Load the libraries
library("tidyverse")
library("ape")
library("coda")
Functions for getting the number of polymorphic sites and informative sites at 1) each visit date, and 2) cumulatively (up to and including that visit date). sequences
should be in character matrix format, with visit dates appended to the sequence names with an underscore (code assumes three visits).
#Number of polymorphic sites
polymorphic<-function(sequences){
sequences=toupper(sequences) #All caps
sequences[sequences=="U"]="T" #Convert U to T
poly.sites<-table(apply(sequences,2,function(x) any(x=="A")+any(x=="C")+any(x=="G")+any(x=="T")))
sum(poly.sites[which(!names(poly.sites)%in%c("0","1"))])
}
#Get polymorphic and informative sites at each visit
getSequenceInfo<-function(sequences){
#Get the sampling times from the tips
seq.dates<-rownames(sequences)%>%word(-1,-1,sep="_")%>%as.numeric()
#Get the max sampling date
max.sample.date<-max(seq.dates)
#Get dates for each visit
v1<-sort(unique(seq.dates))[1]
v2<-sort(unique(seq.dates))[2]
v3<-sort(unique(seq.dates))[3]
#Number of sequences
n.seqs<-length(rownames(sequences))
#Sequence length
seq.length<-dim(sequences)[2]
#Get parsimony informative sites
pis.all<-phyloch::pis(as.DNAbin(sequences))
pis.1st<-phyloch::pis(as.DNAbin(sequences[which(seq.dates==v1),]))
pis.2nd<-phyloch::pis(as.DNAbin(sequences[which(seq.dates==v2),]))
pis.3rd<-phyloch::pis(as.DNAbin(sequences[which(seq.dates==v3),]))
pis.1st2nd<-phyloch::pis(as.DNAbin(sequences[which(seq.dates!=v3),]))
#Get total polymorphic sites
poly.all<-polymorphic(sequences)
poly.1st<-polymorphic(sequences[which(seq.dates==v1),])
poly.2nd<-polymorphic(sequences[which(seq.dates==v2),])
poly.3rd<-polymorphic(sequences[which(seq.dates==v3),])
poly.1st2nd<-polymorphic(sequences[which(seq.dates!=v3),])
return(data.frame(visit1.date=v1,visit2.date=v2,visit3.date=v3,n.seqs,seq.length,poly.1st,poly.2nd,poly.3rd,poly.1st2nd,poly.all,pis.1st,pis.2nd,pis.3rd,pis.1st2nd,pis.all,stringsAsFactors=F))
}
Function to test temporal signal using root-to-tip. tr
should be a tree in phylo
format, with visit dates appended to the tip names with an underscore.
#Function to get the RTT results
getRTTresults<-function(tr){
#Get dates from tree
dates<-as.numeric(str_split_fixed(tr$tip.label,"_",n=3)[,3])
#Do root to tip regression
tr.rtt<-rtt(tr,dates,opt.tol=1E-6)
#Get distance from tips to root for rooted tree
root.dist<-adephylo::distRoot(tr.rtt)
tip.dist<-dates[match(tr.rtt$tip.label,tr$tip.label)]
#Regression of calendar time against genetic time to get estimate of tmrca and substitution rate
rtt.lm<-lm(root.dist~tip.dist)
rtt.tmrca<-unname(-as.double(coef(rtt.lm)[1])/coef(rtt.lm)[2])
rtt.r<-coef(rtt.lm)[2]
#Get one-sided p-value for H1: subs.rate > 0
rtt.pval<-pt(coef(summary(rtt.lm))[,"t value"],rtt.lm$df,lower=FALSE)["tip.dist"]
output<-data.frame(tmrca=rtt.tmrca,subs.rate=rtt.r,p.val=rtt.pval,stringsAsFactors=FALSE)
return(output)
}
Functions to assess convergence and process BEAST log files ready for analysis.
lacksConvergence
checks the ESS of each parameter, and returns TRUE
if any have failed to converge (after removing parameters that should remain fixed).
beastLog
: data frame from BEAST log read in by, for example, read.table
or read_tsv
.min.ess
: minimum effective sample size for convergence. Default is 200.getAgeOfInfection
processes a beastLog
, outputting a data frame with the substitution rate (named subs.rate
to match across all clock models) and TMRCA in days relative to first positive test.
clockModel
: the clock model the log is for. -beastLog
: data frame from BEAST log read in by, for example, read.table
or read_tsv
.maxDate
: the maximum date of sampling for this participant. Used to convert treeHeight into days relative to first positive test.lacksConvergence<-function(beastLog,min.ess=200){
if("state"%in%names(beastLog)){
beastLog<-beastLog[,-which(names(beastLog)=="state")]
}
#Reject any runs with NaNs
if(any(beastLog=="NaN")){
#beastLog[beastLog=="NaN"]<-0
return(TRUE)
}
#Some parameters shouldn't move - remove those
toIgnore<-c(grep("K80_.*freq",names(beastLog)),grep("K81_.*freq",names(beastLog)),grep("TRNEF_.*freq",names(beastLog)),grep("TIMEF_.*freq",names(beastLog)))
if(length(toIgnore>0)){
beastLog<-beastLog[,-toIgnore]
}
#Reject any runs where a column doesn't move
if(any(apply(beastLog,2,function(x) length(unique(x)))==1)){
return(TRUE)
}
#Check for values smaller than -1E100, which seem to be problematic for the ESS function, and replace with -1E50
if(any(beastLog<(-1E100))){
beastLog[beastLog<(-1E100)]<--1E50
}
this.ess<-coda::effectiveSize(beastLog)
return(any(this.ess<=min.ess))
}
#Define function to return a data frame for a participant and model
getAgeOfInfection<-function(clockModel,beastLog,maxDate){
if(length(beastLog)<=1){
#If beast log is missing, only save out the metadata
out<-data.frame(tmrca=NA,subs.rate=NA,stringsAsFactors = F)
}else{
if(clockModel=="ucld"){
beastLog<-beastLog%>%select(treeHeight=treeModel.rootHeight,subs.rate=ucld.mean)
}else{
if(clockModel=="uced"){
beastLog<-beastLog%>%select(treeHeight=treeModel.rootHeight,subs.rate=uced.mean)
}else{
if(clockModel=="rlc"||clockModel=="strict"){
beastLog<-beastLog%>%select(treeHeight=treeModel.rootHeight,subs.rate=clock.rate)
}else{
warning("Clock model unknown")
}
}
}
#Get root height relative to day of diagnosis
out<-beastLog%>%
mutate(tmrca=maxDate-treeHeight)%>%
select(-treeHeight)
}
return(out)
}
Example usage for the lacksConvergence
and getAgeOfInfection
functions.
genes<-c("genome")
pids<-c(10066)
maxSampleDate<-172
model.combos<-c("strict_constant")
#Set burn in
burnin<-2500
#Set min.ess
min.ess<-200
#Set thinning interval (as in, save every nth sample where n is the number set below)
thin<-20
#Initialize the data frame
beast.log<-data.frame(pid=integer(),
model=character(),
gene=character(),
tmrca=integer(),
subs.rate=integer(),
stringsAsFactors=F)
for(gene in genes){
#Run for all participants
for(i in 1:length(pids)){
this.pid<-pids[i]
this.maxSampleDate<-maxSampleDate[i]
#Run for other model combinations
for(j in 1:length(model.combos)){
this.model<-model.combos[j]
#Get BEAST log
this.logfile<-paste0(this.pid,"_",this.model,"_beast.log")
#Check if log file exists
if(!file.exists(this.logfile)){
warning(paste("File missing for ",this.logfile,"\n"))
next
}else{
if(file.size(this.logfile)>0){
this.beastLog<-read.table(this.logfile,sep="\t",header=T,stringsAsFactors=F)
}else{
warning(paste("File empty for ",this.logfile,"\n"))
next
}
}
#Check the length of the log file
if(length(this.beastLog$state)>(2*burnin)){
#Remove burn in and thin
this.beastLog<-this.beastLog[seq((burnin+1),length(this.beastLog$state),thin),]
#Check ESS is greater than min.ess
this.ess<-lacksConvergence(this.beastLog)
if(this.ess==FALSE){
#ESS is greater than min.ESS. Add log to beast.log to plot
this.beastLogExtracted<-getAgeOfInfection(clockModel=word(this.model,1,1,"_"),beastLog=this.beastLog,maxDate=this.maxSampleDate)%>%
cbind(pid=this.pid,model=this.model,gene=gene,stringsAsFactors=F)
}else{
#Did not converge
this.beastLogExtracted<-getAgeOfInfection(clockModel=word(this.model,1,1,"_"),beastLog=NA,maxDate=this.maxSampleDate)%>%
cbind(pid=this.pid,model=this.model,gene=gene,stringsAsFactors=F)
}
}else{
#Too short
this.beastLogExtracted<-getAgeOfInfection(clockModel=word(this.model,1,1,"_"),beastLog=NA,maxDate=this.maxSampleDate)%>%
cbind(pid=this.pid,model=this.model,gene=gene,stringsAsFactors=F)
}
#Add to output data frame
beast.log<-bind_rows(beast.log,this.beastLogExtracted)
}
}
}
head(beast.log)
## pid model gene tmrca subs.rate
## 1 10066 strict_constant genome -27.52797 1.778906e-05
## 2 10066 strict_constant genome -19.73683 1.697652e-05
## 3 10066 strict_constant genome -19.65995 1.859055e-05
## 4 10066 strict_constant genome -21.13862 1.582134e-05
## 5 10066 strict_constant genome -22.35791 1.843267e-05
## 6 10066 strict_constant genome -22.62541 1.646789e-05