Load the libraries

library("tidyverse")
library("ape")
library("coda")

Functions for getting the number of polymorphic sites and informative sites at 1) each visit date, and 2) cumulatively (up to and including that visit date). sequences should be in character matrix format, with visit dates appended to the sequence names with an underscore (code assumes three visits).

#Number of polymorphic sites
polymorphic<-function(sequences){
  sequences=toupper(sequences)      #All caps
    sequences[sequences=="U"]="T"   #Convert U to T
    
    poly.sites<-table(apply(sequences,2,function(x) any(x=="A")+any(x=="C")+any(x=="G")+any(x=="T")))
    
    sum(poly.sites[which(!names(poly.sites)%in%c("0","1"))])
}

#Get polymorphic and informative sites at each visit
getSequenceInfo<-function(sequences){

  #Get the sampling times from the tips
  seq.dates<-rownames(sequences)%>%word(-1,-1,sep="_")%>%as.numeric()
  
  #Get the max sampling date
  max.sample.date<-max(seq.dates)
  
  #Get dates for each visit
  v1<-sort(unique(seq.dates))[1]
  v2<-sort(unique(seq.dates))[2]
  v3<-sort(unique(seq.dates))[3]

  #Number of sequences
  n.seqs<-length(rownames(sequences))
    
  #Sequence length
  seq.length<-dim(sequences)[2]

  #Get parsimony informative sites 
  pis.all<-phyloch::pis(as.DNAbin(sequences))
  pis.1st<-phyloch::pis(as.DNAbin(sequences[which(seq.dates==v1),]))
  pis.2nd<-phyloch::pis(as.DNAbin(sequences[which(seq.dates==v2),]))
  pis.3rd<-phyloch::pis(as.DNAbin(sequences[which(seq.dates==v3),]))
  pis.1st2nd<-phyloch::pis(as.DNAbin(sequences[which(seq.dates!=v3),]))
  
  #Get total polymorphic sites
  poly.all<-polymorphic(sequences)
  poly.1st<-polymorphic(sequences[which(seq.dates==v1),])
  poly.2nd<-polymorphic(sequences[which(seq.dates==v2),])
  poly.3rd<-polymorphic(sequences[which(seq.dates==v3),])
  poly.1st2nd<-polymorphic(sequences[which(seq.dates!=v3),])

  return(data.frame(visit1.date=v1,visit2.date=v2,visit3.date=v3,n.seqs,seq.length,poly.1st,poly.2nd,poly.3rd,poly.1st2nd,poly.all,pis.1st,pis.2nd,pis.3rd,pis.1st2nd,pis.all,stringsAsFactors=F))
}

Function to test temporal signal using root-to-tip. tr should be a tree in phylo format, with visit dates appended to the tip names with an underscore.

#Function to get the RTT results
getRTTresults<-function(tr){
  
  #Get dates from tree
  dates<-as.numeric(str_split_fixed(tr$tip.label,"_",n=3)[,3])

  #Do root to tip regression
  tr.rtt<-rtt(tr,dates,opt.tol=1E-6)
    
  #Get distance from tips to root for rooted tree
  root.dist<-adephylo::distRoot(tr.rtt)
  tip.dist<-dates[match(tr.rtt$tip.label,tr$tip.label)]

  #Regression of calendar time against genetic time to get estimate of tmrca and substitution rate
  rtt.lm<-lm(root.dist~tip.dist)
  rtt.tmrca<-unname(-as.double(coef(rtt.lm)[1])/coef(rtt.lm)[2])
  rtt.r<-coef(rtt.lm)[2]
  
  #Get one-sided p-value for H1: subs.rate > 0
  rtt.pval<-pt(coef(summary(rtt.lm))[,"t value"],rtt.lm$df,lower=FALSE)["tip.dist"]
  
  output<-data.frame(tmrca=rtt.tmrca,subs.rate=rtt.r,p.val=rtt.pval,stringsAsFactors=FALSE)
  return(output)
}

Functions to assess convergence and process BEAST log files ready for analysis.

lacksConvergence<-function(beastLog,min.ess=200){
  if("state"%in%names(beastLog)){
    beastLog<-beastLog[,-which(names(beastLog)=="state")]
  }
  
  #Reject any runs with NaNs 
  if(any(beastLog=="NaN")){
    #beastLog[beastLog=="NaN"]<-0
    return(TRUE)
  }
  
  #Some parameters shouldn't move - remove those
  toIgnore<-c(grep("K80_.*freq",names(beastLog)),grep("K81_.*freq",names(beastLog)),grep("TRNEF_.*freq",names(beastLog)),grep("TIMEF_.*freq",names(beastLog)))
  
  if(length(toIgnore>0)){
    beastLog<-beastLog[,-toIgnore]    
  }

  #Reject any runs where a column doesn't move
  if(any(apply(beastLog,2,function(x) length(unique(x)))==1)){
    return(TRUE)
  }
    
  #Check for values smaller than -1E100, which seem to be problematic for the ESS function, and replace with -1E50
  if(any(beastLog<(-1E100))){
    beastLog[beastLog<(-1E100)]<--1E50
  }

  this.ess<-coda::effectiveSize(beastLog)
  return(any(this.ess<=min.ess))
}

#Define function to return a data frame for a participant and model
getAgeOfInfection<-function(clockModel,beastLog,maxDate){

  if(length(beastLog)<=1){
    #If beast log is missing, only save out the metadata
    out<-data.frame(tmrca=NA,subs.rate=NA,stringsAsFactors = F)
  }else{
    if(clockModel=="ucld"){
      beastLog<-beastLog%>%select(treeHeight=treeModel.rootHeight,subs.rate=ucld.mean)
    }else{
      if(clockModel=="uced"){
        beastLog<-beastLog%>%select(treeHeight=treeModel.rootHeight,subs.rate=uced.mean)
      }else{
        if(clockModel=="rlc"||clockModel=="strict"){
          beastLog<-beastLog%>%select(treeHeight=treeModel.rootHeight,subs.rate=clock.rate)
        }else{
          warning("Clock model unknown")
        }
      }
    }
    
    #Get root height relative to day of diagnosis
    out<-beastLog%>%
      mutate(tmrca=maxDate-treeHeight)%>%
      select(-treeHeight)
    
  }
  return(out)
}

Example usage for the lacksConvergence and getAgeOfInfection functions.

genes<-c("genome")
pids<-c(10066)
maxSampleDate<-172
model.combos<-c("strict_constant")

#Set burn in
burnin<-2500

#Set min.ess
min.ess<-200

#Set thinning interval (as in, save every nth sample where n is the number set below)
thin<-20

#Initialize the data frame
beast.log<-data.frame(pid=integer(),
                      model=character(),
                      gene=character(),
                      tmrca=integer(),
                      subs.rate=integer(),
                      stringsAsFactors=F)

for(gene in genes){

  #Run for all participants
  for(i in 1:length(pids)){
    this.pid<-pids[i]
    this.maxSampleDate<-maxSampleDate[i]
  
    #Run for other model combinations
    for(j in 1:length(model.combos)){

      this.model<-model.combos[j]

      #Get BEAST log
      this.logfile<-paste0(this.pid,"_",this.model,"_beast.log")
      
      #Check if log file exists
      if(!file.exists(this.logfile)){
        warning(paste("File missing for ",this.logfile,"\n"))
        next
      }else{
        if(file.size(this.logfile)>0){
          this.beastLog<-read.table(this.logfile,sep="\t",header=T,stringsAsFactors=F)
        }else{
          warning(paste("File empty for ",this.logfile,"\n"))
          next
        }
      }
      
      #Check the length of the log file
      if(length(this.beastLog$state)>(2*burnin)){
        
        #Remove burn in and thin
        this.beastLog<-this.beastLog[seq((burnin+1),length(this.beastLog$state),thin),]
        
        #Check ESS is greater than min.ess
        this.ess<-lacksConvergence(this.beastLog)
        
        if(this.ess==FALSE){
          
          #ESS is greater than min.ESS. Add log to beast.log to plot
          this.beastLogExtracted<-getAgeOfInfection(clockModel=word(this.model,1,1,"_"),beastLog=this.beastLog,maxDate=this.maxSampleDate)%>%
            cbind(pid=this.pid,model=this.model,gene=gene,stringsAsFactors=F)
 
          
        }else{
          #Did not converge
          this.beastLogExtracted<-getAgeOfInfection(clockModel=word(this.model,1,1,"_"),beastLog=NA,maxDate=this.maxSampleDate)%>%
            cbind(pid=this.pid,model=this.model,gene=gene,stringsAsFactors=F)
        }
      }else{
        #Too short
          this.beastLogExtracted<-getAgeOfInfection(clockModel=word(this.model,1,1,"_"),beastLog=NA,maxDate=this.maxSampleDate)%>%
            cbind(pid=this.pid,model=this.model,gene=gene,stringsAsFactors=F)
      }
      
      #Add to output data frame
      beast.log<-bind_rows(beast.log,this.beastLogExtracted)
    }
  }
}

head(beast.log)
##     pid           model   gene     tmrca    subs.rate
## 1 10066 strict_constant genome -27.52797 1.778906e-05
## 2 10066 strict_constant genome -19.73683 1.697652e-05
## 3 10066 strict_constant genome -19.65995 1.859055e-05
## 4 10066 strict_constant genome -21.13862 1.582134e-05
## 5 10066 strict_constant genome -22.35791 1.843267e-05
## 6 10066 strict_constant genome -22.62541 1.646789e-05