R/Plot_LOOCV.R
In TipDatingBeast: Using Tip Dates with Phylogenetic Trees in BEAST

Documented in PlotLOOCV

PlotLOOCV <- function(name, burnin = 0.1) {
  
  #library (TeachingDemos)
  
  opar <-par(no.readonly=TRUE)
  on.exit(par(opar))
  
  # read original file
  
  inFileName <- paste0(name, '.xml')
  inFile <- readLines(inFileName)
  versionLine <- readLines(inFileName,n=1)
  ver <- grepl(pattern = 'version=\"1.0', x = versionLine)
  ver2 <- grepl(pattern = 'version=\"2.', x = versionLine)  
  ver1 = F
  if (ver == T & ver2 ==  F) {ver1 = T}
  
  if (ver1 == T) {
    
    check <- any(grepl(pattern = 'Generated by BEAUTi v1.10.', x = inFile))
    if (check == T) {stop(
      "Leave-one-out analysis not yet implemented for this Beast 1 version")}
    
    matchLines <- grep(pattern = "date value=", x = inFile, value = T)
    values <- na.omit(as.numeric (gsub("[^0-9].", "", matchLines)))
    taxa = length(values)
    for (i in 1 : taxa){
      ## Read replicates log files
      fileRep <- paste0(name, ".Taxon", i, ".log")
      
      if (file_test("-f", fileRep) == F) {stop(
        "Log files not found, check files and verify that follow the proper naming convention
        --filename.Taxon1.log")} 
      
      temp1 <- read.table(fileRep, header = TRUE, sep = '\t')
      bn <- dim(temp1)[1] - round(dim(temp1)[1]*(burnin), 0)
      temp2 <- tail(temp1, bn)
      coltemp <- colnames(temp2[2])
      col <- unlist(strsplit(colnames(temp2[2]), split = "age.")) [2]
      
      if (i == 1) {data <- cbind(temp2[, 2])} else {
        data <- cbind(data, temp2[, 2])}
      if (i == 1) {colnames = col} else {
        colnames = c(colnames, col)}
      
      print (paste("log of taxon", i, "processed"))
    }
  }
  
  if (ver2 == T) {
    
    linearDates=F
    
    taxa <- length(grep('taxon=', inFile))
    line <- grep(pattern = 'traitname=\"date|traitname=\'date', x = inFile)
    
    linearDates <- grepl(pattern = 'value=', inFile[line])
    
    if (linearDates == T) {
      
      dateLine <- inFile[line]
      step1 <- gsub('\">','',strsplit(dateLine, 'value=\"')[[1]][2])
      step2 <- unlist(strsplit(step1, ","))
      numberDates <- length(step2)
      
      date <- unlist(strsplit(step2, "="))
      dateHap <- date[c(T, F)]
      dateHap <- dateHap[1: numberDates]
      dateValues <- date[c(F, T)]
      
      values <- (as.numeric(gsub(",$", "", dateValues)))

      for (i in 1 : taxa){
        ## Read replicates log files
        fileRep <- paste0(name, ".Taxon", i, ".log")  
        temp1 <- read.table(fileRep, header = TRUE, sep = '\t')
        bn <- dim(temp1)[1] - round(dim(temp1)[1]*(burnin), 0)
        temp2 <- tail(temp1, bn)
        coltemp <- colnames(temp2[3])
        col <- unlist(strsplit(colnames(temp2[3]), split = "height.")) [2]
        
        if (i == 1) {data <- cbind(temp2[, 3])} else {
          data <- cbind(data, temp2[, 3])}
        if (i == 1) {colnames = col} else {
          colnames = c(colnames, col)}
        print (paste("log of taxon", i, "processed"))
        
      }
    }
    
    if (linearDates == F) {
      
      datePositions = c()
      
      repeat {
        if (length(grep("value=", inFile[line])) > 0) line <- line + 1
        if (length(grep("alignment", inFile[line])) > 0) break
        if (length(grep("=", inFile[line])) > 0) {datePositions <- c(datePositions, line)}
        line <- line + 1
      }      
     
      numberDates <- length(datePositions)
      dateLines <- inFile[datePositions]
      dateLines <- trimws(dateLines)
      date <- unlist(strsplit(dateLines, "="))
      dateHap <- date[c(T, F)]
      dateHap <- dateHap[1: numberDates]
      values <- date[c(F, T)]
      values <- na.omit(as.numeric (gsub("[^\\d]+", "", values, perl = T)))
      #lastLine <- length(grep("<taxa", dateValues))
      
      for (i in 1 : taxa){
        ## Read replicates log files
        fileRep <- paste0(name, ".Taxon", i, ".log")  
        temp1 <- read.table(fileRep, header = TRUE, sep = '\t')
        bn <- dim(temp1)[1] - round(dim(temp1)[1]*(burnin), 0)
        temp2 <- tail(temp1, bn)
        coltemp <- colnames(temp2[3])
        col <- unlist(strsplit(colnames(temp2[3]), split = "height.")) [2]
        
        if (i == 1) {data <- cbind(temp2[, 3])} else {
          data <- cbind(data, temp2[, 3])}
        if (i == 1) {colnames = col} else {
          colnames = c(colnames, col)}
        print (paste("log of taxon", i, "processed"))
        
      }
    }
  }
  
  colnames(data) = colnames
  ### Convert in calendar years, last sampling date
  if (ver1 == T) {
    data = ceiling(max(values)) - data
  }
  
  stats = matrix('NA',3,dim(data)[2])
  colnames(stats) = colnames(data)
  rownames(stats) = c("median", "min", "max")
  
  for (i in 1:dim(stats)[2]){
    stats[1,i] = median(data[, i])  
    stats[2,i] = emp.hpd(data[, i], conf = 0.95)[1]
    stats[3,i] = emp.hpd(data[, i], conf = 0.95)[2]
  }
  
  ### Export the data into a text table that summarizes the results
  write.table (stats, paste0(name, "_leave_one_out.txt"))
  mindata = floor(as.numeric(min(stats[2, ])))
  maxdata = ceiling(as.numeric(max(stats[3, ])))
  xsp <- maxdata - mindata
  xlim <- c(mindata - xsp * 0.35, maxdata)
  ylim <- c(-2*taxa, 0)
  
  fail = NULL
  plot(1, xlim = xlim, ylim = ylim, axes = F, xlab = "",
       ylab = "", type = "n", ann = F)
  for (i in 1 : taxa){
    median = (as.numeric(stats[1, i]))
    min = round((as.numeric(stats[2, i])), 3)
    max = round((as.numeric(stats[3, i])), 3)
    label = substr(colnames[i], 1, 20)
    arrows (min, -2*i, max, -2*i, angle = 90, code = 3, length = 0.08, lwd = 1)
    points (median,  -2*i, cex = 1, pch = 3)
    if (min <= values[i] & values[i] <= max) {pt = 1; col = "black"}
    else {pt = 20; col = "red"; fail <- append(fail, i)}
    points (values[i],  -2*i, cex = 1, pch = pt, col = col, bg = col)
    text ((mindata - xsp * 0.20), -2*i, labels = label, cex = 0.5)
  } 
  
  #Scale in bottom
  axis(1, at = seq(mindata, maxdata), labels = seq(mindata, maxdata),
       cex.axis = 0.7, lwd = 1, line = 0)
  LOOCV_result <- all(fail == 0)
  
  # Print result over the plot
  if (LOOCV_result == TRUE) {
    mtext("Pass!!! Age estimation for all taxa inside the expected 95% HPD",
          side = 3, line = 1, col = "red")
  } else {mtext(paste("Age estimation for taxon/taxa",
                      paste(fail, collapse = ", "), 
                      "is/are not overlapping with expected 95% HPD"), side = 3,
                line = 2, col = "red");
    mtext("Attention !!! check LOOCV report file", side = 3, line = 1, 
          col = "red");
    write.table (colnames[fail], row.names = fail, sep = ",",
                 col.names = "Taxon not overlapping with estimated 95% HPD (position, name)",
                 paste0(name, "_LOOCV_report.txt"))
  }
  
  pdf(paste0("Fig_LOOCV_", name, ".pdf"))
  
  plot(1, xlim = xlim, ylim = ylim, axes = F, xlab = "",
       ylab = "", type = "n", ann = F)
  for (i in 1 : taxa){
    median = (as.numeric(stats[1, i]))
    min = round((as.numeric(stats[2, i])), 3)
    max = round((as.numeric(stats[3, i])), 3)
    label = substr(colnames[i], 1, 20)
    arrows (min, -2*i, max, -2*i, angle = 90, code = 3, length = 0.08, lwd = 1)
    points (median,  -2*i, cex = 1, pch = 3)
    if (min <= values[i] & values[i] <= max) {pt = 1; col = "black"}
    else {pt = 20; col = "red"; fail}
    points (values[i],  -2*i, cex = 1, pch = pt, col = col, bg = col)
    text ((mindata - xsp * 0.20), -2*i, labels = label, cex = 0.5)
  } 
  #Scale in bottom
  axis(1, at = seq(mindata, maxdata), labels = seq(mindata, maxdata),
       cex.axis = 0.7, lwd = 1, line = 0)
  LOOCV_result <- all(fail == 0)
  
  # Print result over the plot
  if (LOOCV_result == TRUE) {
    mtext("Pass!!! Age estimation for all taxa inside the expected 95% HPD",
          side = 3, line = 1, col = "red")
  } else {mtext(paste("Age estimation for taxon/taxa",
                      paste(fail, collapse = ", "), 
                      "is/are not overlapping with expected 95% HPD"), side = 3,
                line = 2, col = "red");
    mtext("Attention !!! check LOOCV report file", side = 3, line = 1, 
          col = "red")
  }
  
  dev.off()
}