R/computeRTCor.R
In SwathXtend: SWATH extended library generation and statistical data analysis

###########################################################################
#' Plot for retention time correlation of two libraries
#' @param dat1 A data frame containing the first spectrum library
#' @param dat2 A data frame containing the second spectrum library
#' @param method a character string indicating the method for calculating
#' correlation coefficient. One of "time" (default) and "hydro".
#' @param wb an openxlsx workbook object
#' @param sheet A name or index of a worksheet
#' @param label1 a character string representing the x axis label for plotting
#' @param label2 a character string representing the y axis label for plotting
#' @param nomod a logic value, representing if the modified peptides and its
#' fragment ions will be removed. FALSE (default) means not removing.
#' @return two .png files of RT correlation and RT residual plots will be saved
#' under current working directory
#' @examples 
#' libfiles <- paste(system.file("files",package="SwathXtend"),
#'    c("Lib2.txt","Lib3.txt"),sep="/")
#' datBaseLib <- readLibFile(libfiles[1])
#' datExtLib <- readLibFile(libfiles[2])
#' computeRTCor(datBaseLib, datExtLib)
############################################################################ 

computeRTCor <- function(dat1, dat2, datHydroIndex, method=c("time", "hydro"), wb=NULL, sheet=NULL,
                      label1="Base Lib", label2 = "External Lib", nomod=FALSE)
{
  
  method <- match.arg(method)
  
  if(method == "time"){
    
    datRTrain <- getRTrain(dat1, dat2, nomod=nomod)
    
    bestModel <- selectModel(datRTrain)
    
    
    if(nrow(datRTrain) > 10){
      
      r2 <- cor(datRTrain[,2],datRTrain[,3])^2
      

      predicted <- predict(bestModel, newdata=datRTrain)
        
      rmse <- sqrt(mean((datRTrain[,2]-predicted)^2))      
      
      
      # plot RT correlation

		  png("RT_correlation.png")
		  
		  plot(datRTrain[,2],datRTrain[,3],
			   xlab=paste(label1, "RT"),ylab=paste(label2, "RT"))      
		  
		  
		  lines(lowess(datRTrain[,2],datRTrain[,3]),col="red")
		  
		  r2label <- bquote(italic(R)^2 ==. (format(r2, digits = 2)))
		  
		  text(min(datRTrain[,2])+2, max(datRTrain[,3])-2,
			   labels = r2label,pos=4)
		  dev.off()
		  
		  
		  if(!is.null(wb) & !is.null(sheet))
			insertImage(wb, sheet, "RT_correlation.png", width=6, height=5, startRow=1, startCol=1)
      
      
      # plot residuals
		  png("RT_residual.png")
		  
	#       plot(residuals(fit))
		  resids <- predicted-datRTrain[,2]

		  plot(resids, main=paste("Residual plot for",
								  attr(bestModel,"class")[length(attr(bestModel,"class"))]))
		  
		  
		  abline(h=floor(rmse),col="red")
		  abline(h=-floor(rmse), col="red")
		  axis(2, c(-floor(rmse),floor(rmse)), col.ticks="red")
		  
		  rmselabel <- bquote(italic(rmse) ==. (format(rmse, digits=2)))
		  
		  text(2, max(resids), labels = rmselabel, pos=4)     
		  
		  
		  dev.off()


		  if(!is.null(wb) & !is.null(sheet))
			insertImage(wb, sheet, "RT_residual.png", width=6, height=5, startRow=1, startCol=10)
  

    }
    
    list(RT.cor = r2, RMSE = rmse)
    
  } else if (method == "hydro"){
    
    if(missing(datHydroIndex)) stop("Missing data frame of hydrophibicity index.")
    
    colnames(datHydroIndex)[grep("sequence",tolower(colnames(datHydroIndex)))]<-
      "sequence"
    
    colnames(datHydroIndex)[grep("hydro",tolower(colnames(datHydroIndex)))] <- "hydro"
    
    colnames(datHydroIndex)[grep("len",tolower(colnames(datHydroIndex)))] <- "length"
    
    dat1 <- dat1[!duplicated(dat1$stripped_sequence),]
    
    datHydroIndex <- datHydroIndex[!duplicated(datHydroIndex$sequence),]
    
    datHydroIndex$hydro <- as.numeric(datHydroIndex$hydro)
    
    datRTrain <- merge(dat1[,c("stripped_sequence","RT_detected")],
                        datHydroIndex[,c("sequence","hydro","length")],
                        by.x="stripped_sequence",
                        by.y=tolower("sequence"),all=FALSE)
    
    datRTrain$length <- log(datRTrain$length)
    
    f.h= as.formula("RT_detected~hydro")
    
    f.hl <- as.formula("RT_detected~hydro+length")
    
    
    if(nrow(datRTrain) > 10){
      fit.h <- lm(f.h, data=datRTrain)
      fit.hl <- lm(f.hl, data=datRTrain)
      
		  png("RT_correlation_hydro.png")
		  
		  plot(datRTrain[,2]~datRTrain[,3],
			   xlab="Hydrophibicity index",ylab=paste(label1,"RT"))
		  
		  r2 <- cor(datRTrain[,3],datRTrain[,2])^2
		  
		  lines(lowess(datRTrain[,3],datRTrain[,2]),col="red")
		  
		  r2label <- bquote(italic(R)^2 ==. (format(r2, digits = 2)))
		  
		  text(min(datRTrain[,3])+2, max(datRTrain[,2])-2,
			   labels = r2label,pos=4)
		  dev.off()
		  
		  
		  if(!is.null(wb) & !is.null(sheet))
			insertImage(wb, sheet, "RT_correlation_hydro.png", width=6, height=5, startRow=1, startCol=1)
		  
		  
		  png("RT_correlation_hydroNlength.png")
		  layout(matrix(1:2,nrow=2))
		  
		  plot(datRTrain[,2]~datRTrain[,3],
			   xlab=paste(label1,"Hydro"),ylab=paste(label1,"RT"))
		  
		  r2 <- cor(datRTrain[,3],datRTrain[,2])^2
		  
		  lines(lowess(datRTrain[,3],datRTrain[,2]),col="red")  
		  
		  r2label <- bquote(italic(R)^2 ==. (format(r2, digits = 2)))
		  
		  text(min(datRTrain[,3])+2, max(datRTrain[,2])-2,
			   labels = r2label,pos=4)
		  
		  plot(datRTrain[,2]~datRTrain[,4],
			   xlab="ln(Length)",ylab="RT",
			   main="Correlation between ln(Length) and RT")
		  
		  r2 <- cor(datRTrain[,4],datRTrain[,2])^2
		  
		  lines(lowess(datRTrain[,4],datRTrain[,2]),col="red")  
		  
		  r2label <- bquote(italic(R)^2 ==. (format(r2, digits = 2)))
		  
		  text(max(datRTrain[,4])-0.1, min(datRTrain[,2]+10),
			   labels = r2label)
		  dev.off()
		  
		  
		  if(!is.null(wb) & !is.null(sheet))
			insertImage(wb, sheet, "RT_correlation_hydroNlength.png", width=6, height=5, startRow=1, startCol=10)

      
      rmse.h <- sqrt(mean(resid(fit.h)^2))
      rmse.hl <- sqrt(mean(resid(fit.hl)^2))

      ## plot
		  png("RT_residual_hydro.png")
		  
		  plot(residuals(fit.h))
		  
		  abline(h=floor(rmse.h),col="red")
		  abline(h=-floor(rmse.h), col="red")
		  axis(2, c(-floor(rmse.h),floor(rmse.h)), col.ticks="red")
		  
		  rmselabel <- bquote(italic(rmse) ==. (format(rmse.h, digits=2)))
		  
		  text(2, max(resid(fit.h)), labels = rmselabel, pos=4)
		  dev.off()
		  
		  png("RT_residual_hydroNlength.png")
		  
		  plot(residuals(fit.hl))
		  
		  abline(h=floor(rmse.hl),col="red")
		  abline(h=-floor(rmse.hl), col="red")
		  axis(2, c(-floor(rmse.hl),floor(rmse.hl)), col.ticks="red")
		  
		  rmselabel <- bquote(italic(rmse) ==. (format(rmse.hl, digits=2)))
		  
		  text(2, max(resid(fit.hl)), labels = rmselabel, pos=4)
		  dev.off()
		  
		  
		  if(!is.null(wb) & !is.null(sheet))
			insertImage(wb, sheet, "RT_residual_hydro.png", width=6, height=5, startRow=1, startCol=20)
		  

    }
  } else { # sequence
    d <- dat1[,c("stripped_sequence","RT_detected")]
    
    d.aa <- do.call(rbind, lapply(d$stripped_sequence, convertAACount))
    
    d.aa$RT_detected <- d$RT_detected
    
    fit.seq <- lm(RT_detected ~., d.aa[,-1])
    
    rmse <- sqrt(mean(resid(fit.seq)^2))
    

		plot(residuals(fit.seq))
		
		abline(h=floor(rmse),col="red")
		abline(h=-floor(rmse), col="red")
		axis(2, c(-floor(rmse),floor(rmse)), col.ticks="red")
		
		rmselabel <- bquote(italic(rmse) ==. (format(rmse, digits=2)))
		
		text(length(resid(fit.seq))-200, max(resid(fit.seq)), labels = rmselabel,pos=2)
    

    
  }
  
}