borealis: Tools for reproducible geometric morphometric analysis

Documented in qPCR.plot

#' Plot raw qPCR data
#'
#' This function takes raw real-time PCR data and produces a convenient plot and table,
#' which can be used to assess the quality of the data.
#'
#' The function takes information on the standard concentrations (unitless, but assumed
#' to be log10 of template molecules per microliter) and Cq values measured by the
#' real-time PCR instrument. A linear regression provides intercept and slope, from which
#' PCR efficiency is calculated. The correlation coefficient is also displayed.
#' Potential outlier values in the standard Cq values are highlighted on the plot in red.
#' Efficiency values outside the range of 1.8 to 2.2 (90% - 110%) are also highlighted,
#' as are correlation coefficients (R^2) less than 0.9.
#'
#' If Cq values are provided for unknown samples, they are plotted on the line and the
#' corresponding starting concentrations are provided in the output table.
#'
#' If they are provided, the minimum Cq values for negative controls (NRT and NTC) are
#' plotted as horizontal lines (gray and blue, respectively) and used to call unknowns
#' as reliably above zero.
#'
#' Any additional arguments are passed to \code{graphics::plot}.
#'
#' @source   Dave Angelini \email{david.r.angelini@@gmail.com} [aut, cre]
#'
#' @param std.conc A numeric vector providing the concentrations of the standards used.
#' @param std.cq A numeric vector of Cq values measured for the standard samples.
#' @param unk.cq A numeric vector of Cq values measured for unknown samples.
#' @param nrt.cq A numeric vector of Cq values measured for no-reverse transcription control (NRT) samples.
#' @param ntc.cq A numeric vector of Cq values measured for no-template control (NTC) samples.
#' @param replicate.number The number of technical replicates in the standards (Default value is 3).
#' @param outlier.method A character specifying the method to use in determining outliers among the standards.
#'     The default is \code{absolute} which flags values more than 1.5 cycles from the median.
#'     Alternatively \code{IQR} will flags replicates more than 1.5 times the interquartile range from the median Cq value.
#' @param unk.replicates Specifies whether \code{unk.cq} contains technical replicates (or previously averaged) values.
#'     The default value is \code{FALSE}. If set to \code{TRUE}, then the function assumes
#'     the same \code{replicate.number} used for standards.
#'     Alternatively, a single numeric value can specify a different number of technical
#'     replicates for the unknown samples.
#' @param error.bars A logical factor specifying whether to include error bars on the unknowns.
#'     Only applies if there are replicates for the unknowns.
#' @param unk.ids An optional character vector with ID names for the unknown samples.
#'     Alternative, if set to \code{TRUE}, then \code{unk.cq} values will be numbered in order.
#'     If \code{unk.replicates} is \code{TRUE} or set to a numeric, then one only one
#'     value should be provided in the \code{unk.ids} vector for each sample.
#' @param unk.loading A single numeric value or a vector with one value for each unknown sample.
#'     (If there are technical replicates for unknowns, provide one value for each replicate group.)
#'     If the plate contains unknowns with different loading volumes or concentrations, this
#'     argument can be used to adjust the calculation of starting template concentrations in the
#'     unknowns to a per-unit value.
#'     This is applied to the concentrations calculated in the output table.
#'     To apply the argument to the plot too, set \code{unk.loading.on.plot = TRUE}
#' @param unk.loading.on.plot A logical factor specifying whether to apply the \code{unk.loading}
#'     to the unknown values as they appear on the plot.
#' @param main An overall title for the plot.
#' @param xlab A title for the x axis.
#' @param ylab A title for the y axis.
#' @param plot.only A logical factor specifying whether to limit output to the plot.
#'     If \code{FALSE} (the default), then the function also returns tables of the input data
#'     and the inferred unknown concentrations as well as descriptive statistics from the standards.
#'
#' @export
#'
#' @examples
#' std.conc <- c(rep(6,3),rep(5,3),rep(4,3),rep(3,3))
#' std.cq <- c(25.74, 26.33, 25.03,  28.42, 28.90, 27.30,
#'             32.15, 32.31, 30.61,  34.46, 34.04, 34.29)
#' unk.cq <- c(28.531, 29.331, 29.466,  29.597, 30.168, 30.258,
#'             30.535, 31.206, 31.279,  32.332, 33.096, 32.114)
#' nrt.cq <- c(32.76, 32.80, 31.51)
#' ntc.cq <- c(36.08, 35.56, 37.62)
#'
#' qPCR.plot(std.conc, std.cq, unk.cq, nrt.cq, ntc.cq, main="TFX (FAM)")
#'
#' qPCR.plot(std.conc, std.cq, unk.cq, nrt.cq, ntc.cq, main="TFX (FAM)",
#'           unk.ids = TRUE)
#'
#' qPCR.plot(std.conc, std.cq, unk.cq, nrt.cq, ntc.cq, main="TFX (FAM)",
#'           unk.replicates = TRUE,
#'           unk.ids = c("A","B","C","control"))
#'
#' qPCR.plot(std.conc, std.cq, unk.cq, nrt.cq, ntc.cq, main="TFX (FAM)",
#'           unk.replicates = TRUE,
#'           unk.loading = c(0.5,1,2,2),
#'           unk.loading.on.plot = TRUE,
#'           unk.ids = c("A","B","C","control"))
#'
#' # It's often convenient to scan in values copied from a spreadsheet, e.g.
#' # std.conc <- scan()
#'
#' # If no inputs are provided, the function simulates data and analysis,
#' # which can be useful when training people new to real-time PCR.
#' qPCR.plot()
#'

qPCR.plot <- function(std.conc=NULL, std.cq=NULL, unk.cq=NULL, nrt.cq=NULL, ntc.cq=NULL,
                      replicate.number = 3,
                      outlier.method = c("absolute","IQR"),
                      unk.replicates = FALSE,
                      error.bars = TRUE,
                      unk.ids = NULL,
                      unk.loading = 1,
                      unk.loading.on.plot = FALSE,
                      main = NULL,
                      xlab = 'log10 template / ul',
                      ylab = 'Cq',
                      plot.only = FALSE,
                      ...)
{ # Begin the function

  # Sub-function
  robust.rep <- function(x, times, ...) {
    if (is.null(times)) { return(NULL) } else {
      if (is.na(times)) { return(NULL) } else {
        if (times < 1) { return(NULL) } else {
          rep(x=x, times=times, ...)
        }
      }
    }
  }

  # If no data is provided, simulate something plausible
  if (is.null(std.conc) | is.null(std.cq)) {
    DataProvided <- FALSE
    warning('No data provided. Simulating values.')
    std.conc <- 4:6
    std.conc <- sort(robust.rep(std.conc,replicate.number))
    a <- -10/3 # slope
    b <- 40 # intercept
    noise.value <- 1 # standard deviation to add as noise to simulated replicates
    std.cq <- a*std.conc + b + rnorm(n=length(std.conc), mean=0, sd=noise.value)
    unk.cq <- sort(rep(runif(3,min=min(std.cq),max=max(std.cq)),replicate.number)) + rnorm(n=length(std.conc), mean=0, sd=noise.value)
  } else {
    DataProvided <- TRUE
  }

  # Linear model
  if (length(std.cq)==length(std.conc)) {
    model <- lm(std.cq~std.conc)
    # se.e <- sqrt(sum((std.cq - model$fitted.values)^2)/length(std.cq)) # Standard error of the estimate
  } else {
    stop("`std.conc` and `std.cq` must be equal in length. (See the help entry: `?qPCR.plot`.)")
  }
  if (length(std.cq) %% replicate.number != 0) {
    x <- length(std.cq) %% replicate.number
    s <- paste0("`std.conc` and `std.cq` must be multiples of `replicate.number`\n",
                "`replicate.number.` = ",replicate.number,"\n",
                "`length(std.cq) %% replicate.number` = ",x,"\n",
                "(See the help entry: `?qPCR.plot`=)")
    stop(s)
  }

  # Predict values for unknowns
  if (!is.null(unk.cq)) {
    predicted.conc <- (unk.cq - model$coefficients[1]) / model$coefficients[2]
  }

  # Title
  title.text <- ifelse(DataProvided,ifelse(is.null(main),'',main),'Simulated Data')

  # Find outliers, flagged as technical replicates more than 1.5 cycles from the median
  n <- sort(robust.rep(1:(length(std.cq)/replicate.number),replicate.number))
  if (outlier.method[1] == "absolute") {
    outliers <- unlist(by(std.cq,n,function(x){abs(x - median(x,na.rm=TRUE)) > 1.5}))
  } else {
    if (outlier.method[1] == "IQR") {
      outliers <- unlist(by(std.cq,n,function(x){abs(x - median(x,na.rm=TRUE)) > 1.5*IQR(x, na.rm = TRUE)}))
    } else {
      warning("")
      outliers <- unlist(by(std.cq,n,function(x){abs(x - median(x,na.rm=TRUE)) > 1.5}))
    }
  }

  # Find bounds for the plot
  if (is.null(unk.cq)) { xvalues <- std.conc ; yvalues <- std.cq }
    else { xvalues <- c(std.conc,predicted.conc) ; yvalues <- c(std.cq,unk.cq) }
  if (!is.null(nrt.cq)) { yvalues <- c(yvalues,mean(nrt.cq, na.rm=TRUE)) }
  if (!is.null(ntc.cq)) { yvalues <- c(yvalues,mean(ntc.cq, na.rm=TRUE)) }
  x.min <- min(xvalues,na.rm=TRUE) ; x.max <- max(xvalues,na.rm=TRUE)
  y.min <- min(yvalues,na.rm=TRUE) ; y.max <- max(yvalues,na.rm=TRUE)

  # Plot
  plot(x = std.conc, y = std.cq,
       type = "n",
       main=title.text,
       xlab=xlab, ylab=ylab,
       xlim=c(x.min,x.max), ylim=c(y.min,y.max),
       ... )
  points(x = std.conc[!outliers], y = std.cq[!outliers])
  points(x = std.conc[outliers], y = std.cq[outliers], pch=16, col='darkred')

  # Add negative control baselines
  min.control.cq <- NULL
  if (!is.null(nrt.cq)) {
    abline(h=min(nrt.cq, na.rm=TRUE), col='gray75')
    min.control.cq <- min(nrt.cq)
  }
  if (!is.null(ntc.cq)) {
    abline(h=min(ntc.cq, na.rm=TRUE), col='darkblue')
    min.control.cq <- min(c(min.control.cq,ntc.cq))
  }

  # Annotate with stats
  label.table <- data.frame(row.names = c('efficiency','slope','intercept','R^2') )
  label.table$value <- c(
    10^(-1/model$coefficients[2]),
    model$coefficients[2],
    model$coefficients[1],
    cor(std.cq,std.conc, use='complete.obs')^2
  )

  # Flag stats outside desired ranges
  label.table['efficiency','OK'] <- (label.table['efficiency','value'] < 2.2) & (label.table['efficiency','value'] > 1.8)
  label.table['slope','OK'] <- (label.table['slope','value'] < (-1/log10(2.2))) & (label.table['slope','value'] > (-1/log10(1.8)))
  label.table['intercept','OK'] <- (label.table['intercept','value'] < 60) & (label.table['intercept','value'] > 5)
  label.table['R^2','OK'] <- (label.table['R^2','value'] < 1) & (label.table['R^2','value'] > 0.9)
  abline(model, col=ifelse(label.table['slope','OK'],'gray65','darkred'))

  # Position text
  label.table$x1 <- x.max-(x.max-x.min)/3
  label.table$x2 <- x.max-(x.max-label.table$x1)/3
  label.table[2:4,'value'] <- signif(label.table[2:4,'value'],4)
  label.table['efficiency','value'] <- paste0(signif(50*label.table['efficiency','value'],3),'%')
  rect(xleft = label.table[1,'x1']-(x.max-x.min)*0.025, xright = x.max+(x.max-x.min)*0.025,
       ytop = y.max+(y.max-y.min)*0.025, ybottom = y.max-((y.max-y.min)/20)*5.25,
       col='white', border = NA)

  # Loop to write the text
  for (i in 1:length(row.names(label.table))) {
    name.i <- row.names(label.table)[i]
    text(x=label.table[i,'x1'], y=y.max-((y.max-y.min)/20)*(i-1),
         adj = c(0,1), cex=0.85, labels=paste0(name.i,':'),
         col = ifelse(label.table[i,'OK'],'black','darkred'))
    text(x=label.table[i,'x2'], y=y.max-((y.max-y.min)/20)*(i-1),
         adj = c(0,1), cex=0.85, labels=label.table[i,'value'],
         col = ifelse(label.table[i,'OK'],'black','darkred'))
  }

  # Plot unknowns
  unk.cq.range <- NULL
  unk.conc.ci <- NULL
  if (!is.null(unk.cq)) {
    if (is.logical(unk.replicates)) {
      if (unk.replicates) {
        unk.replicates <- replicate.number
      }
    }
    if (is.numeric(unk.replicates)) {
      if (length(unk.cq) %% unk.replicates == 0) {
        # Find the mean of unk.cq for each replicate
        n <- sort(robust.rep(1:(length(unk.cq)/unk.replicates),unk.replicates))
        avg <- c(by(unk.cq, n, mean, na.rm = TRUE))
        x <- c(by(unk.cq,n,function(x){max(abs(x - mean(x,na.rm=TRUE)))}))
        unk.cq.range <- data.frame(
          x0 = NA,
          y0 = avg-x,
          y1 = avg+x
        )
        inv.lm <- lm(std.conc ~ std.cq)
        x <- predict(inv.lm, data.frame(std.cq=avg), interval="confidence")
        unk.conc.ci <- data.frame(
          x0 = x[,2],
          x1 = x[,3],
          y0 = avg
        )
        unk.cq <- avg
      } else {
        x <- length(unk.cq) %% unk.replicates
        s <- paste0("`unk.cq` length must a multiple of `unk.replicates`\n",
                    "`unk.replicates` = ",unk.replicates,"\n",
                    "`length(unk.cq) %% unk.replicates` = ",x,"\n",
                    "Plotting all values of `unk.cq`. ",
                    "(See the help entry: `?qPCR.plot`)")
        warning(s)
      }
    }

    # Predict concentrations for unknown Cq values
    predicted.conc <- (unk.cq - model$coefficients[1]) / model$coefficients[2]
    above.zero <- robust.rep(TRUE,length(unk.cq))
    if (unk.loading.on.plot) {
      predicted.conc <- predicted.conc - log10(unk.loading)
    }
    if (!is.null(min.control.cq)) {
      above.zero <- (unk.cq < min.control.cq)
      points(x = predicted.conc[!above.zero], y = unk.cq[!above.zero], pch=8, col="midnightblue")
    }
    points(x = predicted.conc[above.zero], y = unk.cq[above.zero], pch=16)

    # Add error bars
    if (error.bars & !(is.null(unk.conc.ci))) {
      unk.cq.range$x0 <- predicted.conc
      if (unk.loading.on.plot) {
        unk.conc.ci$x0 <- unk.conc.ci$x0 - log10(unk.loading)
        unk.conc.ci$x1 <- unk.conc.ci$x1 - log10(unk.loading)
      }
      with(unk.cq.range, segments(x0=x0, y0=y0, y1=y1, col = "gray50") )
      with(unk.conc.ci, segments(x0=x0, x1=x1, y0=y0, col = "gray50") )
    }

    # Add unknown IDs
    if (!is.null(unk.ids)) {
      if (is.logical(unk.ids)) {
        if (unk.ids) {
          unk.ids <- 1:length(unk.cq)
        }
      } else {
        if (length(unk.ids) != length(unk.cq) ) {
          if (length(unk.ids) > length(unk.cq) ) { unk.ids <- unk.ids[1:length(unk.cq)] }
          warning("Length of `unk.ids` does not equal length of `unk.cq`. (See the help entry: `?qPCR.plot`.)")
        }
      }
      text(x = predicted.conc, y = unk.cq, labels = unk.ids,
           pos = 4, cex = 0.85, col = "grey25", font = 2)
    }

  } # End  if (!is.null(unk.cq))
  # End of Plot unknowns section

  # Highlight outliers in stats table
  if (any(outliers)) {
    text(x=label.table[i,'x1'], y=y.max-((y.max-y.min)/20)*(i),
         adj = c(0,1), cex=0.85, labels='Potential outliers', col = 'darkred')
  }

  if (!plot.only) {
    # Format output
    output <- list(
      stds = data.frame(
        conc = std.conc,
        cq = std.cq,
        outlier = outliers
      )
    )

    output$outlier.method = outlier.method[1]

    if(!is.null(min.control.cq)) {
      output$controls <- data.frame(
        type = c(robust.rep("NTC",length(ntc.cq)),robust.rep("NRT",length(nrt.cq))),
        cq   = c(ntc.cq,nrt.cq)
      )
    } else {
      above.zero <- robust.rep(NA,length(unk.cq))
    }

    if (!is.null(unk.cq)) {
      output$unknowns <- data.frame(
        cq = unk.cq,
        conc = predicted.conc - log10(unk.loading)
      )
      if (!is.null(unk.conc.ci)) {
        output$unknowns$ci.lo <- unk.conc.ci$x0 - log10(unk.loading)
        output$unknowns$ci.hi <- unk.conc.ci$x1 - log10(unk.loading)
      }
      output$unknowns$above.zero <- above.zero
      if (!is.null(unk.ids)) {
        rownames(output$unknowns) <- unk.ids
      }
    }

    output$unk.loading <- unk.loading

    output$stats = as.data.frame(label.table[,1:2])

    return(output)
  } # End  if (!plot.only)

} # End function