R/NanoStringQualityMetrics.R

Defines functions centeredSampleClustering geneClustering densityPlot countsInBlankSamples_verticalPlot pcaPlot zfacFun panelCor scatterPair negCtrlsByLane_verticalPlot colByCovar colByFun cutoffByMMAD cutoffByVar dCoVar myCols

Documented in colByCovar colByFun countsInBlankSamples_verticalPlot cutoffByMMAD cutoffByVar dCoVar densityPlot geneClustering myCols negCtrlsByLane_verticalPlot panelCor pcaPlot scatterPair zfacFun

##' @title          myCols
##' @description    Function that defines nice colors
##'
##' @return
##' A vector of colors
##'
##' @author Dorothee Nickles
myCols <- function() {
  myColsSet1 <- c("antiquewhite3", "aquamarine3", "azure3", "lightpink1", "cadetblue4", "coral2",
                  "firebrick", "lightblue", "olivedrab", "palevioletred3")
  myColsSet2 <- c("darkgoldenrod", "darkgoldenrod2", "cornflowerblue", "darkkhaki", "coral3",
                  "chartreuse3", "dodgerblue4", "blueviolet", "darkturquoise", "thistle4") 
  myColsSet3 <- c("darkgreen", "darkmagenta", "cyan4", "darkorange", "darkred", "gold",
                  "darkblue", "deeppink2", "yellowgreen", "orangered3")
  c(myColsSet3, myColsSet2, myColsSet1)
}

##' @title          dCoVar
##' @description    Determine standard deviation at a certain percent CV
##'
##' @param  x       numeric vector 
##' @param  d       scalar numeric, percent CV
##' @param  ...     additional parameters passed on to mean()
##'
##' @return standard deviation of x at d percent of CV
##'
##' @author Dorothee Nickles
##'
dCoVar <- function(x, d, ...) {
  d * mean(x, ...)
}

##' @title          cutoffByVar
##' @description    Determine cutoffs of x (for outlier detection) based on a certain percent CV
##'
##' @param  x       numeric vector 
##' @param  d       scalar numeric, percent CV; passed on to dCoVar
##' @param  ...     additional parameters passed on to mean()
##'
##' @return
##' A list of length 2, with a scalar numeric in each slot, one giving the lower threshold (mean(x) - CV% based cutoff),
##' the other giving the upper threshold (mean(x) + percent CV based cutoff) for outlier definition.
##'
##' @author Dorothee Nickles
##'
cutoffByVar <- function(x, d, ...) {
  thresholds <- list(mean(x, ...) - dCoVar(x, d, ...), mean(x, ...) + dCoVar(x, d, ...))
  return(thresholds)
}

##' @title          cutoffByMMAD
##' @description    Determine cutoffs of x (for outlier detection) based on median
##'
##' @param  x       numeric vector 
##' @param  d       scalar numeric, factor by which to multiply MAD of x
##' @param  ...     additional parameters passed on to median()
##'
##' @return
##' A list of length 2, with a scalar numeric in each slot, one giving the lower threshold (median(x) - d * mad(x)),
##' the other giving the upper threshold (median(x) + d * mad(x)) for outlier definition.
##'
##' @author Dorothee Nickles
##'
cutoffByMMAD <- function(x, d, ...) {
  thresholds <- list(median(x, ...) - d * mad(x, ...), median(x, ...) + d * mad(x, ...))
  return(thresholds)
}

##' @title          colByFun
##' @description    Color x based on upper and lower thresholds
##'
##' @param  x           Numeric vector 
##' @param  thresholds  List of length 2, with a scalar numeric in each slot, one giving the lower the upper threshold (for outlier definition)
##'
##' @return
##' A vector of colors, with "red" for all values of x exceeding thresholds and "black" for all other values
##'
##' @author Dorothee Nickles
##'
colByFun <- function(x, thresholds) {
  ifelse(x < thresholds[[1]], "red", 
      ifelse(x > thresholds[[2]], "red", "black"))
}

##' @title          colByCovar
##' @description    Define colors based on a covariate of an RccSet object
##'
##' @param  pdata   pData() of an RccSet object
##' @param  covar   character, colname in the pdata used to stratify (color) data
##'
##' @return
##' A list of length 2, with [["color"]] being a character vector of colors (one color for each level of covar) of length=number of observations
##' and [["legend"]] providing the levels of covar to map colors to covar
##'
##' @author Dorothee Nickles
##'
colByCovar <- function(pdata, covar) {
  if(!covar %in% colnames(pdata)) {
    stop("The covariate you chose is not defined.")
  } else {
    diffTypes <- list()
    diffTypes[["color"]] <- myCols()[as.numeric(as.vector(factor(pdata[, covar], 
            labels=c(1:length(unique(pdata[, covar]))))))]
    diffTypes[["legend"]] <- unique(pdata[, covar])
    return(diffTypes)
  }         
}

setGeneric( "fovPlot", function( rccSet ) standardGeneric( "fovPlot" ) )
##' @rdname fovPlot
##' @aliases fovPlot
##'
##' @title Fields of view (FOV) plot
##'
##' @description
##' Plot the fraction of successfully imaged fields of view (FOV) in the given RccSet.
##' The RccSet's phenoData should have 'FovCount' and 'FovCounted' columns populated with the total
##' and successfully imaged FOV counts, respectively. Samples with a FOV counted/FOV count of less than
##' 80% are marked in red (dashed red line indicates threshold).
##'
##' @param  rccSet  An RccSet object
##'
##' @return
##' A plot
##'
##' @author Dorothee Nickles
##'
setMethod(
  "fovPlot",
  "RccSet",
  function(rccSet) {
    fov <- as.numeric(as.vector(pData(rccSet)$FovCounted))/
           as.numeric(as.vector(pData(rccSet)$FovCount))
    plot(fov,
         pch=16,
         main="FOV Counted versus FOV Count",
        #ylab="FOV Counted/FOV Count",
         ylab="ratio",
         xlab="Sample Index",
         las=1,
         col=ifelse(fov < 0.8, "red", "black"),
         ylim=c(min(fov, 0.8), 1))
    abline(h=0.8, col="red", lty=2)	
  }
  )

setGeneric( "bdPlot", function( rccSet ) standardGeneric( "bdPlot" ) )
##' @rdname bdPlot
##' @aliases bdPlot
##'
##' @title Binding density plot
##'
##' @description
##' Plot the binding density of each sample in an RccSet object.
##' Samples with a binding density < 0.05 or > 2.25 (thresholds defined by NanoString) 
##' are marked in red (dashed red line indicates threshold).
##'
##' @param  rccSet  An RccSet object
##'
##' @return
##' A plot
##'
##' @author Dorothee Nickles
##'
setMethod(
  "bdPlot",
  "RccSet",
  function(rccSet) {
    plot(as.numeric(as.vector(pData(rccSet)$BindingDensity)), pch=16, 
         ylim=c(0, max(c(as.numeric(as.vector(pData(rccSet)$BindingDensity)), 2.25))),
         main="Binding Density", ylab="Binding density",
         xlab="Sample Index",
         las=1,
         col=ifelse(as.numeric(as.vector(pData(rccSet)$BindingDensity)) > 2.25, "red", 
              ifelse(as.numeric(as.vector(pData(rccSet)$BindingDensity)) < 0.05, "red", "black")))
    abline(h=c(0.05, 2.25), col="red", lty=2)
  }
  )

setGeneric( "flagSamplesTech", function( rccSet ) standardGeneric( "flagSamplesTech" ) )
##' @rdname flagSamplesTech
##' @aliases flagSamplesTech
##'
##' @title          flagSamplesTech
##' @description    Flag samples based on their technical performance, i.e. field of vision (FOV) counted and binding density
##'
##' @details
##' Samples with a FOV counted/FOV count of less than 80% will be flagged. Also samples with a binding density
##'
##' @param  rccSet  An RccSet object
##'
##' @return
##' A numeric vector giving the indices of samples with outlier values in FOV counted and
##' binding density < 0.05 or > 2.25 (thresholds defined by NanoString) will be flagged.
##'
##' @author Dorothee Nickles
##'
setMethod(
  "flagSamplesTech",
  "RccSet",
  function(rccSet) {
    return(unique(c(which(as.numeric(as.vector(pData(rccSet)$FovCounted))/as.numeric(as.vector(pData(rccSet)$FovCount)) < 0.8),
                    which(as.numeric(as.vector(pData(rccSet)$BindingDensity)) > 2.25 | as.vector(pData(rccSet)$BindingDensity) < 0.05))))
  }
  )

setGeneric( "ctrlsOverviewPlot", function( rccSet ) standardGeneric( "ctrlsOverviewPlot" ) )
##' @rdname ctrlsOverviewPlot
##' @aliases ctrlsOverviewPlot
##'
##' @title          ctrlsOverviewPlot
##' @description    Plot individual negative and positive controls across all samples in an RccSet object.
##'
##' @param  rccSet  An RccSet object
##'
##' @return
##' A plot with two panels, one for the negative controls, one for the positive controls.
##'
##' @author Dorothee Nickles
##'
setMethod(
  "ctrlsOverviewPlot",
  "RccSet",
  function(rccSet) {
    par(mfrow=c(1,2))
  
    M <- assayData(rccSet)$exprs
  
    negCtrls <- (fData(rccSet)$CodeClass == "Negative")
    M.negCtrls <- M[ negCtrls, ]
    rownames(M.negCtrls) <- fData(rccSet)$GeneName[negCtrls]
    M.negCtrls.ordered <- M.negCtrls[ order(rownames(M.negCtrls)), ]
  
    boxplot(t(M.negCtrls.ordered),
            las=2,
            cex.axis=0.6,
            ylab="counts (raw)",
            log="y",
            main="Negative Controls")
  
    posCtrls <- which(fData(rccSet)$CodeClass == "Positive")
    M.posCtrls <- M[ posCtrls, ]
    rownames(M.posCtrls) <- fData(rccSet)$GeneName[posCtrls]
    M.posCtrls.ordered <- M.posCtrls[ order(rownames(M.posCtrls)), ]
  
    boxplot(t(M.posCtrls.ordered),
            las=2,
            cex.axis=0.6, 
            ylab="counts (raw)",
            log="y",
            main="Positive Controls")
  }
  )

setGeneric( "negCtrlsPlot", function( rccSet ) standardGeneric( "negCtrlsPlot" ) )
##' @rdname negCtrlsPlot
##' @aliases negCtrlsPlot
##'
##' @title          negCtrlsPlot
##' @description    Plot negative controls across all samples in an RccSet object
##'
##' @details
##' In the second panel, boxplots are colored by lane (as specified in the pData slot). Bars on top of
##' the panel indicate the stage position for each cartridge/sample (as specified in the pData slot).
##'
##' @param  rccSet  An RccSet object
##'
##' @return
##' A plot with two panels: one showing boxplots for the individual negative controls across all samples, and 
##' one showing boxplots for the negative control counts for each individual sample (lane-specific background).
##'
##' @author Dorothee Nickles
##'
setMethod(
  "negCtrlsPlot",
  "RccSet",
  function(rccSet) {
    
    negCtrls <- (fData(rccSet)$CodeClass == "Negative")
    M <- assayData(rccSet)$exprs
  
    par(mfrow=c(1,2))
  
    boxplot(t(M[ negCtrls, ]),
            las=2,
            cex.axis=0.6,
            ylab="counts (raw)",
            log="y",
            main="Negative Controls")
  
    boxplot(M[ negCtrls, ],
            log="y",
            xaxt="n",
            col=myCols()[ pData(rccSet)$LaneID ],
            ylab="counts",
            xlab="lane")
  
    axis(1,
         at = 1:ncol(M),
         labels = as.integer(pData(rccSet)$LaneID),
         cex.axis=0.6)
  
    n <- 1
    for (i in 1:length(unique(pData(rccSet)$StagePosition))) {
      tmp <- which(pData(rccSet)$StagePosition == pData(rccSet)$StagePosition[i])
      lines(y = rep(max(M[ negCtrls, ]) + 5, length(tmp)),
            x = n:(n+length(tmp)-1),
            col=i,
            lwd=4,
            xpd=TRUE)
      n <- n + length(tmp)
    }
  }
  )

setGeneric( "negCtrlsByLane", function( rccSet ) standardGeneric( "negCtrlsByLane" ) )
##' @rdname negCtrlsByLane
##' @aliases negCtrlsByLane
##'
##' @title          negCtrlsByLane
##' @description    Plot negative controls per lane in an RccSet object
##'
##' @details
##' Boxplots are colored by lane (as specified in the pData slot). Bars on top of
##' the panel indicate the stage position for each cartridge/sample (as specified in the pData slot).
##'
##' @param  rccSet  An RccSet object
##'
##' @return
##' A plot with boxplots for the negative control counts for each individual sample (lane-specific background)
##'
##' @author Dorothee Nickles
##'
setMethod(
  "negCtrlsByLane",
  "RccSet",
  function(rccSet) {
  
    negCtrls <- (fData(rccSet)$CodeClass == "Negative")
  
    M <- assayData(rccSet)$exprs
    laneid <- as.numeric(pData(rccSet)$LaneID)
  
    boxplot(
       M[ negCtrls, ]
      ,log="y"
      ,xaxt="n"
      ,col=myCols()[laneid]
      ,ylab="counts"
      ,xlab="lane"
      ,main="Negative Controls by lane -\nlane-specific background"
    )
    axis(1,
         at = 1:ncol(M),
         labels = laneid,
         cex.axis = min(0.2 + 1/log2(length(laneid)), 0.8), las=2)
    ## disable addition of StagePosition indicator for now
    #n <- 1
    #for (i in 1:length(unique(pData(rccSet)$StagePosition))) {
    #  tmp <- which(pData(rccSet)$StagePosition == pData(rccSet)$StagePosition[i])
    #  lines(y = rep(max(M[ negCtrls, ]) + 5, length(tmp)),
    #        x = n:(n+length(tmp)-1),
    #        col = i,
    #        lwd = 4,
    #        xpd = TRUE)
    #  n <- n + length(tmp)
    #}
    legend(x=(length(laneid)+3), y=max(M[ negCtrls, ]), cex=0.5,
            xpd=TRUE, bty="n", fill=myCols()[sort(unique(laneid))],
            legend=paste(rep("Lane", length(unique(laneid))), sort(unique(laneid))))
  }
  )

##' Plot of negative controls by lane (vertical orientation)
##'
##' @param rccSet
##' An RccSet
##'
##' @param outputFile
##' Output PNG filename
##'
##' @return
##' A PNG file containing a boxplot of the counts for negative controls by lane
##' in the input. The PNG is set to a fixed resolution of 300 pixels per inch and
##' a fixed width of 2250 pixels (i.e. 7.5" at 300ppi), but the height varies
##' with the size of the input. The font size is also fixed so that the labels
##' will be legible even for large datasets.
##'
negCtrlsByLane_verticalPlot <- function(rccSet, outputFile)
{
  rccSet.sorted <- rccSet[ , rev(order(rownames(pData(rccSet))))]
  
  negCtrls <- (fData(rccSet.sorted)$CodeClass == "Negative")
  
  M <- assayData(rccSet.sorted)$exprs
  laneid <- as.numeric(pData(rccSet.sorted)$LaneID)

  png(filename=outputFile,
      width=2250,
      height=200 + 30*ncol(M),
      res=300,
      units="px",
      pointsize=7
  )
  
  par(mai=c(0.5,3,0.5,0.75))
  
  par(yaxs="i")
  
  boxplot(M[negCtrls, ],
          horizontal=TRUE,
          log="x",
          xlab="Counts (raw)",
          ylab="",
          xlim=c(0,(ncol(M)+1)),  # This gets assigned to ylim by boxplot() when horizontal=TRUE.
          las=1,
          col=myCols()[laneid],
          main="Negative controls by lane -\nlane-specific background"
  )

  ## disable addition of StagePosition indicator for now
  #n <- 1
  #for (i in 1:length(unique(pData(rccSet)$StagePosition))) {
  #  tmp <- which(pData(rccSet)$StagePosition == pData(rccSet)$StagePosition[i])
  #  lines(y = rep(max(M[ negCtrls, ]) + 5, length(tmp)),
  #        x = n:(n+length(tmp)-1),
  #        col = i,
  #        lwd = 4,
  #        ypd = TRUE)
  #  n <- n + length(tmp)
  #}

  legend(x=max(M[ negCtrls, ])*1.2,
         y=(length(laneid)+3),
        #cex=0.5,
         xpd=TRUE,
         bty="n",
         fill=myCols()[sort(unique(laneid))],
         legend=paste(rep("Lane", length(unique(laneid))), sort(unique(laneid))))
  
  dev.off()
  
  return(TRUE)
}

##' @title          scatterPair
##' @description    Helper function for a scatter plot inside a pairs plots
##'
##' @param  x   integer, x positions
##' @param  y   integer, y positions
##'
##' @return
##' A scatter plot x versus y.
##'
scatterPair <- function(x,y) {points(x,y, pch=20)}

##' @title          panelCor
##' @description    Helper function for printing correlation coefficients inside a pairs plots
##'
##' @param  x       integer
##' @param  y       integer, same length as x
##' @param  digits  scalar integer, indicating the number of decimal positions for displaying the correlation coefficient
##' @param  cex.cor scalar numeric to specify relative font size for priting the correlation coefficient
##' @param  doTest  boolean, whether a results of cor.test should be displayed as well
##'
##' @return
##' Prints correlation coefficients (and p-values if doTest = TRUE) within a pairs plot.
##'
panelCor <- function(x, y, digits=2, cex.cor=0.75, doTest=FALSE)
{
  usr <- par("usr"); on.exit(par(usr))
  par(usr = c(0, 1, 0, 1))
  r <- abs(cor(x, y))
  txt <- format(c(r, 0.123456789), digits=digits)[1]
  if (doTest) {
    test <- cor.test(x,y)
    Signif <- ifelse(round(test$p.value,3)<0.001,"p<0.001",paste("p=",round(test$p.value,3)))  
    text(0.5, 0.25, paste("r=",txt))
    text(.5, .75, Signif)
  } else {
    text(0.5, 0.5, paste("r=",txt), cex=cex.cor)
  }
}

setGeneric( "negCtrlsPairs", function( rccSet, ... ) standardGeneric( "negCtrlsPairs" ) )
##' @rdname negCtrlsPairs
##' @aliases negCtrlsPairs
##'
##' @title          negCtrlsPairs
##' @description    Pairs plot of negative controls across all samples in an RccSet object
##'
##' @param  rccSet          An RccSet object
##' @param  log.transform   boolean, whether data needs to be log2 transformed
##'
##' @return
##' Pairs plot of the negative controls with a scatter plot in the lower panel and
##' correlatation coefficients printed in the upper panel.
##'
##' @author Dorothee Nickles
##'
setMethod(
  "negCtrlsPairs",
  "RccSet",
  function(rccSet, log.transform=FALSE) {
  
    M <- assayData(rccSet)$exprs
    negCtrls <- fData(rccSet)$CodeClass == "Negative"
  
    tmp <- M[ negCtrls, ]
    if (log.transform) {
      tmp <- log2(tmp)
    }
  
    pairs(t(tmp),
          upper.panel=panelCor,
          lower.panel=scatterPair)
  }
  )

setGeneric( "posNormFactPlot", function( rccSet ) standardGeneric( "posNormFactPlot" ) )
##' @rdname posNormFactPlot
##' @aliases posNormFactPlot
##'
##' @title          posNormFactPlot
##' @description    Plot positive control scaling factor for each sample in an RccSet object
##'
##' @param  rccSet  An RccSet object
##'
##' @return
##' A plot of the positive control scaling factor for each sample in an RccSet object.
##' Samples with a positive control scaling factor < 0.3 or > 3 (thresholds defined by NanoString) are
##' marked in red (dashed red line indicates threshold).
##'
##' @author Dorothee Nickles
##'
setMethod(
  "posNormFactPlot",
  "RccSet",
  function(rccSet)
  {
    if ("posCtrlData" %in% ls(assayData(rccSet))) {
        M <- assayData(rccSet)$exprs
        positives <- (fData(rccSet)$CodeClass == "Positive")

        posFactor <- pData(rccSet)$PosFactor
      
        plot(posFactor,
             las = 1,
             cex.axis = 0.6,
             ylab = "Positive control scaling factor",
             pch = 16,
             xlab = "Sample index",
             main = "Positive control scaling factors across samples", 
             col = ifelse(posFactor < 0.3, "red", ifelse(posFactor > 3, "red", "black")),
             ylim = c(min(c(posFactor, 0.3)), max(c(posFactor, 3))))
        abline(h = c(0.3, 3),
               col = "red",
               lty = 2)
    } else {
        print("This plot requires the RccSet to have positive control normalized data")
    }
  }
  )

##' @title          zfacFun
##' @description    Calculate Z' Factor
##'
##' @param  p   numeric vector: measurements for the positive controls (or actual measurement)
##' @param  n   numeric vector: measurements for the negative controls
##'
##' @return     Scalar numeric: the Z' Factor
##'
##' @author Dorothee Nickles
##'
zfacFun <- function(p, n) {
  return( 1 - ((3* (mad(p) + mad(n))) / abs(median(p) - median(n))) )
}

setGeneric( "ctrlsZprimePlot", function( rccSet ) standardGeneric( "ctrlsZprimePlot" ) )
##' @rdname ctrlsZprimePlot
##' @aliases ctrlsZprimePlot
##'
##' @title          ctrlsZprimePlot
##' @description    Plot distribution of counts and the Z' Factors comparing the nagative controls and
##'                 the three highest input positive controls of an RccSet object.
##'
##' @param  rccSet  An RccSet object
##'
##' @return         A plot
##'
##' @author Dorothee Nickles
##'
setMethod(
  "ctrlsZprimePlot",
  "RccSet",
  function(rccSet) {
  
    M <- assayData(rccSet)$exprs
    negatives    <- (fData(rccSet)$CodeClass == "Negative")
    positive_128 <- (fData(rccSet)$SpikeInInput == 128)
    positive_32  <- (fData(rccSet)$SpikeInInput == 32)
    positive_8   <- (fData(rccSet)$SpikeInInput == 8)
  
    posN <- log2(apply( M[negatives, ], 2, median ))
    
    zfac1 <- round(zfacFun(log2(M[ positive_128, ]), posN), digits=2)
    zfac2 <- round(zfacFun(log2(M[ positive_32,  ]), posN), digits=2)
    zfac3 <- round(zfacFun(log2(M[ positive_8,   ]), posN), digits=2)
    plot(density(posN),
         col = myCols()[1],
         xlim = c(1, log2(max(M[ positive_128, ]))),
         main = "Frequency distributions of log2 transformed counts\nfor negative and positive controls")
    lines(density(log2(M[ positive_8,   ])), col=myCols()[2])        		
    lines(density(log2(M[ positive_32,  ])), col=myCols()[3])
    lines(density(log2(M[ positive_128, ])), col=myCols()[4])
    legend("top",
           lty = 1,
           col = myCols()[1:4],
           legend = c("negCtrls",
                      paste0("pos8: Z'=", zfac3),
                      paste0("pos32: Z'=", zfac2),
                      paste0("pos128: Z'=", zfac1)))
  }
  )

setGeneric( "iqrPlot", function( rccSet, ... ) standardGeneric( "iqrPlot" ) )
##' @rdname iqrPlot
##' @aliases iqrPlot
##'
##' @title
##' iqrPlot
##'
##' @description
##' Plot the interquartile range (IQR) for a certain code class of probes in an
##' RccSet object.
##'
##' @details
##' IQR of the specified code class for each sample in the RccSet are plotted
##' and outliers (as determined by the function specified in the method
##' argument) are marked in red (thresholds for outlier definition are plotted
##' as red dashed lines).
##'
##' @param rccSet
##' An RccSet object
##'
##' @param codeClass
##' Character string specifying the code class (as annotated in the
##' fData(rccSet)$CodeClass column) for which the IQR shall be determined.
##'
##' @param method
##' Character string specifying the method for outlier detection: either
##' "cutoffByMMAD" or "cutoffByVar".
##'
##' @param stringency
##' Numeric value passed to the cutoff function specified by the method
##' argument (see the 'd' argument of cutoffByMMAD and cutoffByVar).
##'
##' @return
##' A plot
##'
##' @seealso \code{\link{cutoffByMMAD}}, \code{\link{cutoffByVar}}
##'
##' @author Dorothee Nickles
##'
setMethod(
  "iqrPlot",
  "RccSet",
  function(rccSet,
           codeClass = c("Negative", "Positive", "Endogenous", "Housekeeping"),
           method = c("cutoffByMMAD", "cutoffByVar"),
           stringency = 4)
  {
    codeClass <- match.arg(codeClass)
    fun <- match.fun(method)
  
    M <- assayData(rccSet)$exprs
  
    iqr <- apply(log2(M[ fData(rccSet)$CodeClass == codeClass, ]), 2, IQR)
    thresholds <- fun(iqr, stringency)
  
    plot(iqr,
         las = 1,
         cex.axis = 0.6,
         ylab = "counts (log2)",
         pch = 16,
         xlab = "Sample index",
         main = sprintf("IQR of code class %s across samples", codeClass), 
         col = colByFun(iqr, thresholds), 
         ylim = c(min(c(iqr, unlist(thresholds))), max(c(iqr, unlist(thresholds)))))
  
    abline(h = unlist(thresholds),
           col = "red",
           lty = 2)
  }
  )

setGeneric( "posR2Plot", function( rccSet ) standardGeneric( "posR2Plot" ) )
##' @rdname posR2Plot
##' @aliases posR2Plot
##'
##' @title          posR2Plot
##' @description    Plot the R squared of linear fit of counts versus input for positive controls in an RccSet object.
##'
##' @details
##' R squared for each sample in the RccSet are plotted and samples with R squared < 0.95 are marked in red
##' (threshold indicated by dashed red line).
##'
##' @param  rccSet  RccSet object
##'
##' @return         A plot
##'
##' @author Dorothee Nickles
##'
setMethod(
  "posR2Plot",
  "RccSet",
  function(rccSet)
  {
    par(mfrow=c(1,1))
  
    M <- assayData(rccSet)$exprs
    positives <- (fData(rccSet)$CodeClass == "Positive")
  
    tmp <- apply(log2(M[positives, ]),
                 2,
                 function(x) { summary(lm(x ~ log2(fData(rccSet)$SpikeInInput[ positives ])))$r.squared })
    plot(tmp,
         pch = 16,
         col = ifelse(tmp < 0.95, "red", "black"),
         main = "R2 for linearity in positive controls")
  
    abline(h=0.95, col="red", lty=2)
  }
  )

setGeneric( "posSlopePlot", function( rccSet, ... ) standardGeneric( "posSlopePlot" ) )
##' @rdname posSlopePlot
##' @aliases posSlopePlot
##'
##' @title
##' posSlopePlot
##'
##' @description
##' Plot the slope of linear fit of counts versus input for positive controls
##' in an RccSet object.
##'
##' @param rccSet
##' An RccSet object
##'
##' @param method
##' Character string specifying the method for outlier detection: either
##' "cutoffByMMAD" or "cutoffByVar".
##'
##' @param stringency
##' Numeric value passed to the cutoff function specified by the method
##' argument (see the 'd' argument of cutoffByMMAD and cutoffByVar).
##'
##' @return
##' A plot
##'
##' @details
##' The slope for each sample in the RccSet are plotted and and outliers (as
##' determined by the function specified by the method argument) are marked in
##' red (thresholds for outlier definition are plotted as red dashed lines).
##'
##' @author Dorothee Nickles
##'
setMethod(
  "posSlopePlot",
  "RccSet",
  function(rccSet,
           method = c("cutoffByMMAD", "cutoffByVar"),
           stringency = 4)
  {
    fun <- match.fun(method)
  
    M <- assayData(rccSet)$exprs
    positives <- (fData(rccSet)$CodeClass == "Positive")
  
    tmp <- apply(log2(M[positives, ]),
                 2,
                 function(x) { coefficients(lm(x ~ log2(fData(rccSet)$SpikeInInput[ positives ])))[2] })
    thresholds <- fun(tmp, stringency)
    plot(tmp,
         pch = 16,
         col = colByFun(tmp, thresholds),
         main = "Slope in positive controls",
         ylab = "slope",
         xlab = "Sample index",
         ylim = c(min(c(tmp, unlist(thresholds))), max(c(tmp, unlist(thresholds)))),
         las = 1)
  
    abline(h=unlist(thresholds), col="red", lty=2)
  }
  )


setGeneric( "posRatioPlot", function( rccSet, ... ) standardGeneric( "posRatioPlot" ) )
##' @rdname posRatioPlot
##' @aliases posRatioPlot
##'
##' @title
##' posRatioPlot
##'
##' @description
##' Plot the ratio of the mean of positive control counts for each sample and
##' the overall mean of positive control counts in an RccSet object.
##'
##' @details
##' The ratio for each sample in the RccSet is plotted and and outliers (as
##' determined the cutoff function specified by the method argument) are marked
##' in red (thresholds for outlier definition are plotted as red dashed lines).
##'
##' @param rccSet
##' An RccSet object
##'
##' @param method
##' Character string specifying the method for outlier detection: either
##' "cutoffByMMAD" or "cutoffByVar".
##'
##' @param stringency
##' Numeric value passed to the cutoff function specified by the method
##' argument (see the 'd' argument of cutoffByMMAD and cutoffByVar).
##'
##' @return
##' A plot
##'
##' @author Dorothee Nickles
##'
setMethod(
  "posRatioPlot",
  "RccSet",
  function(rccSet,
           method = c("cutoffByMMAD", "cutoffByVar"),
           stringency = 4)
  {
    fun <- match.fun(method)
  
    M <- assayData(rccSet)$exprs
    positives <- (fData(rccSet)$CodeClass == "Positive")
  
    meanPos <- log2(colMeans(M[positives, ]))
    meanPosMean <- mean(meanPos)
    meanPosRatio <- meanPos/meanPosMean
    thresholds <- fun(meanPosRatio, stringency)
  
    plot(y = meanPosRatio,
         x = 1:ncol(M),
         las = 2,
         cex.axis = 0.6, 
         ylab = "mean pos ctrl counts/overall mean of pos ctrl counts",
         xlab="Sample index",
         main = "", 
         pch = 16,
         xlab = "samples",
         col = colByFun(meanPosRatio, thresholds),
         log = "y",
         ylim = c(min(c(meanPosRatio, unlist(thresholds))), max(c(meanPosRatio, unlist(thresholds)))))
  
    abline(h=unlist(thresholds), col="red", lty=2)
  }
  )

setGeneric( "allSumPlot", function( rccSet, ... ) standardGeneric( "allSumPlot" ) )
##' @rdname allSumPlot
##' @aliases allSumPlot
##'
##' @title
##' allSumPlot
##'
##' @description
##' Plot the sum of all counts (endogenous and housekeeping genes only) for each
##' sample in an RccSet object.
##'
##' @details
##' The sum of counts for each sample in the RccSet are plotted and and outliers
##' (as determined the cutoff function specified by the method argument) are
##' marked in red (thresholds for outlier definition are plotted as red dashed
##' lines).
##'
##' @param rccSet
##' An RccSet object
##'
##' @param method
##' Character string specifying the method for outlier detection: either
##' "cutoffByMMAD" or "cutoffByVar".
##'
##' @param stringency
##' Numeric value passed to the cutoff function specified by the method
##' argument (see the 'd' argument of cutoffByMMAD and cutoffByVar).
##'
##' @return
##' A plot
##'
##' @author Dorothee Nickles
##'
setMethod(
  "allSumPlot",
  "RccSet",
  function(rccSet,
           method = c("cutoffByMMAD", "cutoffByVar"),
           stringency = 4)
  {
    fun <- match.fun(method)
  
    M <- assayData(rccSet)$exprs
    nonctrls <- (fData(rccSet)$CodeClass %in% c("Endogenous", "Housekeeping"))
  
    allSum <- colSums(M[nonctrls, ])
    thresholds <- fun(log2(allSum), stringency)
    blankLabel <- getBlankLabel(rccSet)
  
    plot(y = allSum,
         x = 1:ncol(M),
         log = "y",
         las = 1,
         cex.axis = 0.6,
         ylab = "sum of counts",
         xlab = "Sample index",
         main = "Sum of raw counts for all probes", 
         col = colByFun(log2(allSum), thresholds),
         pch = ifelse( (pData(rccSet)$SampleType %in% blankLabel), 17, 16),
         ylim = as.integer(c(min(c(allSum, 2^unlist(thresholds))), max(c(allSum, 2^unlist(thresholds)))))
         )
  
    abline(h=2^unlist(thresholds), col="red", lty=2)
  }
  )

setGeneric( "posSumVsAllSumPlot", function( rccSet, ... ) standardGeneric( "posSumVsAllSumPlot" ) )
##' @rdname posSumVsAllSumPlot
##' @aliases posSumVsAllSumPlot
##'
##' @title
##' posSumVsAllSumPlot
##'
##' @description
##' Plot the ratio of sums of positive control counts to all counts for all
##' samples in an RccSet object.
##'
##' @details
##' The ratio for each sample in the RccSet is plotted and and outliers (as
##' determined by the cutoff function specified by the method argument) are
##' marked in red (thresholds for outlier definition are plotted as red
##' dashed lines).
##'
##' @param rccSet
##' An RccSet object
##'
##' @param method
##' Character string specifying the method for outlier detection: either
##' "cutoffByMMAD" or "cutoffByVar".
##'
##' @param stringency
##' Numeric value passed to the cutoff function specified by the method
##' argument (see the 'd' argument of cutoffByMMAD and cutoffByVar).
##' (If the median ratio is less than 1, three times this value will be
##' used.)
##'
##' @return
##' A plot
##'
##' @author Dorothee Nickles
##'
setMethod(
  "posSumVsAllSumPlot",
  "RccSet",
  function(rccSet,
           method = c("cutoffByMMAD", "cutoffByVar"),
           stringency = 4)
  {
    fun <- match.fun(method)
  
    M <- assayData(rccSet)$exprs
  
    iSum <- colSums(M[fData(rccSet)$CodeClass %in% "Positive",])
    nSum <- colSums(M[fData(rccSet)$CodeClass %in% c("Endogenous", "Housekeeping"),])

    if (median(iSum/nSum) < 1) {
      stringency <- 3*stringency
    }

    thresholds <- fun(iSum/nSum, stringency)

    blankLabel <- getBlankLabel(rccSet)
  
    plot(y = iSum/nSum,
         x = 1:ncol(M),
         las = 1,
         cex.axis = 0.6,
         ylab = "ratio",
         xlab = "Sample index",
         main = sprintf("Sum of positive control counts /\nsum of endogenous and housekeeping gene counts"),
         col = colByFun(iSum/nSum, thresholds),
         pch = ifelse( (pData(rccSet)$SampleType %in% blankLabel), 17, 16),
         ylim = c(min(c(iSum/nSum, unlist(thresholds))), max(c(iSum/nSum, unlist(thresholds)))))

    abline(h=unlist(thresholds), col="red", lty=2)
  }
  )


setGeneric( "flagSamplesCtrl", function( rccSet, ... ) standardGeneric( "flagSamplesCtrl" ) )
##' @rdname flagSamplesCtrl
##' @aliases flagSamplesCtrl
##'
##' @title
##' flagSamplesCtrl
##'
##' @description
##' Flag samples based on the performance of their controls.
##'
##' @details
##' The method and stringency arguments determine a cutoff value used to
##' flag outlier samples based on the interquartile range of negative and
##' positive controls and in the slope of the linear fit of count versus input
##' of positive controls. Outliers will also be flagged if their positive
##' control scaling factor (see posCtrlNorm) is outside the NanoString
##' recommended range (i.e. below 0.3 or greater than 3).
##'
##' @param rccSet
##' An RccSet object
##'
##' @param method
##' Character string specifying the method for outlier detection: either
##' "cutoffByMMAD" or "cutoffByVar".
##'
##' @param stringency
##' Numeric value passed to the cutoff function specified by the method
##' argument (see the 'd' argument of cutoffByMMAD and cutoffByVar).
##'
##' @return
##' A numeric vector giving the indices of samples with outlier values as
##' described above.
##'
##' @author Dorothee Nickles
##'
setMethod(
  "flagSamplesCtrl",
  "RccSet",
  function(rccSet,
           method = c("cutoffByMMAD", "cutoffByVar"),
           stringency = 4)
  {
    method <- match.arg(method)
    fun <- match.fun(method)

    M <- assayData(rccSet)$exprs
  
    negatives    <- (fData(rccSet)$CodeClass == "Negative")
    positives    <- (fData(rccSet)$CodeClass == "Positive")
    positive_128 <- (fData(rccSet)$SpikeInInput == 128)
  
    a <- apply(log2(M[negatives, ]), 2, IQR)
    ctrls <- which(colByFun(a, fun(a, stringency)) == "red")
  
    a <- apply(log2(M[positives, ]), 2, IQR)
    ctrls <- c(ctrls, which(colByFun(a, fun(a, stringency)) == "red"))
  
    #a <- log2(M[positive_128, ])
    #ctrls <- c(ctrls, which(colByFun(a, fun(a, stringency)) == "red"))
  
    a <- apply(log2(M[positives, ]),
               2,
               function(x) { coefficients(lm(x ~ log2(fData(rccSet)$SpikeInInput[ positives ])))[2] })
    ctrls <- c(ctrls, which(colByFun(a, fun(a, stringency)) == "red"))

    if ("PosFactor" %in% rownames(pData(rccSet))) {
        posFactor <- pData(rccSet)$PosFactor
        ctrls2 <- c(which(posFactor < 0.3), which(posFactor > 3))
    } else {
        ctrls2 <- integer(0)
    }

    return(unique(ctrls, ctrls2))
  }
  )

##' @title          pcaPlot
##' @description    Wrapper function to perform a PCA analysis on the exprs slot of an RccSet object
##'                 and plot some results
##'
##' @param  exx     exprs() of an RccSet object
##' @param  ...     additional parameters passed on to the plotting functions
##'
##' @return
##' PCA screeplot and a plot with two panels, one plotting PC1 versus PC2, the other plotting
##' PC1 versus PC3.
##'
##' @author Dorothee Nickles
##'
pcaPlot <- function(exx, ...) {		## takes exprs of an expression set object and title for plots
  if (!is.matrix(exx)) {
    stop("Input is not a matrix")   # was: "stop("This function takes a matrix. Are you sure you put in exprs(rccSet)?")" -RZ 2015-03-24
  }
  pca_test <- prcomp(exx, scale=TRUE, tol=0)
  par(mfrow=c(1,1))
  screeplot(pca_test, ...)
  par(mfrow=c(1,2))
  plot(pca_test$rotation[,1], pca_test$rotation[,2], ylab="PC2", xlab="PC1", pch=16, ...)
  #text(x=pca_test$rotation[,1], y=pca_test$rotation[,2], label=rownames(pca_test$rotation),
  #  xpd=TRUE)
  plot(pca_test$rotation[,1], pca_test$rotation[,3], pch=16, ylab="PC3", xlab="PC1", ...)
  #text(x=pca_test$rotation[,1], y=pca_test$rotation[,3], label=rownames(pca_test$rotation),
  #  xpd=TRUE)
  #plot(pca_test$rotation[,2], pca_test$rotation[,3], pch=16, ylab="PC3", xlab="PC2", ...)
  #text(x=pca_test$rotation[,2], y=pca_test$rotation[,3], label=rownames(pca_test$rotation),
  #  xpd=TRUE)
  #plot(pca_test$rotation[,2], pca_test$rotation[,4], pch=16, ylab="PC4", xlab="PC2", ...)
}

setGeneric( "lodAssess", function( rccSet, ... ) standardGeneric( "lodAssess" ) )
##' @rdname lodAssess
##' @aliases lodAssess
##'
##' @title
##' lodAssess
##'
##' @description
##' Assess how many genes in each sample in an RccSet object are below the
##' limit of detection. (The current implementation does a straightforward
##' column sum on the presence/absence matrix (paData) in assayData.)
##'
##' @param rccSet
##' An RccSet object
##'
##' @return
##' A numeric vector giving the number of missing genes (endogenous and
##' housekeeping genes) for each sample in an RccSet. If paData is not
##' found in the input's assayData, NULL is returned.
##'
##' @author Dorothee Nickles
##'
setMethod(
  "lodAssess",
  "RccSet",
  function(rccSet)
  {
    if ("paData" %in% ls(assayData(rccSet))) {
      nonctrls <- (fData(rccSet)$CodeClass %in% c("Endogenous", "Housekeeping"))
      misValues <- colSums( assayData(rccSet)$paData[nonctrls, ] == FALSE )
      return(misValues)
    } else {
      return(NULL)
    }
  }
  )

setGeneric( "flagSamplesCount", function( rccSet, ... ) standardGeneric( "flagSamplesCount" ) )
##' @rdname flagSamplesCount
##' @aliases flagSamplesCount
##'
##' @title
##' flagSamplesCount
##'
##' @description
##' Flag samples based on overall counts
##'
##' @details
##' The method and stringency arguments deterine a cutoff value used to flag
##' samples as outliers: samples will be flagged if the sum of counts of their
##' endogeneous genes exceeds the cutoff or if the ratio of the sums of their
##' positive controls to the sums of their endogenous genes exceeds three
##' times the cutoff. Samples will also be flagged if the fraction of
##' genes below the lower limit of detection exceeds the maxMiss value.
##'
##' @param rccSet
##' An RccSet object
##'
##' @param method
##' Character string specifying the method for outlier detection: either
##' "cutoffByMMAD" or "cutoffByVar".
##'
##' @param stringency
##' Numeric value passed to the cutoff function specified by the method
##' argument (see the 'd' argument of cutoffByMMAD and cutoffByVar).
##'
##' @param maxMiss
##' Numeric specifying the allowable fraction of genes below the lower limit
##' of detection in a sample.
##'
##' @return
##' A numeric vector giving the indices of samples with outlier values
##' according to the criteria described above.
##'
##' @author Dorothee Nickles
##'
setMethod(
  "flagSamplesCount",
  "RccSet",
  function(rccSet,
           method = c("cutoffByMMAD", "cutoffByVar"),
           stringency = 4,
           maxMiss = 0.2)
  {
    fun <- match.fun(method)
  
    M <- assayData(rccSet)$exprs
  
    positives  <- (fData(rccSet)$CodeClass == "Positive")
    endogenous <- (fData(rccSet)$CodeClass == "Endogenous")
  
    posSum <- colSums(M[positives, ])
    allSum <- colSums(M[endogenous, ])

    if ("paData" %in% ls(assayData(rccSet))) {
      misValues <- lodAssess(rccSet)
      flagged_maxMiss <- which(misValues/nrow(M) > maxMiss)
    } else {
      flagged_maxMiss <- integer(0)
    }

    sampC <- unique(c(
      which(colByFun(allSum, fun(allSum, stringency)) == "red"),
      which(colByFun(posSum/allSum, fun(posSum/allSum, stringency*3)) == "red"),
      flagged_maxMiss
    ))
  
    return(sampC)
  }
  )

setGeneric( "lodPlot", function( rccSet, ... ) standardGeneric( "lodPlot" ) )
##' @rdname lodPlot
##' @aliases lodPlot
##'
##' @title
##' lodPlot
##'
##' @description
##' Function to plot the number of missing genes per sample in an RccSet object.
##'
##' @details
##' Samples with more than 50% missingness are marked in red. If blank
##' measurements are present, they are represented as triangles.
##'
##' @param rccSet
##' An RccSet object
##'
##' @param maxMiss
##' Numeric specifying the allowable fraction of genes below the lower limit
##' of detection in a sample.
##'
##' @return
##' A plot
##'
##' @author Dorothee Nickles
##'
setMethod(
  "lodPlot",
  "RccSet",
  function(rccSet,
           maxMiss = 0.2)
  {
    if ("paData" %in% ls(assayData(rccSet))) {

        blankLabel <- getBlankLabel(rccSet)
        nonctrls <- (fData(rccSet)$CodeClass %in% c("Endogenous", "Housekeeping"))
        n_nonctrls <- sum(nonctrls)
      
        misValues <- lodAssess(rccSet)
      
        plot(misValues/n_nonctrls, 
             col = ifelse(misValues/n_nonctrls >= maxMiss, "red", "black"),
             pch = ifelse(pData(rccSet)$SampleType %in% blankLabel, 17, 16),
             main = "Sample Missingness",
             ylab = "Fraction of genes below background estimate",
             xlab = "Sample index",
             las = 1)
      
        abline(h=maxMiss, col="red", lty=2)   # was: abline(h=length(nonctrls)*maxMiss, col="red", lty=2) -RZ 2015-03-27

    } else {
        print("This plot requires the RccSet to have the presence/absence matrix")
    }
  }
  )

setGeneric( "sampleClustering", function( rccSet, ... ) standardGeneric( "sampleClustering" ) )
##' @rdname sampleClustering
##' @aliases sampleClustering
##'
##' @title
##' Clustering by sample correlation
##' 
##' @description
##' Clustering by sample correlation
##'
##' @param rccSet
##' An RccSet
##'
##' @param outputFile
##' Output PNG filename
##'
##' @param main
##' Plot title
##' 
##' @param annCol
##' See \code{\link{aheatmap}}
##'
##' @param covar
##' Covariate (e.g. "SampleType")
##'
##' @return
##' A PNG file showing clustering of samples by correlation. Any zero-variance
##' samples are omitted from the heatmap.
##' The width and height of the PNG file are set to vary with the size of the input.
##'
##' @importFrom NMF aheatmap
##' @importFrom RColorBrewer brewer.pal
##'
##' @author Dorothee Nickles, Robert Ziman
##'
setMethod(
  "sampleClustering",
  "RccSet",
  function(rccSet,
           outputFile,
           main = "Sample correlations in raw data",
           annCol = NULL,
           covar = "SampleType")
  {
    M <- assayData(rccSet)$exprs
  
    # Remove any zero-variance rows. These rows are problematic
    # in the heatmap-plotting code that follows (they cause NA values in the output of cor() which
    # in turn causes hclust() to error out).

    zero_variance <- apply(M, 2, function(x){ var(x) == 0 })
    if (any(zero_variance)) {
      warning(
        paste(
          c(
            "The following samples were excluded because of zero variance:",
            paste0("    ", colnames(M)[zero_variance])
          ),
          collapse="\n"
        )
      )
      M <- M[ , !zero_variance]
      pdata <- pData(rccSet)[ !zero_variance ] 
    } else {
     #M <- M
      pdata <- pData(rccSet)
    }

    # Assign annCol (specifications of column annotation tracks displayed as
    # coloured rows on top of the heatmaps)

    cols <- colByCovar(pdata, covar)
    if (missing(annCol)) {
      annCol <- data.frame(SampleSubgroup = pdata[ , covar])
    }

    # Check if the input is too large to be comfortably plotted.

    if (ncol(M) > 1000)
      stop("This function is designed to plot a heatmap of up to 1000 samples")

    # Generate the heatmap. Given the finicky aheatmap() behavior, need different
    # settings when the input is small (<50 samples) or large (>50). 

    if (ncol(M) < 50) {

      width = (20*ncol(M) + 250) / 72
      height = (20*ncol(M) + 250) / 72

      png( filename = outputFile, width = width, height = height, units="in", res=72 )

      aheatmap( cor(M),
                annCol      = annCol,
                color       = brewer.pal(8, "Blues"),
                treeheight  = 0,
                legend      = TRUE,
                fontsize    = 10,
                cexRow      = 1,
                cexCol      = 1,
                cellwidth   = 20,
                cellheight  = 20,
                main        = main)

      dev.off()

    } else {

      width <- (10 * ncol(M) + 600) / 300
      height <- (10 * ncol(M) + 200) / 300

      png( filename = outputFile, width = width, height = height, units = "in", res = 300 )

      aheatmap( cor(M),
                annCol      = annCol,
                color       = brewer.pal(8, "Blues"),
                treeheight  = 0,
                legend      = TRUE,
                fontsize    = 2,
               #cexRow      = 0.5,       # These won't work when the input matrix is large (>200 columns);
               #cexCol      = 0.5,       # use cellwidth/cellheight instead. See aheatmap() source.
                cellwidth   = 2,
                cellheight  = 2,
                gp          = grid::gpar(cex=2),  # Required to compensate for hardcoded values in the aheatmap() source
                main        = main )

      dev.off()

    }

  }
  )

##' Plot counts in blank samples (vertical orientation)
##'
##' @param rccSet
##' An RccSet
##'
##' @param outputFile
##' Output PNG filename
##'
##' @return
##' A PNG file containing a boxplot of the gene-wise counts for the blank samples
##' in the input. The PNG is set to a fixed resolution of 300 pixels per inch and
##' a fixed width of 2250 pixels (i.e. 7.5" at 300ppi), but the height varies
##' with the size of the input. The font size is also fixed so that the labels
##' will be legible even for large datasets.
##'
countsInBlankSamples_verticalPlot <- function(rccSet, outputFile)
{
  if (hasBlanks(rccSet))
  {
    blankLabel <- getBlankLabel(rccSet)
    blanks <- (pData(rccSet)$SampleType %in% blankLabel)
    endo   <- which(fData(rccSet)$CodeClass %in% c("Endogenous", "Housekeeping"))
    
    M <- assayData(rccSet)$exprs
    rownames(M) <- fData(rccSet)$GeneName   
    
    png(filename=outputFile,
        width=2250,
        height=200 + 30*nrow(M),
        res=300,
        units="px",
        pointsize=7
        )
    
    par(mai=c(0.5,1.5,0.5,0.25))

    par(yaxs="i")
    
    boxplot(t(M[endo, blanks]),
            horizontal=TRUE,
            log="x",
            xlab="Counts (raw)",
            ylab="",
            las=1,
            pch=".",
            main="Gene-wise counts in blank measurements"
            )
    sapply(endo,
           function(n) {
             points(x = median(M[n, blanks]) + (2.5*mad(M[n, blanks])),
                      #
                      # ***** TODO: replace with actual background estimate *****
                      #
                    y=n,
                    col="red",
                    pch=20,
                    cex=0.7)
           })
    
    dev.off()
  }
  else {
    print("No blank samples are present")
  }
  
  return(TRUE)
}


##' @title          densityPlot
##' @description    Plot the density of counts for all endogenous and housekeeping genes for each sample in an RccSet object
##'
##' @param  M               One of the matrices from the assayData() of an RccSet object (make sure
##'                         to set the log.transform parameter accordingly)
##' @param  log.transform   Scalar boolean
##' @param  pdata           pData() of the RccSet object
##' @param  covar           character; colname in pData() that can be used to label genes by a category of interest (passed on to colByCovar)
##' @param  ...             additional plotting parameters
##'
##' @return
##' A density plot
##'
##' @author Dorothee Nickles
##'
densityPlot <- function(M, log.transform=FALSE, pdata, covar, ...) {	## takes exprSet
  
  stopifnot(is(M, "matrix"))

  if (log.transform) {
    M <- log2(M)  
  }
  
  cols <- colByCovar(pdata, covar)
  plot(density(M[,1]),
       ylim=c(0,0.5),
       col="white",
       las=1,
       ...)
  sapply(1:ncol(M),
         function(x) { lines(density(M[,x]),
                       col=cols[["color"]][x]) })
  legend("topright", fill=unique(cols[["color"]]), legend=cols[["legend"]])
}

setGeneric( "assessHousekeeping", function( rccSet, ... ) standardGeneric( "assessHousekeeping" ) )
##' @rdname assessHousekeeping
##' @aliases assessHousekeeping
##'
##' @title          assessHousekeeping
##' @description    Assess correlation and variance/variability of housekeeping genes
##'
##' @details
##' Pairwise correlations of all defined housekeeping genes will be assessed and pairwise scatterplots will be generated. This function does not only
##' output pairwise correlation coefficients, but also - for each housekeeping gene - the variance, the interquartile range (IQR) and median expression
##' level across all samples in the experiment.
##'
##' @param  rccSet      An RccSet object
##' @param  hk          Either a boolean vector of length nrow(exprs(rccSet)) or a numeric vector of indices
##'                     which genes in exprs(rccSet) are housekeeping genes
##' @param  covar       character; colname in fData(rccSet) that can be used to label genes by a category of interest
##' @param  annotate    Scalar boolean; if TRUE (default), probes will be "annotated" using the "GeneName" column in the fData(rccSet) slot
##' @param  digits      Scalar integer, the number of decimal places
##' @param  plot        Scalar boolean, plot pairwise relationships ?
##'
##' @return
##' A dataframe with one row per housekeeping genes and several columns with metrics suggested to
##' assess performance of defined housekeeping genes.
##'
##' @author Dorothee Nickles
##'
setMethod(
  "assessHousekeeping",
  "RccSet",
  function(rccSet, hk, covar, annotate=TRUE, plot=TRUE, digits=2) {
  
    if (!is.logical(hk)) {
      stop("Please provide logical vector assigning whether each gene serves as a housekeeping gene or not.")
    }
    if (sum(hk) == 1) {
      stop("You need more than one housekeeping gene to use this function.")
    }
  
    M <- assayData(rccSet)$exprs
  
    values <- rownames(M[hk, ])
    if (annotate) {
      values <- fData(rccSet)$GeneName[hk]
    }
    if (any(duplicated(values))) {
      n <- 1
      for (r in which(duplicated(values))) {
        values[r] <- paste(values[r], n, sep="v")
        n <- n+1
      }
    }
  
    if(plot == TRUE){
      pairs(t(log2(M[hk, ])), 
            main = "Correlation of housekeeping genes",
            col = colByCovar(pData(rccSet), covar)[["color"]],
            labels = values,
            pch = 20,
            las = 1)
    }
  
    output <- data.frame(matrix(nrow=sum(hk), ncol=0))
    output[,1] <- values
    output[,2] <- var(t(log2(M[hk, ])))[,1]
    output[,3] <- apply(log2(M[hk, ]), 1, IQR)
    output[,4] <- apply(log2(M[hk, ]), 1, median)
    output[, 5:(sum(hk)+4)] <- cor(t(log2(M[hk, ])))
    output[,c(2:ncol(output))] <- round(output[,c(2:ncol(output))], digits=digits)
    colnames(output) <- c("Gene", "Variance", "IQR", "median", paste("cor", values, sep="_"))
  
    return(output)
  }
  )


##' Gene clustering heatmap
##'
##' @param rccSet
##' An RccSet
##'
##' @param outputFile
##' Output PNG filename
##'
##' @param main
##' Plot title
##'
##' @param covar
##' Colname in the rccSet's fData that can be used to label genes by a category of interest
##'
##' @return
##' A PNG file showing clustering of genes by correlation across an experiment
##' Positive and negative control probes and any zero-variance genes (typically
##' housekeeping genes) are omitted from the heatmap.
##' The width and height of the PNG file are set to vary with the size of the input.
##'
##' @importFrom NMF aheatmap
##'
##' @author Dorothee Nickles, Robert Ziman
##'
geneClustering <- function(rccSet,
                           outputFile,
                           main = "Gene clustering",
                           covar = NULL
                          #gsubset = NULL
                               ## @param gsubset
                               ## Either a boolean vector of length nrow(exprs(rccSet)) or a numeric vector of
                               ## indices indicating a subset of genes in exprs(rccSet) with which to generate
                               ## the heatmap instead of the full set.
                           )
{
  stopifnot(is(rccSet, "RccSet"))

  M <- assayData(rccSet)$normData
  rownames(M) <- fData(rccSet)$GeneName

  # Remove positive and negative control probes from the matrix.

  nonctrls <- (fData(rccSet)$CodeClass %in% c("Endogenous", "Housekeeping"))
  M <- M[nonctrls, ]

  # Remove any zero-variance rows -- typically the housekeeping genes. These rows are problematic
  # in the heatmap-plotting code that follows (they cause NA values in the output of cor() which
  # in turn causes hclust() to error out), and the housekeeping genes are not informative anyway.

  zero_variance <- apply(M, 1, function(x) { var(x) == 0 })
  if (any(zero_variance)) {
    warning(
      paste(
        c(
          "The following features were excluded because of zero variance:",
          paste0("    ", rownames(M)[zero_variance])
        ),
        collapse="\n"
      )
    )
    M <- M[ !zero_variance, ]
  }

  # Check if the input is too large to be comfortably plotted.

  if (nrow(M) > 1000)
    stop("This function is designed to plot a heatmap of up to 1000 genes")

  # Generate the heatmap. Given the finicky aheatmap() behavior, need different
  # settings when the input is small (<50 genes) or large (>50).

  if (nrow(M) < 50) {

    width = (20*nrow(M) + 250) / 72
    height = (20*nrow(M) + 250) / 72

    png( filename = outputFile, width = width, height = height, units="in", res=72 )

    aheatmap( cor(t(M)),
             #annCol      = plabel,
              color       = "-RdBu",
              treeheight  = 0,
              legend      = TRUE,
              fontsize    = 10,
              cexRow      = 1,
              cexCol      = 1,
              cellwidth   = 20,
              cellheight  = 20,
             #gp=grid::gpar
              main        = main)

    dev.off()

  } else {

    width <- (10 * nrow(M) + 500) / 300
    height <- (10 * nrow(M) + 200) / 300

    png( filename = outputFile, width = width, height = height, units = "in", res = 300 )

    aheatmap( cor(t(M)),
             #annCol      = plabel,
              color       = "-RdBu",
              treeheight  = 0,
              legend      = TRUE,
              fontsize    = 2,
             #cexRow      = 0.5,       # These won't work when the input matrix is large (>200 genes or so);
             #cexCol      = 0.5,       # use cellwidth/cellheight instead. See aheatmap() source.
              cellwidth   = 2,
              cellheight  = 2,
              gp          = grid::gpar(cex=2),  # Required to compensate for hardcoded values in the aheatmap() source
              main        = main )

    dev.off()

  }
}


centeredSampleClustering <- function(gmrccSet,
                                     outputFile,
                                     main,
                                     flags,
                                     excludeFlagged)
{
    if (missing(gmrccSet) || missing(outputFile) || missing(main) || missing(flags) || missing(excludeFlagged))
        stop("missing args")

    if (excludeFlagged)
    {
        flagged <- unique(unlist(apply(flags, 2, which)))

        if (length(flagged) > 0) {
            temp_rccSet <- copyRccSet(gmrccSet)[, -flagged]
        } else {
            temp_rccSet <- copyRccSet(gmrccSet)
        }
        
        gmrccSet <- contentNorm(temp_rccSet,
                                method = "global",
                                summaryFunction = "median",
                                inputMatrix = preproc(gmrccSet)$normData_inputMatrix,
                                quietly = TRUE)
    }

    nonctrls <- (fData(gmrccSet)$CodeClass %in% c("Endogenous", "Housekeeping"))

    M <- t(
            apply( assayData(gmrccSet)$normData[nonctrls, ],
                  1,
                  scale, center = TRUE, scale = FALSE )
          )
    colnames(M) <- colnames( assayData(gmrccSet)$normData[nonctrls, ] )
        #
        # TODO: The apply(... 1, scale, ...) combination above transposes the matrix
        # and strips the rownames. See if the same output -- un-transposed and with names
        # intact -- can be generated with a direct call to scale(). -RZ 2015-04-24
        #
    rownames(M) <- fData(gmrccSet)$GeneName[nonctrls]

    # Remove any zero-variance rows or cols. These are problematic
    # in the heatmap-plotting code that follows (they cause NA values in the output of cor() which
    # in turn causes hclust() to error out).

    zero_variance <- apply(M, 1, function(x) { var(x) == 0 })
    if (any(zero_variance)) {
      warning(
        paste(
          c(
            "The following genes were excluded because of zero variance:",
            paste0("    ", rownames(M)[zero_variance])
          ),
          collapse="\n"
        )
      )
      M <- M[!zero_variance, ]
    }

    zero_variance <- apply(M, 2, function(x) { var(x) == 0 })
    if (any(zero_variance)) {
      warning(
        paste(
          c(
            "The following samples were excluded because of zero variance:",
            paste0("    ", colnames(M)[zero_variance])
          ),
          collapse="\n"
        )
      )
      M <- M[ , !zero_variance]
    }

    # Check if the input is too large to be comfortably plotted.

    if (nrow(M) > 1000 || ncol(M) > 1000)
        stop("This function is designed to plot a heatmap of up to 1000 genes and 1000 samples")

    # Generate the heatmap. Given the finicky aheatmap() behavior, need different
    # settings when the input is small (<50 samples) or large (>50).

    row.cl <- as.dendrogram( hclust( as.dist( 1-cor(t(M)) ), method="ward.D" ) )
    col.cl <- as.dendrogram( hclust( as.dist( 1-cor(M) ), method="ward.D" ) )

    #annColors=lapply( seq_along( colnames( flags )), function(n){
    #  setNames( c("grey90", brewer.pal(ncol(flags), "Set1")[n]), c(FALSE, TRUE))
    #  })
    #names(annColors) <- colnames(flags)

    if (excludeFlagged)
        annCol <- data.frame(Counts = colSums(exprs(gmrccSet)[nonctrls, ]))
    else
        annCol <- flags

    if (nrow(M) < 50 && ncol(M) < 50) {

      width = (20*ncol(M) + 350) / 72
      height = (20*nrow(M) + 350) / 72

      png( filename = outputFile, width = width, height = height, units="in", res=72 )

      aheatmap( M, 
               #annColors   = annColors,
                annCol      = annCol,
                color       = "-RdBu",
                Colv        = col.cl,
                Rowv        = row.cl,
#                labRow      = NA,
#                labCol      = NA,
                scale       = "none",
                breaks      = 0,
                treeheight  = 0,
                legend      = TRUE,
                fontsize    = 10,
                cexRow      = 1,
                cexCol      = 1,
                cellwidth   = 20,
                cellheight  = 20,
                main        = main )

      dev.off()

    } else {

      width <- (10 * ncol(M) + 500) / 300
      height <- (10 * nrow(M) + 200) / 300

      png( filename = outputFile, width = width, height = height, units = "in", res = 300 )

      aheatmap( M, 
               #annColors   = annColors,
                annCol      = annCol,
                color       = "-RdBu",
                Colv        = col.cl,
                Rowv        = row.cl,
#                labRow      = NA,
#                labCol      = NA,
                scale       = "none",
                breaks      = 0,
                treeheight  = 0,
                legend      = TRUE,
                fontsize    = 2,
               #cexRow      = 0.5,       # These won't work when the input matrix is large (>200 columns);
               #cexCol      = 0.5,       # use cellwidth/cellheight instead. See aheatmap() source.
                cellwidth   = 2,
                cellheight  = 2,
                gp          = grid::gpar(cex=2),  # Required to compensate for hardcoded values in the aheatmap() source
                main        = main )

      dev.off()
      
    }
}

Try the NanoStringQCPro package in your browser

Any scripts or data that you put into this service are public.

NanoStringQCPro documentation built on Nov. 8, 2020, 8:11 p.m.