R/antQC.R

Defines functions antQC

Documented in antQC

#' @title Perform ANT based QC check
#'
#' @description Identifies samples with higher than expected ANT values.  Expected ANT values are derived from 72 technical replicates.
#'
#'
#' @param data data frame with raw counts and no additional header information such that the data can be coerced to numeric format.  If the first column contains probe names these will be assigned to rownames and then the probe name column will be removed.
#' @param ant.genes A character vector for he ANT gene names in the data.  The default is the OBP ANT genes from the current parser
#' @param ymax A single number to give the upper and lower limits of the QC plot. If Null is automatically set by results.
#' @param antString (Optional) A string or regular expression used to identify ANT probes in the data.
#' @param norm.method The normalization method to use.  Currently only voom is supported.
#'
#' @return Returns a list with the QC plot (same as in NPSeqQC app) and a data.frame with samples names and pass/fail information
#'
#' @usage antQC(data, ant.genes = c("NEG_CTRL_ANT1", "NEG_CTRL_ANT2", "NEG_CTRL_ANT3", "NEG_CTRL_ANT4","NEG_CTRL_ANT5"), norm.method = "voom", ymax = NULL, antString = "ANT")
#'
#' @examples
#' \donttest{
#' aqc <- antQC(data=df)
#' #plot the QC chart
#' aqc$plot
#' #see which samples passed or failed
#' aqc$result
#'
#' }
#'
#' @export
#' @name antQC
#' @author Biostats
#'
#'
antQC <- function(data,
                  ant.genes = c("NEG_CTRL_ANT1", "NEG_CTRL_ANT2", "NEG_CTRL_ANT3", "NEG_CTRL_ANT4","NEG_CTRL_ANT5"),
                  norm.method = "voom",
                  ymax = NULL,
                  antString = "ANT"
){
  err <- vector()#shell for error collection
  #clean out empty columns
  allNA <- which(apply(data,2,function(x)all(is.na(x))))
  if(length(allNA)>0){
    newdata <- data[,-allNA]
  }else{
    newdata <- data
  }
  #check for probe names in first column by checking to see if they can't be converted to numeric
  if(any(is.na(as.numeric(as.character((newdata[,1])))))){
    #assign rownames from first column
    rownames(newdata) <- newdata[,1]
    newdata <- newdata[,-1]
  }

  #check if any column is not numeric
  if(any(apply(newdata,2,FUN=function(x) !is.numeric(x)))) {
    newdata <- apply(newdata, c(1,2), FUN=function(x) as.numeric(as.character(x)))
    if(any(apply(newdata, 2, FUN=function(x) is.na(x)))) stop("Some data could not be coerced to numeric.")
  }

  if(norm.method=="voom"){#there is only one method but this if remains for future expansions
    normnew <- data.frame(voom(counts=newdata)$E)#transform to CPM
    rownames(normnew)<-rownames(newdata)
  }
  Gmean<-2.61125302287796 #these values come from an experiment of 72 technical replicates and represent "population" values
  SD<-2.38073004222091
  ##process new data
  #Check for proper ANT names
  if(!any(rownames(normnew)%in%ant.genes)){#First check to see if ANT names match
    #if ANT doesn't match then add warning to error vector and continue using grep
    data <- normnew[grepl(antString, rownames(normnew)), ]
    err<-c(err,"Warning: The ANT probe names do not match.  Check that you have used the most current parser.  QC has been completed by searching for ANT probes.  Probes used for ANT are: ", paste(rownames(normnew)[grepl("ANT", rownames(normnew))], sep=" ,"))
  }else{
    data <- normnew[rownames(normnew)%in%ant.genes, ]
  }

  gene_mean <- apply(data, 2, mean)#Calculate the mean for each sample
  delta_mean <- gene_mean - Gmean #Calculate the difference between the expected mean and the observed mean

  #create data set with QC information for each sample
  df <- data.frame(Sample_Name=names(delta_mean), DeltaMean = delta_mean)
  df$SD <- SD
  df$Gmean <- Gmean
  df$Outlier <- (df$DeltaMean>2*SD)
  df$Dataset <- "New"


  df$Dataset <- factor(df$Dataset)
  Levels <- levels(df$Dataset)
  nlevels <- length(Levels)
  #create simple table to display on app with pass and fail info
  simpledf<-df
  simpledf$Result<-ifelse(simpledf$Outlier,"FAIL","PASS")
  simpledf<-simpledf[,c("Sample_Name","Result")]
  #set plotting area based on the maximum deviation
  if(is.null(ymax)) ymax <- max(round(max(abs(df$DeltaMean))+0.5),round(2*SD+0.5))

  df$Sample_Name <- factor(df$Sample_Name, levels=unique(df$Sample))

  nlevels<-1
  #this is the actual plot, everything else is related to data export
  pcplot <- stripplot(DeltaMean~df$Sample_Name, df, groups=Dataset,
                      scale=list(x=list(rot=90,cex=0.8)),ylim=c(-ymax, ymax),
                      par.settings = list(superpose.symbol = list(cex=1.2,col=rep(c("black","magenta","green","orange","blue"),3)[1:nlevels],
                                                                  pch=rep(c(19,17,15),each=5)[1:nlevels])),
                      panel=panel.superpose,
                      panel.group = function(x,y,...){
                        panel.stripplot(x,y,...)
                        #col=ifelse(abs(y)>3*SD,2,1),...)
                        panel.abline(h=c(-2*SD, 2*SD),col="green",lty=2)
                        panel.abline(h=c(-3*SD, 3*SD),col="red",lty=2)
                        panel.abline(v=as.numeric(x),col="grey",lty=2)
                      })
  print(err)
  return(list(results=simpledf, plot=pcplot, errors=err))
}
rocrat/HTGPackage documentation built on May 25, 2017, 8:32 a.m.