R/getQAStats.R
In QUALIFIER: Quality Control of Gated Flow Cytometry Experiments

.getQAStats.env <- function(obj,gsid,...){
			if(missing(gsid))
				stop("missing gsid!")
			
			
			gs <- obj$gs[[gsid]]
			
#			browser()
			
			statsOfGS <- getQAStats(gs, ...)
			
			
#			browser()
			for(curID in names(statsOfGS)){

                    statsOfGS[[curID]][,eval(qa.par.get("idCol")):= as.integer(curID)]
				}
			statsOfGS <- rbindlist(statsOfGS)
			
                    
			##append sid and gsid
			if(nrow(obj$stats)==0)
				msid<-0
			else
				msid<-max(obj$stats$sid)
			statsOfGS[,sid := 1:nrow(statsOfGS)+msid]
			statsOfGS[,gsid := gsid]
			

			#save it db (remove the old records with the same gsid
			
			ind <- obj$stats$gsid == gsid
			obj$stats<-rbind(obj$stats[!ind,],statsOfGS[,colnames(obj$stats),with=FALSE])
			print("stats saved!")
			
		}
#' @title 
#' Get statistics from gated data for QA check.
#'
#' @description 
#' \code{getQAStats} calculates and extract statistics of each
#'gated cell population in the gating hierarchies generated by flowWorkspace
#'package. 
#'
#' @details 
#'This is the second preprocessing step followed by parsing gating template
#'from flowJo workspace with \code{\link{flowjo_to_gatingset}}.  Different QA checks
#'can be performed after this step is done.  when obj is an \code{environment}
#'the results are stored as a dataframe with the name of "statsOfGS" in the
#'environment.
#'
#' 
#' @aliases 
#' getQAStats 
#' getQAStats-methods 
#' getQAStats,environment-method
#' getQAStats,GatingSet-method 
#' getQAStats,GatingSetList-method
#' getQAStats,GatingHierarchy-method
#'
#'@docType methods
#' 
#'@param obj A \code{\link[=GatingHierarchy-class]{gating hierarchy}} that
#'stores the gated cell populations for one FCS file; or a
#'\code{\link[=GatingSet-class]{GatingSet}} or a
#'\code{\link[=GatingSetList-class]{GatingSetList}} containing multiple
#'\code{gating hierarchies} or an \code{environment} that stores the
#'\code{GatingSet} and all other information.
#'@param ...  other arguments
#'
#' isFlowCore: A \code{logical} scalar indicating whether the statistics are the original ones in flowJo xml workspace or the re-calculated version by flowCore in R.  
#' 
#' isMFI a \code{logical} flag indicating whether to calculate MFI which causes the reading of raw data 
#' 
#' isSpike a \code{logical} flag indicating whether to calculate spike for each channel which causes the reading of raw data
#' 
#' isRaw \code{logical} whether to calculate the MFI value in raw scale.
#' 
#' pops a \code{numeric} or \code{character} vector as the population indices specifing a subset of populations to extract stats from
#' 
#' isChannel a \code{logical} flag indicating whether to extract channel information from 1d gates in order to perform channel-specific QA
#' 
#' nslaves: An \code{integer} scalar indicating the number of nodes. It is used for parallel computing when obj is \code{GatingHierarchy} or
#' \code{GatingSet}.  When the \code{parallel} package is loaded and nslaves is NULL, its value is automatically decided by available number of nodes.  
#' When it is set to 1, then forced to run in serial mode.
#' 
#' 
#'@return
#' a data frame when obj is a \code{GatingHierarchy} or \code{GatingSet}
#' 
#'
#'@author Mike Jiang,Greg Finak
#'
#'Maintainer: Mike Jiang <wjiang2@@fhcrc.org>
#'@seealso \code{\link{qaCheck}},\code{\link{qaTask}},\code{\link{qaReport}}
#'@keywords methods
#'@examples
#'
#'\dontrun{
#' 
#'  getQAStats(G[[1]])#extract stats from a gating hierarchy
#'  getQAStats(G[[1]], isMFI = TRUE, pops = c("/boundary/lymph/CD3/CD4"))#extract stats from a gating hierarchy
#'  getQAStats(G)#from a gating set
#' 
#'}
#' 
#' @export     
setGeneric("getQAStats",function (obj, ...)standardGeneric("getQAStats"))   
setMethod("getQAStats",signature=c("environment"),function(obj,gsid,...){
       .getQAStats.env(obj,gsid,...)
      })

setMethod("getQAStats",signature("GatingSet"),function(obj, ...){
#			browser()
			
			print("extracting stats...")
			
			
			IDs <- pData(obj)[sampleNames(obj),qa.par.get("idCol")]
			if(is.null(IDs)||length(IDs)!=length(obj))
			{
				stop("Not all IDs for the current sample set are found in meta data of this GatingSet!")	
			}
			

			statsOfGS <- lapply(obj, getQAStats,...)
			
            names(statsOfGS) <- IDs
            
			statsOfGS
			
		})
##extract stats from a gating hierarchy\\
setMethod("getQAStats",signature("GatingHierarchy"),function(obj, ...){
      .getQAStats.gh(obj, ...)    
    })

#' @param isFlowCore a \code{logical} flag indicating whether to extract flowCore stats or flowJo stats
#' @param isMFI a \code{logical} flag indicating whether to calculate MFI which causes the reading of raw data 
#' @param isSpike a \code{logical} flag indicating whether to calculate spike for each channel which causes the reading of raw data
#' @param isRaw \code{logical} whether to calculate the MFI value in raw scale.
#' @param pops a \code{numeric} or \code {character} vector as the population indices specifing a subset of populations to extract stats from
#' @param isChannel a \code{logical} flag indicating whether to extract channel information from 1d gates in order to perform channel-specific QA
#' it is automatically set to TRUE when isMFI is TRUE
#' @importFrom Biobase rowMedians
.getQAStats.gh <- function(obj,isFlowCore=TRUE,isMFI = FALSE,isSpike = FALSE, isChannel = FALSE, pops = NULL, isRaw = FALSE, ...){
			
    			message("reading GatingHierarchy:",sampleNames(obj))
          
    			if(isMFI){
                  isChannel <- TRUE
                  if(isRaw)
                    inv.list <- gh_get_transformations(obj, inverse = TRUE)
                }
                
    			#check if data is gated
    			isGated <- obj@flag
          #isPath is deperated and prefixed node name is no longer supported
    			#thus nodePaths is the same as nodes
		      nodes<-gs_get_pop_paths(obj, showHidden = TRUE)
          # nodePaths<-gs_get_pop_paths(obj, isPath = TRUE, showHidden = TRUE)
          #convert to QUALIFIER's path
          # nodePaths[1]<-paste("/",nodePaths[1],sep="")
          # nodePaths[-1]<-paste("/root",nodePaths[-1],sep="")
          
          #subset the nodes
    
          if(!is.null(pops)){
            if(is.numeric(pops)){
              nodes <- nodes[pops]
              # nodePaths <- nodePaths[pops]
            }else{
              nodes <- nodes[match(pops,nodes)]
              
              #validity check for pops
              invalidNodes <- is.na(nodes)
              if(any(invalidNodes))
                stop("Invalid pops:", paste0(pops[invalidNodes]))
              
              # nodePaths <- nodePaths[match(pops,nodes)]
            }
            
          }
                
                
    			nNodes<-length(nodes)
                
                if(isGated&&(isSpike||isMFI||isChannel)){
                  fdata<-gh_pop_get_data(obj, use.exprs = isGated&&(isSpike||isMFI))#only spike and MFI require the actual raw data reading
                  pd<-pData(parameters(fdata))
                  params <- pd$name
  
                }
    			
                          
    			statslist <- lapply(1:nNodes,function(i){
    
    			# curPopName<-nodePaths[i]
    			curNode<-nodes[i]
                
                
                ##############################################
                #extract channel and stain info
                #it is mainly used for QA on channel/stain-specific 1d gate
                #e.g. marginal events for each channel (defined by 1d gate on that channel)
                # or MFI stability for each stain+channel (defined by 1d gate on each channel)
                # or redudant staining across aliquots (1d gate on each stain+channel)
                # or Spike on each channel
                # Thus not really needed for 2-d gates unless we want to do QA on MFI
                #######################################################################
#                browser()
                chnl <- as.character(NA)   #convert from logical NA to character NA for rbindlist to behave appropriately
                if(isGated&&!.isRoot(obj,curNode)&&isChannel)
  				{
                    curGate<-gh_pop_get_gate(obj,curNode)
                  #only extract chnl info gates:
                  #1. non bool gate
                  #2. MFI is true
                  #3. or 1d gate
                  if(class(curGate)!="booleanFilter"){
                    chnl <- parameters(curGate)
                    if(!isMFI&&length(chnl)>1)
                      chnl <- as.character(NA) #reset chnl to NA for 1d Gate when isMFI = FALSE
                  }
  				}
  				
  
  				if(all(is.na(chnl)))
  					stain<-as.character(NA)
  				else
  				{
  					stain<-as.vector(pd[match(chnl,pd[,"name"]),"desc"])
  				}
          ##get count and proportion
          statsOfNode <- flowWorkspace:::.getPopStat(obj,curNode)
          
          if(isFlowCore)
          {
            statsOfNode <- statsOfNode$openCyto
          }else
          {
            statsOfNode<-statsOfNode$openCyto
          }
          #reset chnl for 2d gate since channel are meaningless for counts and % 
          if(!is.na(chnl)&&length(chnl)>1){
            thisChnl <- as.character(NA)
            thisStain <- as.character(NA)
          }else{
            thisChnl <- chnl
            thisStain <- stain
          }
#                browser()
  				statsOfNode <- data.table(channel=thisChnl
                                          ,stain=thisStain
                                           ,stats= names(statsOfNode)
    										,value = unname(statsOfNode)
  										)
  #				browser()
  				#get spikes meatures for each channel at root level
  				if(isGated&&.isRoot(obj,curNode)&&isSpike)
  				{
  
  #					browser()
  					expr <- exprs(fdata)
  					
  					time <- flowCore:::findTimeChannel(expr)
  					if(!(time %in% colnames(expr)))
  						stop("Invalid name of variable (", time, ") recording the ",
  								"\ntime domain specified as 'time' argument.", call.=FALSE)
  					nonTimeChnls<-params[!params%in%time]
  					stain<-as.vector(pd[match(nonTimeChnls,pd[,"name"]),"desc"])
  					
  					spikes <- unlist(lapply(nonTimeChnls,.timelineplot,x=fdata, binSize=50))
  					
  					statsOfNode <- rbindlist(list(statsOfNode
                                                    ,data.table(channel=nonTimeChnls
                                                                ,stain=stain
                                                                ,stats="spike"
                                                                 ,value=spikes)
                                                    )
                                              )
  					
  				}
  
  
  				#get MIF meatures
  				if(!is.na(chnl)&&isMFI)
  				{
            curData <- gh_pop_get_data(obj,curNode)
  					
  					mat <- exprs(curData)[,chnl,drop=FALSE]
  					chnames <- colnames(mat)
  					MFI <- rowMedians(t(mat))#using rowMedian to speed up
            
  					if(all(!is.na(MFI))){
  					  if(isRaw){
  					    #inverse transform the MFI
  					   MFI <- mapply(chnames, MFI, FUN = function(ch, thisMFI){
  					      ind <- grep(ch, names(inv.list))
  					      if(length(ind) > 0){
  					        inv.fun <- inv.list[[ind]]
  					        thisMFI <- inv.fun(thisMFI)
  					      }
  					      thisMFI
  					    }, USE.NAMES = FALSE)
  					  }
  						statsOfNode <- rbindlist(list(statsOfNode
                                                        ,data.table(channel=chnames
                                                                    ,stain=stain
                                                                    ,stats="MFI"
                                                                     ,value=MFI)
                                                         )
                                                     )
  					}
				}
        #append two columns (they are currently redudant and should only keep one)
        statsOfNode[, node := curNode]
        statsOfNode[, population := curNode]
        statsOfNode

			})
            
            ##merge to one table
            statsOfGh<-rbindlist(statslist)
            
			statsOfGh
			
			
}
#' @importFrom flowWorkspace lapply
setMethod("getQAStats",signature("GatingSetList"),function(obj,...){
      res <- lapply(obj,function(gs){
            getQAStats(gs,...)
          }, level =1)
      
      unlist(res,recursive = FALSE)
    })
##copy from timelineplot of flowViz, add the plot flag
.timelineplot <- function(x, channel, binSize, varCut=1, ...)#add plot flag to enable spike score computing without plots
{
#	browser()
	## Sanity checking up front
	if(!length(channel)==1)
		stop("'channel' must be character scalar")
	if(!channel %in% flowCore::colnames(x))
		stop(channel, " is not a valid channel in this flowSet.")
	if(tolower(channel) == "time")
		stop("Argument 'channel' can not be the time channel")
	
	## Bin the data and compute local variances and locations
	time <- flowCore:::findTimeChannel(xx= exprs(x))
	timeData <- flowCore:::prepareSet(x, parm=channel, time=time,
			binSize=binSize )
#	browser()	
	
	## Standardize to compute meaningful scale-free QA scores
	med <- median(timeData$smooth[,2], na.rm=TRUE)
	vars <- mean(timeData$variance)
	stand <-  abs(timeData$smooth[,2]-med)/(vars*varCut)
	
#    browser()
	## the QA score and correlation coefficient
	
	qaScore <-sum(stand[stand>0])/varCut
	
	
	## The return value with attributes attached.
	attr(qaScore, "binSize") <- binSize
	return(qaScore)
}