flowIncubator: Routines that don't belong to the core flow packages yet.

Documented in extract_citrus_clusters

#' This function extracts clusted flow cytometric expression data from an RDS containing 
#' clusters generated by CITRUS (Bruggner et al, PNAS 2014) and returns them as a named list of matrices
#' There are two ways to use it:
#'
#' 1. to extract *all* CV.min clusters from the RDSfile, leave cluster_id=NULL
#' mat_of_cvmin_clusters <- extract_citrus_clusters(""citrusOutput_10000cells/Age.rds", anno, markers)
#'
#' 2. to extract a particular cluster from the RDSfile, specify its cluster_id
#' mat_of_cluster12345 <- extract_citrus_clusters(""citrusOutput_10000cells/Age.rds", anno, cluster_id=12345, markers)

#' @param RDSfile path to the Citrus .rds file that contains the Citrus run (eg. "/path/to/Age.rds")
#' @param anno data.frame containing annotation. anno <- read.csv("annotation.txt") in directory of each Citrus run
#' @param cluster_id numeric id of the cluster to extract , if NULL then extract all cv.min clusters found in RDS
#' @param markers: named list translating markers <-> channels 

#' @return a named list of expression matrices (list has length 1 if you specified a single cluster, option 2 above)
#' @import tools

extract_citrus_clusters <- function(RDSfile, anno, cluster_id=NULL, markers=markers) {
  
  short_name <- basename(file_path_sans_ext(RDSfile)) 
  rds <- readRDS(RDSfile)
  members <- rds$citrus.foldClustering$allClustering$clusterMembership
  chnls <- rds$citrus.foldClustering$allClustering$clusteringColumns
  pd <- as.data.frame(cbind(rds$citrus.combinedFCSSet$fileNames, rds$citrus.combinedFCSSet$fileIds))
  colnames(pd) <- c("FCS_File","fileId")
  pd <- merge(pd, anno, by="FCS_File")
  
  if (is.null(cluster_id)) {
  # no cluster_id supplied, get the glmnet cv.min clusters from the RDS  
    if ( class(rds$conditionRegressionResults[[short_name]]$glmnet$differentialFeatures$cv.min) == "character") { #only 1 cluster ?!
      clusters <- as.numeric(rds$conditionRegressionResults[[short_name]]$glmnet$differentialFeatures$cv.min[2])
    } else {
      clusters <- rds$conditionRegressionResults[[short_name]]$glmnet$differentialFeatures$cv.min$clusters
    }
    expr <- as.data.frame(rds$citrus.combinedFCSSet$data)
    cat("supplied Citrus RDS file contains", length(clusters), "glmnet cv.min clusters :", clusters )
    cat("\n extracting expression values...")
    events <- members[clusters]
    names(events) <- clusters
    
  } else {
  # cluster_id supplied, just extract expression of specified cluster
    expr <- as.data.frame(rds$citrus.combinedFCSSet$data)
    events <- members[cluster_id]
    names(events) <- cluster_id  
  }
  
  mat <- list()
  
  for (i in 1:length(events)){
    idxs <- events[[i]]
    x <- expr[idxs, chnls]
    x <- expr[idxs, c(chnls,"fileEventNumber", "fileId")]
    x <- merge(x, pd, by="fileId")
    x <- x[,c(chnls, "ID")]
    
    # return MFI instead of raw
    x <- aggregate(.~ID, data=x, median, na.rm=TRUE)
    x <- merge(x, pd, by="ID")
    
    cluster_name <- names(events)[i]
    mat[[i]] <- x
    names(mat)[i] <- names(events)[i]
    
    # name rows with PTIDs, colnames with friendly marker names
    rownames(mat[[i]]) = mat[[i]]$ID
    setnames(mat[[i]], as.vector(markers), names(markers))
  }
  # `mat` is a list of matrices, named for the Citrus cluster IDs
  return(mat)
}