R/cellmap.R

Defines functions cellmap

Documented in cellmap

#' Estimate the cell type proportions of mixture bulk RNA
#' 
#' This function estimates cell type proportions of mixture bulk RNA samples based on pre-trained cell type profiles.
#' 
#' @param strBulk The full path to the query mixture bulk expression file. 
#'   Expression matrix separated by tabs with rows are genes, columns are samples. 
#'   First row includes the sample names or ids, while first column consists of gene symbols or gene ensembl id.
#' @param strProfile The full path to a pre-trained CellMap cell type profile.
#'   The profile with ‘rds’ as file extension generated by \code{cellMapTraining} function.
#' @param strPrefix The prefix with path of the result files. 
#'   There are two files produced: a pdf file contains all cell type decomposition figures; 
#'   a tab separated table file including composition and p-values.
#' @param delCT Cell types should not be considered in the decomposition estimation.
#'   A string with exact cell type names defined in the CellMap profile. 
#'   If more than one cell types needed to be removed, please separate them by commas (,). Default is \code{NULL}.
#' @param cellCol R colors for all cell types. A named vector of R colors, where names are cell type names.
#'   Default is \code{NULL}, which means \code{$para$cellCol} from the provided CellMap profile will be used.
#' @param geneNameReady A boolean to indicate if the gene names in the query mixture bulk expression matrix is official symbol already.
#'   The \code{FALSE} option also works with the official symbol is used in the expression matrix.
#'   Default is \code{FALSE}, which enable to find official symbol by an R package called \code{biomaRt}.
#' @param ensemblPath The path to a folder where ensembl gene definition is/will be saved.
#'   The ensembl gene definition file will be saved if it never run before.
#'   Default is \emph{Data/} in the current working directory.
#' @param ensemblV The version of the ensembl to be used for the input query bulk expression. Default is 97.
#' @param bReturn A Boolean indicate if return object is needed.
#'   \code{False}, no object returned but plots in a pdf as well as a tables in a tsv file.
#'   \code{True}, return an R list object including details of raw decomposition results without generating any file.
#' @param bCutoff A numeric indicate the significant level. Default is 0.05.
#' @param core The number of computation nodes could be used. Default is 2.
#' 
#' @return If \code{bReturn} is set to be \code{TRUE}, a named list object with detailed decomposition results is returned.
#'   The following objects are in the list, and they can be accessed by ($) of the returned list object:
#'   * \code{compoP} A matrix of the raw fitting coefficient for each sample (column) and each cell type (row).
#'       It needed to normalize the sum of each column to be 1, in order to martch the output compisition table.
#'   * \code{compoP} A matrix of the fitting p-values for each sample (column) and each cell type (row).
#'   * \code{overallP} A vector of the overall fitting p-value for each sample.
#'   * \code{rmse} A vector of the fitting RMSE for each sample.
#'   * \code{coverR} A numeric indicate the ratio of cell type signature genes covered in the mixture bulk expression data
#'   * \code{rawComp} A named list of all raw composition matrix, p-values, RMSE for each sample.
#'   * \code{rawSets} A matrix of all sets of pure cell type combinations.
#'   * \code{missingF} A vector of cell type signature genes which are not in the query bulk expresion data.
#'   * \code{missingByCellType} A named list of cell type signature genes which are not in the query bulk expresion data for each cell type.

#' @examples
#' strMix <- system.file("extdata","bulk.txt",package="cellmap")
#' strProfile <- system.file("extdata","CNS6.rds",package="cellmap")
#' cellmap(strMix,strProfile,strPrefix="~/cellmap_CNS6_test")
#' 
#' @export
cellmap <- function(strBulk,
                         strProfile,
                         strPrefix=substr(strBulk,1,nchar(strBulk)-4),
                         delCT=NULL,
                         cellCol=NULL,
                         geneNameReady=FALSE,
                         ensemblPath="Data/",
                         ensemblV=97,
                         bReturn=F,
                         pCutoff=0.05,
                         core=2){
  eval(strBulk)
  eval(strProfile)

  oriSC <- strProfile
  if(!file.exists(strProfile)){
    strProfile <- paste("profile/",strProfile,".rds",sep="")
    if(!file.exists(strProfile)){
    	strProfile <- system.file("extdata",basename(strProfile),package="cellmap")
    	if(!file.exists(strProfile)) stop(paste0(oriSC," cannot be found!"))
    }
  }
  profile <- readRDS(strProfile)
  if(!is.null(cellCol)) profile$para$cellCol[names(cellCol)] <- cellCol
  
  X <- condensGene(read.table(strBulk,header=T,sep="\t",check.names=F,as.is=T),geneNameReady,ensemblV=ensemblV,ensemblPath=ensemblPath)#,row.names=1
  
  ## deconvolution --------
  BiocParallel::register(BiocParallel::MulticoreParam(core))
  strF <- paste0(strPrefix,"_CellMap_",profile$para$version,".pdf")
  print(system.time({cellComp <- deconv(X,profile,rmCellType=delCT,modelForm=profile$para$modelForm)}))

  #saveRDS(cellComp,file=gsub("pdf","rds",strF))
  if(bReturn) return(cellComp)
  
  pdf(strF,width=5+round(ncol(X)/3))
  plotComposition(cellComp$composition,
                  profile$para$cellCol,pV=cellComp$compoP,cutoff.p=pCutoff,
                  ggplotFun=ggplot2::ggtitle(paste0("CellMap(",profile$para$version,") feature coverage:",cellComp$coverR,"%")))
  tmp <- dev.off()

  conn <- file(gsub("pdf$","tsv",strF),"w")
  cat(paste('#Bulk:',strBulk,"Profile:",strProfile,"delCT:",paste(delCT,collapse=";"),"\n"),file=conn)
  cat(paste0("CellMap (",profile$para$version,") feature coverage: ",cellComp$coverR,"%"),"\n",file=conn)
  write.table(apply(cellComp$composition,2,function(x){return(round(x/sum(x),2))}),file=conn,sep="\t",col.names = NA,quote=F)
  if(!is.null(cellComp$compoP)){
    cat("pValue:\n",file=conn)
    write.table(cellComp$compoP,file=conn,sep="\t",col.names = NA,quote=F)
  }
  if(!is.null(cellComp$overallP)){
    cat("overall p-value:\n",file=conn)
    write.table(matrix(cellComp$overallP,nrow=1,dimnames=list("overallP",names(cellComp$overallP))),
                file=conn,sep="\t",col.names = NA,quote=F)
  }
  cat("\n\nmissing following cell type signature genes:\n",paste(cellComp$missingF,collapse="\n"),file=conn,sep="")
  close(conn)
  message("Decomposition on all samples is finished successfully!")
}
interactivereport/CellMap documentation built on March 17, 2024, 2:01 a.m.