R/LIONESS.R

Defines functions lioness lionessPy

Documented in lioness lionessPy

#' Run python implementation of LIONESS
#'
#' \strong{LIONESS}(Linear Interpolation to Obtain Network Estimates for Single Samples) is a method to estimate sample-specific regulatory networks.
#'  \href{https://pubmed.ncbi.nlm.nih.gov/30981959/}{[(LIONESS publication)])}.
#'
#' @param expr_file Character string indicating the file path of expression values file, with each gene(in rows) across samples(in columns).
#' @param motif_file An optional character string indicating the file path of a prior transcription factor binding motifs dataset.
#'          When this argument is not provided, analysis will continue with Pearson correlation matrix.
#' @param ppi_file An optional character string indicating the file path of protein-protein interaction edge dataset.
#'          Also, this can be generated with a list of proteins of interest by \code{\link{sourcePPI}}.
#'          
#' @param computing 'cpu' uses Central Processing Unit (CPU) to run PANDA; 'gpu' use the Graphical Processing Unit (GPU) to run PANDA. The default value is "cpu".
#' 
#' @param precision 'double' computes the regulatory network in double precision (15 decimal digits); 'single' computes the regulatory network in single precision (7 decimal digits) which is fastaer, requires half the memory but less accurate. The default value is 'double'. 
#' @param save_tmp 'TRUE' saves middle data like expression matrix and normalized networks; 'FALSE' deletes the middle data. The default value is 'TURE'.
#' @param modeProcess 'legacy' refers to the processing mode in netZooPy<=0.5, 'union': takes the union of all TFs and genes across priors and fills the missing genes in the priors with zeros; 'intersection': intersects the input genes and TFs across priors and removes the missing TFs/genes. Default values is 'union'.
#' @param remove_missing Only when modeProcess='legacy': remove_missing='TRUE' removes all unmatched TF and genes; remove_missing='FALSE' keeps all tf and genes. The default value is FALSE.
#' 
#' @param start_sample Numeric indicating the start sample number, The default value is 1.
#' @param end_sample Numeric indicating the end sample number. The default value is 'None' meaning no end sample, i.e. print out all samples.
#' @param save_single_network Boolean vector, "TRUE" wirtes out the single network in npy/txt/mat formats, directory and format are specifics by params "save_dir" and "save_fmt". The default value is 'FALSE' 
#' @param save_dir Character string indicating the folder name of output lioness networks for each sample by defined.
#'        The default is a folder named "lioness_output" under current working directory. This paramter is valid only when save_single_network = TRUE.
#' @param save_fmt Character string indicating the format of lioness network of each sample. The dafault is "npy". The option is txt, npy, or mat. This paramter is valid only when save_single_network = TRUE.
#'
#' @return A data frame with columns representing each sample, rows representing the regulator-target pair in PANDA network generated by \code{\link{pandaPy}}. 
#'         Each cell filled with the related score, representing the estimated contribution of a sample to the aggregate network.

#'
#' @examples
#'
#' # refer to the input datasets files of control in inst/extdat as example
#' control_expression_file_path <- system.file("extdata", "expr10_reduced.txt", package = "netZooR", 
#'     mustWork = TRUE)
#' motif_file_path <- system.file("extdata", "chip_reduced.txt", package = "netZooR", mustWork = TRUE)
#'     ppi_file_path <- system.file("extdata", "ppi_reduced.txt", package = "netZooR", mustWork = TRUE)
#' 
#' # Run LIONESS algorithm
#' \donttest{
#' control_lioness_result <- lionessPy(expr_file = control_expression_file_path, 
#'     motif_file = motif_file_path, ppi_file = ppi_file_path, 
#'     modeProcess="union",start_sample=1, end_sample=1, precision="single")
#' }
#' unlink("lioness_output", recursive=TRUE)
#' @import reticulate
#' @export
lionessPy <- function(expr_file, motif_file=NULL, ppi_file=NULL, computing="cpu", precision="double", save_tmp=TRUE, modeProcess="union", remove_missing=FALSE, start_sample=1, end_sample="None", save_single_network=FALSE, save_dir="lioness_output", save_fmt='npy'){

  if(missing(expr_file)){
    stop("Please provide the gene expression value with option e, e.g. e=\"expression.txt\"") 
  }else{ 
    expr.str <- paste("\'", expr_file, "\'", sep = '') 
  }
  
  if(is.null(motif_file)){
    motif.str <- 'None'
    message("Prior motif network is not provided, analysis continues with Pearson correlation matrix.") 
  }else{ 
    motif.str <- paste("\'", motif_file,"\'", sep = '') 
  }
  
  if(is.null(ppi_file)){
    ppi.str <- 'None'
    message("No PPI provided.") }
  else{ ppi.str <- paste("\'", ppi_file, "\'", sep = '') }
  
  # computing variable
  if(computing=="gpu"){
    computing.str <- "computing='gpu'"
  } else{ computing.str <- "computing='cpu'"}
  
  # precision variable
  if(precision == "single" ){
    precision.str <- "precision='single'"
  } else{ precision.str <- "precision='double'"}
  
  # save_memory variable always is "save_memory=False" 
  savememory.str <- "save_memory=False" 
  
  # save_tmp variable
  if(save_tmp==FALSE){
    savetmp.str <- "save_tmp= False"
  } else{ savetmp.str <- "save_tmp=True" }
  

  # when pre-processing mode is legacy
  if(modeProcess == "legacy"){
    
    if(remove_missing == TRUE){
      message("Use the legacy mode to pre-process the input dataset and keep only the matched TFs or Genes")
      mode.str <- "modeProcess ='legacy', remove_missing = True"  
    } else{ message("Use the legacy mode (netZooPy version <= 0.5) to pre-process the input dataset and keep all the unmatched TFs or Genes")
      mode.str <- "modeProcess ='legacy', remove_missing = False"}
  }
  
  # when pre-processing mode is union
  else if(modeProcess == "union"){
    mode.str <- "modeProcess ='union'"
  }
  
  else if(modeProcess == "intersection"){
    mode.str <- "modeProcess ='intersection'"
  }
  

   # source the pandaPy and lionessPy from GitHub raw website.
   pandapath   <- system.file("extdata", "panda.py", package = "netZooR", mustWork = TRUE)
   lionesspath <- system.file("extdata", "lioness.py", package = "netZooR", mustWork = TRUE) 
   reticulate::source_python(pandapath,convert = TRUE)
   reticulate::source_python(lionesspath,convert = TRUE)
   # run py code to create an instance named "panda_ob" of Panda Class
   pandaObj.str <-  paste("panda_obj=Panda(", expr.str, ",", motif.str,",", ppi.str, ",", computing.str, ",", precision.str, ",", savememory.str, ",", savetmp.str, "," , "keep_expression_matrix=True", ",",  mode.str, ")", sep ='')
   py_run_string(pandaObj.str,local = FALSE, convert = TRUE)
   
   # assign "panda_network" with the output PANDA network
   py_run_string("panda_network=panda_obj.export_panda_results",local = FALSE, convert = TRUE)
   panda_net <- py$panda_network
   
   # re-assign data type of cloumn.
   panda_net$tf   <- as.character(panda_net$tf)
   panda_net$gene <- as.character(panda_net$gene)
   
   # rename first two columns
   names(panda_net)[names(panda_net) == "tf"] <- "TF"
   names(panda_net)[names(panda_net) == "gene"] <- "Gene"
   
   if( length(intersect(panda_net$Gene, panda_net$TF))>0){
     panda_net$TF <- paste('reg_', panda_net$TF, sep='')
     panda_net$Gene <- paste('tar_', panda_net$Gene, sep='')
     message("Rename the content of first two columns with prefix 'reg_' and 'tar_' as there are some duplicate node names between the first two columns" )
   }
   
   # create an instance named "lioness_obj" of Lioness Class.
   # when save_single_network = TRUE
   if(save_single_network==TRUE){
     py_run_string(paste("lioness_obj = Lioness(panda_obj,computing='", computing, "' , ","start=", start_sample," , ", "end=" ,end_sample, " , ", "save_dir='", save_dir, "' , " , "save_fmt='" , save_fmt, "' , " , "save_single = True )",sep = "" ))
   }else if(save_single_network==FALSE){
     py_run_string(paste("lioness_obj = Lioness(panda_obj,computing='", computing, "' , ","start=", start_sample," , ", "end=" ,end_sample, " , ", "save_single = False )",sep = "" ))
  }
   
   # retrieve the "total_lioness_network" attribute of instance "lionesss_obj"
   py_run_string("lioness_network = lioness_obj.export_lioness_results",local = FALSE, convert = TRUE)
   
   # convert the python variable "lionesss_network" to a data.frame in R environment.
   lioness_net <- py$lioness_network
   # cbind the first two columns of PANDA output with LIONESS output.
   lioness_net$tf   <- panda_net$TF
   lioness_net$gene <- panda_net$Gene
   lioness_output   <- lioness_net
   return(lioness_output)
}

#' Compute LIONESS (Linear Interpolation to Obtain Network Estimates for Single Samples)
#'
#'
#' @param motif A motif dataset, a data.frame, matrix or exprSet containing 3 columns.
#' Each row describes an motif associated with a transcription factor (column 1) a
#' gene (column 2) and a score (column 3) for the motif.
#' @param expr A mandatory expression dataset, as a genes (rows) by samples (columns) data.frame
#' @param ppi A Protein-Protein interaction dataset, a data.frame containing 3 columns.
#' Each row describes a protein-protein interaction between transcription factor 1(column 1),
#' transcription factor 2 (column 2) and a score (column 3) for the interaction.
#' @param network.inference.method String specifying choice of network inference method. Default is "panda".
#' Options include "pearson". 
#' @param ncores int specifying the number of cores to be used. Default is 1. 
#' (Note: constructing panda networks can be memory-intensive, and the number of cores should take into consideration available memory.)
#' @param ... additional arguments for panda analysis
#' @keywords keywords
#' @importFrom matrixStats rowSds
#' @importFrom matrixStats colSds
#' @importFrom Biobase assayData
#' @importFrom reshape melt.array
#' @importFrom pandaR panda
#' @importFrom parallel mclapply
#' @export
#' @return A list of length N, containing objects of class "panda" 
#' corresponding to each of the N samples in the expression data set.\cr
#' "regNet" is the regulatory network\cr
#' "coregNet" is the coregulatory network\cr
#' "coopNet" is the cooperative network
#' @examples
#' data(pandaToyData)
#' lionessRes <- lioness(expr = pandaToyData$expression[,1:3], motif = pandaToyData$motif, 
#'   ppi = pandaToyData$ppi,hamming=1,progress=FALSE)
#' @references
#' Kuijjer, M.L., Tung, M., Yuan, G., Quackenbush, J. and Glass, K., 2015. 
#' Estimating sample-specific regulatory networks. arXiv preprint arXiv:1505.06440.
#' Kuijjer, M.L., Hsieh, PH., Quackenbush, J. et al. lionessR: single sample network inference in R. BMC Cancer 19, 1003 (2019). https://doi.org/10.1186/s12885-019-6235-7
lioness = function(expr, motif = NULL, ppi = NULL, network.inference.method = "panda", ncores = 1, ...){
  N <- ncol(expr)
  if(ncores < 1){
    print('Setting number of cores to 1.')
    ncores <- 1
  }
  if(network.inference.method == "panda")
  {
    fullnet <- panda(motif, expr, ppi, ...)
    return(mclapply(seq_len(N), function(i) {
      print(paste("Computing network for sample ", i))
      N * fullnet@regNet - (N - 1) * panda(motif, expr[, -i], 
                                           ppi, ...)@regNet
    }, mc.cores = ncores))
  }
  if(network.inference.method == "pearson")
  {
    fullnet <- cor(t(expr))
    
    return(mclapply(seq_len(N), function(i) {
      print(paste("Computing network for sample ", i))
      N * fullnet - (N - 1) * cor(t(expr[,-i]))
    }, mc.cores = ncores))
  }
  if(!(network.inference.method %in% c("panda","pearson")))
  {
    stop(paste(c("The method",network.inference.method,"is not yet supported."),collapse=" "))
    geterrmessage()
  }
}
netZoo/netZooR documentation built on Oct. 16, 2024, 10:23 p.m.