R/gess_cor.R

Defines functions cor_sig_search gess_cor

Documented in gess_cor

#' @rdname gess
#' @description 
#' Correlation-based similarity metrics, such as Spearman or Pearson 
#' coefficients, can be used as Gene Expression Signature Search (GESS) methods.
#' As non-set-based methods, they require quantitative gene expression values 
#' for both the query and the database entries, such as normalized intensities 
#' or read counts from microarrays or RNA-Seq experiments, respectively.
#' @details 
#' For correlation searches to work, it is important that both the query and
#' reference database contain the same type of gene identifiers. The expected 
#' data structure of the query is a matrix with a single numeric column and the 
#' gene labels (e.g. Entrez Gene IDs) in the row name slot. For convenience, the
#' correlation-based searches can either be performed with the full set of genes
#' represented in the database or a subset of them. The latter can be useful to
#' focus the computation for the correlation values on certain genes of interest
#' such as a DEG set or the genes in a pathway of interest. For comparing the
#' performance of different GESS methods, it can also be advantageous to subset
#' the genes used for a correlation-based search to same set used in a set-based
#' search, such as the up/down DEGs used in a LINCS GESS. This way the search
#' results of correlation- and set-based methods can be more comparable because
#' both are provided with equivalent information content.
#' @examples 
#' 
#' ######## Correlation-based GESS method #########
#' # qsig_sp <- qSig(query=query_mat, gess_method="Cor", refdb=db_path)
#' # sp <- gess_cor(qSig=qsig_sp, method="spearman")
#' # result(sp)
#' @export
gess_cor <- function(qSig, method="spearman", 
                     chunk_size=5000, ref_trts=NULL, workers=1, 
                     cmp_annot_tb=NULL, by="pert", cmp_name_col="pert",
                     addAnnotations = TRUE){
  if(!is(qSig, "qSig")) stop("The 'qSig' should be an object of 'qSig' class")
  if(gm(qSig) != "Cor"){
      stop("The 'gess_method' slot of 'qSig' should be 'Cor' 
           if using 'gess_cor' function")
  }
  query <- qr(qSig)
  db_path <- determine_refdb(refdb(qSig))
  
  ## calculate cs_raw of query to blocks (e.g., 5000 columns) of full refdb
  full_mat <- HDF5Array(db_path, "assay")
  rownames(full_mat) <- as.character(HDF5Array(db_path, "rownames"))
  colnames(full_mat) <- as.character(HDF5Array(db_path, "colnames"))
  
  if(! is.null(ref_trts)){
      trts_valid <- trts_check(ref_trts, colnames(full_mat))
      full_mat <- full_mat[, trts_valid]
  }
  
  full_dim <- dim(full_mat)
  full_grid <- colAutoGrid(full_mat, ncol=min(chunk_size, ncol(full_mat)))
  ### The blocks in 'full_grid' are made of full columns 
  nblock <- length(full_grid) 
  resultDF <- bplapply(seq_len(nblock), function(b){
    ref_block <- read_block(full_mat, full_grid[[as.integer(b)]])
    cor_res <- cor_sig_search(query=query, refdb=ref_block, method=method)
    return(data.frame(cor_res))}, BPPARAM = MulticoreParam(workers = workers))
  resultDF <- do.call(rbind, resultDF)

  resultDF <- resultDF[order(abs(resultDF$cor_score), decreasing = TRUE), ]
  row.names(resultDF) <- NULL
  
  if(addAnnotations == TRUE){
  res <- sep_pcf(resultDF)
  # add compound annotations
  res <- addGESSannot(res, refdb(qSig), cmp_annot_tb = cmp_annot_tb[,!colnames(cmp_annot_tb) %in% "t_gn_sym"], by, cmp_name_col)
  } else {
    res <- tibble(resultDF)
    colnames(res)[1] <- "pert"
  }
  x <- gessResult(result = res,
                  query = qr(qSig),
                  gess_method = gm(qSig),
                  refdb = refdb(qSig))
  return(x)
}

cor_sig_search <- function(query, refdb, method){
  res_list <- NULL
  # make sure rownames of query and refdb are the same
  common_gene <- intersect(rownames(query), rownames(refdb))
  query2 <- as.matrix(query[common_gene,])
  colnames(query2) <- colnames(query)
  refdb <- refdb[common_gene,]
  for(i in seq_len(ncol(query2))){
    cor <- as.numeric(cor(query2[,i], refdb, method = method))
    names(cor) <- colnames(refdb)
    trend=cor
    trend[cor>=0]="up"
    trend[cor<0]="down"
    res <- data.frame(set=names(cor), trend=trend, cor_score = cor, 
                      stringsAsFactors = FALSE)
    res_list <- c(res_list, list(res))
  }
  names(res_list) <- colnames(query)
  if(length(res_list)==1) return(res_list[[1]])
  return(res_list)
}
girke-lab/signatureSearch documentation built on Feb. 21, 2024, 8:32 a.m.