R/select_db.R

Defines functions select_db

Documented in select_db

#'  Data preprocessing
#'
#' @param Assay The TCGA dataset downloaded by download_db.
#' @param cmat The RNA-seq count data of clinical patients.
#' @param MS_status Filter TCGA samples with the MSI status (MSI or MSS).
#' @param driver_gene  A character with the driver gene symbol.
#' @param MUT_status Filter TCGA samples with the MUT status of driver gene (MUT or NULL).
#' @param CNA_status Filter TCGA samples with the CNA status of driver gene (AMP, GAIN, HETLOSS, or HOMDEL).
#' @param geneList A data.frame of 2 column with gene and status. Only used in multiple driver genes.
#' @param minSampleSize Minimal size of the output samples (10 by default).
#' @param removeBatchEffect Remove batch effects between the TCGA dataset and the clinical patients (TRUE by default).
#' @param OrgDb The human annotation db for ID convert.
#'
#' @return A list with preprocessed profiles of the clinical patients (cmat), and preprocessed profiles of the TCGA dataset (mat).
#' @export
#' @importFrom SummarizedExperiment assay
#' @importFrom clusterProfiler bitr
#' @importFrom limma removeBatchEffect
#' @import dplyr
#' @import octad.db

select_db = function(Assay, cmat, MS_status = NULL, driver_gene = NULL,
                     MUT_status = NULL, CNA_status = NULL, geneList = NULL, minSampleSize = 10,
                     removeBatchEffect = TRUE, OrgDb){

  sample_s = NULL
  sampleID = octad.db::phenoDF$sample.id
  clinical <- as.data.frame(Assay@colData@listData)
  mat <- SummarizedExperiment::assay(Assay,"mrna_seq_v2_rsem")
  mut <- SummarizedExperiment::assay(Assay,"mutations")
  cna <- SummarizedExperiment::assay(Assay,"cna")

  row.names(mut) <- apply(mut, 1, function(x){x[which(!is.na(x))[1]]})
  cna <- ifelse(cna == 2, 'AMP',
                ifelse(cna == 1, 'GAIN',
                       ifelse(cna == -1, 'HETLOSS',
                              ifelse(cna == -2, 'HOMDEL', NA))))

  if (sum(is.na(suppressWarnings(as.numeric(row.names(cmat))))) == 0 | sum(grepl('^ENSG',row.names(cmat))) > 0) {
    stop('Row names of cmat should be SYMBOL.')
  }
  if (!is.null(MS_status)){
    if (MS_status == 'MSI') {
      sample_s <- clinical$PATIENT_ID[clinical$MSI_SCORE_MANTIS > 0.6]
    }
    if (MS_status == 'MSS') {
      sample_s <- clinical$PATIENT_ID[clinical$MSI_SCORE_MANTIS < 0.4]
    }
    if (length(sample_s) < minSampleSize) {
      if (length(sample_s) == 0) {
        stop(paste0('No patients with ',MS_status))
      }
      warning(paste0('Insufficient patients with ',MS_status))
    }
  }

  if (!is.null(driver_gene)){
    if (!is.null(MUT_status)){
      sample_s <- colnames(mut)[which(!is.na(mut[driver_gene,]))]
      if (length(sample_s) < minSampleSize) {
        if (is.na(sample_s)) {
          stop(paste0('No patients with ',driver_gene,' ',MUT_status))
        }
        warning(paste0('Insufficient patients with ',driver_gene,' ',MUT_status))
      }
    }
    if (!is.null(CNA_status)){
      sample_s <- ifelse(CNA_status == 'AMP', colnames(cna)[which(cna[driver_gene,] == 'AMP')],
                         ifelse(CNA_status == 'GAIN', colnames(cna)[which(cna[driver_gene,] == 'GAIN')],
                                ifelse(CNA_status == 'HETLOSS', colnames(cna)[which(cna[driver_gene,] == 'HETLOSS')],
                                       ifelse(CNA_status == 'HOMDEL', colnames(cna)[which(cna[driver_gene,] == 'HOMDEL')]))))
      if (length(sample_s) < minSampleSize) {
        if (is.na(sample_s)) {
          stop(paste0('No patients with ',driver_gene,' ',CNA_status))
        }
        warning(paste0('Insufficient patients with ',driver_gene,' ',CNA_status))
      }
    }
  }

  if (!is.null(geneList)) {
    if (!colnames(geneList) %in% c('gene','status')) {
      stop('GeneList must be a dataframe with `gene` and `status` columns. ')
    }

    mut_list <- geneList[,geneList$status == 'MUT']
    cna_list <- list(
      geneList[,geneList$status == 'AMP'],
      geneList[,geneList$status == 'GAIN'],
      geneList[,geneList$status == 'HETLOSS'],
      geneList[,geneList$status == 'HOMDEL']
    )

    select_sample = function(ls){
      return(ifelse(dim(ls)[2] == 1,
                    colnames(mut)[which(!is.na(mut[ls$gene,]))],
                    colnames(mut)[which(apply(mut[ls$gene,], 2, function(x){sum(is.na(x))}) == 0)]))
    }

    sample_s <- Reduce(intersect,
                       list(select_sample(mut_list),
                            unlist(lapply(cna_list, function(x){select_sample(x)}))))

    if (length(sample_s) < minSampleSize) {
      warning(paste0('Insufficient patients with genetic alterations in `geneList`.'))
    }
  }


  gene <- suppressMessages(suppressWarnings(clusterProfiler::bitr(row.names(mat), fromType='ENTREZID', toType='SYMBOL',OrgDb)))
  gene <- gene[!duplicated(gene$ENTREZID),]
  mat <- mat[row.names(mat) %in% gene$ENTREZID,]
  mat <- mat[gene$ENTREZID,]
  row.names(mat) <- gene$SYMBOL

  if (is.null(sample_s)) {sample_s <- colnames(mat)} else {sample_s <- intersect(paste0(sample_s,'-01'),colnames(mat))}
  sample_s <- intersect(sampleID,sample_s)
  mat <- mat[,sample_s]

  commongene <- intersect(row.names(mat)[complete.cases(mat)],row.names(cmat)[complete.cases(cmat)])
  cmat <- cmat[commongene,]
  mat <- mat[commongene,]

  pmat <- list(
    mat = mat,
    cmat = cmat)

  if (isTRUE(removeBatchEffect)) {
    message('... Removing batch effect ...')
    mat <- log2(mat + 1)
    cmat <- log2(cmat + 1)
    X <- t(cbind(mat, cmat))
    X <- t(limma::removeBatchEffect(t(X), batch = c(rep("TCGA", dim(mat)[2]),
                                                    rep("clinical", dim(cmat)[2]))))
    pmat <- list(
      mat = t(X[1:dim(mat)[2],]),
      cmat = t(X[(dim(mat)[2]+1):dim(X)[1],])
    )
  }
  return(pmat)
}
AllenSpike/CPDR documentation built on April 18, 2022, 4:38 p.m.