selection.r

## License GPL2.0
## Developed by Bo Li, bli@jimmy.harvard.edu, 2016

## Select genes whose expression levels are negatively correlated with tumor purity,
## Select them because they are informative to deconvolve immune calls in the tumor tissue

options(warn=-1)
library(sqldf)
library(sva)
library(crayon)

baseDir <- (function() {
  args <- commandArgs(trailingOnly = FALSE)
  file.arg.name <- "--file="
  script.name <- sub(file.arg.name, "", args[grep(file.arg.name, args)])
  script.basename <- dirname(script.name)
  return(script.basename)
})()


source(paste(baseDir, '/utils.r', sep=''))
TimerINFO('Aggregating immune expression data')


cancers <- c('kich', 'blca', 'brca', 'cesc', 'gbm', 'hnsc', 'kirp', 'lgg',
             'lihc', 'luad', 'lusc', 'prad', 'sarc', 'pcpg', 'paad', 'tgct',
             'ucec', 'ov', 'skcm', 'dlbc', 'kirc', 'acc', 'meso', 'thca',
             'uvm', 'ucs', 'thym', 'esca', 'stad', 'read', 'coad', 'chol')

GetCancerSampleID <- function(sID, num.res=3){
  ## Edit TCGA ID, with the option of keeping the first num.res fields

  ## Args:
  ##    sID: vector of original TCGA sample IDs
  ##    num.res: how many fields to keep

  ## Return:
  ##    Vector of sample IDs with the extracted fields


  mm <- c()
  for (id in sID) {
    tmp <- unlist(strsplit(id, '-'))
    if (length(tmp) == 1) {
      tmp <- unlist(strsplit(id, '\\.'))
    }
    ll <- 'TCGA'
    for (j in 2:num.res) {
      ll <- paste(ll, tmp[j], sep='-')
    }
    mm <- c(mm, ll)
  }
  return(mm)
}

ConvertRownameToLoci <- function(cancerGeneExpression) {
  ## Extract only the loci information for row name

  ## Example of origin row name is 'LOC389332|389332'
  ## Coverted row name is 'LOC389332'

  ## Args:
  ##   geneExpression: the orginal geneExpression load from .Rdata file
  ##
  ## Returns:
  ##   Modified geneExpression


  tmp <- strsplit(rownames(cancerGeneExpression), '\\|')
  tmp <- sapply(tmp, function(x) x[[1]])
  tmp.vv <- which(nchar(tmp) > 1)
  rownames(cancerGeneExpression) <- tmp
  return(cancerGeneExpression[tmp.vv,])
}

LoadCancerGeneExpression <- function(cancer) {
  ## Load and process gene expression data

  ## Args:
  ##   cancer: String of cancer type
  ##
  ## Returns:
  ##   The data frame of gene expression for the input cancer category
  ##   (col for sample, row for gene)

  if (cancer %in% c('gbm', 'ov')) {
    geneExpression <- get(load(paste(baseDir,'/data/RNAseq/',cancer,'Affy.Rdata',
                                     sep='')))
  } else {
    geneExpression <- get(load(paste(baseDir,'/data/RNAseq/',cancer,'RNAseq.Rdata',
                                     sep='')))
  }

  geneExpression <- as.matrix(geneExpression)
  mode(geneExpression) <- 'numeric'
  if(!cancer %in% c('gbm', 'ov', 'esca', 'stad')) {
    ## rsem scaled estimates needs multiply 1e6, Array or RPKM does not need.
    geneExpression <- geneExpression * 1e6
  }

  geneExpression <- ConvertRownameToLoci(geneExpression)
  colnames(geneExpression) <- GetCancerSampleID(colnames(geneExpression), 4)

  return(geneExpression)
}




LoadImmuneMarkerProbe <- function () {
  ## Load immune marker probe from Abbas et al., 2005

  ## Return:
  ##   Vector of probeID for immule marker genes
  ##   (name: cancer category, value: probe ID)

  tmp <- read.csv(paste(baseDir, 'data/immune_datasets/IRIS-marker-gene.txt', sep='/'),
                  header=T, sep='\t', stringsAsFactors=F)
  marker.list <- tmp[, 1]
  names(marker.list) <- tmp[, 7]
  names(marker.list) <- gsub(' ','_',tmp[,7])
  names(marker.list) <- gsub('Dendritic_Cell', 'DC', names(marker.list))
  names(marker.list) <- gsub('Neutrophil', 'Neutrophils', names(marker.list))
  gsub('Cell', 'cell', names(marker.list)) -> names(marker.list)
  return(marker.list)
}

LoadImmuneMarkerGene <- function() {
  ## Load immune marker genes from Abbas et al., 2005
  ## Convert the result of LoadImmuneMarkerProbe() to gene names with the help of immune gene expression data

  ## Args:
  ##    ref: The immune gene expression data which is generated by LoadImmuneGeneExpression()
  ## Returns:
  ##    Vector of immune marker gene names


  ref <- get(load(paste(baseDir,'/data/immune_datasets/HPCTimmune.Rdata',sep='')))
  tmp <- strsplit(rownames(ref), ';')
  AffyIDtoGenes <- sapply(tmp, function(x) x[[1]])
  names(AffyIDtoGenes) <- sapply(tmp, function(x) x[[2]])

  marker.list <- LoadImmuneMarkerProbe()
  marker.list.genes <- AffyIDtoGenes[marker.list]
  return(marker.list.genes)
}

LoadCancerTumorPurity <- function(cancer) {
  ## Load and process tumor purity data

  ## Args:
  ##   cancer: String of cancer type
  ##
  ## Return:
  ##    A dataframe of tumor purity data

  AGP <- read.table(paste(baseDir, '/data/AGP/AGP-', cancer, '.txt', sep=''),
                    sep='\t', header=T)
  AGP <- AGP[which(AGP[, 'PoP'] > 0.01), ]
  rownames(AGP) <- GetCancerSampleID(AGP[, 1], 4)
  return(AGP)
}




##----- function to select genes with expression values negatively correlated with tumor purity -----##
GetPurityGenes <- function(dd, AGP, thr.p=0.05, thr.c=0, mode='env'){
  tmp.ss <- intersect(colnames(dd), rownames(AGP))
  tmp.dd <- dd[, tmp.ss]
  ## 's' means Spearman Correlation
  tmp <- lapply(rownames(tmp.dd),
                function(x) {
                  cor.test(tmp.dd[x, ],
                           as.numeric(AGP[colnames(tmp.dd), 2]),
                           method='s')})

  tmp.pp <- sapply(tmp, function(x)x$p.value)
  tmp.cor <- sapply(tmp, function(x)x$estimate)
  names(tmp.pp) <- names(tmp.cor) <- rownames(dd)

  if (mode == 'env') {
    geneNames <- names(which(tmp.pp <=thr.p&tmp.cor < thr.c))
  } else if (mode == 'tumor') {
    geneNames <- names(which(tmp.pp <=thr.p&tmp.cor > thr.c))
  } else {
    stop("invalid mode for purity-based gene selection")
  }
  return(geneNames)
}

##----- remove outlier genes whose expression may drive the colinearity of similar covariates in the regression -----##
RemoveOutliers <- function(vv, ref.dd, thr.q=0.99){
  ## removes upper thr.q quantile for every reference feature
  remove.vv=c()
  for(i in 1:ncol(ref.dd)){
    tmp=quantile(ref.dd[vv,i],thr.q)[1]
    tmp.vv=which(ref.dd[vv,i]>tmp)
    remove.vv=c(remove.vv,tmp.vv)
  }
  remove.vv=unique(remove.vv)
  return(vv[-remove.vv])
}




##----- setup parameters and establish the output file -----##

main <- function() {
  for(cc in cancers) {
    if (cc == 'skcm') {
      cc.type <- '06A'
    } else {
      cc.type <- '01A'
    }

    ccDir = paste(baseDir, 'results', cc, sep='/')
    dir.create(ccDir, showWarnings=FALSE, recursive=TRUE)

    options(scipen=6)


    write(paste(cc, ' output\n'),
          file=paste(baseDir, '/results/', cc, '/output-statistics.txt', sep=''))


    cancer.geneExpression <- LoadCancerGeneExpression(cancer=cc)
    cancer.tumorPurity <- LoadCancerTumorPurity(cancer=cc)

    immune <- LoadImmuneGeneExpression()
    immune.geneExpression <- immune$genes
    immune.cellTypes <- immune$celltypes

    immune.markerGene <- LoadImmuneMarkerGene()

    TimerINFO(paste("Removing the batch effect of", cc))
    tmp <- RemoveBatchEffect(cancer.geneExpression, immune.geneExpression, immune.cellTypes)
    cancer.expNorm <- tmp[[1]]
    ## immune.expNorm <- tmp[[2]]
    immune.expNormMedian <- tmp[[3]]

    gene.purity.neg <- GetPurityGenes(cancer.geneExpression, cancer.tumorPurity,
                                      thr.p = 0.05, thr.c = -0.2)
    gene.purity.neg <- intersect(gene.purity.neg, rownames(immune.expNormMedian))

    gene.selected.marker <- intersect(gene.purity.neg, immune.markerGene)


    ##---- calculate differences between the correlations of reference immune cells using Pearson's or Spearman's correlations -----##
    tmp.diff <-
      sum(sum(
        abs(cor(immune.expNormMedian[gene.selected.marker, ], method='p') -
              cor(immune.expNormMedian[gene.selected.marker, ], method='s'))))

    if (tmp.diff >= -10000) {
      gene.selected.marker <- RemoveOutliers(gene.selected.marker,
                                             immune.expNormMedian[, -4])
    }

    cat("Number of genes inversely correlated with purity is ",
        length(gene.purity.neg), '\n\n',
        sep='', file='output-statistics.txt', append=T)

    cat("Number of immune genes inversely correlated with purity is ",
        length(gene.selected.marker), '\n\n',
        sep='', file='output-statistics.txt', append=T)

    ##----- calculate the significance of enrichment for purity selected genes to immune marker genes -----##
    gene.all=intersect(rownames(immune.expNormMedian), rownames(cancer.expNorm))
    n.immune=length(intersect(immune.markerGene, gene.all))

    cat("Test if immune genes are enriched for inverse correlation with purity: \n\n", file='output-statistics.txt', append=T)

    geneCalculated <- paste(baseDir, '/data/precalculated/genes_', cc, '.RData', sep='')
    save(gene.selected.marker, file=geneCalculated)

    sink(file=paste(baseDir, '/results/', cc, '/output-statistics.txt', sep=''), append=T)
    print(
      fisher.test(
        matrix(
          c(
            length(gene.selected.marker),
            length(gene.purity.neg)-length(gene.selected.marker),
            n.immune,
            length(gene.all)-n.immune),
          2, 2)))
    sink()
  }
}

main()
hanfeisun/TIMER documentation built on May 17, 2019, 2:28 p.m.