A noise constrained Recursive Feature Extraction algorithm for robust deconvolution of cell-types mixture from molecular signatures
Since the significant impact of immunotherapy in cancer, the estimation of the immune cell-type proportions present in a tumor becomes crucial. Currently, the deconvolution of the cell mixture content of a tumor is carried out by different analytic tools, yet the accuracy of inferred cell type proportions has room for improvement. We improve tumor immune environment characterization developing MIXTURE, an analytical method based on a noise constrained recursive variable selection for a support vector regression
install.packages("devtools") library(devtools) install_github("elmerfer/MIXTURE")
In this example we will use the LM22 signature matrix as a 22 subject expression matrix cohort
library(MIXTURE) ##Load signature matrix data(LM22) ## Run the self test on LM22 signature mix.test <- MIXTURE(expressionMatrix = LM22, #N x ncol(signatureMatrix) gene expression ##matrix to evaluate ##rownames(M) should be the GeneSymbols signatureMatrix = LM22, #the gene signature matrix (W) such that ##M = W*betas' #(i.e the LM22 from Newman et al) functionMixture = nu.svm.robust.RFE, #cibersort, nu.svm.robust.rfe, ls.rfe.abbas, useCores = 10L, #cores for parallel processing verbose = TRUE, #TRUE or FALSE messages nullDist = "PopulationBased" ) # Showing the predicted proportions head(GetMixture(mix.test)[,1:3]) # Showing the predicted absolute coefficients head(GetMixture(mix.test, type = "absolute")[,1:3]) # Showing the slots names of the MIXTURE object names(mix.test)
A self test analysis is intended to evaluate if the signature profiles can be accuratly predicted by using MIXTURE. This means that each signature profiles (Columns) will be assumed as a pure cell mixture profile, thus, the algorithm should only provide one coefficients as being 1 for each signature profile
library(MIXTURE) # Load the TIL10 signature from Finotello et al. data(TIL10) # Signature Format head(TIL10) SelfTest(TIL10)
# Run the MIXTURE on simulated samples built from the given signature res <- SimulationTest(signatureMatrix = TIL10, maxCoefs = 6, #maximum number of cell types to include in the mixture (from 2 to maxCoefs) maxSamples = 100, noisy = TRUE, #Add a noisy profile to the simulated pure mix useCores=3L) # Getting simulated data # Simulated Samples dim(res$SimulatedData$M) head(res$SimulatedData$M[, 1:4]) #Simulated betas (coefficients) head(res$SimulatedData$B) #Retrieving MIXTURE results dim(GetMixture(res$MIXTURE)) # Displaying the cell type proportions p <- ProportionPlot(res$MIXTURE) p # Since ProportionPlot returns a ggplot object, we can change the characteristics of the plot beyond those relative to the bars and colors. p + theme(axis.text.x = element_text(angle = 45, hjust = 1, vjust=0.5))
library(data.table) library(openxlsx) library(org.Hs.eg.db) library(limma) #download and unzip expression from url <- "https://www.cancerrxgene.org/gdsc1000/GDSC1000_WebResources/ /Data/preprocessed/Cell_line_RMA_proc_basalExp.txt.zip" fname <- basename(url) # this will save the downloaded file in your working directory download.file(url = url, destfile = fname, method = "auto") unzip(fname) #load expression matrix a.data<- fread("Cell_line_RMA_proc_basalExp.txt") #update annotation b.annot<- a.data[,1:2] colnames(b.annot)<- c("symbol", "name") columns(org.Hs.eg.db) b.entrezids <- mapIds(org.Hs.eg.db, keys=b.annot$symbol, column="ENTREZID", keytype="SYMBOL", multiVals="first") b.entrezids[sapply(b.entrezids, is.null)]<- NA b.annot$entrezid<- unlist(b.entrezids) #fix colnames colnames(a.data)<- gsub("DATA.", "", colnames(a.data)) #make elist b.elist<- new("EList", list(E=a.data[,-c(1,2)], genes= b.annot)) dim(b.elist) #remove missing entrezid b.elist<- b.elist[which(!is.na(b.elist$genes$entrezid)),] #combine repeated entrezid expression b.elist<- avereps(x = b.elist, ID = b.elist$genes$entrezid) # this will save the cell lines gene expression matrix in your working directory saveRDS(b.elist, file = "celllines.rds")
Once the file has been downloaded and stored in your hard disk, the you can proceed to analyze
# we will run over the first 10 cell lines library(limma) library(MIXTURE) cells <- readRDS("celllines.rds") # gene expression cell line data is in log2 scale # so we anti log the data M <- 2^cells$E cn <- colnames(cells$E) cn[1:10] rownames(M) <- cells$genes$symbol # we will only process the first ten cell lines cells.mix <- MIXTURE(expressionMatrix = M[, 1:10], signatureMatrix = LM22, useCores = 3L) head(GetMixture(cells.mix))
#Download FPKM data with TCGAbioLinks library(TCGAbiolinks) library(SummarizedExperiment) query_all_brca_fpkm <- GDCquery(project = "TCGA-BRCA", data.category = "Transcriptome Profiling", data.type = "Gene Expression Quantification", experimental.strategy = "RNA-Seq", workflow.type = "HTSeq - FPKM") GDCdownload(query_all_brca_fpkm) data_all_brca_fpkm <- GDCprepare(query_all_brca_fpkm, save = TRUE, save.filename = "BRCA.All.FPKM.rda") ##Chage the directory according to wahre you save your downloaded data
path <- "/home/elmer/Dropbox/IDEAS/cibersort/MIXTURE/" library(TCGAbiolinks) library(SummarizedExperiment)
```r
load(file.path(path,"BRCA.All.FPKM.rda"))
# Prepare your gene expression data matrix gene_annot_all <- rowData(data) exp.all <- assay(data) target.data.all <- as.data.frame(colData(data)) tumor.type <- do.call(rbind,str_split(as.character(target.data.all$barcode),"-")) # Tissue type distribution table(tumor.type[,4]) rownames(exp.all)[1:10] # only ensembl gene ids exp.all <- exp.all[gene_annot_all$ensembl_gene_id,] #MIXTURE use rownames to identify genes and macth with the molecular signature rownames(exp.all) <- gene_annot_all$external_gene_name rownames(exp.all)[1:10] #an E list from limma package a.data <- new("EList",list(E = exp.all, genes = gene_annot_all, targets = target.data.all)) a.data$targets$TissueType <- do.call(rbind,str_split(as.character(target.data.all$barcode),"-"))[,4] # processing with MIXTURE brca.mix <- MIXTURE(expressionMatrix = a.data$E, signatureMatrix = LM22, useCores = 6L)
round(apply(GetMixture(brca.mix),2,function(x) sum(x>0))/nrow(GetMixture(brca.mix)),2) brca.prop <- GetMixture(brca.mix) df <- data.frame(Proportions = c(brca.prop), CellTypes = rep(colnames(brca.prop), each = nrow(brca.prop)))
ggplot(df,aes(x=CellTypes, y=Proportions, fill = CellTypes)) + geom_boxplot(outlier.size = 0.7) + theme(axis.text.x = element_text(angle=45, vjust = 1,hjust = 1)) ```
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.