knitr::opts_chunk$set( comment = "#>", message = FALSE, warning = TRUE, collapse = TRUE, out.height="75%", out.width="75%" )
One of the widely used exploratory data analysis methods is PCA and a PCA plot can provide a quick overview of sample composition and distribution. However, the interpretation of different PCs is not readily available in the conventional PCA. We couple PCs from new data with GSEA annotation of RAVmodel and enable the instant interpretation of PCA results. Here, we show this example using a microarray dataset from isolated immune cells (E-MTAB-2452) and RAVmodel annotated with three priors from the PLIER package (RAVmodel_PLIERpriors).
suppressPackageStartupMessages({ library(dplyr) library(GenomicSuperSignature) library(EBImage) })
E-MTAB-2452 contains sorted peripheral blood cells (CD4+ T cells, CD14+ monocytes, CD16+ neutrophils) profiled on microarray from several autoimmune diseases.
The expression data of E-MTAB-2452 is pre-processed by Greene lab and available here.
annot.dat <- readr::read_tsv("data/E-MTAB-2452_hugene11st_SCANfast_with_GeneSymbol.pcl") %>% as.data.frame rownames(annot.dat) <- annot.dat[, 2] dataset <- as.matrix(annot.dat[, 3:ncol(annot.dat)]) rownames(dataset) <- annot.dat$GeneSymbol dataset[1:3, 1:3]
Each sample is labeled with the known cell type.
cellType <- gsub("_.*$", "", colnames(dataset)) cellType <- gsub("CD4", "CD4,T cell", cellType) cellType <- gsub("CD14", "CD14,monocyte", cellType) cellType <- gsub("CD16", "CD16,neutrophil", cellType) names(cellType) <- colnames(dataset)
Because E-MTAB-2452 data is comprised of isolated immune subsets and was analyzed in MultiPLIER paper with three priors implemented in PLIER method by default, we used the RAVmodel annotated with the same PLIER priors.
## If GenomicSuperSignaturePaper is built locally with RAVmodel in inst/extdata data.dir <- system.file("extdata", package = "GenomicSuperSignaturePaper") RAVmodel <- readRDS(file.path(data.dir, "RAVmodel_PLIERpriors.rds"))
RAVmodel <- getModel("PLIERpriors", load=TRUE)
RAVmodel
version(RAVmodel)
We checked the enriched pathways for top eight PCs of E-MTAB-2452 data. By
default, RAVs with the validation score above 0.5 are returned from
annotatePC
function. Here, only PC1 and PC2 have the associated RAV under
the default condition.
val_all <- validate(dataset, RAVmodel) annotatePC(1:8, val_all, RAVmodel, n = 5, simplify = TRUE)
We lowered the validation score cutoff to 0 (scoreCutoff = 0
) and the output
returned with more enriched pathways. Top four PCs of E-MTAB-2452 data are
associated with RAV23/ RAV1552/ RAV1387/ RAV2766.
annotatePC(1:8, val_all, RAVmodel, n = 5, simplify = TRUE, scoreCutoff = 0)
To compare which PC separates the different cell types the best, we draw the pairs plot with the top four PCs.
## Data cleaning library(tibble) res <- stats::prcomp(dataset)$rotation %>% as.data.frame res <- tibble::rownames_to_column(res, "SampleID") ct <- data.frame(cellType) ct <- tibble::rownames_to_column(ct, "SampleID") res.df <- dplyr::full_join(res, ct, by = "SampleID") %>% dplyr::mutate(PC1 = as.numeric(as.character(PC1)), PC2 = as.numeric(as.character(PC2)), PC3 = as.numeric(as.character(PC3)), PC4 = as.numeric(as.character(PC4)), Dataset = factor(cellType, levels = c("CD14,monocyte", "CD16,neutrophil", "CD4,T cell"))) ## Pairs plot PC1 <- res.df[,"PC1"] PC2 <- res.df[,"PC2"] PC3 <- res.df[,"PC3"] PC4 <- res.df[,"PC4"] data <- data.frame(PC1, PC2, PC3, PC4) group <- as.factor(res.df$cellType) pairs(data, col = group, oma=c(10,3,3,3), pch = c(19), cex = 1) par(xpd = TRUE) legend("bottom", fill = unique(group), legend = c(levels(group)), cex = 0.7)
Tried GGally::ggpairs
...
library(GGally) ggpairs(data, aes(color = group), columns = 1:4, upper = "blank", legend = c(1,1))
Top three PCs seem to separate different cell types well. So we draw annotated PCA plot with the different pairs of the top three PCs below.
plotAnnotatedPCA(dataset, RAVmodel, PCnum = c(1,2), val_all = val_all, scoreCutoff = 0.3, color_by = cellType, color_lab = "Cell Type", trimed_pathway_len = 45)
plotAnnotatedPCA(dataset, RAVmodel, PCnum = c(1,3), val_all = val_all, scoreCutoff = 0.3, color_by = cellType, color_lab = "Cell Type", trimed_pathway_len = 45)
plotAnnotatedPCA(dataset, RAVmodel, PCnum = c(2,3), val_all = val_all, scoreCutoff = 0.3, color_by = cellType, color_lab = "Cell Type", trimed_pathway_len = 45)
Among the RAVs associated with the top four PCs of E-MTAB-2452 data, two are validated with high score: RAV23 and RAV1552.
heatmapTable(val_all, RAVmodel)
drawWordcloud(RAVmodel, 23)
drawWordcloud(RAVmodel, 1552)
drawWordcloud(RAVmodel, 1387)
findStudiesInCluster(RAVmodel, 23, studyTitle = TRUE)
findStudiesInCluster(RAVmodel, 1552, studyTitle = TRUE)
findStudiesInCluster(RAVmodel, 1387, studyTitle = TRUE)
x <- calculateScore(dataset, RAVmodel) top_PC_annot <- c(23,1552,1387,684,338,299,21,312) Fig1B_topleft <- sampleScoreHeatmap(x[,top_PC_annot], dataName = "E-MTAB-2452", modelName = "RAVs for top 8 PCs", row_names_gp = 5, column_names_gp = 7, cluster_columns = FALSE, cluster_row = FALSE) Fig1B_topleft
## Save the Figure 1B top-left png("Figures/manuscript_Fig1B_topleft.png", width = 1500, height = 1500, res = 300) Fig1B_topleft dev.off()
## SampleScore heatmap img <- readImage("Figures/manuscript_Fig1B_topleft.png") display(img, method = "raster")
Output from drawWordcloud(RAVmodel, 1551)
.
## Save the Figure 1B bottom-left png("Figures/manuscript_Fig1B_bottomleft.png", width = 1500, height = 1500, res = 300) drawWordcloud(RAVmodel, 1551) dev.off()
## wordcloud img <- readImage("Figures/manuscript_Fig1B_bottomleft.png") display(img, method = "raster")
a14 <- annotatePC(1:4, val_all, RAVmodel, n = 5, simplify = TRUE, scoreCutoff = 0) a58 <- annotatePC(5:8, val_all, RAVmodel, n = 5, simplify = TRUE, scoreCutoff = 0) b <- plotAnnotatedPCA(dataset, RAVmodel, PCnum = c(1,3), val_all = val_all, scoreCutoff = 0.3, color_by = cellType, color_lab = "Cell Type", trimed_pathway_len = 45) library(flextable) set_flextable_defaults(na_str = "NA") a14_ft <- flextable(a14) %>% width(., width = 2.5) a14_ft <- grid::rasterGrob(as_raster(a14_ft)) a58_ft <- flextable(a58) %>% width(., width = 1.5) a58_ft <- grid::rasterGrob(as_raster(a58_ft)) # ggpubr::ggarrange(a14_ft, a58_ft, labels = c("A", " "), # hjust = 0.1, vjust = 1.5, align = "hv", # nrow = 2, # heights = c(0.8, 1), # font.label = list(size = 15))
Fig1B_bottomright <- plotAnnotatedPCA(dataset, RAVmodel, PCnum = c(2,3), val_all = val_all, scoreCutoff = 0.3, color_by = cellType, color_lab = "Cell Type", trimed_pathway_len = 45) Fig1B_bottomright
## Save the Figure 1B bottom-right png("Figures/manuscript_Fig1B_bottomright.png", width = 1500, height = 1500, res = 300) Fig1B_bottomright dev.off()
## Annotated PCA plot img <- readImage("Figures/manuscript_Fig1B_bottomright.png") display(img, method = "raster")
PCA result of leukocyte gene expression data (E-MTAB-2452) is displayed in A) a table or B) a scatter plot. PCA is done on a centered, but not scaled, input dataset by default. Different cutoff parameters for GSEA annotation, such as minimum validation score or NES, can be set.
supFig8 <- ggpubr::ggarrange(a14_ft, b, labels = c("a", "b"), nrow = 2, heights = c(1.7, 5), align = "hv") supFig8
## Save the Supplementary Figure 8 png("Figures/manuscript_Sup_Fig8.png", width = 650, height = 950) supFig8 dev.off()
img <- readImage("Figures/manuscript_Sup_Fig8.png") display(img, method = "raster")
sessionInfo()
# PCA ## Directly on E-MTAB-2452 gene_common <- intersect(rownames(RAVmodel), rownames(dataset)) prcomRes <- stats::prcomp(t(dataset[gene_common,])) loadings <- prcomRes$rotation[, 1:8] ### GSEA on top 8 PCs library(PLIER) library(clusterProfiler) data(canonicalPathways) data(bloodCellMarkersIRISDMAP) data(svmMarkers) allPaths <- combinePaths(canonicalPathways, bloodCellMarkersIRISDMAP, svmMarkers) source('~/data2/[archive]GenomicSuperSignature/R/gmtToMatrix.R') term2gene <- matrixToTERM2GENE(allPaths) res_all <- list() for (i in seq_len(ncol(loadings))) { ## Run GSEA on ranked gene list geneList <- loadings[,i] geneList <- sort(geneList, decreasing = TRUE) res <- GSEA(geneList, TERM2GENE = term2gene, pvalueCutoff = 0.05) ## Subset to the most significant pathways res <- res[which(res$qvalues == min(res$qvalues)), c("Description", "NES", "pvalue", "qvalues"), drop = FALSE] resName <- paste0("PC", i) res_all[[resName]] <- res res_all[[i]] <- res } summary <- list() for (i in 1:8) { annotatedPC <- res_all[[i]] topAnnotation <- annotatedPC[order(abs(annotatedPC$NES), decreasing = TRUE),][1:5,] rownames(topAnnotation) <- NULL summary[[i]] <- topAnnotation names(summary)[i] <- paste0("PC", i) } # Simple version of the output - only witt the description simple_summary <- sapply(summary, function(x) x$Description) %>% as.data.frame simple_summary
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.