knitr::opts_chunk$set(
  comment = "#>", message = FALSE, warning = TRUE, collapse = TRUE,
  out.height="75%", out.width="75%"
)

Setup

One of the widely used exploratory data analysis methods is PCA and a PCA plot can provide a quick overview of sample composition and distribution. However, the interpretation of different PCs is not readily available in the conventional PCA. We couple PCs from new data with GSEA annotation of RAVmodel and enable the instant interpretation of PCA results. Here, we show this example using a microarray dataset from isolated immune cells (E-MTAB-2452) and RAVmodel annotated with three priors from the PLIER package (RAVmodel_PLIERpriors).

Load packages

suppressPackageStartupMessages({
  library(dplyr)
  library(GenomicSuperSignature)
  library(EBImage)
})

E-MTAB-2452

E-MTAB-2452 contains sorted peripheral blood cells (CD4+ T cells, CD14+ monocytes, CD16+ neutrophils) profiled on microarray from several autoimmune diseases.

The expression data of E-MTAB-2452 is pre-processed by Greene lab and available here.

annot.dat <- readr::read_tsv("data/E-MTAB-2452_hugene11st_SCANfast_with_GeneSymbol.pcl") %>% as.data.frame
rownames(annot.dat) <- annot.dat[, 2]

dataset <- as.matrix(annot.dat[, 3:ncol(annot.dat)])
rownames(dataset) <- annot.dat$GeneSymbol
dataset[1:3, 1:3]

Each sample is labeled with the known cell type.

cellType <- gsub("_.*$", "", colnames(dataset))
cellType <- gsub("CD4", "CD4,T cell", cellType)
cellType <- gsub("CD14", "CD14,monocyte", cellType)
cellType <- gsub("CD16", "CD16,neutrophil", cellType)
names(cellType) <- colnames(dataset)

RAVmodel

Because E-MTAB-2452 data is comprised of isolated immune subsets and was analyzed in MultiPLIER paper with three priors implemented in PLIER method by default, we used the RAVmodel annotated with the same PLIER priors.

## If GenomicSuperSignaturePaper is built locally with RAVmodel in inst/extdata
data.dir <- system.file("extdata", package = "GenomicSuperSignaturePaper")
RAVmodel <- readRDS(file.path(data.dir, "RAVmodel_PLIERpriors.rds"))
RAVmodel <- getModel("PLIERpriors", load=TRUE)
RAVmodel

version(RAVmodel)

Annotate top PCs

We checked the enriched pathways for top eight PCs of E-MTAB-2452 data. By default, RAVs with the validation score above 0.5 are returned from annotatePC function. Here, only PC1 and PC2 have the associated RAV under the default condition.

val_all <- validate(dataset, RAVmodel)
annotatePC(1:8, val_all, RAVmodel, n = 5, simplify = TRUE)  

We lowered the validation score cutoff to 0 (scoreCutoff = 0) and the output returned with more enriched pathways. Top four PCs of E-MTAB-2452 data are associated with RAV23/ RAV1552/ RAV1387/ RAV2766.

annotatePC(1:8, val_all, RAVmodel, n = 5, simplify = TRUE, scoreCutoff = 0)

Annotated PCA plot

Pairs plot

To compare which PC separates the different cell types the best, we draw the pairs plot with the top four PCs.

## Data cleaning
library(tibble)
res <- stats::prcomp(dataset)$rotation %>% as.data.frame
res <- tibble::rownames_to_column(res, "SampleID")
ct <- data.frame(cellType)
ct <- tibble::rownames_to_column(ct, "SampleID")

res.df <- dplyr::full_join(res, ct, by = "SampleID") %>%
  dplyr::mutate(PC1 = as.numeric(as.character(PC1)),
                PC2 = as.numeric(as.character(PC2)),
                PC3 = as.numeric(as.character(PC3)),
                PC4 = as.numeric(as.character(PC4)),
                Dataset = factor(cellType, levels = c("CD14,monocyte", 
                                                      "CD16,neutrophil", 
                                                      "CD4,T cell")))

## Pairs plot
PC1 <- res.df[,"PC1"]
PC2 <- res.df[,"PC2"]
PC3 <- res.df[,"PC3"]
PC4 <- res.df[,"PC4"]
data <- data.frame(PC1, PC2, PC3, PC4)
group <- as.factor(res.df$cellType)

pairs(data, col = group, oma=c(10,3,3,3), pch = c(19), cex = 1)
par(xpd = TRUE)
legend("bottom", fill = unique(group), legend = c(levels(group)), cex = 0.7)


Tried GGally::ggpairs...

library(GGally)
ggpairs(data, aes(color = group), columns = 1:4, upper = "blank",
        legend = c(1,1))


Top three PCs seem to separate different cell types well. So we draw annotated PCA plot with the different pairs of the top three PCs below.

PC1 vs. PC2

plotAnnotatedPCA(dataset, RAVmodel, PCnum = c(1,2), val_all = val_all, 
                 scoreCutoff = 0.3, color_by = cellType, 
                 color_lab = "Cell Type", trimed_pathway_len = 45)

PC1 vs. PC3

plotAnnotatedPCA(dataset, RAVmodel, PCnum = c(1,3), val_all = val_all, 
                 scoreCutoff = 0.3, color_by = cellType, 
                 color_lab = "Cell Type", trimed_pathway_len = 45)

PC2 vs. PC3

plotAnnotatedPCA(dataset, RAVmodel, PCnum = c(2,3), val_all = val_all, 
                 scoreCutoff = 0.3, color_by = cellType, 
                 color_lab = "Cell Type", trimed_pathway_len = 45)

Explore knowledge graph

Validated RAVs

Among the RAVs associated with the top four PCs of E-MTAB-2452 data, two are validated with high score: RAV23 and RAV1552.

heatmapTable(val_all, RAVmodel)

Associated MeSH terms

drawWordcloud(RAVmodel, 23)
drawWordcloud(RAVmodel, 1552)
drawWordcloud(RAVmodel, 1387)

Associated studies

findStudiesInCluster(RAVmodel, 23, studyTitle = TRUE)
findStudiesInCluster(RAVmodel, 1552, studyTitle = TRUE)
findStudiesInCluster(RAVmodel, 1387, studyTitle = TRUE)

Manuscript Figures

Figure 1

Figure 1B top-left

x <- calculateScore(dataset, RAVmodel)
top_PC_annot <- c(23,1552,1387,684,338,299,21,312)
Fig1B_topleft <- sampleScoreHeatmap(x[,top_PC_annot],                    
                                    dataName = "E-MTAB-2452", 
                                    modelName = "RAVs for top 8 PCs", 
                                    row_names_gp = 5, column_names_gp = 7, 
                                    cluster_columns = FALSE, cluster_row = FALSE)
Fig1B_topleft
## Save the Figure 1B top-left
png("Figures/manuscript_Fig1B_topleft.png", width = 1500, height = 1500, res = 300)
Fig1B_topleft
dev.off()
## SampleScore heatmap
img <- readImage("Figures/manuscript_Fig1B_topleft.png")
display(img, method = "raster")

Figure 1B bottom-left

Output from drawWordcloud(RAVmodel, 1551).

## Save the Figure 1B bottom-left
png("Figures/manuscript_Fig1B_bottomleft.png", width = 1500, height = 1500, res = 300)
drawWordcloud(RAVmodel, 1551)
dev.off()
## wordcloud
img <- readImage("Figures/manuscript_Fig1B_bottomleft.png")
display(img, method = "raster")

Figure 1B bottom-right

a14 <- annotatePC(1:4, val_all, RAVmodel, n = 5, simplify = TRUE, scoreCutoff = 0)
a58 <- annotatePC(5:8, val_all, RAVmodel, n = 5, simplify = TRUE, scoreCutoff = 0)
b <- plotAnnotatedPCA(dataset, RAVmodel, PCnum = c(1,3), val_all = val_all, 
                      scoreCutoff = 0.3, color_by = cellType, 
                      color_lab = "Cell Type", trimed_pathway_len = 45)

library(flextable)
set_flextable_defaults(na_str = "NA")
a14_ft <- flextable(a14) %>% width(., width = 2.5)
a14_ft <- grid::rasterGrob(as_raster(a14_ft))

a58_ft <- flextable(a58) %>% width(., width = 1.5)
a58_ft <- grid::rasterGrob(as_raster(a58_ft))

# ggpubr::ggarrange(a14_ft, a58_ft, labels = c("A", " "), 
#                   hjust = 0.1, vjust = 1.5, align = "hv",
#                   nrow = 2, 
#                   heights = c(0.8, 1),
#                   font.label = list(size = 15)) 
Fig1B_bottomright <- plotAnnotatedPCA(dataset, RAVmodel, PCnum = c(2,3), 
                                      val_all = val_all, scoreCutoff = 0.3, 
                                      color_by = cellType, 
                                      color_lab = "Cell Type", 
                                      trimed_pathway_len = 45)
Fig1B_bottomright
## Save the Figure 1B bottom-right
png("Figures/manuscript_Fig1B_bottomright.png", width = 1500, height = 1500, res = 300)
Fig1B_bottomright
dev.off()
## Annotated PCA plot
img <- readImage("Figures/manuscript_Fig1B_bottomright.png")
display(img, method = "raster")

Supplementary Figure 10

PCA result of leukocyte gene expression data (E-MTAB-2452) is displayed in A) a table or B) a scatter plot. PCA is done on a centered, but not scaled, input dataset by default. Different cutoff parameters for GSEA annotation, such as minimum validation score or NES, can be set.

supFig8 <- ggpubr::ggarrange(a14_ft, b, labels = c("a", "b"),
                             nrow = 2, heights = c(1.7, 5), align = "hv") 

supFig8
## Save the Supplementary Figure 8
png("Figures/manuscript_Sup_Fig8.png", width = 650, height = 950)
supFig8
dev.off()
img <- readImage("Figures/manuscript_Sup_Fig8.png")
display(img, method = "raster")

Session Info

sessionInfo()

# PCA
## Directly on E-MTAB-2452
gene_common <- intersect(rownames(RAVmodel), rownames(dataset))
prcomRes <- stats::prcomp(t(dataset[gene_common,]))
loadings <- prcomRes$rotation[, 1:8]

### GSEA on top 8 PCs
library(PLIER)
library(clusterProfiler)

data(canonicalPathways)
data(bloodCellMarkersIRISDMAP)
data(svmMarkers)
allPaths <- combinePaths(canonicalPathways, bloodCellMarkersIRISDMAP, svmMarkers)

source('~/data2/[archive]GenomicSuperSignature/R/gmtToMatrix.R')
term2gene <- matrixToTERM2GENE(allPaths)

res_all <- list()
for (i in seq_len(ncol(loadings))) {
  ## Run GSEA on ranked gene list
  geneList <- loadings[,i]
  geneList <- sort(geneList, decreasing = TRUE)
  res <- GSEA(geneList, TERM2GENE = term2gene, pvalueCutoff = 0.05)
  ## Subset to the most significant pathways
  res <- res[which(res$qvalues == min(res$qvalues)), c("Description", "NES", "pvalue", "qvalues"), drop = FALSE]
  resName <- paste0("PC", i)
  res_all[[resName]] <- res
  res_all[[i]] <- res
}

summary <- list()
for (i in 1:8) {
  annotatedPC <- res_all[[i]]
  topAnnotation <- annotatedPC[order(abs(annotatedPC$NES), decreasing = TRUE),][1:5,]
  rownames(topAnnotation) <- NULL
  summary[[i]] <- topAnnotation
  names(summary)[i] <- paste0("PC", i)
}

# Simple version of the output - only witt the description
simple_summary <- sapply(summary, function(x) x$Description) %>% as.data.frame
simple_summary  


shbrief/GenomicSuperSignaturePaper documentation built on Aug. 2, 2022, 2:04 p.m.