knitr::opts_chunk$set(
  comment = "#>", message = FALSE, warning = FALSE, collapse = TRUE, 
  out.width="60%", out.height="60%", fig.align="center"
)

Setup

RAVs encompass biological signals applicable across different platforms and independent datasets. We demonstrate this transfer learning capacity of RAVs by identifying the neutrophil-associated RAV from systemic lupus erythematosus whole blood (SLE-WB) data and using the same RAV to analyze nasal brushing (NARES) dataset. SLE-WB part of the analysis is described in this vignette and NARES part of the analysis can be found here.

In this vignette, we reproduce Figure 3B and 3C of the MultiPLIER paper and expand the same analysis using GenomicSuperSignature. Analyses here are referencing scripts in this vignette.

Load packages

if (!"GenomicSuperSignaturePaper" %in% installed.packages())
    devtools::install_github("shbrief/GenomicSuperSignaturePaper")

suppressPackageStartupMessages({
  library(GenomicSuperSignature)
  library(GenomicSuperSignaturePaper)
  library(dplyr)
})

RAVmodel

To directly compare our new analysis with the results from MultiPLIER paper, we used the RAVmodel annotated with the same priors as MultiPLER: bloodCellMarkersIRISDMAP, svmMarkers, and canonicalPathways.

## If GenomicSuperSignaturePaper is built locally with RAVmodel in inst/extdata
data.dir <- system.file("extdata", package = "GenomicSuperSignaturePaper")
RAVmodel <- readRDS(file.path(data.dir, "RAVmodel_PLIERpriors.rds"))
RAVmodel <- getModel("PLIERpriors", load=TRUE)
RAVmodel

version(RAVmodel)

SLE-WB data (E-GEOD-65391)

Expression data

Pre-processed expression data is downloaded from here and saved under Results/SLE-WB/data directory.

fname <- "data/SLE_WB_all_microarray_QN_zto_before_with_GeneSymbol.pcl"
exprs <- readr::read_tsv(fname) %>% as.data.frame

rownames(exprs) <- exprs$GeneSymbol
dataset <- as.matrix(exprs[,3:ncol(exprs)])  # 15,825 genes x 1,640 samples
dim(dataset)

dataset[1:2, 1:4]

Metadata

Metadata is downloaded from here and saved under Results/SLE-WB/data directory.

meta <- read.table("data/E-GEOD-65391.sdrf.txt", sep = "\t", header = TRUE)
dim(meta)   # 966 samples have metadata

Neutrophil-related metadata

We searched for the keyword, "neutrophil", and found four columns with the neutrophil-related metadata.

ind <- grep("neutrophil", colnames(meta), ignore.case = TRUE)
colnames(meta)[ind]

Neutrophil counts information ("Characteristics..neutrophil_count.") is subset and cleaned.

neutrophilCount <- meta[,c("Source.Name", "Characteristics..neutrophil_count.")]

## Clean the `Source.Name` column
cleaned_name <- sapply(neutrophilCount[,1], 
                       function(x) stringr::str_split(x, " ")[[1]][1])
neutrophilCount[,1] <- cleaned_name

## 143 NAs were introduced by coercion due to the missing data
neutrophilCount[,2] <- as.numeric(neutrophilCount[,2])
na_ind <- which(is.na(as.numeric(neutrophilCount[,2]))) 

## 853 samples with metadata after clean-up
neutrophilCount <- neutrophilCount[-na_ind,]   
colnames(neutrophilCount)[2] <- "Neutrophil.Count"
dim(neutrophilCount)

head(neutrophilCount, 3)

Subset to the samples with metadata

cleaned_colnames <- sapply(colnames(dataset), 
                           function(x) stringr::str_split(x, "_")[[1]][1])
withMeta_ind <- which(cleaned_colnames %in% neutrophilCount$Source.Name)
dataset <- dataset[, withMeta_ind]   # 15,825 genes x 853 samples
dim(dataset)

MCPCounter

Neutrophil count showed a somewhat weak correlation with the scores assigned by LV (latent variable, an equivalent of RAV in MultiPLIER model) and the MultiPLIER authors suspected that it's likely because the neutrophils are terminally differentiated cells and using gene expression as a measure of it might be under-representative.

To confirm that the weak correlation is not a limitation intrinsic to PLIER models or the MultiPLIER approach, the authors used MCPcounter to estimate cell type abundance in solid tissues and we also took this approach.

if (!"MCPcounter" %in% installed.packages())
    devtools::install_github("ebecht/MCPcounter", 
                         ref = "a79614eee002c88c64725d69140c7653e7c379b4",
                         subdir = "Source",
                         dependencies = TRUE)
## Get cell type estimates with MCPcounter
mcp.results <- MCPcounter::MCPcounter.estimate(expression = dataset,
                                               featuresType = "HUGO_symbols")

## Subset only the neutrophil estimates
neutrophil.df <- reshape2::melt(mcp.results) %>%
  dplyr::filter(Var1 == "Neutrophils") %>% 
  dplyr::select(-Var1)
colnames(neutrophil.df) <- c("Sample", "Neutrophil_estimate")

head(neutrophil.df)

Identify the relavant RAV

In this exploratory data analysis section, we used three different ways to narrow down the RAVs that explain SLE-WB data and it's neutrophil count feature and identified RAV1551 as the best one. Here are the three approaches:

  1. Validation score
  2. Keyword 'neutrophil' in the enriched pathways
  3. Metadata association

Validation

We collected the top 10 validated RAVs with positive average silhouette width. Here, RAV1551 has the highest score with the positive average silhouette width.

val_all <- validate(dataset, RAVmodel) 
validated_ind <- validatedSignatures(val_all, num.out = 10, RAVmodel,
                                     swCutoff = 0, indexOnly = TRUE)

validated_ind
heatmapTable(val_all, RAVmodel, num.out = 10, swCutoff = 0)

Keyword search

We search the keyword, 'neutrophil', in the GSEA annotation of RAVmodel and select the top three enriched pathways (both up- and down- regulated). We found 13 RAVs where the two of the top three enriched pathways contain the keyword and RAV1551 is one of them.

## RAVs containing the keyword, "neutrophil", within top 3 enriched pathways 
findSignature(RAVmodel, "neutrophil", n = 3)

## RAVs with two keyword-containing pathways
sig_ind <- findSignature(RAVmodel, "neutrophil", n = 3, k = 2)
sig_ind

Metadata-associated

We used r-squared value to identify the metadata-associated RAV. (r-squared is the proportion of the variance in the dependent variable that is predictable from the independent variable.) rsq function takes a data frame (data argument) where each column represents different variables. rsq calculates the r-squared values between the two variables (lv and y.var arguments) and returns the numeric vector of them.

calculateRsq <- function (x, y) cor(x, y) ^ 2
rsq <- function(data, lv, y.var = "Neutrophil_estimate") {
  res <- calculateRsq(data[, lv], data[, y.var]) %>% round(., 3)
  return(res)
}

Sample Score

sampleScore <- calculateScore(dataset, RAVmodel)
dim(sampleScore)

sampleScore[1:4, 1:4]
## Plot sample score heatmap using only the RAVs with > 2 elements.
# ind <- which(metadata(RAVmodel)$size > 2)
# sampleScoreHeatmap(score = sampleScore[,ind], 
#                    dataName = "E-GEOD-65391", 
#                    modelName = "RAVmodel",
#                    show_column_names = FALSE, 
#                    row_names_gp = 4)

Neutrophil count

We selected the top ten r-squared values between the neutrophil count and all RAVs.

ss <- as.data.frame(sampleScore)
ss$Source.Name <- rownames(ss)

## Combine neutrophil count and sample scores
ss_count <- dplyr::left_join(neutrophilCount, ss, by = "Source.Name")

## Calculate r-squared value for all RAVs
rsq_count <- sapply(3:ncol(ss_count), 
                    function(x) {rsq(ss_count, x, y.var = "Neutrophil.Count")})
names(rsq_count) <- colnames(ss_count)[3:ncol(ss_count)]
rsq_count <- sort(rsq_count, decreasing = TRUE)

## RAVs with top 10 r-squared value
topRAVs <- head(rsq_count, 10)
topRAVs <- gsub("RAV", "", names(topRAVs)) %>% as.numeric

topRAVs
## r-squared value between the top validated RAVs and the neutrophil count. This
## is one way to narrow down validated RAVs.

## Samples scores from five validated RAVs
sampleScore_sub <- sampleScore[,validated_ind] %>% as.data.frame
sampleScore_sub$Source.Name <- rownames(sampleScore_sub)

## Combine neutrophil count and sample score from five validated RAVs
dat_n.count <- dplyr::left_join(neutrophilCount, 
                                sampleScore_sub, 
                                by = "Source.Name")

## RAV1551 shows the highest r-squared value.
rsq_valSub <- sapply(3:ncol(dat_n.count), 
                     function(x) {rsq(dat_n.count, 
                                      x, y.var = "Neutrophil.Count")})
names(rsq_valSub) <- colnames(dat_n.count)[3:ncol(dat_n.count)]
rsq_valSub <- sort(rsq_valSub, decreasing = TRUE)
rsq_valSub

Best RAV

We identified RAV1551 as the best RAV to explain SLE-WB data and it's neutrophil count feature because RAV1551 is repeatedly found in the three approaches above.

bestRAV <- intersect(validated_ind, sig_ind) %>% intersect(., topRAVs)
bestRAV

Neutrophil estimate

Above, we identified RAV1551 in the three different approaches. Here, RAV1551 shows the highest r-squared value among all RAVs when compared to neutrophil estimate. This confirms that RAV1551 best represents the neutrophil feature of SLE-WB dataset.

sampleScore.df <- sampleScore %>% as.data.frame(.) %>% 
  tibble::rownames_to_column(.)
colnames(sampleScore.df)[1] <- "Sample"

## Join all the scores with neutrophil estimates
dat_n.estimate <- dplyr::inner_join(neutrophil.df, sampleScore.df, by="Sample")
dim(dat_n.estimate)
dat_n.estimate[1:4, 1:4]

## RAVs with the high r-squared values with the neutrophil estimate
rsq_estimate <- sapply(3:ncol(dat_n.estimate), function(x) {rsq(dat_n.estimate,x)})
names(rsq_estimate) <- colnames(dat_n.estimate)[3:ncol(dat_n.estimate)]
rsq_estimate <- sort(rsq_estimate, decreasing = TRUE)

head(rsq_estimate)

Conclusion

We recovered RAV1551 as the neutrophil-associated signature through validation, GSEA, and metadata-association. This result is confirmed again with the highest r-squared value between RAV1551 score and the neutrophil estimate.

Neutrophil Count

count_plot <- LVScatter(ss_count, paste0("RAV", 1551), 
                        y.var = "Neutrophil.Count",
                        ylab = "Neutrophil Count",
                        title = "RAVmodel",
                        subtitle = "SLE WB Compendium")
count_plot
saveRDS(count_plot, "outputs/neutrophil_count.rds")
png("outputs/png/neutrophil_count.png", width = 400, height = 400)
count_plot
dev.off()

Neutrophil Estimate

estimate_plot <- LVScatter(dat_n.estimate, paste0("RAV", 1551), 
                           y.var = "Neutrophil_estimate",
                           ylab = "MCPcounter neutrophil estimate",
                           title = "RAVmodel",
                           subtitle = "SLE WB MCPcounter")
estimate_plot
saveRDS(estimate_plot, "outputs/neutrophil_estimate.rds")
png("outputs/png/neutrophil_estimate.png", width = 400, height = 400)
estimate_plot
dev.off()

Other EDA

Validation

heatmapTable(val_all, RAVmodel, 1551)

MeSH terms

drawWordcloud(RAVmodel, 1551)

GSEA

gseaRes <- gsea(RAVmodel)[[1551]]
gseaRes <- gseaRes[order(gseaRes$NES, decreasing = TRUE),]
keyword_ind <- grep("neutrophil", gseaRes$Description, ignore.case = TRUE)

All the enriched pathways for RAV1551 with the minimum p-value of r min(gseaRes$qvalues)

gseaRes$Description

We ordered the enriched pathways of RAV1551 based on NES and the keyword-containing pathways were placed r paste(paste(keyword_ind, collapse = ","), "out of", nrow(gseaRes)).

gseaRes[keyword_ind, c("Description", "NES", "qvalues")]

Annotate PCs

We checked how the top PCs of SLE-WB data are annotated using annotatePC.

# Top 8 PCs
annotatePC(1:8, val_all = val_all, RAVmodel = RAVmodel, scoreCutoff = 0)

# PC1
annotatePC(1, val_all = val_all, RAVmodel = RAVmodel, simplify = FALSE)

Session Info

sessionInfo()



shbrief/GenomicSuperSignaturePaper documentation built on Aug. 2, 2022, 2:04 p.m.