knitr::opts_chunk$set(echo = TRUE, collapse = TRUE, message = FALSE, eval=FALSE, out.height="50%", out.width="50%")
library(dplyr) library(ggplot2)
## If GenomicSuperSignaturePaper is built locally with RAVmodel in inst/extdata data.dir <- system.file("extdata", package = "GenomicSuperSignaturePaper") RAVmodel <- readRDS(file.path(data.dir, "RAVmodel_C2.rds"))
RAVmodel <- getModel("C2", load=TRUE)
RAVmodel
updateNote(RAVmodel)
pcaSummary <- trainingData(RAVmodel)$PCAsummary # PCA summary
pcaSummary_cumulative <- as.data.frame(matrix(nrow=20, ncol=length(pcaSummary))) rownames(pcaSummary_cumulative) <- paste0("PC", 1:20) colnames(pcaSummary_cumulative) <- names(pcaSummary) for (i in seq_along(pcaSummary)) { x <- pcaSummary[[i]]["Cumulative",] pcaSummary_cumulative[,i] <- x }
Accumulated variance explained at n-th PC
n <- 20 hist(as.numeric(pcaSummary_cumulative[n,]), xlim = c(0.2, 1), ylim = c(0, 80), xlab = "Cumulative Variance Explained", main = paste0("Cumulative Variance Explained at PC", n))
for (n in 5:20) { hist(as.numeric(pcaSummary_cumulative[n,]), xlim = c(0.2, 1), ylim = c(0, 200), xlab = "Cumulative Variance Explained", main = paste0("Cumulative Variance Explained at PC", n)) }
accVar_90
is the number of PCs for each study that achieves above 90%
cumulative variance explained. Out of 536 studies, 519 studies couldn't
explain 90% of the variance with top 20 PCs.
accVar_90 <- apply(pcaSummary_cumulative, 2, function(x) {sum(x <= 0.90)}) table(accVar_90)
There are two studies whose PC1 explains more than 90% of the variance (accVar_90 == 0).
sum(accVar_90 == 0)
accVar_95 <- apply(pcaSummary_cumulative, 2, function(x) {sum(x <= 0.95)}) table(accVar_95)
cutoff <- 0.7 var <- apply(pcaSummary_cumulative, 2, function(x) {sum(x <= cutoff)}) table(var) # hist(var)
pcaSummary_variance <- as.data.frame(matrix(nrow=20, ncol=length(pcaSummary))) rownames(pcaSummary_variance) <- paste0("PC", 1:20) colnames(pcaSummary_variance) <- names(pcaSummary) for (i in seq_along(pcaSummary)) { x <- pcaSummary[[i]]["Variance",] pcaSummary_variance[,i] <- x }
Plot variance explained by PCs from 536 studies used in RAVmodel building.
plot(pcaSummary_variance[,1], xlab = "PC number", ylab = "Variance explained by PC", ylim = c(0, 0.5)) for (i in 2:ncol(pcaSummary_variance)) { lines(pcaSummary_variance[,i]) } abline(h = 0.05, col = "red") abline(v = 8, col = "red")
## Variance explained by PC20 as.numeric(pcaSummary_variance[20,]) %>% summary ## Cumulative variance explained up to PC20 as.numeric(pcaSummary_cumulative[20,]) %>% summary
The number of PCs that explains more than 5% of variance within each study. For example, none of the PCs from 11 studies explain more than 5% of variance and two studies contain 7 PCs that explain more than 5% of variance.
var <- apply(pcaSummary_variance, 2, function(x) {sum(x >= 0.05)}) table(var)
Modify PLIER::num.pc
to match with our data processing steps: PLIER::num.pc
function does z-score normalization but our training datasets are z-score
normalized as a combined datasets (standard deviation and mean are calculated
from all samples combined). So we removed the internal z-score from the num.pc
function and used already-normalized data. This function is named as gss.num.pc
.
gss.num.pc <- function(data) { n <- ncol(data) m <- nrow(data) if (n < 500) {k = n} else {k = max(200, n/4)} if (k == n) {uu <- svd(data)} else {uu <- rsvd(data, k, q = 3)} x = smooth(xraw <- abs(diff(diff(uu$d))), twiceit = TRUE) nsv = which(x <= quantile(x, 0.5))[1] + 1 return(nsv) }
## Input parameters for RAVmodel_536 cg <- readRDS("~/GSS/model_building/data/topGenes_13934.rds") trainingDatasets <- "refinebioRseq" ## Working directories in.dir <- "~/Data/refinebio_processed/rna_seq_v2" ## Load the training dataset information dir <- system.file("extdata", package = "GenomicSuperSignature") studyMeta <- read.table(file.path(dir, "studyMeta.tsv.gz")) ind <- which(studyMeta$RAVmodel_536 == TRUE) allStudies <- studyMeta$studyName[ind] ## Standard deviation and mean wd_data <- file.path("~/GSS/GenomicSuperSignatureLibrary", trainingDatasets) wd <- file.path(wd_data, "RAVmodel_536") fname <- paste0(trainingDatasets, "_536study") sdmean <- readRDS(file.path(wd, paste0(fname, "_SdMean.rds"))) s <- sdmean$sd m <- sdmean$mean non_exp <- which(s == 0) %>% names
For our 536 training datasets, we calculate the significant number of PCs
through the elbow method and save the result in pcNumAll
object.
pcNum <- vector(mode = "list", length = length(allStudies)) names(pcNum) <- allStudies for (study in allStudies) { # Load the count matrix dir.path <- file.path(in.dir, study) x <- data.table::fread(file.path(dir.path, paste0(study, "_count.csv")), showProgress = FALSE) x <- data.frame(x[,-1], row.names = x$V1) %>% as.matrix x <- x[cg,,drop=FALSE] # Remove non-expressing genes in all samples (m == 0) x <- x[!rownames(x) %in% non_exp,] # Normalization x <- sweep(x, 1, m) x <- sweep(x, 1, s, "/") # Estimate the number of 'significant' PCs res <- gss.num.pc(x) pcNum[[study]] <- res } pcNumAll <- stack(pcNum) colnames(pcNumAll) <- c("pcNum", "Studies") write.table(pcNumAll, "pcNumAll.tsv", row.names = FALSE)
pcNumAll <- read.table("~/GSS/GenomicSuperSignaturePaper/Results/model/pcNumAll.tsv", header = TRUE) barplot(table(pcNumAll$pcNum), xlab = "the number of PCs", xlim = c(0, 45), main = "# of significant PCs for 536 studies") summary(pcNumAll$pcNum)
pcNum.max <- which.max(pcNumAll$pcNum) pcNum.min <- which.min(pcNumAll$pcNum) pcNumAll[c(pcNum.max, pcNum.min),]
Scree plot of the studies with maximum and minimum recommended number of PCs
studies <- pcNumAll$Studies[c(pcNum.max, pcNum.min)] n <- 50 # number of PCs to display in the screeplot (= x-axis range)
for (study in studies) { # Load the count matrix dir.path <- file.path(in.dir, study) x <- data.table::fread(file.path(dir.path, paste0(study, "_count.csv")), showProgress = FALSE) x <- data.frame(x[,-1], row.names = x$V1) %>% as.matrix x <- x[cg,,drop=FALSE] # Remove non-expressing genes in all samples (m == 0) x <- x[!rownames(x) %in% non_exp,] # Normalization x <- sweep(x, 1, m) x <- sweep(x, 1, s, "/") # PCA pca_res <- prcomp(t(x)) # screeplot(pca_res, npcs = 10, main = study, type = "lines") # Plotting #calculate total variance explained by each principal component var_explained = pca_res$sdev^2 / sum(pca_res$sdev^2) # create scree plot idealPcNum <- pcNumAll$pcNum[which(pcNumAll$Studies == study)] title <- paste0(study, " (recommended # of PCs is ",idealPcNum, ")") pl <- qplot(1:n, var_explained[1:n]) + geom_line() + xlab("Principal Component") + ylab("Variance Explained") + ggtitle(title) + ylim(0, 1) print(pl) }
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.