library(gridExtra) library(pheatmap) library(BiostatsALL) #devtools::install_github("mssm-msf-2019/BiostatsALL") library(affy) library(ggplot2) library(preprocessCore) library(limma) library(plyr) library(sigaR) # library(MergeMaid) library(stringr)
addPDataFeatures <- function(eset){ eset$pathogen_adjuvant <- interaction(eset$pathogen, eset$adjuvant) eset$time <- eset$study_time_collected eset$age_group <- ifelse(eset$age_imputed < 60, 'young', 'old') eset$sex <- eset$gender_imputed return(eset) } subsetToBaseline <- function(eset){ eset.baseline <- eset[, eset$study_time_collected == 0 ] # boxplot(exprs(eset.baseline), las = 2) return(eset.baseline) } getSelectedGenes <- function(eset){ hasCompleteData <- rowMeans(is.na(exprs(eset))) == 0 hasStdDeviation <- apply(exprs(eset), 1, sd) > 0 selectedGenes <- featureNames(eset)[ which(hasCompleteData & hasStdDeviation) ] } getPCAs <- function(eset, maxpc = 3){ if (!is.matrix(eset)) { ex <- exprs(eset) out <- pData(eset) } else { out <- NULL ex <- eset } pca.res <- prcomp(t(ex)) x <- pca.res$x[, 1:maxpc] colnames(x) <- paste("PC", 1:maxpc, sep = ".") varex = round(100 * summary(pca.res)$importance[2, ]) db <- cbind.data.frame(x) if (!is.null(out)) { db <- cbind(db, out) } return(list(db = as.data.frame(db), varex = varex)) } plotPCA <- function(pca.db, title = paste0('PCA'), color.var='study2', var.shape='pathogen', var.size='months2' ){ p1 <- ggplot(mutate(as.data.frame(pca.db$db), months = round((study_time_collected/28), 1), months2 = ifelse(months < 0, -1, months) + 2), aes_string(x = 'PC.1', y = 'PC.2', color = color.var, size = var.size, shape = var.shape)) + geom_jitter() + theme_bw() + scale_color_manual(values=ann.colors[[color.var]]) + labs(x = paste('PC-1 (', pca.db$varex[1], '%)', sep=''), y = paste('PC-2 (', pca.db$varex[2], '%)', sep=''), title = title) return(p1) } my.plot.pvca <- function (pvcaObj, cex.percentage = 1, fname = NULL, ht = 4, wd = 5, title = fname, race.vars = NULL) { title <- mgsub(c("/", ".", "PVCA"), rep("", 3), fixed = TRUE, fname) labels <- gsub(":", " x ", pvcaObj$label) labels <- gsub("_imputed", " ", labels) db <- data.frame(perc = t(pvcaObj$dat), labels = factor(labels, levels = labels)) if (!is.null(race.vars)) { db <- rbind(db, data.frame(perc = sum(subset(db, labels %in% race.vars)$perc), labels='race')) %>% subset(!(labels %in% race.vars)) %>% arrange(1*(labels == 'resid'), perc) } p <- ggplot(db, aes(x = reorder(labels, perc), y = perc)) + geom_bar(stat = "identity", fill = "blue") + theme_bw() + scale_y_continuous(limits = c(0, 1.1)) + geom_text(aes(x = labels, y = perc + 0.05, label = paste0(as.character(round(100 * perc, 1)), "%"))) + labs(x = "Effects", y = "Weighted average proportion variance", title = paste("PVCA ", title)) + scale_x_discrete(labels = function(x) str_wrap(x, width = 8)) + theme(axis.text.x = element_text(angle = 25, hjust = 1)) if (!is.null(fname)) { ggsave(file = paste0(fname, ".pdf"), height = ht, width = wd, plot = p) } return(p) } impute.na <- function(x){ x[is.na(x)] <- mean(x, na.rm=TRUE) return(x) }
dir.path <- here::here("data_cache", "2021_03_08") eset <- readRDS(file.path(dir.path, "2021_03_08_all_noNorm_eset.rds")) eset.n <- readRDS(file.path(dir.path,"2021_03_08_all_norm_eset.rds"))
Nfeatures.pca <- 5000 race.vars <- c('Hispanic','White','Black','Asian') standard.vars <- c('y_chrom_present','cell_type') scale_shape_values <- c(12,20,21:23,10,11,13:19) pvca_threshold <- 0.8
# load("/Users/maytesuarezfarinas/Desktop/DHIPC/SignaturesIOF_Rproj/adj_path_vt_colors.rda") # add for batch sc <- c("magenta","purple","orangered3","firebrick2","gray","green","lightblue4","yellow2", "cyan","firebrick4","royalblue3","darkorange2","brown",'olivedrab','aquamarine', 'red','gold','dodgerblue','darkgray','lightsalmon1','darkgreen','royalblue3','orchid4', 'tan3','sandybrown','moccasin','pink','magenta','black','green','blue','violetred','snow4') ann.colors <- list(y_chrom_present = c('red','blue'), study_accession = sc, study_accession2 = c(sc,'black'), featureSetName2 = sc, vaccine = sc[1:length(unique(eset$vaccine))]) # ann.colors <- c(adj_path_vt_colors, ann.colors) path <- c('Meningitis' = "#80B1D3", "Ebola" = 'gold', 'Hepatitis B' = 'purple', 'HIV' = 'red', 'Malaria' = 'cyan') ann.colors$pathogen <- c(path, ann.colors$pathogen[setdiff(names(ann.colors$pathogen), names(path))])
# No Normalization eset <- addPDataFeatures(eset) eset.baseline <- subsetToBaseline(eset) selectedGenes.noNorm <- getSelectedGenes(eset.baseline) eset.baseline.selected <- eset.baseline[ selectedGenes.noNorm, ] pcas.noNorm.baseline <- getPCAs(eset.baseline.selected) pcas.noNorm.baseline$db$size <- factor(1) # With Normalization eset.n <- addPDataFeatures(eset.n) eset.n.baseline <- subsetToBaseline(eset.n) selectedGenes.withNorm <- getSelectedGenes(eset.n.baseline) eset.n.baseline.selected <- eset.n.baseline[ selectedGenes.withNorm, ] pcas.withNorm.baseline <- getPCAs(eset.n.baseline.selected) pcas.withNorm.baseline$db$size <- factor(1)
p <- plotPCA(pcas.noNorm.baseline, title = 'PCA Baseline', color.var = 'study_accession', var.shape = 'featureSetName2', var.size = 'size') plotByStudy.noNorm <- p + scale_shape_manual(values = scale_shape_values) plotByStudy.noNorm p <- plotPCA(pcas.noNorm.baseline, title='PCA Baseline', color.var='featureSetName2', var.shape='cell_type', var.size='size') plotByPlatform.noNorm <- p + scale_shape_manual(values = scale_shape_values) plotByPlatform.noNorm
plotByPlatform.withNorm <- plotPCA(pcas.withNorm.baseline, title='PCA Baseline Norm+Batch', color.var='featureSetName2', var.shape='age_group', var.size='size') plotByPlatform.noNorm
eset.n.baseline.old <- eset.n[selectedGenes.withNorm, eset.n$age_group == "old"] pcas.withNorm.old <- getPCAs(eset.n.baseline.old) pcas.withNorm.old$db$size <- factor(1) p1 <- plotPCA(pcas.withNorm.old, title = paste('PCANorm+Batch old'), color.var = 'featureSetName2', var.shape = 'cell_type', var.size = 'size') p1 pcas.withNorm.old$db$time.disc <- cut(pcas.withNorm.old$db$study_time_collected, c(-8,-7,0,7,28,Inf)) p1 <- plotPCA(pcas.withNorm.old, title = 'PCANorm+Batch Old', color.var = 'pathogen', var.shape = 'adjuvant', var.size = 'time.disc') p1 <- p1 + scale_shape_manual(values = scale_shape_values) p1 p1 <- plotPCA(pcas.withNorm.old, title = 'PCANorm+Batch Old', color.var = 'pathogen', var.shape = 'adjuvant', var.size = 'size') p1 <- p1 + scale_shape_manual(values = scale_shape_values) p1
source("./functions pvca.R") eset.baseline.withNormFeatures <- eset.baseline[selectedGenes.withNorm,] eigen.baseline <- getEigen(eset.baseline[selectedGenes.noNorm,]) # should this be withNorm? batch.vars <- c(standard.vars, race.vars, 'age_group', 'study_accession') pvca.baseline <- pvcaBatchAssess.MSF2(eset.baseline.withNormFeatures, eigenData = eigen.baseline, batch.factors = batch.vars, include.inter = c( 'age_group:cell_type', 'y_chrom_present:cell_type'), threshold = pvca_threshold) p <- my.plot.pvca(pvca.baseline, fname = paste0('PVCA_4paper_baseline_all'), ht = 3, wd = 6.2, race.vars = race.vars) + theme(axis.text.x = element_text(angle = 25, hjust = 1)) ggsave("pvca_baseline_all.pdf", plot = p)
eset.n.selected <- eset.n[selectedGenes.withNorm,] eigen.all <- getEigen(eset.n.selected) batch.vars <- c(standard.vars, race.vars, 'participant_id', 'pathogen_adjuvant', 'time', 'age_group', 'study_accession') a <- pvcaBatchAssess.MSF2(eset.n.selected, eigenData = eigen.all, batch.factors = batch.vars, include.inter = 'none', threshold = pvca_threshold) p <- my.plot.pvca(a, fname = paste0('PVCA_AfterNorm4paper-all_data'), ht = 3, wd = 6.2, race.vars = race.vars)+ theme(axis.text.x = element_text(angle = 25, hjust = 1)) ggsave("pvca_afternorm.pdf", plot = p) pv <- arrange(data.frame(Factor=a$label, perc=a$dat[1,]), desc(perc)) print(subset(pv, perc > 0.01))
pvca.byAge <- function(age_group){ eset.n.selected <- eset.n[ selectedGenes.withNorm, eset.n$age_group == age_group] eigen <- getEigen(eset.n.selected) batch.vars <- c('participant_id', standard.vars, race.vars, 'pathogen_adjuvant', 'time') a <- pvcaBatchAssess.MSF2(eset.n.selected, eigenData = eigen, batch.factors = batch.vars, include.inter = c('pathogen_adjuvant:time', 'y_chrom_present:pathogen_adjuvant:time'), threshold = pvca_threshold) my.plot.pvca(a, fname = paste0('PVCA_AfterNorm4paper', age_group), ht = 3, wd = 5.5, race.vars = race.vars) } pvca.byAge("young") pvca.byAge("old")
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.