Create PCA plots

knitr::opts_chunk$set(echo =TRUE, 
                      message = FALSE,
                      warning = FALSE)

suppressPackageStartupMessages({
  library(Biobase)
  # library(gridExtra)
  # library(pheatmap)
  # library(BiostatsALL)
  # library(affy)
  library(ggplot2)
  # library(preprocessCore)
  library(limma)
  library(plyr)
  # library(sigaR)
  # library(MergeMaid)
})

outputDir <- params$outputDir
if (!dir.exists(outputDir)) dir.create(outputDir)
dataDir <- params$dataCacheDir
prefix <- params$timestamp



getPCAs<-function (eset, maxpc = 3) 
{
  if (!is.matrix(eset)) {
    ex <- exprs(eset)
    out <- pData(eset)
  }
  else {
    out <- NULL
    ex <- eset
  }
  pca.res <- prcomp(t(ex))
  x <- pca.res$x[, 1:maxpc]
  colnames(x) <- paste("PC", 1:maxpc, sep = ".")
  varex = round(100 * summary(pca.res)$importance[2, ])
  db <- cbind.data.frame(x)
  if (!is.null(out)) 
    db <- cbind(db, out)
  return(list(db = as.data.frame(db), varex = varex))
}
plotPCA<-function(pca.db,title=paste0('PCA'), 
                  color.var='study2',var.shape='pathogen',var.size='months2' ){

  p1<-ggplot(mutate(as.data.frame(pca.db$db),
                    months=round((study_time_collected/28),1),
                    months2=ifelse(months<0, -1, months)+2),
             aes_string(x='PC.1',y='PC.2', color=color.var,  size=var.size, shape=var.shape)) +
    geom_jitter() + theme_bw() +
    scale_color_manual(values=ann.colors[[color.var]]) +
    labs(x=paste('PC-1 (',pca.db$varex[1],'%)',sep=''), 
         y=paste('PC-2 (',pca.db$varex[2],'%)',sep=''), title=title) 
  return(p1)
}

my.plot.pvca <- function(pvcaObj, 
                         cex.percentage = 1, 
                         fname = NULL, 
                         ht = 4, 
                         wd = 5, 
                         title = fname, 
                         race.vars=NULL,
                         outputDir = NULL) {
  require(stringr)
  require(ggplot2)
  title <- gsub("/", "", fixed = TRUE, fname)
  title <- gsub(".", "", fixed = TRUE, fname)
  title <- gsub("PVCA", "", fixed = TRUE, fname)
  labels <- gsub(":", " x ", pvcaObj$label)
  labels <- gsub("_imputed", " ", labels)
  db <- data.frame(perc = t(pvcaObj$dat), labels = factor(labels, 
                                                          levels = labels))

  if (!is.null(race.vars)) {
    db<-rbind(db, 
              data.frame(perc=sum(subset(db, labels%in%race.vars)$perc), labels='race')) %>%
      subset(!(labels%in%race.vars)) %>% arrange(1*(labels=='resid'), perc)
  }

  p <- ggplot(db, aes(x = reorder(labels, perc), y = perc)) + 
    geom_bar(stat = "identity", fill = "blue") + theme_bw() + 
    scale_y_continuous(limits = c(0,  1.1)) + 
    geom_text(aes(x = labels, y = perc + 0.05, label = paste0(as.character(round(100 *  perc, 1)), "%"))) +
    labs(x = "Effects", y = "Weighted average proportion variance", title = paste("PVCA ", title)) + 
    scale_x_discrete(labels = function(x) str_wrap(x, width = 8))
  if (!is.null(fname)) {
    ggsave(file = file.path(outputDir, paste0(fname, ".pdf")), height = ht, width = wd, 
           plot = p)
  }
  return(p)
}


impute.na<-function(x){x[is.na(x)]<-mean(x,na.rm=T); return(x)}

Nfeatures.pca<-5000

race.vars <- c('Hispanic','White','Black','Asian')
eset_noNorm <- readRDS(file.path(dataDir, paste0(prefix, "all_noNorm_noResponse_eset.rds")))
# eset_crossStudyNorm <- readRDS(file.path(dataDir, paste0(prefix, "all_norm_noBatchCorrect_eset.rds")))
# eset_batchCorrected <- readRDS(file.path(dataDir, paste0(prefix, "all_norm_noResponse_eset.rds")))
# eset_noCelltype <- readRDS(file.path(dataDir, paste0(prefix, "all_norm_nocelltype_noResponse_eset.rds")))

# eset_old <- readRDS(file.path(dataDir, paste0(prefix, "old_norm_noResponse_eset.rds")))
# eset_all <- readRDS(file.path(dataDir, paste0(prefix, "all_norm_noResponse_eset.rds")))
# eset_young <- readRDS(file.path(dataDir, paste0(prefix, "young_norm_noResponse_eset.rds")))


# Nfeatures.pca<-5000

# eset_noNorm$pathogen_adjuvant <- interaction(eset_noNorm$pathogen, eset_noNorm$adjuvant)
# eset_noNorm$time <- eset_noNorm$study_time_collected
# eset_noNorm$age_group <- ifelse(eset_noNorm$age_imputed < 50,'young','old')
# eset_noNorm$sex <- eset_noNorm$y_chrom_present

# eset_crossStudyNorm$pathogen_adjuvant <- interaction(eset_crossStudyNorm$pathogen, eset_crossStudyNorm$adjuvant)
# eset_crossStudyNorm$time <- eset_crossStudyNorm$study_time_collected
# eset_crossStudyNorm$sex <- eset_crossStudyNorm$y_chrom_present
# eset_crossStudyNorm$age_group <- ifelse(eset_crossStudyNorm$age_imputed < 50,'young','old')

# eset_batchCorrected$pathogen_adjuvant <- interaction(eset_batchCorrected$pathogen, eset_batchCorrected$adjuvant)
# eset_batchCorrected$time <- eset_batchCorrected$study_time_collected
# eset_batchCorrected$sex <- eset_batchCorrected$y_chrom_present
# eset_batchCorrected$age_group <- ifelse(eset_batchCorrected$age_imputed < 50,'young','old')
# 
# eset_noCelltype$pathogen_adjuvant <- interaction(eset_noCelltype$pathogen, eset_noCelltype$adjuvant)
# eset_noCelltype$time <- eset_noCelltype$study_time_collected
# eset_noCelltype$sex <- eset_noCelltype$y_chrom_present
# eset_noCelltype$age_group <- ifelse(eset_noCelltype$age_imputed < 50,'young','old')
# 
# 
# race.vars <- c('Hispanic','White','Black','Asian')
load(here::here("vignettes", "original_code", "adj_path_vt_colors.rda"))
# add for batch
sc<-c("magenta","purple","orangered3","firebrick2","gray","green","lightblue4","yellow2", "cyan","firebrick4","royalblue3","darkorange2",
      "brown",'olivedrab','aquamarine','red','gold','dodgerblue','darkgray','lightsalmon1','darkgreen','royalblue3','orchid4',
      'tan3','sandybrown','moccasin','pink','magenta','black','green','blue','violetred','snow4')
ann.colors<-list(y_chrom_present=c('red','blue'),
                 study_accession=sc,
                 study_accession2=c(sc,'black'),
                 featureSetName2=sc,
                 vaccine=sc[1:length(unique(eset_noNorm$vaccine))])
ann.colors<-c(adj_path_vt_colors,ann.colors)
path<-c(Meningitis="#80B1D3","Ebola"='gold',
        'Hepatitis B'='purple', 'HIV'='red','Malaria'='cyan')
ann.colors$pathogen<-c(path, ann.colors$pathogen[setdiff(names(ann.colors$pathogen), names(path))])
Nfeatures.pca<-5000
eset_noNorm_baseline <- eset_noNorm[, eset_noNorm$study_time_collected == 0]

make_the_plots <- function(esetType) {

  eset <- get(paste0("eset_", esetType))
  pcaFileName <- file.path(outputDir, paste0("pca_", esetType, "_baseline.rds"))

  if (!file.exists(pcaFileName)) {
    eset_baseline <- eset[, eset$study_time_collected == 0]
    features.pca.0 <- featureNames(eset_baseline)[
      which( (rowMeans(is.na(exprs(eset_baseline))) == 0) &
               (apply(exprs(eset_baseline),1,sd) > 0) )
    ]
    pca_baseline <- getPCAs(eset_baseline[features.pca.0,])
    pca_baseline$db$size=factor(1)
    saveRDS(pca_baseline, pcaFileName)
  } else {
    pca_baseline <- readRDS(pcaFileName)
  }


  # PCA for baseline samples, colored by study and shaped by sequencing technology
  pcaPlot_baseline_v1 <- 
    plotPCA(pca_baseline, 
            title = paste0("PCA baseline ", esetType),
            color.var = 'study_accession',
            var.shape = 'featureSetName2',
            var.size = 'size') + 
    scale_shape_manual(values=c(12,20,21:23,10,11,13:19))

  ggsave(file = file.path(outputDir, paste0("pca_", esetType, "_baseline_v1.pdf")),
         plot = pcaPlot_baseline_v1, 
         height = 6,
         width = 9)

  # PCA for baseline samples, colored by technology and shape by sample type
  pcaPlot_baseline_v2 <- 
    plotPCA(pca_baseline,
            title = paste0("PCA baseline ", esetType),
            color.var = 'featureSetName2',
            var.shape = 'cell_type',
            var.size = 'size') + 
    scale_shape_manual(values=c(12,20,21:23,10,11,13:19))

  ggsave(file = file.path(outputDir, paste0("pca_", esetType, "_baseline_v2.pdf")),
         plot = pcaPlot_baseline_v2,
         height = 6,
         width = 9)

  pcaPlot_baseline_v3 <- 
    plotPCA(pca_baseline,
            title = paste0("PCA Baseline by batch ", esetType),
            color.var = 'featureSetName2',
            var.shape = 'age_group',
            var.size = 'size')
  ggsave(file.path(outputDir, paste0("pca_", esetType, "_baseline_v3.pdf")))


  print(pcaPlot_baseline_v1)
  print(pcaPlot_baseline_v2)
  print(pcaPlot_baseline_v3)
}
library(data.table)
eset_young <- readRDS("~/Projects/ImmuneSignatures2/data_cache/2021_02_11/2021_02_11_young_norm_noResponse_eset.rds")
eset_young$age_group <-  ifelse(eset_young$age_imputed < 50,'young','old')
make_the_plots("young")

eset_old <- readRDS("~/Projects/ImmuneSignatures2/data_cache/2021_02_11/2021_02_11_old_norm_noResponse_eset.rds")
eset_old$age_group <- "old"
make_the_plots("old")

pd_old <- pData(eset_old)
setDT(pd_old)
pd_young[, .N, .(featureSetName, featureSetName2)]
pd_young <- pData(eset_young)
setDT(pd_young)
pd_young[featureSetName2 == "HGU133_plus_PM", .N, .(study_accession)]

### Before batchGate ###

eset_young_pre <- readRDS("~/Projects/ImmuneSignatures2/data_cache/old/2020_10_27_young_norm_noResponse_eset.rds")
eset_old_pre <- readRDS("~/Projects/ImmuneSignatures2/data_cache/old/2020_10_27_old_norm_noResponse_eset.rds")
eset_young_pre$age_group <- "young"
eset_old_pre$age_group <- "old"
make_the_plots("young_pre")
make_the_plots("old_pre")

All cohorts, before cross-study normalization

make_the_plots("noNorm")

All cohorts, after cross-study normalization

# make_the_plots("crossStudyNorm")

All cohorts, batch corrected without cell type

make_the_plots("noCelltype")

All cohorts, after batch correction

make_the_plots("batchCorrected")

Young only, after batch correction

subjects age 18-50

eset_young_batchCorrected <- readRDS(file.path(dataDir, paste0(prefix, "young_norm_noResponse_eset.rds")))

eset_young_batchCorrected$pathogen_adjuvant <- interaction(eset_young_batchCorrected$pathogen, eset_young_batchCorrected$adjuvant)
eset_young_batchCorrected$time <- eset_young_batchCorrected$study_time_collected
eset_young_batchCorrected$sex <- eset_young_batchCorrected$y_chrom_present
eset_young_batchCorrected$age_group <- ifelse(eset_young_batchCorrected$age_imputed < 50,'young','old')
race.vars <- c('Hispanic','White','Black','Asian')

make_the_plots("young_batchCorrected")

extendedOld, after batch correction

Subjects age 50-91

eset_old2_batchCorrected <- readRDS(file.path(dataDir, paste0(prefix, "old_norm_noResponse_eset.rds")))

eset_old2_batchCorrected$pathogen_adjuvant <- interaction(eset_old2_batchCorrected$pathogen, eset_old2_batchCorrected$adjuvant)
eset_old2_batchCorrected$time <- eset_old2_batchCorrected$study_time_collected
eset_old2_batchCorrected$sex <- eset_old2_batchCorrected$y_chrom_present
eset_old2_batchCorrected$age_group <- ifelse(eset_old2_batchCorrected$age_imputed < 50,'young','old')

make_the_plots("old2_batchCorrected")

all, subsetted young vs old

eset_all_old <- eset_batchCorrected[,eset_batchCorrected$age_impute >= 60 ]
make_the_plots("all_old")
eset_all_young <- eset_batchCorrected[,eset_batchCorrected$age_imputed < 60 ]
make_the_plots("all_young")


RGLab/ImmuneSignatures2 documentation built on Dec. 9, 2022, 10:51 a.m.