knitr::opts_chunk$set( echo = TRUE, message = FALSE, warning = FALSE, cache = params$cache, cache.path = params$report_cache_dir )
Setup: Load libraries and helper functions
suppressPackageStartupMessages({ library(package = "Biobase") library(package = "tidyverse") library(package = "qusage") # Bioc install library(package = "boot") # Bioc install library(package = "ggthemes") library(package = "pheatmap") library(package = "matrixStats") library(package = "RColorBrewer") library(package = "reshape2") library(package = "lme4") library(package = "metap") library(package = "circlize") library(package = "corrplot") library(package = "ComplexHeatmap") library(package = "ggpubr") })
data_dir <- params$data_dir extra_data_dir <- params$extra_data_dir if (!dir.exists(extra_data_dir)) dir.create(extra_data_dir) date_prefix <- params$date_prefix
# For every gene set, find out in how many vaccines it is significantly up at # any one postvax time point. # The same but for down at any postvax time point calc_sharing_df <- function(x) { x %>% dplyr::mutate(direction = ifelse( test = activity_score > 0, yes = "up", no = "down"), is_significant = !is.na(FDR) & FDR < 0.05) %>% dplyr::select(pathogen, vaccine_type, geneset, direction, is_significant) %>% dplyr::distinct() %>% dplyr::group_by(geneset, direction) %>% dplyr::summarise(n = sum(is_significant)) %>% dplyr::group_by(geneset) %>% dplyr::arrange(-n) %>% dplyr::slice(1) %>% dplyr::ungroup() } # Qusage (version 2.18.0) runs very slow on certain datasets, because it uses # the convolve() function from stats which itself uses the fft() function. The # fft() function runs very slow on certain inputs (e.g. 3 minutes instead of # under 1 second with other fft implementations for the same input). # # Found a report of speed issues with the convolve() function at: # https://stat.ethz.ch/pipermail/r-help/2007-December/148777.html # # Found a solution for the slow convolve() (actually the fft() it uses) at: # https://stat.ethz.ch/pipermail/r-help/2007-December/148779.html # # The solution below requires the fftw library, which on ubuntu required the # installation of the fftw-dev package (sudo apt install fftw3-dev) before # install.packages("fftw") was successful. # # Here we override the convolve function to use a different (faster) fft # function. convolve <- function (x, y, conj = TRUE, type = c("circular", "open", "filter")) { type <- match.arg(type) n <- length(x) ny <- length(y) Real <- is.numeric(x) && is.numeric(y) if (type == "circular") { if (ny != n) stop("length mismatch in convolution") } else { n1 <- ny - 1 x <- c(rep.int(0, n1), x) n <- length(y <- c(y, rep.int(0, n - 1))) } # Original fft lines: #x <- fft(fft(x) * (if (conj) # Conj(fft(y)) # else fft(y)), inverse = TRUE) # Use fftw instead of fft x <- fftw::FFT(fftw::FFT(x) * (if (conj) Conj(fftw::FFT(y)) else fftw::FFT(y)), inverse = TRUE) if (type == "filter") (if (Real) Re(x) else x)[-c(1L:n1, (n - n1 + 1L):n)]/n else (if (Real) Re(x) else x)/n } create_signif_label <- function(x) { cut(x, c(-Inf, .001, .01, .05, Inf), labels = c("***", "**", "*", "ns")) } #' Keep only specified symbols in a given list of gene sets #' #' @param genesets.symbol list of character vectors #' @param symbols2keep character vector #' @param min_size minimum number of symbols to keep a gene set keep_symbols <- function(genesets.symbol, symbols2keep, min_size = 2) { stopifnot(is.character(symbols2keep)) stopifnot(is.list(genesets.symbol)) common_symbols <- intersect(Reduce(union, genesets.symbol), symbols2keep) purrr::map( .x = genesets.symbol, ~.x[.x %in% common_symbols])%>% purrr::discard(~length(.x) < min_size) } # NOTE: function below was copied from # https://github.com/RGLab/ImmuneSignatures2/blob/master/R/geneExpressionPreProcessing.R # #' Remove incomplete rows #' #' @param eset expressionSet #' @export #' removeAllNArows <- function(eset){ em <- Biobase::exprs(eset) allNARows <- apply(em, 1, function(x){ all(is.na(x)) }) eset <- eset[ !allNARows, ] } remove_any_NA_rows <- function(eset) { any_NA_rows <- apply(X = Biobase::exprs(eset), MARGIN = 1, FUN = function(x)any(is.na(x))) eset[!any_NA_rows,] } #' Keep most recent pre-vaccination time point for each participant #' #' @param eset expressionSet #' @param drop_postvax if TRUE, postvax time points will be discarded #' @export #' keep_most_recent_prevax_time_point <- function(eset, drop_postvax) { prevax_uids <- Biobase::pData(eset) %>% dplyr::filter(time_post_last_vax <= 0) %>% dplyr::arrange(-time_post_last_vax) %>% dplyr::group_by(participant_id) %>% dplyr::slice(1) %>% dplyr::ungroup() %>% dplyr::pull(uid) postvax_uids <- Biobase::pData(eset) %>% dplyr::filter(time_post_last_vax > 0) %>% dplyr::pull(uid) if (drop_postvax) uids <- prevax_uids else uids <- union(prevax_uids, postvax_uids) eset[, eset$uid %in% uids] } #' Add 'response_class' column containing maxRBA_p30 values for Influenza and #' MFC_p30 values for all other vaccines #' #' @param eset expressionSet #' @export #' add_response_class_column <- function(eset) { # Note: maxRBA measure is used to determine high and low responders for # Influenza, because many participants are unlikely to be naive for this virus # (resulting in high baseline titers and consequently lower fold changes which # maxRBA is designed to take into account) Biobase::pData(eset)$response_class <- as.character(Biobase::pData(eset)$MFC_p30) Biobase::pData(eset)$response_class[ Biobase::pData(eset)$pathogen == "Influenza"] <- as.character(Biobase::pData(eset)$maxRBA_p30[ Biobase::pData(eset)$pathogen == "Influenza"]) eset } #' Keep participants that are high- or low-responders #' #' @param eset expressionSet #' @param col name of column containing high or low responders #' @export #' keep_only_high_and_low_responders <- function(eset, col) { eset[, eset[[col]] %in% c("lowResponder", "highResponder")] } #' Perform qusage meta analysis (combinePDFs) on data frame #' #' @param x data frame with list column containing QSarray objects #' @param group_cols character vector with names of columns to perform grouping #' on for meta analysis #' @return x with additional column containing output from meta analysis meta_qusage <- function(x, group_cols) { require(qusage) if (find("convolve")[1] == "package:stats") { stop("stats::convolve is slow, please load convolve.R") } x %>% dplyr::group_by_at(group_cols) %>% tidyr::nest() %>% dplyr::ungroup() %>% dplyr::mutate( meta_qusage_output = purrr::map( .x = data, ~{ QSarray_list <- .x$qusage_output %>% purrr::keep(~class(.x) == "QSarray") if (length(QSarray_list) == 0) return(NULL) if (length(QSarray_list) > 1) { cat("Running combinePDFs\n") return(qusage::combinePDFs(QSarrayList = QSarray_list)) } else { return(QSarray_list[[1]]) } } )) %>% dplyr::select(-data) } #' Replace QSarray objects with a few informative columns #' #' @param x data frame containing list column with QSarray objects #' @param col string with name of list column #' @return x with col replaced with "p.value", "geneset", and "activity_score" #' columns tidy_qusage_output <- function(x, col) { require(qusage) x %>% dplyr::filter(purrr::map_lgl(.x = !!sym(col), ~class(.x) == "QSarray")) %>% dplyr::mutate( tidied = purrr::map( .x = !!sym(col), ~ tibble( p.value = pdf.pVal(.x), geneset = colnames(.x$path.PDF), activity_score = .x$path.mean) )) %>% dplyr::select(-!!sym(col)) %>% tidyr::unnest(tidied) } #' Get confidence intervals for every variable in a bootstrap. #' #' @param boot_result an object of class "boot" containing the output of a #' bootstrap calculation #' @param type character string representing the type of interval required tidy_boot_ci <- function(boot_result, type) { stopifnot(class(boot_result) == "boot") stopifnot(is.character(type)) purrr::map_df( .x = 1:ncol(boot_result$t), ~{ ci <- boot.ci(boot.out = boot_result, type = type, index = .x) # When using "perc" as type, the name in the output list ci is "percent" # Using grep to handle this inconsistency i <- grep(pattern = type, names(ci))[1] tibble(name = names(ci$t0), statistic = ci$t0, ci_low = ci[[i]][4], ci_high = ci[[i]][5]) } ) %>% dplyr::mutate(bootstrapped_mean_of_statistic = colMeans(boot_result$t)) } # Description: functions to extend pheatmap functionality pheatmap_add_labels <- function(p, annot_labels, gp = gpar(col = "#000000", fontsize = 10), angle_col = 0, angle_row = 270) { require(pheatmap) require(grid) require(gtable) # Get the column and row annotation rectangles (called "track" in pheatmap) tracks <- list() annot_names <- vector() for (annot_dir in c("col", "row")){ track_grob_name <- paste0(annot_dir, "_annotation") track_index <- which(p$gtable$layout$name == track_grob_name) if (length(track_index) == 0){ next } track_grob <- p$gtable$grobs[[track_index]] tracks[[annot_dir]] <- list("index" = track_index, "grob" = track_grob) annot_names <- c(annot_names, colnames(track_grob$gp$fill)) } # Get legend grob legend_index <- which(p$gtable$layout$name == "annotation_legend") legend_grob <- p$gtable$grobs[[legend_index]] # Get mapping between annotation labels and annotation colors elem2color <- list() color2label <- list() for (annot_name in annot_names) { # Retrieve from the legend the filled rectangles grob belonging to current annotation cur_grob <- getGrob(gTree = legend_grob, gPath = childNames(legend_grob)[[paste(annot_name, "r")]]) elem2color[[annot_name]] <- cur_grob$gp$fill cur_colors <- as.character(elem2color[[annot_name]]) cur_elems <- names(elem2color[[annot_name]]) if (annot_name %in% names(annot_labels)) { cur_labels <- as.character(annot_labels[[annot_name]][cur_elems]) } else { cur_labels <- rep("", length.out = length(cur_elems)) } color2label[[annot_name]] <- cur_labels names(color2label[[annot_name]]) <- cur_colors } # Add labels to annotation legend for (annot_name in annot_names) { target_child_name <- paste(annot_name, "r") cur_grob <- getGrob(gTree = legend_grob, gPath = childNames(legend_grob)[[target_child_name]]) legend_labels <- color2label[[annot_name]][as.character(cur_grob$gp$fill)] res <- gList() res[["rect"]] <- cur_grob res[["text"]] <- textGrob(x = cur_grob$x + 0.5 * cur_grob$width, y = cur_grob$y - 0.5 * cur_grob$height, label = legend_labels, gp = gp) legend_grob$children[[target_child_name]] <- gTree(children = res) } p$gtable$grobs[[legend_index]] <- gTree(children = legend_grob$children) # Add labels to annotation tracks for (track_dir in names(tracks)){ track_grob <- tracks[[track_dir]]$grob track_index <- tracks[[track_dir]]$index track_labels <- purrr::map( .x = colnames(track_grob$gp$fill), ~color2label[[.x]][track_grob$gp$fill[,.x]] ) %>% do.call(what = c, args = .) # Add labels to annotation track res <- gList() res[["rect"]] = track_grob res[["text"]] = textGrob(x = track_grob$x, y = track_grob$y, label = track_labels, rot = ifelse(track_dir == "col", yes = angle_col, no = angle_row), gp = gp) p$gtable$grobs[[track_index]] <- gTree(children = res) } return(p) } # Function based on (and slightly modified, removing the option to make # vertically positioned dendrograms) # https://github.com/cran/pheatmap/blob/master/R/pheatmap.r # on 2019_02_14 draw_dendrogram <- function(hc) { h = hc$height / max(hc$height) / 1.05 m = hc$merge o = hc$order n = length(o) m[m > 0] = n + m[m > 0] m[m < 0] = abs(m[m < 0]) dist = matrix(0, nrow = 2 * n - 1, ncol = 2, dimnames = list(NULL, c("x", "y"))) dist[1:n, 1] = 1 / n / 2 + (1 / n) * (match(1:n, o) - 1) for(i in 1:nrow(m)){ dist[n + i, 1] = (dist[m[i, 1], 1] + dist[m[i, 2], 1]) / 2 dist[n + i, 2] = h[i] } draw_connection = function(x1, x2, y1, y2, y){ res = list( x = c(x1, x1, x2, x2), y = c(y1, y, y, y2) ) return(res) } x = rep(NA, nrow(m) * 4) y = rep(NA, nrow(m) * 4) id = rep(1:nrow(m), rep(4, nrow(m))) for(i in 1:nrow(m)){ c = draw_connection(dist[m[i, 1], 1], dist[m[i, 2], 1], dist[m[i, 1], 2], dist[m[i, 2], 2], h[i]) k = (i - 1) * 4 + 1 x[k : (k + 3)] = c$x y[k : (k + 3)] = c$y } x = unit(x, "npc") y = unit(y, "npc") return(polylineGrob(x = x, y = y, id = id)) } # pheatmap variant enabling clustering of groups of columns. col_grouped_pheatmap <- function(mat, col_groups, col_clust_fun = "hclust", col_dist_fun = "dist", method = "euclidean", treeheight_col = 50, display_numbers = FALSE, silent = TRUE, ...) { require(pheatmap) require(grid) require(gtable) stopifnot(is.matrix(mat)) stopifnot(any(class(display_numbers) %in% c("logical", "matrix"))) col_clust_fun <- match.fun(col_clust_fun) col_dist_fun <- match.fun(col_dist_fun) # Cluster the columns and draw the dendrograms dendro_grobs <- list() for (cur_group in unique(col_groups)) { col_indexes.subset <- which(col_groups == cur_group) #cl <- hclust(d = dist(x = t(mat[, col_indexes.subset]), # method = method)) cl <- col_clust_fun(d = col_dist_fun(x = t(mat[, col_indexes.subset]), method = method)) col_indexes <- 1:ncol(mat) col_indexes[col_indexes.subset] <- NA col_indexes[is.na(col_indexes)] <- col_indexes.subset[cl$order] mat <- mat[, col_indexes] dendro_grobs[[length(dendro_grobs) + 1]] <- draw_dendrogram(cl) } # Get column numbers after which a gap occurs gaps_col <- which( purrr::map_lgl(2:length(col_groups), ~{col_groups[.x] != col_groups[.x-1]})) gap_width <- unit("4", "bigpts") matrix_cell_width <- (1 / ncol(mat)) * (unit(1, "npc") - length(gaps_col) * gap_width) dendro_widths <- base::diff(c(0, gaps_col, ncol(mat))) * matrix_cell_width gap_grob <- rectGrob(gp = gpar(fill = NA, col = NA), width = gap_width) my_grobs <- list(dendro_grobs[[1]]) my_widths <- dendro_widths[1] for (i in 1:length(gaps_col)) { my_grobs[[2*i]] <- gap_grob my_grobs[[2*i + 1]] <- dendro_grobs[[i+1]] my_widths <- unit.c(my_widths, gap_width, dendro_widths[i+1]) } combined_dendros_grob <- grobTree(gtable_matrix( name = "my_dendrograms", grobs = matrix(my_grobs, ncol = length(my_grobs)), widths = my_widths, heights = unit(1, "null"))) if (is.matrix(display_numbers)) { display_numbers <- display_numbers[rownames(mat), colnames(mat)] } p <- pheatmap::pheatmap(mat = mat, display_numbers = display_numbers, gaps_col = gaps_col, treeheight_col = treeheight_col, cluster_cols = FALSE, filename = NA, silent = silent, ...) p$gtable <- gtable_add_grob( x = p$gtable, grobs = combined_dendros_grob, t = 2, l = 3, name = "col_tree") p$gtable$heights[2] <- p$gtable$heights[2] + unit(treeheight_col, "bigpts") p$gtable$heights[4] <- p$gtable$heights[4] - unit(treeheight_col, "bigpts") return(p) } #Principal Variant Component Analysis (PVCA) function pvca <- function (abatch,expInfo) { theDataMatrix=abatch dataRowN <- nrow(theDataMatrix) dataColN <- ncol(theDataMatrix) ########## Center the data (center rows) ########## theDataMatrixCentered <- matrix(data = 0, nrow = dataRowN, ncol = dataColN) theDataMatrixCentered_transposed = apply(theDataMatrix, 1, scale, center = TRUE, scale = FALSE) theDataMatrixCentered = t(theDataMatrixCentered_transposed) ########## Compute correlation matrix & Obtain eigenvalues ########## theDataCor <- cor(theDataMatrixCentered) eigenData <- eigen(theDataCor) eigenValues = eigenData$values ev_n <- length(eigenValues) eigenVectorsMatrix = eigenData$vectors eigenValuesSum = sum(eigenValues) percents_PCs = eigenValues /eigenValuesSum ##=========================================== ## Getting the experimental information ##=========================================== exp_design <- as.data.frame(expInfo) expDesignRowN <- nrow(exp_design) expDesignColN <- ncol(exp_design) ########## Merge experimental file and eigenvectors for n components ########## ## pc_n is the number of principal components to model ## Use fixed pc_n pc_n=5 pc_data_matrix <- matrix(data = 0, nrow = (expDesignRowN*pc_n), ncol = 1) mycounter = 0 for (i in 1:pc_n){ for (j in 1:expDesignRowN){ mycounter <- mycounter + 1 pc_data_matrix[mycounter,1] = eigenVectorsMatrix[j,i] } } AAA <- exp_design[rep(1:expDesignRowN,pc_n),] Data <- cbind(AAA,pc_data_matrix) ####### Edit these variables according to your factors ####### variables <-c (colnames(exp_design)) for (i in 1:length(variables)) { Data$variables[i] <- as.factor(Data$variables[i] ) } ########## Mixed linear model ########## op <- options(warn = (-1)) effects_n = expDesignColN+1 #effects size without interaction terms randomEffectsMatrix <- matrix(data = 0, nrow = pc_n, ncol = effects_n) ##============================# ## Get model functions ##============================# model.func <- c() index <- 1 ## level-1 for (i in 1:length(variables)) { mod = paste("(1|", variables[i], ")", sep="") model.func[index] = mod index = index + 1 } function.mods <- paste (model.func , collapse = " + ") ##============================# ## Get random effects # ##============================# for (i in 1:pc_n){ y = (((i-1)*expDesignRowN)+1) funct <- paste ("pc_data_matrix", function.mods, sep =" ~ ") Rm1ML <- lmer( funct , Data[y:(((i-1)*expDesignRowN)+expDesignRowN),], REML = TRUE, verbose = FALSE, na.action = na.omit) randomEffects <- Rm1ML randomEffectsMatrix[i,] <- c(unlist(VarCorr(Rm1ML)),resid=sigma(Rm1ML)^2) } effectsNames <- c(names(getME(Rm1ML,"cnms")),"resid") ########## Standardize Variance ########## randomEffectsMatrixStdze <- matrix(data = 0, nrow = pc_n, ncol = effects_n) for (i in 1:pc_n){ mySum = sum(randomEffectsMatrix[i,]) for (j in 1:effects_n){ randomEffectsMatrixStdze[i,j] = randomEffectsMatrix[i,j]/mySum } } ########## Compute Weighted Proportions ########## randomEffectsMatrixWtProp <- matrix(data = 0, nrow = pc_n, ncol = effects_n) for (i in 1:pc_n){ weight = eigenValues[i]/eigenValuesSum for (j in 1:effects_n){ randomEffectsMatrixWtProp[i,j] = randomEffectsMatrixStdze[i,j]*weight } } ########## Compute Weighted Ave Proportions ########## randomEffectsSums <- matrix(data = 0, nrow = 1, ncol = effects_n) randomEffectsSums <-colSums(randomEffectsMatrixWtProp) totalSum = sum(randomEffectsSums) randomEffectsMatrixWtAveProp <- matrix(data = 0, nrow = 1, ncol = effects_n) for (j in 1:effects_n){ randomEffectsMatrixWtAveProp[j] = randomEffectsSums[j]/totalSum } return(list(dat=randomEffectsMatrixWtAveProp, label=effectsNames)) }
Setup: load dataset, create fold change data
FDR_THRESHOLD <- 0.05 # Determines if a gene set is significantly up/down NUM_PERMUTATIONS <- 10 # Colors and abbreviations load(file.path(data_dir, "adj_path_vt_colors_abbv_v3.RData")) young.noNorm.noResponse <- readRDS(file.path(data_dir, paste0(date_prefix, "young_noNorm_eset.rds"))) old.noNorm.noResponse <- readRDS(file.path(data_dir, paste0(date_prefix, "old_noNorm_eset.rds"))) young.noNorm.withResponse <- readRDS(file.path(data_dir, paste0(date_prefix, "young_noNorm_withResponse_eset.rds"))) old.noNorm.withResponse <- readRDS(file.path(data_dir, paste0(date_prefix, "old_noNorm_withResponse_eset.rds"))) #Merge pre/post vax samples into same matrix for SDY1529 young.noNorm.noResponse$matrix[young.noNorm.noResponse$study_accession=='SDY1529']='SDY1529_WholeBlood_HealthyAdults' young.noNorm.withResponse$matrix[young.noNorm.withResponse$study_accession=='SDY1529']='SDY1529_WholeBlood_HealthyAdults' #Add alum as an adjuvant to Hepatitis B young.noNorm.noResponse$adjuvant[young.noNorm.noResponse$pathogen=='Hepatitis A/B']='Alum' old.noNorm.noResponse$adjuvant[old.noNorm.noResponse$pathogen=='Hepatitis A/B']='Alum' young.noNorm.withResponse$adjuvant[young.noNorm.withResponse$pathogen=='Hepatitis A/B']='Alum' old.noNorm.withResponse$adjuvant[old.noNorm.withResponse$pathogen=='Hepatitis A/B']='Alum' #Abbreviate vaccine types young.noNorm.noResponse$vaccine_type_full=young.noNorm.noResponse$vaccine_type old.noNorm.noResponse$vaccine_type_full=old.noNorm.noResponse$vaccine_type young.noNorm.withResponse$vaccine_type_full=young.noNorm.withResponse$vaccine_type old.noNorm.withResponse$vaccine_type_full=old.noNorm.withResponse$vaccine_type vaccine_abrev=c('Live virus'='LV','Recombinant protein'='RP','Recombinant Viral Vector'='RVV', 'Polysaccharide'='PS','Conjugate'='CJ','Inactivated'='IN', 'Inactivated/Recombinant protein'='IN/RP') young.noNorm.noResponse$vaccine_type=str_replace_all(young.noNorm.noResponse$vaccine_type,vaccine_abrev) old.noNorm.noResponse$vaccine_type=str_replace_all(old.noNorm.noResponse$vaccine_type,vaccine_abrev) young.noNorm.withResponse$vaccine_type=str_replace_all(young.noNorm.withResponse$vaccine_type,vaccine_abrev) old.noNorm.withResponse$vaccine_type=str_replace_all(old.noNorm.withResponse$vaccine_type,vaccine_abrev) names(adj_path_vt_abbv$vaccine_type)=str_replace_all(names(adj_path_vt_abbv$vaccine_type), vaccine_abrev) names(adj_path_vt_colors$vaccine_type)=str_replace_all(names(adj_path_vt_abbv$vaccine_type), vaccine_abrev) # Combine young and old with response symbols <- intersect(rownames(young.noNorm.withResponse), rownames(old.noNorm.withResponse)) pd <- rbind(Biobase::pData(young.noNorm.withResponse) %>% dplyr::mutate(age_group = "young"), Biobase::pData(old.noNorm.withResponse) %>% dplyr::mutate(age_group = "old")) ge <- cbind(Biobase::exprs(young.noNorm.withResponse)[symbols, ], Biobase::exprs(old.noNorm.withResponse)[symbols, ])[, pd$uid] rownames(pd) <- pd$uid young_and_old.noNorm.withResponse <- new("ExpressionSet", exprs = ge, phenoData = new('AnnotatedDataFrame', pd)) # Combine young and old without response symbols <- intersect(rownames(young.noNorm.noResponse), rownames(old.noNorm.noResponse)) pd <- rbind(Biobase::pData(young.noNorm.noResponse) %>% dplyr::mutate(age_group = "young"), Biobase::pData(old.noNorm.noResponse) %>% dplyr::mutate(age_group = "old")) ge <- cbind(Biobase::exprs(young.noNorm.noResponse)[symbols, ], Biobase::exprs(old.noNorm.noResponse)[symbols, ])[, pd$uid] rownames(pd) <- pd$uid young_and_old.noNorm <- new("ExpressionSet", exprs = ge, phenoData = new('AnnotatedDataFrame', pd)) btm_list <- readRDS(file = file.path(data_dir, "btm_list_2020_12_23.rds")) ##Create fold change dataset for downstream use eset=readRDS(file.path(data_dir, paste0(date_prefix, "young_norm_eset.rds"))) colnames(eset)=eset$uid #Merge pre/post vax samples into same matrix for SDY1529 eset$matrix[eset$study_accession=='SDY1529']='SDY1529_WholeBlood_HealthyAdults' #Add alum as an adjuvant to Hepatitis B eset$adjuvant[eset$pathogen=='Hepatitis A/B']='Alum' #Abbreviate vaccine types eset$vaccine_type_full=eset$vaccine_type eset$vaccine_type=str_replace_all(eset$vaccine_type,vaccine_abrev) #Create combined vaccine type/pathogen column eset$pt=paste(eset$pathogen," (",eset$vaccine_type,")", sep='') #Create combined SDY/pathogen/vaccine type column eset$SDY_pt=paste(eset$study,eset$pt) #Find samples at timepoints >=0 tp_int=sort(unique(eset$time_post_last_vax[which(eset$time_post_last_vax>=0)])) ind=lapply(tp_int, function(x) which(eset$time_post_last_vax==x)) #Combine indices of all timepoints of interest ind_all=Reduce(union,ind) #Retain only samples from timepoints of interest eset=eset[,ind_all] #Recompute timepoint indices after removing extraneous timepoints ind=lapply(tp_int, function(x) which(eset$time_post_last_vax==x)) #Remove samples from a single study with fewer than sample_cutoff samples at any timepoint sample_cutoff=3 matrix_uni_tp=lapply(ind,function(x) unique(eset[,x]$matrix)) matrix_ind=lapply(1:length(ind),function(x) lapply(1:length(matrix_uni_tp[[x]]), function(y) which(matrix_uni_tp[[x]][[y]]==eset[,ind[[x]]]$matrix))) ind_cut_all=vector() for (i in 1:length(matrix_ind)) { ind_cut=which(sapply(matrix_ind[[i]],length)<sample_cutoff) if (is_empty(ind_cut)==FALSE) { for (j in 1:length(ind_cut)){ ind_cut_all=c(ind_cut_all,ind[[i]][matrix_ind[[i]][[ind_cut[j]]]]) } } } if (is_empty(ind_cut_all)==FALSE) { eset=eset[,-ind_cut_all] } #Create unique list of studies matrix_uni=unique(eset$matrix) #Remove matrices with only D0 expression matrix_tp=lapply(matrix_uni, function(x) unique(eset$time_post_last_vax[eset$matrix==x])) matrix_d0_only=matrix_uni[sapply(matrix_tp, function(x) identical(x,0))] eset=eset[,!(eset$matrix %in% matrix_d0_only)] #Remove genes with NA eset=eset[complete.cases(exprs(eset)),] #Recompute timepoint indices after removing samples tp_int=sort(unique(eset$time_post_last_vax[which(eset$time_post_last_vax>=0)])) ind=lapply(tp_int, function(x) which(eset$time_post_last_vax==x)) #Create BTM-level eset #Remove BTMs which have no matching genes in dataset btm_list=btm_list[!sapply(btm_list, function(x) is_empty(intersect(x,rownames(eset))))] #Collapse - find arithmetic mean of genes in each BTM for each sample eset_BTM=do.call(rbind, lapply(btm_list, function(x) colMeans(exprs(eset[na.omit(match(x,rownames(eset))),]),na.rm=TRUE))) #Create BTM-level eset eset_BTM=ExpressionSet(as.matrix(eset_BTM), eset@phenoData) #Compute D0 normalized FC ind_D0=which(0==eset$time_post_last_vax) common=lapply(2:length(ind),function(x) intersect(eset$participant_id[ind[[x]]],eset$participant_id[ind_D0])) ia=lapply(2:length(ind),function(x) na.omit(match(common[[x-1]],eset$participant_id[ind[[x]]]))) ib=lapply(2:length(ind),function(x) na.omit(match(common[[x-1]],eset$participant_id[ind_D0]))) exp_FC=lapply(2:length(ind),function(x) eset[,ind[[x]][ia[[x-1]]]]) exp_FC=lapply(2:length(ind),function(x) {exprs(exp_FC[[x-1]])=exprs(exp_FC[[x-1]])-exprs(eset[,ind_D0[ib[[x-1]]]]); exp_FC[[x-1]]}) exp_FC_BTM=lapply(2:length(ind),function(x) eset_BTM[,ind[[x]][ia[[x-1]]]]) exp_FC_BTM=lapply(2:length(ind),function(x) {exprs(exp_FC_BTM[[x-1]])=exprs(exp_FC_BTM[[x-1]])-exprs(eset_BTM[,ind_D0[ib[[x-1]]]]); exp_FC_BTM[[x-1]]}) tp_FC=tp_int[-1] #Store timepoints for FC data (remove Day 0)
Figure 1b: Histogram of the number of samples included per vaccine at each timepoint. Day 0 represents Day of vaccination.
#Generate unique list of studies/vaccine type/pathogen/timepoint matrices=unique(eset$study) studies=unique(eset$study) vaccine_type=unique(eset$vaccine_type) pathogen=unique(eset$pathogen) timepoint=round(unique(eset$time_post_last_vax)*100)/100 pathogen_type=unique(eset$pt) #For each timepoint, identify pathogen, then count studies/samples per pathogen/TP TP_path_sample_count=matrix(NA,length(timepoint), length(pathogen_type)) for (i in 1:length(timepoint)) { #Find pathogen/type TP_ind=which(timepoint[i]==eset$time_post_last_vax) pathogen_TP=unique(eset$pt[TP_ind]) #For each pathogen at given TP for (j in 1:length(pathogen_TP)) { #Find pathogen in overall pathogen list pathogen_type_ind=which(pathogen_TP[j]==pathogen_type) #Count studies with given TP/pathogen combo pathogen_TP_ind=which(pathogen_TP[j]==eset$pt[TP_ind]) #Count samples with given TP/pathogen combo if (length(pathogen_TP_ind)>1) { TP_path_sample_count[i,pathogen_type_ind]=length(pathogen_TP_ind) } } } #Convert counts to data frame, add timepoint column and pathogen names TP_path_sample_count=data.frame(TP_path_sample_count) colnames(TP_path_sample_count)=pathogen_type TP_path_sample_count$Day=timepoint #Order alphabetically TP_path_sample_count=TP_path_sample_count[,order(names(TP_path_sample_count))] #Convert days to factor TP_path_sample_count$Day=factor(TP_path_sample_count$Day, levels=sort(TP_path_sample_count$Day)) #Create plot df=pivot_longer(TP_path_sample_count, !Day, names_to='Vaccine', values_to = 'Samples') p=ggplot(df, aes(x=Day, y=Samples, fill=Vaccine))+ geom_bar(stat="identity")+ scale_fill_manual(values=colorRampPalette(brewer.pal(name="Paired", n = 8))(14))+ xlab("Day post-vaccination")+ ylab("# of Samples")+ labs(fill='Vaccine')+ theme_minimal()+ theme(text=element_text(size=16),panel.grid.minor.x=element_blank(), legend.position="bottom") print(p)
Figure 1c: Age distribution of participants in the Immune Signatures Data Resource by vaccine. Shape of points denotes the subject’s inferred sex based on Y chromosome-specific gene expression. For participants with missing age data, ages were estimated using RAPToR. Center line, median; box limits, upper and lower quartiles; whiskers, 1.5x interquartile range.
df=pData(eset[,match(unique(eset$participant_id), eset$participant_id)]) p=ggplot(df, aes(x=pt, y=age_imputed, fill=pt))+ geom_boxplot(outlier.shape=NA,width=0.5,notch=FALSE, alpha=0.6)+ geom_jitter(aes(shape=y_chrom_present), alpha=0.6, width=0.1)+ scale_fill_manual(values=colorRampPalette(brewer.pal(name="Paired", n = 8))(14))+ xlab("Vaccine")+ ylab("Age")+ labs(fill='Vaccine', shape='Sex (Imputed)')+ theme_minimal()+ theme(text=element_text(size=16),panel.grid.minor.x=element_blank(), legend.position="bottom", axis.text.x = element_text(angle = 45, hjust=1)) print(p)
Figure 1d: Bar plot representing the proportion of variance in post-vaccination transcriptional responses that can be attributed to clinical (age, sex, ethnicity) and experimental variables (time after vaccination, vaccine) via Principal Component Variance Analysis. The residual represents the proportion of the variance that could not be explained by any of the included variables.
#Merge FC data into one eset exp_FC_all=do.call(Biobase::combine,exp_FC) exp_FC_all_pData=pData(exp_FC_all) #Collect variables for PVCA TP_variables=data.frame(Timepoint=exp_FC_all_pData$study_time_collected, Vaccine=exp_FC_all_pData$pt, Gender=exp_FC_all_pData$gender, Age=exp_FC_all_pData$age_reported, Ethnicity=exp_FC_all_pData$ethnicity) rownames(TP_variables)=rownames(exp_FC_all_pData) #Call PVCA function pvcaObj=pvca(exprs(exp_FC_all),TP_variables) #Create plot par(mar=c(9,6,4,1)) p=barplot(pvcaObj$dat, ylab = "Proportion variance explained", ylim= c(0,1.1),col = c("blue"), las=2, cex.axis=1, cex.names=1) axis(1, at = p, labels = pvcaObj$label, cex.axis = 1, las=2) print(p)
Prepare data 1. Create data frame with gene set enrichment scores for postvax vs prevax time points in young adult cohort.
longitudinal_qusage_young_df_path <- file.path(extra_data_dir, "longitudinal_qusage_young_df.rds") if (file.exists(longitudinal_qusage_young_df_path)) { longitudinal_qusage_young_df <- readRDS(longitudinal_qusage_young_df_path) } else { young.noNorm.noResponse.filtered <- young.noNorm.noResponse %>% keep_most_recent_prevax_time_point(drop_postvax = FALSE) %>% remove_any_NA_rows() # Check that there are no repeated measures at any time point stopifnot(Biobase::pData(young.noNorm.noResponse.filtered) %>% dplyr::group_by(participant_id, time_post_last_vax) %>% dplyr::filter(n() > 1) %>% nrow() == 0) # Check that every participant has a baseline time point stopifnot(Biobase::pData(young.noNorm.noResponse.filtered) %>% dplyr::group_by(participant_id) %>% dplyr::filter(!any(time_post_last_vax <= 0)) %>% nrow() == 0) btm_list.filtered <- keep_symbols( genesets.symbol = btm_list, symbols2keep = rownames(young.noNorm.noResponse.filtered)) # Gene set enrichment analysis: postvax vs prevax time points ------------------ analysis_df <- Biobase::pData(young.noNorm.noResponse.filtered) %>% dplyr::filter(time_post_last_vax > 0) %>% dplyr::select(pathogen, vaccine_type, time_post_last_vax, study_accession) %>% dplyr::arrange(pathogen, vaccine_type, time_post_last_vax, study_accession) %>% dplyr::distinct() %>% dplyr::mutate(n_participants = NA, qusage_contrast = NA, qusage_output = list(NULL), error = list(NULL)) %>% dplyr::as_tibble() safe_qusage <- purrr::safely(.f = qusage::qusage, otherwise = NULL, quiet = TRUE) for (row_nr in 1:nrow(analysis_df)) { row <- analysis_df[row_nr,, drop = FALSE] cat(paste0("Row ", row_nr, "/", nrow(analysis_df), ": ", row$pathogen, " ", row$vaccine_type, " ", row$time_post_last_vax, " ", row$study_accession, "\n")) cur_eset <- young.noNorm.noResponse.filtered[, young.noNorm.noResponse.filtered$pathogen == row$pathogen & young.noNorm.noResponse.filtered$vaccine_type == row$vaccine_type & young.noNorm.noResponse.filtered$study_accession == row$study_accession & (young.noNorm.noResponse.filtered$time_post_last_vax <= 0 | young.noNorm.noResponse.filtered$time_post_last_vax == row$time_post_last_vax)] prevax_string <- "baseline" postvax_string <- paste0("Day", row$time_post_last_vax) qusage.contrast <- paste0(postvax_string, "-", prevax_string) qusage.labels <- ifelse( test = cur_eset$time_post_last_vax == row$time_post_last_vax, yes = postvax_string, no = prevax_string) result <- safe_qusage(eset = cur_eset, labels = qusage.labels, contrast = qusage.contrast, geneSets = btm_list.filtered, pairVector = cur_eset$participant_id) analysis_df$n_participants[row_nr] <- length(unique(cur_eset$participant_id)) analysis_df$qusage_contrast[row_nr] <- qusage.contrast if (!is.null(result$result)) analysis_df$qusage_output[[row_nr]] <- result$result if (!is.null(result$error)) analysis_df$error[[row_nr]] <- result$error } saveRDS(analysis_df, longitudinal_qusage_young_df_path) longitudinal_qusage_young_df <- analysis_df }
longitudinal_qusage_young_meta_df_path <- file.path(extra_data_dir, "longitudinal_qusage_young_meta_df.rds") if (file.exists(longitudinal_qusage_young_meta_df_path)) { longitudinal_qusage_young_meta_df <- readRDS(longitudinal_qusage_young_meta_df_path) } else { longitudinal_qusage_young_meta_df <- meta_qusage(longitudinal_qusage_young_df, group_cols = c("pathogen", "vaccine_type", "time_post_last_vax")) saveRDS(longitudinal_qusage_young_meta_df, longitudinal_qusage_young_meta_df_path) } tidy_longitudinal_qusage_young_meta_df <- longitudinal_qusage_young_meta_df %>% tidy_qusage_output(col = "meta_qusage_output") %>% dplyr::mutate(FDR = p.adjust(p.value, method = "BH"), FDR_signif_label = create_signif_label(FDR))
# Calculate null distribution of gene set sharing between vaccines ------------- geneset_sharing_young_null_boot_path <- file.path(extra_data_dir, "geneset_sharing_young_null_boot.rds") if (file.exists(geneset_sharing_young_null_boot_path)) { geneset_sharing_young_null_boot <- readRDS(geneset_sharing_young_null_boot_path) } else { nbins <- tidy_longitudinal_qusage_young_meta_df %>% dplyr::select(pathogen, vaccine_type) %>% dplyr::distinct() %>% nrow() + 1 strata <- tidy_longitudinal_qusage_young_meta_df %>% dplyr::group_by(pathogen, vaccine_type, time_post_last_vax) %>% dplyr::group_indices() ncpus <- parallel::detectCores() geneset_sharing_young_null_boot <- boot( data = tidy_longitudinal_qusage_young_meta_df, statistic = function(data, indices){ data %>% dplyr::mutate(geneset = geneset[indices]) %>% calc_sharing_df() %>% dplyr::pull(n) %>% `+`(1) %>% tabulate(nbins = nbins) }, R = NUM_PERMUTATIONS, sim = "permutation", stype = "i", strata = strata, parallel = "multicore", ncpus = ncpus) saveRDS(object = geneset_sharing_young_null_boot, file = geneset_sharing_young_null_boot_path) }
sharing_young_df <- tidy_longitudinal_qusage_young_meta_df %>% calc_sharing_df() genesets_shared_young_common <- sharing_young_df %>% dplyr::filter(n >= quantile(x = n, probs = 0.95)) %>% dplyr::pull(geneset)
Ext. Data. Figure 1a. Histogram of overlap in DEGs. A gene is shared with another vaccine if it is significantly (FDR < 0.05) regulated in the same direction, irrespective of timepoint. Blue bars, number of genesets shared (y-axis) between the same number of vaccines (x-axis). Grey bars represent the null distribution generated by n = 10,000 permutations of gene labels within vaccine + timepoint groups. Data are presented as mean values + /− 95% confidence interval.
exp_noNA <- na.omit(exprs(eset)) pheno <- pData(eset) ################################################### # Find DEGS for every vaccine at every time point # ################################################### sample_cutoff <- 3 # Remove study time points that contain less than sample_cutoff samples cur_phenoData <- pheno %>% dplyr::group_by(study_accession, matrix, time_post_last_vax) %>% dplyr::filter(n() >= sample_cutoff) %>% dplyr::ungroup() cur_expressionData <- exp_noNA[, cur_phenoData$uid] # Keep only subjects that have both pre- and postvax data cur_phenoData <- cur_phenoData %>% dplyr::group_by(participant_id) %>% dplyr::filter(any(time_post_last_vax <= 0) & any(time_post_last_vax > 0)) %>% dplyr::ungroup() # For subjects with more than one prevax time point, take the one closest to day 0 cur_phenoData_prevax <- cur_phenoData %>% dplyr::filter(time_post_last_vax <= 0) %>% dplyr::group_by(participant_id) %>% dplyr::arrange(-time_post_last_vax) %>% dplyr::slice(1) %>% dplyr::ungroup() cur_phenoData_postvax <- cur_phenoData %>% dplyr::filter(time_post_last_vax > 0) cur_phenoData <- rbind(cur_phenoData_prevax, cur_phenoData_postvax) # Check that we do not have repeated measurements stopifnot(cur_phenoData %>% dplyr::group_by(participant_id, time_post_last_vax) %>% dplyr::filter(n() > 1) %>% nrow() == 0) # Check that uid is unique stopifnot(!any(duplicated(cur_phenoData$uid))) # Calculate fold change with baseline for every gene at every postvax time point exp_fc <- cur_phenoData %>% dplyr::group_by(participant_id) %>% tidyr::nest() %>% dplyr::ungroup() %>% dplyr::mutate( fc_data = purrr::map( .x = data, ~sweep(x = exp_noNA[, .x$uid[.x$time_post_last_vax > 0], drop = FALSE], MARGIN = 1, STATS = as.array(exp_noNA[, .x$uid[.x$time_post_last_vax <= 0], drop = FALSE]), FUN = "-") )) %>% dplyr::pull(fc_data) %>% do.call(what = cbind, args = .) # Calculate Student t value and associated one-sided p values for every gene at every postvax time point df <- cur_phenoData %>% dplyr::filter(time_post_last_vax > 0) %>% dplyr::group_by(study_accession, matrix, time_post_last_vax) %>% tidyr::nest() %>% dplyr::ungroup() %>% dplyr::mutate( deg_data = purrr::map( .x = data, ~{ fc_data <- exp_fc[, .x$uid, drop = FALSE] tibble( gene = rownames(fc_data), avg_fc = rowMeans(fc_data), n = ncol(fc_data), s = apply(fc_data, 1, sd), t = avg_fc / (s / sqrt(ncol(fc_data))), p_less = pt(q = t, df = ncol(fc_data) - 1), p_greater = 1 - p_less # it is faster to run the above calculation than the base R t.test function (result is the same) #p_less = apply(fc_data, 1, function(x)t.test(x = x, alternative = "less")$p.value), #p_greater = apply(fc_data, 1, function(x)t.test(x = x, alternative = "greater")$p.value), ) } ) ) df <- df %>% dplyr::select(-data) %>% tidyr::unnest(deg_data) %>% # Replace NaN p values with 0.9999 dplyr::mutate_at(c("p_less", "p_greater"), function(x)replace(x, is.nan(x), 0.9999)) %>% # Replace 1 p values with 0.9999 (for compatibility with sumz) dplyr::mutate_at(c("p_less", "p_greater"), function(x) replace(x, x > 0.9999, 0.9999)) %>% # Replace 0 p values with 0.00001 (for compatibility with sumz) dplyr::mutate_at(c("p_less", "p_greater"), function(x) replace(x, x <= 0, 0.00001)) ## Compute DEGs by pathogen/vaccine type # Integrate p values across multiple studies by Stouffer's method df <- df %>% dplyr::left_join(cur_phenoData %>% dplyr::select(study_accession, matrix, pathogen, vaccine_type) %>% dplyr::distinct(), by = c("study_accession", "matrix")) %>% dplyr::mutate(pvt = paste0(pathogen, " (", vaccine_type, ")")) fn <- file.path(extra_data_dir, "deg_df.rds") if (file.exists(fn)) { deg_df <- readRDS(file = fn) } else{ deg_df <- df %>% dplyr::select(-s, -t) %>% dplyr::group_by(pathogen, vaccine_type, pvt, time_post_last_vax) %>% tidyr::nest() %>% dplyr::ungroup() %>% dplyr::mutate( deg_data = purrr::map( .x = data, ~{ if(length(unique(.x$matrix)) == 1){ return(tibble( gene = unique(.x$gene), weighted_avg_fc = .x$avg_fc, meta_p_less = .x$p_less, meta_p_greater = .x$p_greater )) } wide_avg_fc_df <- .x %>% dplyr::select(matrix, gene, avg_fc, n) %>% tidyr::pivot_wider(names_from = "gene", values_from = "avg_fc") weighted_avg_fc <- sweep(wide_avg_fc_df %>% dplyr::select(-matrix, -n), MARGIN = 1, STATS = wide_avg_fc_df$n, FUN = "*") %>% colSums() %>% `/`(sum(wide_avg_fc_df$n)) wide_p_less_df <- .x %>% dplyr::select(matrix, gene, p_less, n) %>% tidyr::pivot_wider(names_from = "gene", values_from = "p_less") meta_p_less <- apply(X = wide_p_less_df %>% dplyr::select(-matrix, -n), MARGIN = 2, function(x)sumz(p = x, weights = sqrt(wide_p_less_df$n))$p) wide_p_greater_df <- .x %>% dplyr::select(matrix, gene, p_greater, n) %>% tidyr::pivot_wider(names_from = "gene", values_from = "p_greater") meta_p_greater <- apply(X = wide_p_greater_df %>% dplyr::select(-matrix, -n), MARGIN = 2, function(x)sumz(p = x, weights = sqrt(wide_p_greater_df$n))$p) tibble(gene = names(weighted_avg_fc), weighted_avg_fc = weighted_avg_fc, meta_p_less = meta_p_less, meta_p_greater = meta_p_greater) } ) ) deg_df <- deg_df %>% dplyr::select(-data) %>% tidyr::unnest(deg_data) %>% dplyr::mutate( meta_p = 2*base::pmin(meta_p_less, meta_p_greater), meta_p = ifelse(test = meta_p > 1, yes = 1, no = meta_p), FDR = p.adjust(meta_p, method = "BH"), FDR_signif_label = cut(FDR, c(0, .001, .01, .05, 1), labels = c("***", "**", "*", "")) ) saveRDS(object = deg_df, file = fn) } ########################################################################## # Figure: bar graph of sharing of postvax gene activity between vaccines # ########################################################################## # For every gene, find out in how many vaccines it is significantly up at any one postvax time point # The same but for down at any postvax time point calc_sharing_gene_df <- function(x) { x %>% dplyr::mutate(direction = ifelse( test = weighted_avg_fc > 0, yes = "up", no = "down"), is_significant = !is.na(FDR) & FDR < 0.05) %>% dplyr::select(pvt, gene, direction, is_significant) %>% dplyr::distinct() %>% dplyr::group_by(gene, direction) %>% dplyr::summarise(n = sum(is_significant)) %>% dplyr::group_by(gene) %>% dplyr::arrange(-n) %>% dplyr::slice(1) %>% dplyr::ungroup() } sharing_null2mat <- function(sharing_null) { sharing_null_mat <- do.call(rbind, lapply(sharing_null, `length<-`, max(lengths(sharing_null)))) sharing_null_mat[is.na(sharing_null_mat)] <- 0 return(sharing_null_mat) } sharing_null2df <- function(sharing_null_mat) { tibble( incidence = 0:(ncol(sharing_null_mat)-1), incidence_count = colMeans(sharing_null_mat), type = "null" ) } fn <- file.path(extra_data_dir, "gene_sharing_null_list.rds") if (file.exists(fn)) { sharing_null_list <- readRDS(file = fn) } else { sharing_null_list <- lapply( 1:NUM_PERMUTATIONS, function(x){ cat("Permutation ", x, '/', NUM_PERMUTATIONS, "\n") deg_df %>% dplyr::group_by(pvt, time_post_last_vax) %>% dplyr::mutate(gene = sample(gene, replace = FALSE)) %>% dplyr::ungroup() %>% calc_sharing_gene_df() %>% dplyr::pull(n) %>% `+`(1) %>% tabulate() }) saveRDS(object = sharing_null_list, file = fn) } sharing_null_mat <- sharing_null2mat(sharing_null_list) sharing_null_counts_df <- sharing_null2df(sharing_null_mat) obs_sharing_df <- calc_sharing_gene_df(deg_df) obs_sharing_counts_df <- obs_sharing_df %>% dplyr::pull(n) %>% `+`(1) %>% tabulate() %>% as_tibble() %>% dplyr::mutate(incidence = dplyr::row_number() - 1) %>% dplyr::rename(incidence_count = value) %>% dplyr::mutate(type = "observed") %>% dplyr::select(incidence, incidence_count, type) plot_df <- dplyr::bind_rows(obs_sharing_counts_df, sharing_null_counts_df) %>% tidyr::complete(incidence, tidyr::nesting(type), fill = list(incidence_count = 0)) quantile_df <- purrr::map_df( .x = 1:ncol(sharing_null_mat), ~quantile(x = sharing_null_mat[, .x], probs = c(.025, 0.975)) %>% as.list() %>% tibble::as_tibble() ) quantile_df$incidence <- 0:(nrow(quantile_df)-1) quantile_df$type <- "null" p <- ggplot(plot_df, aes(x = incidence)) + geom_bar(stat = "identity", position = position_dodge(), aes(y = incidence_count, fill = type)) + geom_segment(data = quantile_df, aes(x = incidence-.25, xend = incidence-.25, y = `2.5%`, yend = `97.5%`)) + geom_segment(data = quantile_df, aes(x = incidence-.35, xend = incidence-.15, y = `2.5%`, yend = `2.5%`)) + geom_segment(data = quantile_df, aes(x = incidence-.35, xend = incidence-.15, y = `97.5%`, yend = `97.5%`)) + xlab("Number of different vaccines") + ylab("Number of genes") + scale_fill_manual(values = c("null" = "lightgrey", "observed" = "navy")) + scale_x_continuous(breaks = plot_df$incidence) + theme_bw(base_size = 22) print(p)
Ext. Data Figure 1b. Histogram of overlap in differentially expressed modules. A module is shared with another vaccine if it is significantly (FDR < 0.05) regulated in the same direction, irrespective of timepoint. Blue bars, number of genesets shared (y-axis) between the same number of vaccines (x-axis). Grey bars represent the null distribution generated by n = 10,000 permutations of module labels within vaccine + timepoint groups. Data are presented as mean values + /− 95% confidence interval.
nbins <- tidy_longitudinal_qusage_young_meta_df %>% dplyr::select(pathogen, vaccine_type) %>% dplyr::distinct() %>% nrow() + 1 obs_sharing_young_df <- calc_sharing_df( tidy_longitudinal_qusage_young_meta_df) %>% dplyr::pull(n) %>% `+`(1) %>% tabulate(nbins = nbins) %>% dplyr::as_tibble() %>% dplyr::mutate(incidence = dplyr::row_number() - 1) %>% dplyr::rename(incidence_count = value) %>% dplyr::mutate(ymin = NA, ymax = NA, type = "observed") %>% dplyr::select(incidence, incidence_count, ymin, ymax, type) probs <- c(0.025, 0.975) null_sharing_young_df <- geneset_sharing_young_null_boot$t %>% as.data.frame() %>% dplyr::summarise(dplyr::across(.fns = function(col) c(median(col), quantile(col , probs = probs)))) %>% t() %>% as.data.frame() colnames(null_sharing_young_df) <- c("incidence_count", "ymin", "ymax") null_sharing_young_df <- null_sharing_young_df %>% dplyr::mutate(incidence = dplyr::row_number() - 1, type = "null") %>% dplyr::select(incidence, incidence_count, ymin, ymax, type) plot_df <- dplyr::bind_rows(obs_sharing_young_df, null_sharing_young_df) %>% dplyr::filter(!(incidence > 1 & incidence_count < 0.1)) ggplot(plot_df, aes(x = incidence, y = incidence_count, fill = type)) + geom_bar(stat = "identity", position = position_dodge(), width = 0.9) + geom_errorbar(aes(ymin = ymin, ymax = ymax), position = position_dodge(width = 0.9), width = 0.4) + xlab("Number of different vaccines") + ylab("Number of BTMs") + scale_fill_manual(values = c("null" = "lightgrey", "observed" = "navy")) + scale_x_continuous(breaks = plot_df$incidence) + theme_bw(base_size = 22) + theme(panel.grid = element_blank())
Figure 2a. Heatmap of common differentially expressed modules (regulated in 7 or more vaccines) over time (*FDR<0.05). Color represents the QuSAGE activity score. Clustering on columns was performed separately for Days 1, 3, 7, 14, and 21 post-vaccination. TBA – To be annotated.
n_clusters=4 #Number of module clusters common_tp=c(1,3,7,14,21) #Common timepoints to plot time_windows <- c(0, 1, 4, 10, 15, Inf) #Subset qusage results to most common timepoints tidy_longitudinal_qusage_young_meta_df_commontp=tidy_longitudinal_qusage_young_meta_df[tidy_longitudinal_qusage_young_meta_df$time_post_last_vax %in% common_tp,] plot_df <- tidy_longitudinal_qusage_young_meta_df_commontp %>% dplyr::filter(geneset %in% genesets_shared_young_common) %>% dplyr::mutate(pvt_tp = paste0(pathogen, " (", vaccine_type, ")", " Day ", time_post_last_vax)) %>% dplyr::left_join(Biobase::pData(young.noNorm.noResponse) %>% dplyr::select(pathogen, vaccine_type, adjuvant) %>% dplyr::distinct(), by = c("pathogen", "vaccine_type")) m <- plot_df %>% dplyr::select(pvt_tp, geneset, activity_score) %>% tidyr::pivot_wider(names_from = "pvt_tp", values_from = "activity_score") %>% tibble::column_to_rownames(var = "geneset") %>% as.matrix() m_fdr <- plot_df %>% dplyr::select(pvt_tp, geneset, FDR) %>% tidyr::pivot_wider(names_from = "pvt_tp", values_from = "FDR") %>% tibble::column_to_rownames(var = "geneset") %>% as.matrix() # Setup color gradient paletteLength <- 50 myColor <- colorRampPalette(c("navy", "white", "firebrick3"))(paletteLength) myBreaks <- c( seq(-max(abs(m)), 0, length.out=ceiling(paletteLength/2) + 1), seq(max(abs(m))/paletteLength, max(m), length.out=floor(paletteLength/2))) # Column annotations myColAnnot <- plot_df %>% dplyr::select(time_post_last_vax, pvt_tp, pathogen, vaccine_type, adjuvant) %>% dplyr::distinct() %>% dplyr::arrange(time_post_last_vax, pathogen, vaccine_type, adjuvant) %>% tibble::column_to_rownames(var = "pvt_tp") m <- m[, rownames(myColAnnot)] cor.dist <- function(x, method = NA) { stopifnot(is.na(method)) stopifnot(is.matrix(x)) as.dist((1-cor(t(x)))/2) } cluster_rows <- hclust(d = cor.dist(m)) cluster_df <- cutree(tree = cluster_rows, k = n_clusters) %>% as.data.frame() %>% tibble:::rownames_to_column(var = "geneset") %>% dplyr::rename(cluster_nr = ".") vaccine_type <- unique(plot_df$vaccine_type) pathogen <- unique(plot_df$pathogen) adjuvant <- unique(plot_df$adjuvant) myAnnotColors <- list( vaccine_type = adj_path_vt_colors$vaccine_type[vaccine_type], pathogen = adj_path_vt_colors$pathogen[pathogen], adjuvant = adj_path_vt_colors$adjuvant[adjuvant], cluster = setNames(ptol_pal()(n_clusters), nm = as.character(1:n_clusters)) ) #Manually change cluster labels for aesthetics cluster_df$cluster_nr[cluster_df$cluster_nr==4]=5 cluster_df$cluster_nr[cluster_df$cluster_nr==2]=4 cluster_df$cluster_nr[cluster_df$cluster_nr==3]=2 cluster_df$cluster_nr[cluster_df$cluster_nr==5]=3 p <- col_grouped_pheatmap( mat = m, col_groups = cut(myColAnnot$time_post_last_vax, time_windows), col_dist_fun = cor.dist, method = NA, color = myColor, breaks = myBreaks, cluster_rows = cluster_rows, display_numbers = ifelse(test = is.na(m_fdr) | m_fdr >= FDR_THRESHOLD, yes = " ", no = "*"), annotation_col = myColAnnot %>% dplyr::select(pathogen, vaccine_type, adjuvant), annotation_row = cluster_df %>% dplyr::rename(cluster = cluster_nr) %>% dplyr::mutate(cluster = as.character(cluster)) %>% tibble::column_to_rownames(var = "geneset"), annotation_colors = myAnnotColors, main = "", fontsize = 22, fontsize_row = 18, fontsize_col = 18, fontsize_number = 22) p <- pheatmap_add_labels( p = p, annot_labels = adj_path_vt_abbv) print(p)
Figure 2b-c, Ext. Data Figure 1c-d. Kinetics of the mean FC of module clusters across vaccines.
#Create color/linetype paired list for pathogen/vaccine type combination #Reorder vaccines to keep consistent with prior versions pt_order=c(1,3,4,5,2,6,10,7,11,12,13,8,9) cols=colorRampPalette(brewer.pal(ceiling(length(unique(eset$pt))/3), "Set1")) pt_line_color_anno=list('color'=setNames(rep(cols(ceiling(length(unique(eset$pt))/3)),3), unique(eset$pt)[pt_order]), 'linetype'=setNames(rep(c('solid','dashed','dotted'),ceiling(length(unique(eset$pt))/3)), unique(eset$pt)[pt_order])) #Create list from BTM clustering in common BTM heatmap BTM_clust=lapply(unique(cluster_df$cluster_nr), function(x) cluster_df$geneset[which(cluster_df$cluster_nr==x)]) names(BTM_clust)=unique(cluster_df$cluster_nr) #Get BTM FC data for common timepoints common_tp=c(1,3,7,14,21) exp_FC_BTM_commonTP=exp_FC_BTM[tp_FC %in% common_tp] #Collapse BTM FC data to cluster-level exp_FC_BTM_cluster=lapply(exp_FC_BTM_commonTP, function(x) {z=do.call(rbind, lapply(BTM_clust, function(y) colMeans(exprs(x[match(y,rownames(x)),])))); ExpressionSet(as.matrix(z), x@phenoData)}) #For each study, average expression across all subjects matrix_uni_tp=lapply(exp_FC_BTM_cluster,function(x) x$matrix[!duplicated(x$matrix)]) #Store study metadata matrix_uni_tp_metaData=lapply(exp_FC_BTM_cluster,function(x) pData(x)[!duplicated(x$matrix),] %>% rownames_to_column() %>% dplyr::select(matrix, vaccine, vaccine_type, pathogen, adjuvant, pt) %>% column_to_rownames(var = "matrix")) #Find sample indices per matrix matrix_ind=lapply(1:length(exp_FC_BTM_cluster), function(x) lapply(1:length(matrix_uni_tp[[x]]), function(y) which(matrix_uni_tp[[x]][[y]]==exp_FC_BTM_cluster[[x]]$matrix))) #Study size exp_FC_BTM_cluster_n=lapply(1:length(exp_FC_BTM_cluster), function(x) sapply(1:length(matrix_uni_tp[[x]]), function(y) length(matrix_ind[[x]][[y]]))) #Average FC exp_FC_BTM_cluster_mean=lapply(1:length(exp_FC_BTM_cluster), function(x) sapply(1:length(matrix_uni_tp[[x]]), function(y) rowMeans(exprs(exp_FC_BTM_cluster[[x]][,matrix_ind[[x]][[y]]])))) #Find unique pathogen/vaccine types by timepoint pt_uni_tp=lapply(matrix_uni_tp_metaData,function(x) x$pt[!duplicated(x$pt)]) pt_metaData=lapply(1:length(matrix_uni_tp_metaData), function(x) dplyr::select( matrix_uni_tp_metaData[[x]][!duplicated(matrix_uni_tp_metaData[[x]]$pt),], vaccine_type, pathogen, adjuvant)) #Vaccine size exp_FC_BTM_cluster_pt_n=lapply(1:length(exp_FC_BTM_cluster), function(x) sapply(1:length(pt_uni_tp[[x]]), function(y) length(which(exp_FC_BTM_cluster[[x]]$pt==pt_uni_tp[[x]][y])))) #Add weight (1/vaccine sample size) to pData for all samples n_weight=lapply(1:length(exp_FC_BTM_cluster), function(x) sapply(1:ncol(exp_FC_BTM_cluster[[x]]), function(y) exp_FC_BTM_cluster_pt_n[[x]][which(pt_uni_tp[[x]]==exp_FC_BTM_cluster[[x]]$pt[y])])) for (i in 1:length(exp_FC_BTM_cluster)) { exp_FC_BTM_cluster[[i]]$n_weight=1/n_weight[[i]] } #Compute mean FC and p values per cluster per vaccine (pt-pathogen/vaccine type) pt_ind=vector("list",length(exp_FC_BTM_cluster)) pt_FC_mean=vector("list",length(exp_FC_BTM_cluster)) for (i in 1:length(exp_FC_BTM_cluster)) { #For each vaccine pt_FC_mean[[i]]=matrix(NA, nrow=nrow(exp_FC_BTM_cluster[[i]]), ncol=length(pt_uni_tp[[i]])) for (j in 1:length(pt_uni_tp[[i]])) { pt_ind[[i]][[j]]=which(pt_uni_tp[[i]][[j]]==matrix_uni_tp_metaData[[i]]$pt) #If there is more than 1 study, compute weighted average if (length(pt_ind[[i]][[j]])>1) { #Average FCs (weighted by n) pt_FC_mean[[i]][,j]=rowWeightedMeans(exp_FC_BTM_cluster_mean[[i]][,pt_ind[[i]][[j]]], w=exp_FC_BTM_cluster_n[[i]][pt_ind[[i]][[j]]]) #Otherwise store single value } else { pt_FC_mean[[i]][,j]=exp_FC_BTM_cluster_mean[[i]][,pt_ind[[i]][[j]]] } } colnames(pt_FC_mean[[i]])=pt_uni_tp[[i]] rownames(pt_FC_mean[[i]])=rownames(exp_FC_BTM_cluster_mean[[i]]) } #Set clusters of interest, plot kinetics BTM_clust_int=names(BTM_clust) for (i in 1:length(BTM_clust_int)) { #Lookup BTM ind=which(rownames(exp_FC_BTM_cluster_mean[[1]])==BTM_clust_int[i]) #Build matrix of avg common gene exp over time per vaccine BTM_int_FC_mat=array(NA, dim=c(length(unique(eset$pt)),ncol=length(exp_FC_BTM_cluster)+1), dimnames=list(unique(eset$pt),c(0,common_tp))) #Set D0 FC to 0 BTM_int_FC_mat[,1]=0 #Add vaccine FCs at each timepoine for (j in 1:length(exp_FC_BTM_cluster_mean)) { ind_pt=match(pt_uni_tp[[j]],unique(eset$pt)) BTM_int_FC_mat[ind_pt,j+1]=pt_FC_mean[[j]][ind,] } #Plot line graph of most common DEG avg per vaccine #Convert matrix to long format BTM_int_FC_mat=data.frame(BTM_int_FC_mat) %>% tibble::rownames_to_column(var = "Vaccine") %>% pivot_longer(-Vaccine, names_to='Timepoint', values_to='FC') #Convert timepoint back to numeric BTM_int_FC_mat$Timepoint=as.numeric(str_replace(BTM_int_FC_mat$Timepoint, 'X', '')) #Plot p=ggplot(BTM_int_FC_mat[!is.na(BTM_int_FC_mat$FC),], aes(x=Timepoint, y=FC, group=Vaccine)) + geom_line(aes(linetype=Vaccine, color=Vaccine), size=1.07)+ theme_classic()+ scale_color_manual(values=pt_line_color_anno$color)+ scale_linetype_manual(values=pt_line_color_anno$linetype)+ scale_x_continuous(breaks=c(0,1,3,7,14,21))+ xlab("Day post-last vaccination")+ ylab("Mean FC (log2)")+ ggtitle(paste0('Cluster ',rownames(exp_FC_BTM_cluster_mean[[1]])[ind]))+ theme(plot.title = element_text(hjust = 0.5), text = element_text(size=20)) print(p) }
Figure 3a-c. Circos plots of the overlap in differentially expressed modules (FDR < 0.05) across vaccines on days 1 (a), 3 (b) and 7 (c). Each segment of the circle represents one vaccine, and each point in a segment represents a single module. Heat maps in the outermost ring represent the correlation with day 28 antibody responses, and bars in the middle ring represent the activity score of differentially expressed modules. Lines connect modules with a significant positive score shared between vaccines. Boxes and line colors in the innermost ring represent the functional groups of the modules. ECM, extracellular matrix.
corrMethod='pearson' tp_int=c(1,3,7) ab_response_col='MFC' #Load processed qusage results qusage_df=tidy_longitudinal_qusage_young_meta_df #Load BTM groups load(file.path(extra_data_dir,'BTM_groups.rda')) #Optional:Remove BTMs without subgroup BTM_groups=BTM_groups[sapply(BTM_groups$SUBGROUP, function(x) nchar(x)>0),] #Set BTM group as factor and manually set order BTM_groups$SUBGROUP=factor(BTM_groups$SUBGROUP, levels=c('ANTIGEN PRESENTATION', 'INFLAMMATORY/TLR/CHEMOKINES','INTERFERON/ANTIVIRAL SENSING', 'MONOCYTES', 'DC ACTIVATION', 'NEUTROPHILS', 'SIGNAL TRANSDUCTION', 'ECM AND MIGRATION', 'ENERGY METABOLISM', 'CELL CYCLE', 'NK CELLS', 'T CELLS', 'B CELLS', 'PLASMA CELLS')) #Sort BTM list by subgroup BTM_groups=BTM_groups[order(BTM_groups$SUBGROUP),] rownames(BTM_groups)=BTM_groups$NAME #Match qusage BTM names with list common_BTMs=intersect(sub('.*\\(S','S',sub('.*\\(M','M',BTM_groups$NAME)), sub('.*\\(S','S',sub('.*\\(M','M',unique(qusage_df$geneset)))) #Remove qusage BTMs without group qusage_df=qusage_df[sub('.*\\(S','S',sub('.*\\(M','M',qusage_df$geneset)) %in% common_BTMs,] #Rename to match BTM_groups qusage_df$geneset=BTM_groups$NAME[match(sub('.*\\(S','S',sub('.*\\(M','M',qusage_df$geneset)),sub('.*\\(S','S',sub('.*\\(M','M',BTM_groups$NAME)))] #Remove BTMs missing from qusage results BTM_groups=BTM_groups[sub('.*\\(S','S',sub('.*\\(M','M',BTM_groups$NAME)) %in% common_BTMs,] #Add pathogen_type (vaccine) column qusage_df$pt=paste(qusage_df$pathogen," (",qusage_df$vaccine_type,")", sep='') #Subset data to timepoints of interest qusage_df=qusage_df[qusage_df$time_post_last_vax %in% tp_int,] pt_uni=unique(qusage_df$pt) pt_uni_tp=lapply(tp_int, function(x) unique(qusage_df$pt[qusage_df$time_post_last_vax==x])) #Get vaccines with data at each timepoint #Split by timepoint qusage_df=lapply(tp_int, function(x) qusage_df[qusage_df$time_post_last_vax==x,]) #Convert to wide format pt_FC_NES=lapply(qusage_df, function(x) data.frame(spread(x[,c('pt','geneset','activity_score')], pt, activity_score))) pt_FC_NES=lapply(pt_FC_NES, function(x) {rownames(x)=x$geneset; x[,-1]}) pt_FC_NES=lapply(1:length(pt_FC_NES), function(x) {colnames(pt_FC_NES[[x]])=pt_uni_tp[[x]]; pt_FC_NES[[x]]}) pt_FC_p=lapply(qusage_df, function(x) data.frame(spread(x[,c('pt','geneset','FDR')], pt, FDR))) pt_FC_p=lapply(pt_FC_p, function(x) {rownames(x)=x$geneset; x[,-1]}) pt_FC_p=lapply(1:length(pt_FC_p), function(x) {colnames(pt_FC_p[[x]])=pt_uni_tp[[x]]; pt_FC_p[[x]]}) #Reorder qusage results to match BTM_groups pt_FC_NES=lapply(pt_FC_NES, function(x) x[match(BTM_groups$NAME,rownames(x)),]) pt_FC_p=lapply(pt_FC_p, function(x) x[match(BTM_groups$NAME,rownames(x)),]) #Generate Ab response correlation data #Load GE eset_wAb=young.noNorm.withResponse colnames(eset_wAb)=eset_wAb$uid #Create combined vaccine type/pathogen column eset_wAb$pt=paste(eset_wAb$pathogen," (",eset_wAb$vaccine_type,")", sep='') #Label columns by uid colnames(eset_wAb)=eset_wAb$uid #Find samples from timepoints of interest tp_int=c(0,tp_int) ind=lapply(tp_int, function(x) which(eset_wAb$study_time_collected==x)) #Combine indices of all timepoints of interest ind_all=Reduce(union,ind) #Retain only samples from timepoints of interest eset_wAb=eset_wAb[,ind_all] #Recompute timepoint indices after removing extraneous timepoints ind=lapply(tp_int, function(x) which(eset_wAb$study_time_collected==x)) #Remove samples from a single study with fewer than sample_cutoff samples at any timepoint sample_cutoff=3 matrix_uni_tp=lapply(ind,function(x) unique(eset_wAb[,x]$matrix)) matrix_ind=lapply(1:length(ind),function(x) lapply(1:length(matrix_uni_tp[[x]]), function(y) which(matrix_uni_tp[[x]][[y]]==eset_wAb[,ind[[x]]]$matrix))) ind_cut_all=vector() for (i in 1:length(matrix_ind)) { ind_cut=which(sapply(matrix_ind[[i]],length)<sample_cutoff) if (is_empty(ind_cut)==FALSE) { for (j in 1:length(ind_cut)){ ind_cut_all=c(ind_cut_all,ind[[i]][matrix_ind[[i]][[ind_cut[j]]]]) } } } if (is_empty(ind_cut_all)==FALSE) { eset_wAb=eset_wAb[,-ind_cut_all] } #Recompute timepoint indices after removing samples tp_int=unique(eset_wAb$study_time_collected[which(eset_wAb$study_time_collected>=0)]) ind=lapply(tp_int, function(x) which(eset_wAb$study_time_collected==x)) #Remove genes with NA eset_wAb=eset_wAb[complete.cases(exprs(eset_wAb)),] #Create unique list of studies matrix_uni=unique(eset_wAb$matrix) #Collapse to BTM level #Load BTM list BTM_list=readRDS(file = file.path(data_dir, "btm_list_2020_12_23.rds")) #match with BTM groups/qusage data BTM_list=BTM_list[na.omit(match(sub('.*\\(S','S',sub('.*\\(M','M',BTM_groups$NAME)),sub('.*\\(S','S',sub('.*\\(M','M',names(BTM_list)))))] #Collapse - find arithmetic mean of genes in each BTM for each sample exp_BTM=do.call(rbind, lapply(BTM_list, function(x) colMeans(exprs(eset_wAb[na.omit(match(x,rownames(eset_wAb))),]),na.rm=TRUE))) #Create BTM eset_wAb eset_wAb_BTM=ExpressionSet(exp_BTM, eset_wAb@phenoData) #Replace eset_wAb with BTM eset_wAb eset_wAb=eset_wAb_BTM rm(eset_wAb_BTM) #Compute D0 normalized FC ind_D0=which(0==eset_wAb$study_time_collected) common=lapply(2:length(ind),function(x) intersect(eset_wAb$participant_id[ind[[x]]],eset_wAb$participant_id[ind_D0])) ia=lapply(2:length(ind),function(x) na.omit(match(common[[x-1]],eset_wAb$participant_id[ind[[x]]]))) ib=lapply(2:length(ind),function(x) na.omit(match(common[[x-1]],eset_wAb$participant_id[ind_D0]))) exp_FC_wAb=lapply(2:length(ind),function(x) eset_wAb[,ind[[x]][ia[[x-1]]]]) exp_FC_wAb=lapply(2:length(ind),function(x) {exprs(exp_FC_wAb[[x-1]])=exprs(exp_FC_wAb[[x-1]])-exprs(eset_wAb[,ind_D0[ib[[x-1]]]]); exp_FC_wAb[[x-1]]}) #For each study, average expression across all subjects matrix_uni_tp=lapply(exp_FC_wAb,function(x) x$matrix[!duplicated(x$matrix)]) #Store study metadata matrix_uni_tp_metaData=lapply(exp_FC_wAb,function(x) pData(x)[!duplicated(x$matrix),] %>% rownames_to_column() %>% dplyr::select(matrix, vaccine, vaccine_type, pathogen, adjuvant, pt) %>% column_to_rownames(var = "matrix")) #Indices for each study matrix_ind=lapply(1:length(exp_FC_wAb), function(x) lapply(1:length(matrix_uni_tp[[x]]), function(y) which(matrix_uni_tp[[x]][[y]]==exp_FC_wAb[[x]]$matrix))) #Study size exp_FC_wAb_n=lapply(1:length(exp_FC_wAb), function(x) sapply(1:length(matrix_uni_tp[[x]]), function(y) length(matrix_ind[[x]][[y]]))) #Correlation with Ab response exp_FC_wAb_corr_stat= lapply(1:length(exp_FC_wAb), #for each timepoint function(x) sapply(1:length(matrix_uni_tp[[x]]), #for each study function(y) apply(exprs(exp_FC_wAb[[x]][,matrix_ind[[x]][[y]]]), 1, #for each geneset function(z) cor.test(z, pData(exp_FC_wAb[[x]][,matrix_ind[[x]][[y]]])[[ab_response_col]], method = corrMethod)$statistic))) #Merge mean FC, correlation/t-test test statistics by vaccine (weighted average by study size) #Find unique pathogen/vaccine types by timepoint pt_uni_tp=lapply(matrix_uni_tp_metaData,function(x) x$pt[!duplicated(x$pt)]) pt_metaData=lapply(1:length(matrix_uni_tp_metaData), function(x) dplyr::select( matrix_uni_tp_metaData[[x]][!duplicated(matrix_uni_tp_metaData[[x]]$pt),], vaccine_type, pathogen, adjuvant)) #Sample indices per vaccine pt_ind=lapply(1:length(exp_FC_wAb), function(x) sapply(1:length(pt_uni_tp[[x]]), function(y) which(pt_uni_tp[[x]][[y]]==matrix_uni_tp_metaData[[x]]$pt))) pt_corr_stat= lapply(1:length(exp_FC_wAb), function(x) sapply(1:length(pt_uni_tp[[x]]), function(y) if(length(pt_ind[[x]][[y]])>1) rowWeightedMeans(exp_FC_wAb_corr_stat[[x]][,pt_ind[[x]][[y]]], w=exp_FC_wAb_n[[x]][pt_ind[[x]][[y]]], na.rm=TRUE) else exp_FC_wAb_corr_stat[[x]][,pt_ind[[x]][[y]]])) pt_corr_stat=lapply(1:length(pt_corr_stat), function(x) {rownames(pt_corr_stat[[x]])=BTM_groups$NAME; colnames(pt_corr_stat[[x]])=pt_uni_tp[[x]]; pt_corr_stat[[x]]}) #Add vaccines with missing data at each timepoint pt_FC_NES_full=vector("list",length(pt_FC_NES)) pt_FC_p_full=vector("list",length(pt_FC_NES)) pt_corr_stat_full=vector("list",length(pt_corr_stat)) for (i in 1:length(pt_corr_stat)) { pt_FC_NES_full[[i]]=matrix(NA, ncol=length(pt_uni), nrow=nrow(pt_FC_NES[[i]]), dimnames = list(rownames(pt_FC_NES[[i]]), pt_uni)) pt_FC_NES_full[[i]][rownames(pt_FC_NES[[i]]), colnames(pt_FC_NES[[i]])]=unlist(pt_FC_NES[[i]]) pt_FC_p_full[[i]]=matrix(NA, ncol=length(pt_uni), nrow=nrow(pt_FC_p[[i]]), dimnames = list(rownames(pt_FC_p[[i]]), pt_uni)) pt_FC_p_full[[i]][rownames(pt_FC_p[[i]]), colnames(pt_FC_p[[i]])]=unlist(pt_FC_p[[i]]) pt_corr_stat_full[[i]]=matrix(NA, ncol=length(pt_uni), nrow=nrow(pt_corr_stat[[i]]), dimnames = list(rownames(pt_corr_stat[[i]]), pt_uni)) pt_corr_stat_full[[i]][rownames(pt_corr_stat[[i]]), colnames(pt_corr_stat[[i]])]=unlist(pt_corr_stat[[i]]) } pt_FC_NES=pt_FC_NES_full pt_FC_p=pt_FC_p_full pt_corr_stat=pt_corr_stat_full rm(pt_corr_stat_full, pt_FC_NES_full, pt_FC_p_full) #Update vaccine per timepoint variable pt_uni_tp=lapply(1:length(pt_FC_NES), function(x) pt_uni) #Compute overlap in sig BTMs comb=vector("list",length(pt_FC_NES)) pt_BTM_ind_sig_NES=vector("list",length(pt_FC_NES)) pt_BTM_ind_sig_up=vector("list",length(pt_FC_NES)) pt_BTM_ind_sig_down=vector("list",length(pt_FC_NES)) pt_BTM_up_overlap=vector("list",length(pt_FC_NES)) pt_BTM_down_overlap=vector("list",length(pt_FC_NES)) for (i in 1:length(pt_FC_NES)) { pt_BTM_ind_sig_NES[[i]]=vector("list",ncol(pt_FC_NES[[i]])) names(pt_BTM_ind_sig_NES[[i]])=colnames(pt_FC_NES[[i]]) for (j in 1:ncol(pt_FC_NES[[i]])) { #Find signifiant pathways pt_BTM_ind_sig_NES[[i]][[j]]=which(pt_FC_p[[i]][,j]<FDR_THRESHOLD) pt_BTM_ind_sig_up[[i]][[j]]=intersect(pt_BTM_ind_sig_NES[[i]][[j]],which(pt_FC_NES[[i]][,j]>0)) pt_BTM_ind_sig_down[[i]][[j]]=intersect(pt_BTM_ind_sig_NES[[i]][[j]],which(pt_FC_NES[[i]][,j]<0)) } #Pairwise combinations comb[[i]]=combn(seq.int(1,length(pt_uni_tp[[i]])),2,simplify = FALSE) pt_BTM_up_overlap[[i]]= lapply(comb[[i]],function(x) intersect(pt_BTM_ind_sig_up[[i]][[x[1]]] , pt_BTM_ind_sig_up[[i]][[x[2]]])) pt_BTM_down_overlap[[i]]= lapply(comb[[i]],function(x) intersect(pt_BTM_ind_sig_down[[i]][[x[1]]] , pt_BTM_ind_sig_down[[i]][[x[2]]])) } #Circos plot of BTMs #Set vaccine colors cols=colorRampPalette(brewer.pal(length(pt_uni), "Set3")) mycolors_vac=cols(length(pt_uni)) #Create link colors by BTM group cols=colorRampPalette(brewer.pal(length(unique(BTM_groups$SUBGROUP)), "Set1")) mycolors_BTM=cols(length(unique(BTM_groups$SUBGROUP))) #Set barplot limits # bar_lim=ceiling(max(abs(do.call(cbind,pt_FC_NES)), na.rm=TRUE)*2)/2 # map_lim=ceiling(max(abs(do.call(cbind,pt_corr_stat)), na.rm=TRUE)*2)/2 #Set barplot limits manually (alternate) bar_lim=1 map_lim=2.6 #Replace nonsignificant values with 0 pt_FC_NES=lapply(1:length(pt_FC_NES), function(x) {pt_FC_NES[[x]][pt_FC_p[[x]]>=FDR_THRESHOLD]=0; pt_FC_NES[[x]]}) #Replace values outside limits with limit pt_FC_NES=lapply(1:length(pt_FC_NES), function(x) {pt_FC_NES[[x]][pt_FC_NES[[x]]<(-bar_lim)]=-bar_lim; pt_FC_NES[[x]]}) pt_FC_NES=lapply(1:length(pt_FC_NES), function(x) {pt_FC_NES[[x]][pt_FC_NES[[x]]>bar_lim]=bar_lim; pt_FC_NES[[x]]}) pt_corr_stat=lapply(1:length(pt_corr_stat), function(x) {pt_corr_stat[[x]][pt_corr_stat[[x]]<(-map_lim)]=-map_lim; pt_corr_stat[[x]]}) pt_corr_stat=lapply(1:length(pt_corr_stat), function(x) {pt_corr_stat[[x]][pt_corr_stat[[x]]>map_lim]=map_lim; pt_corr_stat[[x]]}) #Set heatmap colors map_col=colorRamp2(c(-map_lim, 0, map_lim), c("black", "white", "orange")) for (i in 1:length(pt_FC_NES)) { #Create df with sectors df_sector_list=lapply(1:length(pt_uni_tp[[i]]), function(y) data.frame(factors=replicate(nrow(pt_FC_NES[[i]]),pt_uni_tp[[i]][[y]]), x=seq.int(1,nrow(pt_FC_NES[[i]])))) #Combine into one df df=bind_rows(df_sector_list) #Draw sectors circos.par(cell.padding = c(0.02, 0.1, 0.02, 0.1), circle.margin=c(0.01,0.4,0.01,0.01), track.margin=c(0,0)) #Margins with legend #circos.par(cell.padding = c(0.02, 0.1, 0.02, 0.1), circle.margin=c(0.01,0.01,0.01,0.01), track.margin=c(0,0)) #Margins without legend circos.initialize(factors = df$factors, x = df$x) circos.track(ylim = c(0, 1), track.height=0.15, bg.border=NA, panel.fun=function(x, y) { #chr=CELL_META$sector.index xlim=CELL_META$xlim ylim=CELL_META$ylim circos.rect(xlim[1], 0, xlim[2], 1, col = mycolors_vac[which(pt_uni_tp[[i]][CELL_META$sector.numeric.index]==pt_uni)]) #Add labels circos.text(nrow(pt_FC_NES[[i]])/2, 0.5, CELL_META$sector.index, CELL_META$sector.index, 1, facing = "bending.outside", niceFacing=TRUE, cex = 1.5) }) #Add Ab correlation heatmap circos.track(ylim = c(0.25,0.75), track.height=0.05, track.margin=c(0.01,0), bg.border=NA, panel.fun=function(x, y) { m=t(data.matrix(pt_corr_stat[[i]][,CELL_META$sector.numeric.index])) col_mat=map_col(m) nc=ncol(m) circos.rect(1:nc, rep(0, nc), 1:nc, rep(1, nc), border=col_mat, col=col_mat) }) #Add barplots circos.track(ylim=c(-bar_lim, bar_lim), track.height=0.1, panel.fun=function(x, y) { value=pt_FC_NES[[i]][,CELL_META$sector.numeric.index] circos.barplot(value, 1:nrow(pt_FC_NES[[i]]), col = ifelse(value > 0, '#FF0000', '#0000FF'), border = NA) #circos.yaxis(side='right') }) #Add BTM group color blocks circos.track(ylim = c(0, 1), track.height=0.05, bg.border=NA, panel.fun=function(x, y) { for (j in 1:length(unique(BTM_groups$SUBGROUP))) { #Get start/ending indices of BTM group start=match(unique(BTM_groups$SUBGROUP)[j],BTM_groups$SUBGROUP) end=length(BTM_groups$SUBGROUP)-match(unique(BTM_groups$SUBGROUP)[j],rev(BTM_groups$SUBGROUP))+1 circos.rect(start, 0, end, 1, col=mycolors_BTM[j]) } }) #Draw common DEG links (up) for (j in 1:length(pt_BTM_up_overlap[[i]])) { if (length(pt_BTM_up_overlap[[i]][[j]])>0){ for (k in 1:length(pt_BTM_up_overlap[[i]][[j]])) { if (!is.na(pt_BTM_up_overlap[[i]][[j]][k])) { circos.link(pt_uni_tp[[i]][[comb[[i]][[j]][1]]], pt_BTM_up_overlap[[i]][[j]][k], pt_uni_tp[[i]][[comb[[i]][[j]][2]]], pt_BTM_up_overlap[[i]][[j]][k], col=mycolors_BTM[[which(BTM_groups$SUBGROUP[pt_BTM_up_overlap[[i]][[j]][k]]==unique(BTM_groups$SUBGROUP))]]) } } } } #Legend lgd_points = Legend(at = pt_uni_tp[[i]], type = "points",pch=15, legend_gp = gpar(col = mycolors_vac,fill = mycolors_vac), title_position = "topleft", labels_gp = gpar(fontsize = 12), title = "Vaccine") lgd_lines = Legend(at = unique(BTM_groups$SUBGROUP), type = "lines", legend_gp = gpar(col = mycolors_BTM, lwd = 4), title_position = "topleft", labels_gp= gpar(fontsize = 14), title = "BTM", background = NA) #lgd_list_vertical = packLegend(lgd_points, lgd_lines) lgd_list_vertical = packLegend(lgd_lines) pushViewport(viewport(x = unit(40, "cm"), y = unit(40, "cm"), width = widthDetails(lgd_list_vertical), height = heightDetails(lgd_list_vertical), just = c("left", "bottom"))) grid.draw(lgd_list_vertical) upViewport() circos.clear() }
Figure 4a, Ext. Data Figure 2a-b. Correlation matrices of pairwise Spearman correlations of gene-level fold changes between vaccines at each timepoint.
#Get FC data for common timepoints common_tp=c(1,3,7,14,21) exp_FC_commonTP=exp_FC[tp_FC %in% common_tp] #For each study, average expression across all subjects matrix_uni_tp=lapply(exp_FC_commonTP,function(x) x$matrix[!duplicated(x$matrix)]) #Store study metadata matrix_uni_tp_metaData=lapply(exp_FC_commonTP,function(x) pData(x)[!duplicated(x$matrix),] %>% rownames_to_column() %>% dplyr::select(matrix, vaccine, vaccine_type, pathogen, adjuvant, pt) %>% column_to_rownames(var = "matrix")) #Find sample indices per matrix matrix_ind=lapply(1:length(exp_FC_commonTP), function(x) lapply(1:length(matrix_uni_tp[[x]]), function(y) which(matrix_uni_tp[[x]][[y]]==exp_FC_commonTP[[x]]$matrix))) #Study size exp_FC_commonTP_n=lapply(1:length(exp_FC_commonTP), function(x) sapply(1:length(matrix_uni_tp[[x]]), function(y) length(matrix_ind[[x]][[y]]))) #Average FC exp_FC_commonTP_mean=lapply(1:length(exp_FC_commonTP), function(x) sapply(1:length(matrix_uni_tp[[x]]), function(y) rowMeans(exprs(exp_FC_commonTP[[x]][,matrix_ind[[x]][[y]]])))) #Find unique pathogen/vaccine types by timepoint pt_uni_tp=lapply(matrix_uni_tp_metaData,function(x) x$pt[!duplicated(x$pt)]) pt_metaData=lapply(1:length(matrix_uni_tp_metaData), function(x) dplyr::select( matrix_uni_tp_metaData[[x]][!duplicated(matrix_uni_tp_metaData[[x]]$pt),], vaccine_type, pathogen, adjuvant)) #Vaccine size exp_FC_commonTP_pt_n=lapply(1:length(exp_FC_commonTP), function(x) sapply(1:length(pt_uni_tp[[x]]), function(y) length(which(exp_FC_commonTP[[x]]$pt==pt_uni_tp[[x]][y])))) #Add weight (1/vaccine sample size) to pData for all samples n_weight=lapply(1:length(exp_FC_commonTP), function(x) sapply(1:ncol(exp_FC_commonTP[[x]]), function(y) exp_FC_commonTP_pt_n[[x]][which(pt_uni_tp[[x]]==exp_FC_commonTP[[x]]$pt[y])])) for (i in 1:length(exp_FC_commonTP)) { exp_FC_commonTP[[i]]$n_weight=1/n_weight[[i]] } #Compute mean FC and p values per cluster per vaccine (pt-pathogen/vaccine type) pt_ind=vector("list",length(exp_FC_commonTP)) pt_FC_mean=vector("list",length(exp_FC_commonTP)) for (i in 1:length(exp_FC_commonTP)) { #For each vaccine pt_FC_mean[[i]]=matrix(NA, nrow=nrow(exp_FC_commonTP[[i]]), ncol=length(pt_uni_tp[[i]])) for (j in 1:length(pt_uni_tp[[i]])) { pt_ind[[i]][[j]]=which(pt_uni_tp[[i]][[j]]==matrix_uni_tp_metaData[[i]]$pt) #If there is more than 1 study, compute weighted average if (length(pt_ind[[i]][[j]])>1) { #Average FCs (weighted by n) pt_FC_mean[[i]][,j]=rowWeightedMeans(exp_FC_commonTP_mean[[i]][,pt_ind[[i]][[j]]], w=exp_FC_commonTP_n[[i]][pt_ind[[i]][[j]]]) #Otherwise store single value } else { pt_FC_mean[[i]][,j]=exp_FC_commonTP_mean[[i]][,pt_ind[[i]][[j]]] } } colnames(pt_FC_mean[[i]])=pt_uni_tp[[i]] rownames(pt_FC_mean[[i]])=rownames(exp_FC_commonTP_mean[[i]]) } #Compute rank-based correlation matrices at each timepoint using gene FC pt_FC_cor=lapply(1:length(pt_FC_mean), function(x) cor(pt_FC_mean[[x]], method='spearman')) #Plot correlation matrices colorLS=colorRampPalette(colors = c("blue", "white", "red"))(n = 100) breakLS=seq(from = -1, to = 1, length.out = 100) tp_plot=c(1,3,7) for (i in 1:length(tp_plot)) { df=pt_FC_cor[[which(common_tp==tp_plot[i])]] diag(df)=0 df_lim=1 p=corrplot(df, order='hclust', col=colorLS, tl.col='black', cl.lim = c(-df_lim, df_lim), #smaller label version is.corr = FALSE, tl.cex=1, cl.cex=1, title=paste0('Day ',tp_plot[i]), mar=c(0,0,2,0)) }
Figure S2c-d. Scatterplots of Day 1 gene FCs between vaccines
#Plot example scatterplots for YF vs Pnemococcus (best negative) and Malaria vs HIV (best positive) pt_plot_pairs=list(c('Yellow Fever','Pneumococcus'), c('Malaria','HIV')) for (i in 1:length(pt_plot_pairs)) { ind1=grep(pt_plot_pairs[[i]][1],pt_uni_tp[[1]]) ind2=grep(pt_plot_pairs[[i]][2], pt_uni_tp[[1]]) df=setNames(data.frame(pt_FC_mean[[1]][,c(ind1,ind2)]), c('X','Y')) p=ggscatter(df, x='X', y='Y', xlab=paste(pt_uni_tp[[1]][ind1],'Day 1 log2 FC'), ylab=paste(pt_uni_tp[[1]][ind2],'Day 1 log2 FC'), add = "reg.line", add.params = list(color = "blue", fill = "lightgray")) # Add correlation coefficient, set scales p+stat_cor(method = "pearson")+theme(text = element_text(size=20)) print(p) }
Figure 4b-c. Activity score heatmaps of modules differentially expressed in response to YF vaccination on day 1 (b, QuSAGE FDR<0.2) and day 7 (c, QuSAGE FDR<0.05, activity score>0.2)
#Collect processed qusage results qusage_df=tidy_longitudinal_qusage_young_meta_df #Subset data to timepoints of interest tp_plot=c(1,7) qusage_df=qusage_df[qusage_df$time_post_last_vax %in% tp_plot,] qusage_df$pt=paste(qusage_df$pathogen," (",qusage_df$vaccine_type,")", sep='') pt_uni=unique(qusage_df$pt) pt_uni_tp=lapply(tp_plot, function(x) unique(qusage_df$pt[qusage_df$time_post_last_vax==x])) #Split by timepoint qusage_df=lapply(tp_plot, function(x) qusage_df[qusage_df$time_post_last_vax==x,]) #Convert to wide format pt_FC_NES=lapply(qusage_df, function(x) data.frame(spread(x[,c('pt','geneset','activity_score')], pt, activity_score))) pt_FC_NES=lapply(pt_FC_NES, function(x) {rownames(x)=x$geneset; x[,-1]}) pt_FC_NES=lapply(1:length(pt_FC_NES), function(x) {colnames(pt_FC_NES[[x]])=pt_uni_tp[[x]]; pt_FC_NES[[x]]}) pt_FC_p=lapply(qusage_df, function(x) data.frame(spread(x[,c('pt','geneset','FDR')], pt, FDR))) pt_FC_p=lapply(pt_FC_p, function(x) {rownames(x)=x$geneset; x[,-1]}) pt_FC_p=lapply(1:length(pt_FC_p), function(x) {colnames(pt_FC_p[[x]])=pt_uni_tp[[x]]; pt_FC_p[[x]]}) #Reorder qusage results to match BTM_groups pt_FC_NES=lapply(pt_FC_NES, function(x) x[match(BTM_groups$NAME,rownames(x)),]) pt_FC_p=lapply(pt_FC_p, function(x) x[match(BTM_groups$NAME,rownames(x)),]) #Find significant BTMs ind_sig=vector("list",length(pt_FC_NES)) for (i in 1:length(pt_FC_NES)) { ind_sig[[i]]=vector("list",ncol(pt_FC_NES[[i]])) names(ind_sig[[i]])=colnames(pt_FC_NES[[i]]) for (j in 1:ncol(pt_FC_NES[[i]])) { if (tp_plot[i]==1) { ind=which(pt_FC_p[[i]][,j]<0.2) ind_sig[[i]][[j]]=intersect(ind,which(abs(pt_FC_NES[[i]][,j])>0)) } else if (tp_plot[i]==7) ind=which(pt_FC_p[[i]][,j]<0.05) ind_sig[[i]][[j]]=intersect(ind,which(abs(pt_FC_NES[[i]][,j])>0.2)) } } #Find all significant BTMs at each timepoint ind_sig_pertp=lapply(ind_sig, function(x) Reduce(union,x)) #Use YF only to define sig BTMs ind_sig_pertp=lapply(ind_sig, function(x) unlist(x['Yellow Fever (LV)'])) #Plot heatmap of significant BTMs for (i in 1:length(pt_FC_NES)) { df=pt_FC_NES[[i]][ind_sig_pertp[[i]],] #df_anno=pathogen_type_metaData[[i]][,c('vaccine_type', 'pathogen')] ph_lim=ceiling(max(abs(df), na.rm = TRUE)) #ha=HeatmapAnnotation(df=df_anno, col=path_vt_colors) p=Heatmap(df, #top_annotation=ha, #column_labels=pathogen_type_metaData[[i]][,'adjuvant'], column_labels=colnames(df), column_names_side="top", name='Activity Score', col=colorRamp2(c(-ph_lim, 0, ph_lim), c("blue", "white", "red")), row_names_gp = gpar(fontsize = 12), column_names_gp = gpar(fontsize = 12)) print(p) }
Figure 4D. Kinetics of the mean FC of module M75 across vaccines.
#For each study, average expression across all subjects matrix_uni_tp=lapply(exp_FC_BTM_commonTP,function(x) x$matrix[!duplicated(x$matrix)]) #Store study metadata matrix_uni_tp_metaData=lapply(exp_FC_BTM_commonTP,function(x) pData(x)[!duplicated(x$matrix),] %>% rownames_to_column() %>% dplyr::select(matrix, vaccine, vaccine_type, pathogen, adjuvant, pt) %>% column_to_rownames(var = "matrix")) #Get study indices matrix_ind=lapply(1:length(exp_FC_BTM_commonTP), function(x) lapply(1:length(matrix_uni_tp[[x]]), function(y) which(matrix_uni_tp[[x]][[y]]==exp_FC_BTM_commonTP[[x]]$matrix))) #Study size exp_FC_BTM_commonTP_n=lapply(1:length(exp_FC_BTM_commonTP), function(x) sapply(1:length(matrix_uni_tp[[x]]), function(y) length(matrix_ind[[x]][[y]]))) #Average FC exp_FC_BTM_commonTP_mean=lapply(1:length(exp_FC_BTM_commonTP), function(x) sapply(1:length(matrix_uni_tp[[x]]), function(y) rowMeans(exprs(exp_FC_BTM_commonTP[[x]][,matrix_ind[[x]][[y]]])))) #Find unique pathogen/vaccine types by timepoint pt_uni_tp=lapply(matrix_uni_tp_metaData,function(x) x$pt[!duplicated(x$pt)]) pt_ind=lapply(1:length(pt_uni_tp), function(x) lapply(1:length(pt_uni_tp[[x]]), function(y) which(pt_uni_tp[[x]][[y]]==matrix_uni_tp_metaData[[x]]$pt))) #Compute average expression for each vaccine across studies (weighted by sample size) pt_FC_mean=lapply(1:length(exp_FC_BTM_commonTP), function(x) sapply(setNames(1:length(pt_uni_tp[[x]]), pt_uni_tp[[x]]), function(y) if(length(pt_ind[[x]][[y]])>1) rowWeightedMeans(exp_FC_BTM_commonTP_mean[[x]][,pt_ind[[x]][[y]]], w=exp_FC_BTM_commonTP_n[[x]][pt_ind[[x]][[y]]], useNames=T) else exp_FC_BTM_commonTP_mean[[x]][,pt_ind[[x]][[y]]])) #Set BTM of interest list, plot kinetics BTM_int=c('M75)') #Lookup BTM ind=grep(BTM_int, rownames(exp_FC_BTM_commonTP[[1]]), fixed=TRUE) #Build matrix of avg common gene exp over time per vaccine BTM_int_FC_mat=array(NA, dim=c(length(unique(eset$pt)),ncol=length(exp_FC_BTM_commonTP)+1), dimnames=list(unique(eset$pt),c(0,common_tp))) #Set D0 FC to 0 BTM_int_FC_mat[,1]=0 for (j in 1:length(exp_FC_BTM_commonTP)) { ind_pt=match(pt_uni_tp[[j]],unique(eset$pt)) BTM_int_FC_mat[ind_pt,j+1]=pt_FC_mean[[j]][ind,] } #Convert matrix to long format BTM_int_FC_mat=data.frame(BTM_int_FC_mat) %>% tibble::rownames_to_column(var = "Vaccine") %>% pivot_longer(-Vaccine, names_to='Timepoint', values_to='FC') #Convert timepoint back to numeric BTM_int_FC_mat$Timepoint=as.numeric(str_replace(BTM_int_FC_mat$Timepoint, 'X', '')) #Plot p=ggplot(BTM_int_FC_mat[!is.na(BTM_int_FC_mat$FC),], aes(x=Timepoint, y=FC, group=Vaccine)) + geom_line(aes(linetype=Vaccine, color=Vaccine), size=1.07)+ theme_classic()+ scale_color_manual(values=pt_line_color_anno$color)+ scale_linetype_manual(values=pt_line_color_anno$linetype)+ scale_x_continuous(breaks=c(0,common_tp))+ xlab("Day post-last vaccination")+ ylab("Mean FC (log2)")+ ggtitle(rownames(exp_FC_BTM[[1]][ind,]))+ theme(plot.title = element_text(hjust = 0.5), text = element_text(size=12)) print(p)
Figure 4e. Heatmaps of the post-vaccination FC of genes in module M75
#Get FC data for common timepoints common_tp=c(1,3,7,14,21) exp_FC_commonTP=exp_FC[tp_FC %in% common_tp] #For each study, average expression across all subjects matrix_uni_tp=lapply(exp_FC_commonTP,function(x) x$matrix[!duplicated(x$matrix)]) #Store study metadata matrix_uni_tp_metaData=lapply(exp_FC_commonTP,function(x) pData(x)[!duplicated(x$matrix),] %>% rownames_to_column() %>% dplyr::select(matrix, vaccine, vaccine_type, pathogen, adjuvant, pt) %>% column_to_rownames(var = "matrix")) #Find sample indices per matrix matrix_ind=lapply(1:length(exp_FC_commonTP), function(x) lapply(1:length(matrix_uni_tp[[x]]), function(y) which(matrix_uni_tp[[x]][[y]]==exp_FC_commonTP[[x]]$matrix))) #Study size exp_FC_commonTP_n=lapply(1:length(exp_FC_commonTP), function(x) sapply(1:length(matrix_uni_tp[[x]]), function(y) length(matrix_ind[[x]][[y]]))) #Average FC exp_FC_commonTP_mean=lapply(1:length(exp_FC_commonTP), function(x) sapply(1:length(matrix_uni_tp[[x]]), function(y) rowMeans(exprs(exp_FC_commonTP[[x]][,matrix_ind[[x]][[y]]])))) #Find unique pathogen/vaccine types by timepoint pt_uni_tp=lapply(matrix_uni_tp_metaData,function(x) x$pt[!duplicated(x$pt)]) pt_metaData=lapply(1:length(matrix_uni_tp_metaData), function(x) dplyr::select( matrix_uni_tp_metaData[[x]][!duplicated(matrix_uni_tp_metaData[[x]]$pt),], vaccine_type, pathogen, adjuvant)) #Vaccine size exp_FC_commonTP_pt_n=lapply(1:length(exp_FC_commonTP), function(x) sapply(1:length(pt_uni_tp[[x]]), function(y) length(which(exp_FC_commonTP[[x]]$pt==pt_uni_tp[[x]][y])))) #Compute mean FC and p values per cluster per vaccine (pt-pathogen/vaccine type) pt_ind=vector("list",length(exp_FC_commonTP)) pt_FC_mean=vector("list",length(exp_FC_commonTP)) for (i in 1:length(exp_FC_commonTP)) { #For each vaccine pt_FC_mean[[i]]=matrix(NA, nrow=nrow(exp_FC_commonTP[[i]]), ncol=length(pt_uni_tp[[i]])) for (j in 1:length(pt_uni_tp[[i]])) { pt_ind[[i]][[j]]=which(pt_uni_tp[[i]][[j]]==matrix_uni_tp_metaData[[i]]$pt) #If there is more than 1 study, compute weighted average if (length(pt_ind[[i]][[j]])>1) { #Average FCs (weighted by n) pt_FC_mean[[i]][,j]=rowWeightedMeans(exp_FC_commonTP_mean[[i]][,pt_ind[[i]][[j]]], w=exp_FC_commonTP_n[[i]][pt_ind[[i]][[j]]]) #Otherwise store single value } else { pt_FC_mean[[i]][,j]=exp_FC_commonTP_mean[[i]][,pt_ind[[i]][[j]]] } } colnames(pt_FC_mean[[i]])=pt_uni_tp[[i]] rownames(pt_FC_mean[[i]])=rownames(exp_FC_commonTP_mean[[i]]) rownames(pt_metaData[[i]])=pt_uni_tp[[i]] } #Plot BTM genes BTM_int='M75' for (i in 1:length(pt_FC_mean)) { ind=with(pt_metaData[[i]], order(vaccine_type, adjuvant, pathogen)) gene_int=btm_list[[grep(BTM_int,names(btm_list))]] #Create data frame and annotation df=pt_FC_mean[[i]][na.omit(match(gene_int,rownames(eset))),ind] df_anno=pt_metaData[[i]][ind, c('vaccine_type', 'pathogen', 'adjuvant')] ph_lim=ceiling(max(abs(df), na.rm = TRUE)) ph_lim=2 colorLS=colorRampPalette(colors = c("blue", "white", "red"))(n = 100) breakLS=seq(from = -ph_lim, to = ph_lim, length.out = 100) #Plotting with pheatmap p=pheatmap::pheatmap(mat = df, color = colorLS, breaks = breakLS, cluster_cols = FALSE, cluster_rows = FALSE, clustering_distance_rows = "euclidean", clustering_distance_cols = "euclidean", clustering_method = "ward.D2", show_colnames = TRUE, show_rownames = TRUE, annotation_col = df_anno, annotation_colors = adj_path_vt_colors, fontsize = 10, cellheight = 10, cellwidth = 15, main=paste0('Day ',common_tp[i])) }
Ext. Data Figure 2e. Boxplot of Day 1 FCs in xCell estimated B cell frequencies across vaccines
common_tp=c(0,1,3,7,14,21) cellFreq_p_cutoff=0.2 #cutoff for discarding cell frequency estimations below a confidence threshold #Load xCell data freqDF=read.delim(file.path(extra_data_dir,'xcell_Vax_noResponse.csv'), sep=',', row.names=1) freqDF_p=read.delim(file.path(extra_data_dir,'pval_xcell_Vax_noResponse.csv'), sep=',', row.names=1) #Replace nonsignificant cell scores with NA freqDF[freqDF_p>=cellFreq_p_cutoff]=NA #Select only subjects with both pData and est freq, create expressionset common=intersect(colnames(eset), colnames(freqDF)) eset_freq=ExpressionSet(as.matrix(log2(freqDF[,match(common,colnames(freqDF))])), eset@phenoData[match(common,colnames(eset))]) #Recompute timepoint indices after removing samples ind=lapply(common_tp, function(x) which(eset_freq$time_post_last_vax==x)) #Compute D0 normalized FC ind_D0=which(0==eset_freq$time_post_last_vax) common=lapply(2:length(ind),function(x) intersect(eset_freq$participant_id[ind[[x]]],eset_freq$participant_id[ind_D0])) ia=lapply(2:length(ind),function(x) na.omit(match(common[[x-1]],eset_freq$participant_id[ind[[x]]]))) ib=lapply(2:length(ind),function(x) na.omit(match(common[[x-1]],eset_freq$participant_id[ind_D0]))) exp_FC_freq=lapply(2:length(ind),function(x) eset_freq[,ind[[x]][ia[[x-1]]]]) exp_FC_freq=lapply(2:length(ind),function(x) {exprs(exp_FC_freq[[x-1]])=exprs(exp_FC_freq[[x-1]])-exprs(eset_freq[,ind_D0[ib[[x-1]]]]); exp_FC_freq[[x-1]]}) #For each study, average expression across all subjects matrix_uni_tp_freq=lapply(exp_FC_freq,function(x) x$matrix[!duplicated(x$matrix)]) #Store study metadata matrix_uni_tp_metaData_freq=lapply(exp_FC_freq,function(x) pData(x)[!duplicated(x$matrix),] %>% rownames_to_column() %>% dplyr::select(matrix, vaccine, vaccine_type, pathogen, adjuvant, pt) %>% column_to_rownames(var = "matrix")) #Compute average expression/t-statistic matrix_ind_freq=lapply(1:length(exp_FC_freq), function(x) lapply(1:length(matrix_uni_tp_freq[[x]]), function(y) which(matrix_uni_tp_freq[[x]][[y]]==exp_FC_freq[[x]]$matrix))) #Study size exp_FC_freq_n=lapply(1:length(exp_FC_freq), function(x) sapply(1:length(matrix_uni_tp_freq[[x]]), function(y) length(matrix_ind_freq[[x]][[y]]))) #Average FC exp_FC_freq_mean=lapply(1:length(exp_FC_freq), function(x) sapply(1:length(matrix_uni_tp_freq[[x]]), function(y) rowMeans(exprs(exp_FC_freq[[x]][,matrix_ind_freq[[x]][[y]]])))) #t-test/p value exp_FC_freq_p_less= lapply(1:length(exp_FC_freq), function(x) sapply(1:length(matrix_uni_tp_freq[[x]]), function(y) apply(exprs(exp_FC_freq[[x]][,matrix_ind_freq[[x]][[y]]]), 1, function(z) if(length(which(!is.na(z)))>2) t.test(z, alternative='less')$p.value else NA))) exp_FC_freq_p_greater= lapply(1:length(exp_FC_freq), function(x) sapply(1:length(matrix_uni_tp_freq[[x]]), function(y) apply(exprs(exp_FC_freq[[x]][,matrix_ind_freq[[x]][[y]]]), 1, function(z) if(length(which(!is.na(z)))>2) t.test(z, alternative='greater')$p.value else NA))) #Replace NaN p values with 0.9999 exp_FC_freq_p_less=lapply(exp_FC_freq_p_less, function(x) replace(x,is.nan(x),0.9999)) exp_FC_freq_p_greater=lapply(exp_FC_freq_p_greater, function(x) replace(x,is.nan(x),0.9999)) #Replace 1 p values with 0.9999 (for compatibility with sumz) exp_FC_freq_p_less=lapply(exp_FC_freq_p_less, function(x) replace(x,x>0.9999,0.9999)) exp_FC_freq_p_greater=lapply(exp_FC_freq_p_greater, function(x) replace(x,x>0.9999,0.9999)) ##Compute DE cells by pathogen/vaccine type #Integrate p values across multiple studies by Stouffer's method #Find unique pathogen/vaccine types by timepoint pt_uni_tp_freq=lapply(matrix_uni_tp_metaData_freq,function(x) x$pt[!duplicated(x$pt)]) pt_metaData_freq=lapply(1:length(matrix_uni_tp_metaData_freq), function(x) dplyr::select( matrix_uni_tp_metaData_freq[[x]][!duplicated(matrix_uni_tp_metaData_freq[[x]]$pt),], vaccine_type, pathogen, adjuvant)) #Vaccine size exp_FC_freq_pt_n=lapply(1:length(exp_FC_freq), function(x) sapply(1:length(pt_uni_tp_freq[[x]]), function(y) length(which(exp_FC_freq[[x]]$pt==pt_uni_tp_freq[[x]][y])))) #Add weight (1/vaccine sample size) to pData for all samples n_weight_freq=lapply(1:length(exp_FC_freq), function(x) sapply(1:ncol(exp_FC_freq[[x]]), function(y) exp_FC_freq_pt_n[[x]][which(pt_uni_tp_freq[[x]]==exp_FC_freq[[x]]$pt[y])])) for (i in 1:length(exp_FC_freq)) { exp_FC_freq[[i]]$n_weight_freq=1/n_weight_freq[[i]] } pt_ind_freq=vector("list",length(exp_FC_freq)) pt_FC_mean_freq=vector("list",length(exp_FC_freq)) pt_p_freq=vector("list",length(exp_FC_freq)) pt_q_freq=vector("list",length(exp_FC_freq)) for (i in 1:length(exp_FC_freq)) { #For each pathogen/type combo pt_FC_mean_freq[[i]]=matrix(NA, nrow=nrow(eset_freq), ncol=length(pt_uni_tp_freq[[i]])) pt_p_freq[[i]]=matrix(NA, nrow=nrow(eset_freq), ncol=length(pt_uni_tp_freq[[i]])) pt_q_freq[[i]]=matrix(NA, nrow=nrow(eset_freq), ncol=length(pt_uni_tp_freq[[i]])) for (j in 1:length(pt_uni_tp_freq[[i]])) { pt_ind_freq[[i]][[j]]=which(pt_uni_tp_freq[[i]][[j]]==matrix_uni_tp_metaData_freq[[i]]$pt) #If there is more than 1 study, integrate p values by Stouffer's method if (length(pt_ind_freq[[i]][[j]])>1) { #Average FCs (weighted by n) pt_FC_mean_freq[[i]][,j]=rowWeightedMeans(exp_FC_freq_mean[[i]][,pt_ind_freq[[i]][[j]]], w=exp_FC_freq_n[[i]][pt_ind_freq[[i]][[j]]], na.rm=T) #Directional Stouffer's method using sumz function (weighted by sqrt(n)) pt_p_freq_less=vapply(1:nrow(eset_freq), FUN.VALUE=1, function(x) sumz(exp_FC_freq_p_less[[i]][x,pt_ind_freq[[i]][[j]]],sqrt(exp_FC_freq_n[[i]][pt_ind_freq[[i]][[j]]]), na.action=na.omit)$p) pt_p_freq_greater=vapply(1:nrow(eset_freq), FUN.VALUE=1, function(x) sumz(exp_FC_freq_p_greater[[i]][x,pt_ind_freq[[i]][[j]]],sqrt(exp_FC_freq_n[[i]][pt_ind_freq[[i]][[j]]]), na.action=na.omit)$p) pt_p_freq[[i]][,j]=2*pmin(pt_p_freq_less,pt_p_freq_greater) #Otherwise store single p values } else { pt_FC_mean_freq[[i]][,j]=exp_FC_freq_mean[[i]][,pt_ind_freq[[i]][[j]]] pt_p_freq[[i]][,j]=2*pmin(exp_FC_freq_p_less[[i]][,pt_ind_freq[[i]][[j]]],exp_FC_freq_p_greater[[i]][,pt_ind_freq[[i]][[j]]]) } #Correct p values >1 pt_p_freq[[i]][(pt_p_freq[[i]][,j]>1),j]=1 #Convert p values to q values (FDR correction) pt_q_freq[[i]][,j]=p.adjust(pt_p_freq[[i]][,j],method="BH") #BH method } colnames(pt_FC_mean_freq[[i]])=pt_uni_tp_freq[[i]] rownames(pt_FC_mean_freq[[i]])=rownames(fData(eset_freq)) colnames(pt_p_freq[[i]])=pt_uni_tp_freq[[i]] rownames(pt_p_freq[[i]])=rownames(fData(eset_freq)) colnames(pt_q_freq[[i]])=pt_uni_tp_freq[[i]] rownames(pt_q_freq[[i]])=rownames(fData(eset_freq)) rownames(pt_metaData_freq[[i]])=pt_uni_tp_freq[[i]] } #Plot boxplot for total B cells on D1 cells_plot=c('B-cells') for (i in 1:length(cells_plot)) { ind=match(cells_plot[i], rownames(eset_freq)) df=data.frame(value=exprs(exp_FC_freq[[1]])[ind,], vaccine=exp_FC_freq[[1]]$pt) p=ggplot(df, aes(x=vaccine, y=value, fill=vaccine)) + geom_boxplot(outlier.shape=NA,width=0.5,notch=FALSE, alpha=0.3)+ geom_jitter(pch=21, width=0.1)+ ylab('log2(FC)')+ xlab('')+ ggtitle(paste0('Day 1 ',cells_plot[i]))+ theme_bw()+ theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1), plot.title = element_text(hjust = 0.5), legend.position = "none") print(p) }
Ext. Data Figure 2f-g. Kinetics of the mean FC of modules M47.0 and M75 across Yellow Fever vaccine studies.
#Keep only YF studies from FC data exp_FC_BTM_YF=lapply(exp_FC_BTM, function(x){y=x[,x$pathogen=='Yellow Fever']; y}) #Add vaccine+study column to pData exp_FC_BTM_YF=lapply(exp_FC_BTM_YF, function(x){x$pt_sdy=paste(x$pt, x$study_accession); x}) #Keep only up to day 14 exp_FC_BTM_YF=exp_FC_BTM_YF[which(tp_FC %in% c(1,3,7,14))] #For each matrix, average expression across all subjects matrix_uni_tp_YF=lapply(exp_FC_BTM_YF,function(x) x$matrix[!duplicated(x$matrix)]) #Store matrix metadata matrix_uni_tp_YF_metaData=lapply(exp_FC_BTM_YF,function(x) pData(x)[!duplicated(x$matrix),] %>% rownames_to_column() %>% dplyr::select(matrix, vaccine, vaccine_type, pathogen, adjuvant, pt_sdy) %>% column_to_rownames(var = "matrix")) #Find matrix indices matrix_ind_YF=lapply(1:length(exp_FC_BTM_YF), function(x) lapply(1:length(matrix_uni_tp_YF[[x]]), function(y) which(matrix_uni_tp_YF[[x]][[y]]==exp_FC_BTM_YF[[x]]$matrix))) #Matrix size exp_FC_BTM_YF_n=lapply(1:length(exp_FC_BTM_YF), function(x) sapply(1:length(matrix_uni_tp_YF[[x]]), function(y) length(matrix_ind_YF[[x]][[y]]))) #Average FC exp_FC_BTM_YF_mean=lapply(1:length(exp_FC_BTM_YF), function(x) sapply(1:length(matrix_uni_tp_YF[[x]]), function(y) rowMeans(exprs(exp_FC_BTM_YF[[x]][,matrix_ind_YF[[x]][[y]]])))) #Find unique pathogen/vaccine types by timepoint pt_uni_tp_YF=lapply(matrix_uni_tp_YF_metaData,function(x) x$pt_sdy[!duplicated(x$pt_sdy)]) pt_metaData_YF=lapply(1:length(matrix_uni_tp_YF_metaData), function(x) dplyr::select( matrix_uni_tp_YF_metaData[[x]][!duplicated(matrix_uni_tp_YF_metaData[[x]]$pt_sdy),], vaccine_type, pathogen, adjuvant)) #Vaccine size exp_FC_BTM_YF_pt_n=lapply(1:length(exp_FC_BTM_YF), function(x) sapply(1:length(pt_uni_tp_YF[[x]]), function(y) length(which(exp_FC_BTM_YF[[x]]$pt_sdy==pt_uni_tp_YF[[x]][y])))) #Add weight (1/vaccine sample size) to pData for all samples n_weight=lapply(1:length(exp_FC_BTM_YF), function(x) sapply(1:ncol(exp_FC_BTM_YF[[x]]), function(y) exp_FC_BTM_YF_pt_n[[x]][which(pt_uni_tp_YF[[x]]==exp_FC_BTM_YF[[x]]$pt_sdy[y])])) for (i in 1:length(exp_FC_BTM_YF)) { exp_FC_BTM_YF[[i]]$n_weight=1/n_weight[[i]] } pt_ind_YF=vector("list",length(exp_FC_BTM_YF)) pt_FC_mean_BTM_YF=vector("list",length(exp_FC_BTM_YF)) for (i in 1:length(exp_FC_BTM_YF)) { #For each pathogen/type combo pt_FC_mean_BTM_YF[[i]]=matrix(NA, nrow=nrow(exp_FC_BTM_YF[[1]]), ncol=length(pt_uni_tp_YF[[i]])) for (j in 1:length(pt_uni_tp_YF[[i]])) { pt_ind_YF[[i]][[j]]=which(pt_uni_tp_YF[[i]][[j]]==matrix_uni_tp_YF_metaData[[i]]$pt_sdy) #If there is more than 1 study if (length(pt_ind_YF[[i]][[j]])>1) { #Average FCs (weighted by n) pt_FC_mean_BTM_YF[[i]][,j]=rowWeightedMeans(exp_FC_BTM_YF_mean[[i]][,pt_ind_YF[[i]][[j]]], w=exp_FC_BTM_YF_n[[i]][pt_ind_YF[[i]][[j]]]) #Otherwise store single p values } else { pt_FC_mean_BTM_YF[[i]][,j]=exp_FC_BTM_YF_mean[[i]][,pt_ind_YF[[i]][[j]]] } } colnames(pt_FC_mean_BTM_YF[[i]])=pt_uni_tp_YF[[i]] rownames(pt_FC_mean_BTM_YF[[i]])=rownames(rownames(exp_FC_BTM_YF[[1]])) rownames(pt_metaData_YF[[i]])=pt_uni_tp_YF[[i]] } #Set BTM of interest list, plot kinetics BTM_int=c('M47.0)', 'M75)') for (i in 1:length(BTM_int)) { #Lookup BTM ind=grep(BTM_int[i], names(btm_list), fixed=TRUE) #Build matrix of avg common gene exp over time per vaccine BTM_int_FC_mat=array(NA, dim=c(length(unique(exp_FC_BTM_YF[[2]]$pt_sdy)),ncol=length(exp_FC_BTM_YF)+1), dimnames=list(unique(exp_FC_BTM_YF[[2]]$pt_sdy),c(0,1,3,7,14))) #Set D0 FC to 0 BTM_int_FC_mat[,1]=0 #Lookup values for other timepoints for (j in 1:length(exp_FC_BTM_YF)) { ind_pt=match(pt_uni_tp_YF[[j]],unique(exp_FC_BTM_YF[[2]]$pt_sdy)) BTM_int_FC_mat[ind_pt,j+1]=pt_FC_mean_BTM_YF[[j]][ind,] } #Plot line graph #Convert matrix to long format BTM_int_FC_mat=data.frame(BTM_int_FC_mat) %>% tibble::rownames_to_column(var = "Study") %>% pivot_longer(-Study, names_to='Timepoint', values_to='FC') #Convert timepoint back to numeric BTM_int_FC_mat$Timepoint=as.numeric(str_replace(BTM_int_FC_mat$Timepoint, 'X', '')) #Add vaccine column BTM_int_FC_mat$Vaccine=gsub(' SDY.*','',BTM_int_FC_mat$Study) #Fix study labels BTM_int_FC_mat$Study=str_replace_all(BTM_int_FC_mat$Study, 'Yellow Fever \\(Live attenuated\\) ', '') #Plot p=ggplot(BTM_int_FC_mat, aes(x=Timepoint, y=FC, group=Study)) + geom_line(aes(color=Study), size=1.07)+ theme_classic()+ scale_x_continuous(breaks=c(0,1,3,7,14))+ xlab("Day post-last vaccination")+ ylab("Mean FC (log2)")+ ggtitle(names(btm_list[ind]))+ theme(plot.title = element_text(hjust = 0.5), text = element_text(size=20)) print(p) }
Prepare data
basetiter_high_cut=0.7 #Baseline titer quantile cutoffs basetiter_low_cut=1-basetiter_high_cut #Geometric mean function gm_mean = function(x, na.rm=TRUE, zero.propagate = FALSE){ if(any(x < 0, na.rm = TRUE)){ return(NaN) } if(zero.propagate){ if(any(x == 0, na.rm = TRUE)){ return(0) } exp(mean(log(x), na.rm = na.rm)) } else { exp(sum(log(x[x > 0]), na.rm=na.rm) / length(x)) } } #Load BTMs/groups BTM_list=readRDS(file = file.path(data_dir, "btm_list_2020_12_23.rds")) #Load BTM groups load(file.path(extra_data_dir,'BTM_groups.rda')) #Match names with list BTM_groups=BTM_groups[match(sub('.*\\(S','S',sub('.*\\(M','M',names(BTM_list))),sub('.*\\(S','S',sub('.*\\(M','M',BTM_groups$NAME))),] names(BTM_list)=BTM_groups$NAME #Set BTM group as factor and manually set order BTM_groups$SUBGROUP=factor(BTM_groups$SUBGROUP, levels=c('ANTIGEN PRESENTATION', 'INFLAMMATORY/TLR/CHEMOKINES','INTERFERON/ANTIVIRAL SENSING', 'MONOCYTES', 'DC ACTIVATION', 'NEUTROPHILS', 'SIGNAL TRANSDUCTION', 'ECM AND MIGRATION', 'ENERGY METABOLISM', 'CELL CYCLE', 'NK CELLS', 'T CELLS', 'B CELLS', 'PLASMA CELLS')) #Sort BTM list by subgroup BTM_list=BTM_list[order(BTM_groups$SUBGROUP)] BTM_groups=BTM_groups[order(BTM_groups$SUBGROUP),] rownames(BTM_groups)=BTM_groups$NAME #Set BTM colors cols=colorRampPalette(brewer.pal(length(levels(BTM_groups$SUBGROUP)), "Set2")) mycolors_BTM=setNames(cols(length(levels(BTM_groups$SUBGROUP))), levels(BTM_groups$SUBGROUP)) #Load datasets eset=readRDS(file.path(data_dir, paste0(date_prefix, "young_noNorm_eset.rds"))) eset_norm=readRDS(file.path(data_dir, paste0(date_prefix, "young_norm_eset.rds"))) #Clean data #Create combined vaccine type/pathogen column eset$pt=paste(eset$pathogen," (",eset$vaccine_type,")", sep='') eset_norm$pt=paste(eset_norm$pathogen," (",eset_norm$vaccine_type,")", sep='') #Subset to flu only eset=eset[,eset$pt=='Influenza (Inactivated)'] eset_norm=eset_norm[,eset_norm$pt=='Influenza (Inactivated)'] #Set SDY1276 cohorts as separate SDY for processing eset$study_accession[eset$arm_accession=='ARM4431']='SDY1276_V' eset$study_accession[eset$arm_accession=='ARM4428']='SDY1276_D' eset_norm$study_accession[eset_norm$arm_accession=='ARM4431']='SDY1276_V' eset_norm$study_accession[eset_norm$arm_accession=='ARM4428']='SDY1276_D' #Set SDY180 groups as the same matrix eset$matrix[eset$matrix=='SDY180_WholeBlood_Grp2Fluzone_Geo']='SDY180_WholeBlood_Grp1Fluzone_Geo' eset_norm$matrix[eset_norm$matrix=='SDY180_WholeBlood_Grp2Fluzone_Geo']='SDY180_WholeBlood_Grp1Fluzone_Geo' #Load titer data immdata=readRDS(file.path(data_dir, paste0(date_prefix, "immdata_all.rds"))) #Convert day -7 to 0 for easy matching immdata=lapply(immdata, function(x) {x$study_time_collected[x$study_time_collected==-7]=0; x}) #Set SDY1276 cohorts as separate immdata=lapply(immdata, function(x) {x$study_accession[x$arm_accession=='ARM4431']='SDY1276_V'; x}) immdata=lapply(immdata, function(x) {x$study_accession[x$arm_accession=='ARM4428']='SDY1276_D'; x}) #Retain only D0 titers immdata=lapply(immdata, function(x) x[x$study_time_collected==0]) #For each study, ID Ab assay and assign geom mean baseline titer/group for each subject sdy_uni=unique(eset$study_accession) ab_d0_gmt_all=data.frame() for (i in 1:length(sdy_uni)) { #Check for hai titers, then nAb, then elisa if (sdy_uni[i] %in% immdata$hai$study_accession) { assay_mode='hai' } else if (sdy_uni[i] %in% immdata$neut_ab_titer$study_accession) { assay_mode='neut_ab_titer' } else if (sdy_uni[i] %in% immdata$elisa$study_accession) { assay_mode='elisa' } else { assay_mode=NA } if (is.na(assay_mode)==F) { #Get titers ab_data=immdata[[assay_mode]][immdata[[assay_mode]]$study_accession==sdy_uni[i]] %>% dplyr::select(participant_id, virus, value_preferred) #For each subject, get GMT ab_d0_gmt=ab_data %>% group_by(participant_id) %>% summarise(d0_gmt=gm_mean(as.numeric(value_preferred))) #Define baseline titer groups ab_d0_gmt$basetiter_group=NA ab_d0_gmt$basetiter_group[ab_d0_gmt$d0_gmt>=quantile(ab_d0_gmt$d0_gmt,basetiter_high_cut)]='High' ab_d0_gmt$basetiter_group[ab_d0_gmt$d0_gmt<quantile(ab_d0_gmt$d0_gmt,basetiter_low_cut)]='Low' #Add to full list ab_d0_gmt_all=rbind(ab_d0_gmt_all,ab_d0_gmt) } } #Add to pData pData(eset)=left_join(pData(eset),ab_d0_gmt_all) pData(eset_norm)=left_join(pData(eset_norm),ab_d0_gmt_all) colnames(eset)=eset$uid colnames(eset_norm)=eset_norm$uid #Remove intermediate subjects/subjects without titer data eset=eset[,!is.na(eset$basetiter_group)] eset_norm=eset_norm[,!is.na(eset_norm$basetiter_group)] #Find samples from timepoints of interest tp_int=c(0,1,3,7,14) ind=lapply(tp_int, function(x) which(eset$study_time_collected==x)) #Combine indices of all timepoints of interest ind_all=Reduce(union,ind) #Retain only samples from timepoints of interest eset=eset[,ind_all] eset_norm=eset_norm[,ind_all] #Recompute timepoint indices after removing samples ind=lapply(tp_int, function(x) which(eset$study_time_collected==x)) #Remove samples from a single study with fewer than sc high and low baseline titer subjects at any timepoint sc=3 matrix_uni_tp=lapply(ind,function(x) unique(eset[,x]$matrix)) matrix_ind=lapply(1:length(ind),function(x) lapply(1:length(matrix_uni_tp[[x]]), function(y) which(matrix_uni_tp[[x]][[y]]==eset[,ind[[x]]]$matrix))) matrix_H_ind=lapply(1:length(ind), function(x) lapply(1:length(matrix_uni_tp[[x]]), function(y) which(pData(eset[,ind[[x]][matrix_ind[[x]][[y]]]])['basetiter_group']=='High'))) matrix_L_ind=lapply(1:length(ind), function(x) lapply(1:length(matrix_uni_tp[[x]]), function(y) which(pData(eset[,ind[[x]][matrix_ind[[x]][[y]]]])['basetiter_group']=='Low'))) ind_cut_all=vector() for (i in 1:length(matrix_ind)) { ind_cut_H=which(sapply(matrix_H_ind[[i]],length)<sc) ind_cut_L=which(sapply(matrix_L_ind[[i]],length)<sc) ind_cut=union(ind_cut_H,ind_cut_L) if (is_empty(ind_cut)==FALSE) { ind_cut_all=c(ind_cut_all,ind[[i]][unlist(matrix_ind[[i]][ind_cut])]) } } if (is_empty(ind_cut_all)==FALSE) { eset=eset[,-ind_cut_all] eset_norm=eset_norm[,-ind_cut_all] } #Recompute timepoint indices after removing samples tp_int=unique(eset$study_time_collected[which(eset$study_time_collected>=0)]) ind=lapply(tp_int, function(x) which(eset$study_time_collected==x)) #Remove genes with NA eset=eset[complete.cases(exprs(eset)),] eset_norm=eset_norm[complete.cases(exprs(eset_norm)),] #Collapse to BTM level #Remove BTMs which have no matching genes in dataset BTM_groups=BTM_groups[!sapply(BTM_list, function(x) is_empty(intersect(x,rownames(eset)))),] BTM_list=BTM_list[!sapply(BTM_list, function(x) is_empty(intersect(x,rownames(eset))))] #Collapse - find arithmetic mean of genes in each BTM for each sample exp_BTM=do.call(rbind, lapply(BTM_list, function(x) colMeans(exprs(eset[na.omit(match(x,rownames(eset))),]),na.rm=TRUE))) #Create BTM eset eset=ExpressionSet(exp_BTM, eset@phenoData) rm(exp_BTM) exp_BTM=do.call(rbind, lapply(BTM_list, function(x) colMeans(exprs(eset_norm[na.omit(match(x,rownames(eset_norm))),]),na.rm=TRUE))) eset_norm=ExpressionSet(exp_BTM, eset_norm@phenoData) rm(exp_BTM) #Compute D0 normalized FC ind_D0=which(0==eset$study_time_collected) common=lapply(2:length(ind),function(x) intersect(eset$participant_id[ind[[x]]],eset$participant_id[ind_D0])) ia=lapply(2:length(ind),function(x) na.omit(match(common[[x-1]],eset$participant_id[ind[[x]]]))) ib=lapply(2:length(ind),function(x) na.omit(match(common[[x-1]],eset$participant_id[ind_D0]))) exp_FC=lapply(2:length(ind),function(x) eset[,ind[[x]][ia[[x-1]]]]) exp_FC=lapply(2:length(ind),function(x) {exprs(exp_FC[[x-1]])=exprs(exp_FC[[x-1]])-exprs(eset[,ind_D0[ib[[x-1]]]]); exp_FC[[x-1]]}) exp_FC_norm=lapply(2:length(ind),function(x) eset_norm[,ind[[x]][ia[[x-1]]]]) exp_FC_norm=lapply(2:length(ind),function(x) {exprs(exp_FC_norm[[x-1]])=exprs(exp_FC_norm[[x-1]])-exprs(eset_norm[,ind_D0[ib[[x-1]]]]); exp_FC_norm[[x-1]]})
Ext. Data Figure 3a-b. Differentially expressed modules at Day 1 (a) and Day 7 (b) (FDR < 0.05, t-test between mean fold changes) between participants with high and low baseline antibody titers (negative score indicates increased expression in the low baseline antibody group).
#For each study, average expression across all subjects matrix_uni_tp=lapply(exp_FC,function(x) x$matrix[!duplicated(x$matrix)]) #Store study metadata matrix_uni_tp_metaData=lapply(exp_FC,function(x) pData(x)[!duplicated(x$matrix),] %>% rownames_to_column() %>% dplyr::select(matrix, vaccine, vaccine_type, pathogen, adjuvant, pt) %>% column_to_rownames(var = "matrix")) #Indices for each study matrix_ind=lapply(1:length(exp_FC), function(x) lapply(1:length(matrix_uni_tp[[x]]), function(y) which(matrix_uni_tp[[x]][[y]]==exp_FC[[x]]$matrix))) #High/low baseline titer group within each study matrix_H_ind=lapply(1:length(exp_FC), function(x) lapply(1:length(matrix_uni_tp[[x]]), function(y) which(pData(exp_FC[[x]][,matrix_ind[[x]][[y]]])['basetiter_group']=='High'))) matrix_L_ind=lapply(1:length(exp_FC), function(x) lapply(1:length(matrix_uni_tp[[x]]), function(y) which(pData(exp_FC[[x]][,matrix_ind[[x]][[y]]])['basetiter_group']=='Low'))) #Study size exp_FC_n=lapply(1:length(exp_FC), function(x) sapply(1:length(matrix_uni_tp[[x]]), function(y) length(matrix_ind[[x]][[y]]))) #T-test and mean FC between high/low baseline titer group #If there are less than sc high/low titer subjects in a given study, return NA for all t-tests exp_FC_mean= lapply(1:length(exp_FC), function(x) sapply(1:length(matrix_uni_tp[[x]]), function(y) if(length(matrix_H_ind[[x]][[y]])>=sc & length(matrix_L_ind[[x]][[y]])>=sc) rowMeans(exprs(exp_FC[[x]][,matrix_ind[[x]][[y]][matrix_H_ind[[x]][[y]]]]))-rowMeans(exprs(exp_FC[[x]][,matrix_ind[[x]][[y]][matrix_L_ind[[x]][[y]]]])) else rep(NA, nrow(exp_FC[[x]])))) exp_FC_t_stat= lapply(1:length(exp_FC), function(x) sapply(1:length(matrix_uni_tp[[x]]), function(y) if(length(matrix_H_ind[[x]][[y]])>=sc & length(matrix_L_ind[[x]][[y]])>=sc) apply(exprs(exp_FC[[x]][,matrix_ind[[x]][[y]]]), 1, function(z) t.test(z[matrix_H_ind[[x]][[y]]], z[matrix_L_ind[[x]][[y]]])$statistic) else rep(NA, nrow(exp_FC[[x]])))) exp_FC_p_less= lapply(1:length(exp_FC), function(x) sapply(1:length(matrix_uni_tp[[x]]), function(y) if(length(matrix_H_ind[[x]][[y]])>=sc & length(matrix_L_ind[[x]][[y]])>=sc) apply(exprs(exp_FC[[x]][,matrix_ind[[x]][[y]]]), 1, function(z) t.test(z[matrix_H_ind[[x]][[y]]], z[matrix_L_ind[[x]][[y]]], alternative='less')$p.value) else rep(NA, nrow(exp_FC[[x]])))) exp_FC_p_greater= lapply(1:length(exp_FC), function(x) sapply(1:length(matrix_uni_tp[[x]]), function(y) if(length(matrix_H_ind[[x]][[y]])>=sc & length(matrix_L_ind[[x]][[y]])>=sc) apply(exprs(exp_FC[[x]][,matrix_ind[[x]][[y]]]), 1, function(z) t.test(z[matrix_H_ind[[x]][[y]]], z[matrix_L_ind[[x]][[y]]], alternative='greater')$p.value) else rep(NA, nrow(exp_FC[[x]])))) exp_FC_mean= lapply(1:length(exp_FC), function(x) sapply(1:length(matrix_uni_tp[[x]]), function(y) if(length(matrix_H_ind[[x]][[y]])>=sc & length(matrix_L_ind[[x]][[y]])>=sc) rowMeans(exprs(exp_FC[[x]][,matrix_ind[[x]][[y]][matrix_H_ind[[x]][[y]]]]))-rowMeans(exprs(exp_FC[[x]][,matrix_ind[[x]][[y]][matrix_L_ind[[x]][[y]]]])) else rep(NA, nrow(exp_FC[[x]])))) #Replace NaN p values with 0.9999 exp_FC_p_less=lapply(exp_FC_p_less, function(x) replace(x,is.nan(x),0.9999)) exp_FC_p_greater=lapply(exp_FC_p_greater, function(x) replace(x,is.nan(x),0.9999)) #Replace 1 p values with 0.9999 (for compatibility with sumz) exp_FC_p_less=lapply(exp_FC_p_less, function(x) replace(x,x>0.9999,0.9999)) exp_FC_p_greater=lapply(exp_FC_p_greater, function(x) replace(x,x>0.9999,0.9999)) ##Compute DE features by pathogen/vaccine type (only inactivated influenza here) #Integrate p values across multiple studies by Stouffer's method #Find unique pathogen/vaccine types by timepoint pt_uni_tp=lapply(matrix_uni_tp_metaData,function(x) x$pt[!duplicated(x$pt)]) pt_metaData=lapply(1:length(matrix_uni_tp_metaData), function(x) dplyr::select( matrix_uni_tp_metaData[[x]][!duplicated(matrix_uni_tp_metaData[[x]]$pt),], vaccine_type, pathogen, adjuvant)) pt_ind=vector("list",length(exp_FC)) pt_FC_mean=vector("list",length(exp_FC)) pt_p=vector("list",length(exp_FC)) pt_q=vector("list",length(exp_FC)) pt_up_down_logic=vector("list",length(exp_FC)) pt_DEG_up=vector("list",length(exp_FC)) pt_DEG_down=vector("list",length(exp_FC)) pt_DEG_total_num=vector("list",length(exp_FC)) pt_DEG_up_overlap=vector("list",length(exp_FC)) pt_DEG_down_overlap=vector("list",length(exp_FC)) for (i in 1:length(exp_FC)) { #For each pathogen/type combo (only inactivated influenza here) pt_FC_mean[[i]]=matrix(NA, nrow=nrow(eset), ncol=length(pt_uni_tp[[i]])) pt_p[[i]]=matrix(NA, nrow=nrow(eset), ncol=length(pt_uni_tp[[i]])) pt_q[[i]]=matrix(NA, nrow=nrow(eset), ncol=length(pt_uni_tp[[i]])) pt_up_down_logic[[i]]=matrix(NA, nrow=nrow(eset), ncol=length(pt_uni_tp[[i]])) pt_ind[[i]]=vector("list",length(pt_uni_tp[[i]])) pt_DEG_up[[i]]=vector("list",length(pt_uni_tp[[i]])) pt_DEG_down[[i]]=vector("list",length(pt_uni_tp[[i]])) pt_DEG_total_num[[i]]=matrix(NA, nrow=1, ncol=length(pt_uni_tp[[i]])) for (j in 1:length(pt_uni_tp[[i]])) { pt_ind[[i]][[j]]=which(pt_uni_tp[[i]][[j]]==matrix_uni_tp_metaData[[i]]$pt) #If there is more than 1 study with values, integrate p values by Stouffer's method if (length(pt_ind[[i]][[j]])>1) { #Average FCs (weighted by n) pt_FC_mean[[i]][,j]=rowWeightedMeans(exp_FC_mean[[i]][,pt_ind[[i]][[j]]], w=exp_FC_n[[i]][pt_ind[[i]][[j]]]) #Directional Stouffer's method using sumz function (weighted by sqrt(n)) pt_p_less=vapply(1:nrow(eset), FUN.VALUE=1, function(x) sumz(exp_FC_p_less[[i]][x,pt_ind[[i]][[j]]],sqrt(exp_FC_n[[i]][pt_ind[[i]][[j]]]), na.action=na.omit)$p) pt_p_greater=vapply(1:nrow(eset), FUN.VALUE=1, function(x) sumz(exp_FC_p_greater[[i]][x,pt_ind[[i]][[j]]],sqrt(exp_FC_n[[i]][pt_ind[[i]][[j]]]), na.action=na.omit)$p) pt_up_down_logic[[i]][,j]=pt_p_greater<pt_p_less #logical variable, true if upregulation p value is smallest pt_p[[i]][,j]=2*pmin(pt_p_less,pt_p_greater) #Otherwise store single p values } else { pt_FC_mean[[i]][,j]=exp_FC_mean[[i]][,pt_ind[[i]][[j]]] pt_p[[i]][,j]=2*pmin(exp_FC_p_less[[i]][,pt_ind[[i]][[j]]],exp_FC_p_greater[[i]][,pt_ind[[i]][[j]]]) pt_up_down_logic[[i]][,j]=exp_FC_p_greater[[i]][,pt_ind[[i]][[j]]]<exp_FC_p_less[[i]][,pt_ind[[i]][[j]]] } #Correct p values >1 pt_p[[i]][(pt_p[[i]][,j]>1),j]=1 #Convert p values to q values (FDR correction) pt_q[[i]][,j]=p.adjust(pt_p[[i]][,j],method="BH") #BH method #Find DE features ind_up=which(pt_up_down_logic[[i]][,j]==TRUE) ind_down=which(pt_up_down_logic[[i]][,j]==FALSE) ind_sig=which(pt_q[[i]][,j]<FDR_THRESHOLD) pt_DEG_up[[i]][[j]]=intersect(ind_up,ind_sig) pt_DEG_down[[i]][[j]]=intersect(ind_down,ind_sig) pt_DEG_total_num[[i]][j]=length(ind_sig) } colnames(pt_FC_mean[[i]])=pt_uni_tp[[i]] rownames(pt_FC_mean[[i]])=rownames(fData(eset)) colnames(pt_p[[i]])=pt_uni_tp[[i]] colnames(pt_q[[i]])=pt_uni_tp[[i]] rownames(pt_metaData[[i]])=pt_uni_tp[[i]] } pt_t_stat= lapply(1:length(exp_FC), function(x) sapply(1:length(pt_uni_tp[[x]]), function(y) if(length(pt_ind[[x]][[y]])>1) rowWeightedMeans(exp_FC_t_stat[[x]][,pt_ind[[x]][[y]]], w=exp_FC_n[[x]][pt_ind[[x]][[y]]], na.rm=TRUE) else exp_FC_t_stat[[x]][,pt_ind[[x]][[y]]])) #Plot DE BTMs as bar chart tp_plot=c(1,7) for (i in 1:length(tp_plot)) { tp_ind=which(tp_int==tp_plot[i])-1 DE_feat=c(c(pt_DEG_up[[tp_ind]][[1]],pt_DEG_down[[tp_ind]][[1]])) df=data.frame(pathway=names(BTM_list[DE_feat]), t=pt_t_stat[[tp_ind]][DE_feat,1]) #Order df=df[order(-df$t),] #Add subgroup df$subgroup=BTM_groups$SUBGROUP[match(df$pathway,BTM_groups$NAME)] df$pathway=factor(df$pathway, levels=rev(as.character(df$pathway))) p=ggplot(df,aes(x=pathway,y=t, fill=subgroup))+ geom_bar(stat="identity",width=0.5)+ scale_fill_manual(values=mycolors_BTM, na.value='grey')+ coord_flip()+ xlab("BTM")+ ylab("t-statistic")+ theme_minimal()+ theme(text = element_text(size=12)) print(p) }
Ext. Data Figure 3c-d. Boxplots of (c) IFN signature module M75 expression at Day 1 and (d) plasma cell module M156.1 expression at Day 7 between high and low baseline antibody groups at Day 1. Center line, median; box limits, upper and lower quartiles; whiskers, 1.5x interquartile range.
#For BTMs of interest, plot FC boxplots between baseline Ab groups BTM_plot=c('M75', 'M156.1') tp_plot=c(1,7) for (i in 1:length(BTM_plot)) { tp_ind=which(tp_int==tp_plot[i])-1 df=data.frame(Exp=exprs(exp_FC_norm[[tp_ind]])[match(BTM_plot[i],BTM_groups$BTM),], Study=exp_FC_norm[[tp_ind]]$study_accession, Group=exp_FC_norm[[tp_ind]]$basetiter_group) #Plot p=ggplot(df, aes(x=Study, y=Exp, fill=Group)) + geom_boxplot(outlier.shape=NA,position=position_dodge(0.7),width=0.5,notch=FALSE)+ geom_point(pch = 21, position = position_jitterdodge(jitter.width = 0.1))+ stat_compare_means(method = "t.test", hide.ns=TRUE, label='p.signif')+ ylab('log2 FC')+ ggtitle(paste0(BTM_groups$NAME[match(BTM_plot[i],BTM_groups$BTM)],' Day ',tp_plot[i]))+ theme_classic()+ theme(plot.title = element_text(hjust = 0.5)) print(p) }
Ext. Data Figure 3e-f. Line graphs of (e) M75 and (f) M156.1 expression across time in high and low baseline antibody groups. Error bars represent standard error of the mean.
#Create merged FC eset exp_FC_all=do.call(Biobase::combine,exp_FC_norm) #Add 0 FC for all subjects at Day 0 exp_FC_d0=eset_norm[,eset_norm$time_post_last_vax==0] exprs(exp_FC_d0)=matrix(0,nrow(exp_FC_d0),ncol(exp_FC_d0)) exp_FC_all=Biobase::combine(exp_FC_d0,exp_FC_all) #Plot BTM FC kinetics between high/low groups BTM_plot=c('M75', 'M156.1') for (i in 1:length(BTM_plot)) { df=data.frame(Exp=exprs(exp_FC_all)[match(BTM_plot[i],BTM_groups$BTM),], Timepoint=exp_FC_all$time_post_last_vax, Group=exp_FC_all$basetiter_group) #Plot p=ggline(df, x='Timepoint', y='Exp', color='Group', add='mean_se') + stat_compare_means(aes(group=Group), method = "t.test", hide.ns=TRUE, label='p.signif', label.y=0.8)+ ggtitle(BTM_groups$NAME[match(BTM_plot[i],BTM_groups$BTM)])+ theme(plot.title = element_text(hjust = 0.5)) print(p) }
Load packages/functions
require(Biobase) require(tidyverse) require(GSVA) require(reshape2) require(data.table) require(limma) require(caret) require(caretEnsemble) require(WeightedROC) require(nlme) require(emmeans) require(plotROC) LDAprojection<-function (xs = xs, ys = ys) { M = as.matrix(xs[[1]]) for (i in 2:length(xs)) { M = cbind(M, xs[[i]]) } extractWT <- function(vec, y = ys) { x = matrix(vec, nrow = length(y)) x = as.data.frame(x) if (ncol(x) == 2) { d1 = diff(c(by(unlist(x[, 1]), y, mean))) d2 = diff(c(by(unlist(x[, 2]), y, mean))) if (d1 == 0 | d2 == 0) { rs = matrix(0.5, nrow = ncol(x)) } else { rs = MASS::lda(x, y)$scaling } } else { rs = MASS::lda(x, y)$scaling } return(rs[, 1]) } wt = t(apply(M, 1, extractWT, ys)) return(wt) } multiplyFunction<-function (xlist, par) { tmp = xlist[[1]] * par[, 1] for (i in 2:ncol(par)) { tmp = tmp + xlist[[i]] * par[, i] } x.return = matrix(unlist(tmp), nrow = nrow(par)) }
Prepare data for model training/prediction 1. Calculate ssGSEA scores for each BTM, followed by fold change calculation of ssGSEA scores
exp_FC_wAb_path <- file.path(extra_data_dir, "FCH_eset_wAbList.BTM.rda") eset_ssGSEA_wAb_path <- file.path(extra_data_dir, "eset_wAbList.BTM.rda") eset_wAb <- readRDS(file.path(data_dir, paste0(date_prefix, "young_norm_withResponse_eset.rds"))) #Merge pre and post-vax samples to same matrix for SDY1529 eset_wAb$matrix[eset_wAb$study_accession=='SDY1529']='SDY1529_WholeBlood_HealthyAdults' #Abbreviate vaccine types eset_wAb$vaccine_type_full=eset_wAb$vaccine_type eset_wAb$vaccine_type=str_replace_all(eset_wAb$vaccine_type,vaccine_abrev) # Code below adds row-names equal to uid (uid is columnID for the Expression Data) rownames(pData(eset_wAb))<-pData(eset_wAb)$uid # ensuring order of pheno-data matchs expression data. pData(eset_wAb)<-pData(eset_wAb)[colnames(Biobase::exprs(eset_wAb)),] #Create column with combined vaccine type/pathogen eset_wAb$pt=paste0(eset_wAb$pathogen,"_",eset_wAb$vaccine_type) # Time points to calculate tp_int=c(0, 3, 7, 14, 21) #Remove subjects with Inf/NA MFC measurements eset_wAb=eset_wAb[,!is.na(eset_wAb$MFC)] eset_wAb=eset_wAb[,!(eset_wAb$MFC == Inf)] #For subjects with D-7 but no D0 measurement, convert to D0 for FC calculation sub_noD0=setdiff(eset_wAb[,which(eset_wAb$study_time_collected==-7)]$participant_id,eset_wAb[,which(eset_wAb$study_time_collected==0)]$participant_id) eset_wAb$study_time_collected[intersect(which(eset_wAb$study_time_collected==-7),which(eset_wAb$participant_id %in% sub_noD0))]=0 #Find samples from timepoints of interest ind=lapply(tp_int, function(x) which(eset_wAb$study_time_collected==x)) #Retain only samples from timepoints of interest eset_wAb=eset_wAb[,Reduce(union,ind)] #Recompute timepoint indices after removing extraneous timepoints ind=lapply(tp_int, function(x) which(eset_wAb$study_time_collected==x)) #Remove samples from a single study with fewer than sample_cutoff samples at any timepoint matrix_uni_tp=lapply(ind,function(x) unique(eset_wAb[,x]$matrix)) matrix_ind=lapply(1:length(ind),function(x) lapply(1:length(matrix_uni_tp[[x]]), function(y) which(matrix_uni_tp[[x]][[y]]==eset_wAb[,ind[[x]]]$matrix))) ind_cut_all=vector() for (i in 1:length(matrix_ind)) { ind_cut=which(sapply(matrix_ind[[i]],length)<sample_cutoff) if (is_empty(ind_cut)==FALSE) { for (j in 1:length(ind_cut)) { ind_cut_all=c(ind_cut_all,ind[[i]][matrix_ind[[i]][[ind_cut[j]]]]) } } } if (is_empty(ind_cut_all)==FALSE) { eset_wAb=eset_wAb[,-ind_cut_all] } #Recompute timepoint indices after removing samples tp_int=unique(eset_wAb$study_time_collected[which(eset_wAb$study_time_collected>=0)]) ind=lapply(tp_int, function(x) which(eset_wAb$study_time_collected==x)) PD<-pData(eset_wAb) eset_wAb=eset_wAb[complete.cases(Biobase::exprs(eset_wAb)),] #Convert to ssGSEA scores and compute FC, unless cache already exists if (file.exists(eset_ssGSEA_wAb_path)) { load(exp_FC_wAb_path) load(eset_ssGSEA_wAb_path) } else { BTM_list <- readRDS(file = file.path(data_dir, "btm_list_2020_12_23.rds")) # Gene names GeneNames<-featureNames(eset_wAb) #Remove BTMs which have no matching genes in dataset BTM_list=BTM_list[!sapply(BTM_list, function(x){is_empty(intersect(x, GeneNames))})] eset_wAb_BTM<-gsva(expr=eset_wAb, ## Gene-expression data, rows genes, cols samples BTM_list, ## PTW list verbose=TRUE, method="ssgsea") #Replace eset_wAb with BTM eset_wAb eset_wAb=eset_wAb_BTM rm(eset_wAb_BTM) #Compute D0 normalized FC - for individual genes/BTMs - exp_FC_wAb has 3 large matrix with D1, D3, D7 timepoints ind_D0=which(0==eset_wAb$study_time_collected) common=lapply(2:length(ind),function(x) intersect(eset_wAb$participant_id[ind[[x]]],eset_wAb$participant_id[ind_D0])) #get participant id with timepoint and D0 data ia=lapply(2:length(ind),function(x) na.omit(match(common[[x-1]],eset_wAb$participant_id[ind[[x]]]))) #included participant IDs match with participant IDs at diff timepoints in eset_wAb ib=lapply(2:length(ind),function(x) na.omit(match(common[[x-1]],eset_wAb$participant_id[ind_D0]))) #participant IDs match with D0 to get indices in ind_D0 exp_FC_wAb=lapply(2:length(ind),function(x) eset_wAb[,ind[[x]][ia[[x-1]]]]) #get expression data of the 3 timepoints exp_FC_wAb=lapply(2:length(ind), function(x) {Biobase::exprs(exp_FC_wAb[[x-1]])=Biobase::exprs(exp_FC_wAb[[x-1]])-Biobase::exprs(eset_wAb[,ind_D0[ib[[x-1]]]]) exp_FC_wAb[[x-1]] }) #compute FC names(exp_FC_wAb)<-paste0("Day", tp_int[-1]) save(eset_wAb, file=eset_ssGSEA_wAb_path) save(exp_FC_wAb, file=exp_FC_wAb_path) }
lapply(names(exp_FC_wAb), function(DAY){ eset_wAb<-exp_FC_wAb[[DAY]] PD<-pData(eset_wAb) # remove patients with duplicate time points PD$SubjTime<-paste0(PD$participant_id,"_",PD$study_time_collected) tab<-table(PD$SubjTime) patientsDuplic<-names(tab[tab!=1]) ## Keep only patients with 0, 1 and 3 days numbrero<-as.numeric(gsub("Day", "", DAY)) PD.sub<-subset(PD, study_time_collected%in%c(numbrero)) PD.use<-PD.sub PD.use<-droplevels(PD.use) UID<-PD.use$uid ## Prepare expression data EXP<-Biobase::exprs(eset_wAb) ## Figure out a good filter SDVec<-apply(EXP, 1, sd) GENES<-names(SDVec[SDVec>=quantile(SDVec,0.75)]) ## I will keep genes with an SD>= the median SD ## Need to wrangle this into ML format: Gene_Time by PTID EXP.melt<-reshape2::melt(EXP) colnames(EXP.melt)[2]="uid" EXP.PD<-inner_join(x=EXP.melt[EXP.melt$uid%in%PD.use$uid,], y=PD.use[,c("uid","participant_id","study_time_collected","time_post_last_vax", "age_imputed",'study_accession', "gender","pathogen","vaccine_type","maxRBA_p30","MFC_p30","MFC","maxRBA")], by="uid") EXP.PD$Gene_Time<-paste0(EXP.PD$Var1, "_",EXP.PD$study_time_collected) EXP.PD$pt<-paste0(EXP.PD$pathogen, "_", EXP.PD$vaccine_type) data.ML<-as.data.frame(dcast.data.table(as.data.table(EXP.PD), formula=Var1+Gene_Time+study_time_collected~participant_id, value.var = "value")) rownames(data.ML)<-gsub(paste0("_",numbrero,"$"), paste0("_D",numbrero,"FCH"), data.ML$Gene_Time) data.ML.filter<-data.ML[data.ML$Var1%in%GENES,-c(1:3)] save(data.ML.filter, file=paste0(extra_data_dir,"/ML.Data.", paste0("D",numbrero,"FCH."),"BTM.SDFilter75.rda")) data.ML<-data.ML[,-c(1:3)] save(data.ML, file=paste0(extra_data_dir,"/ML.Data.", paste0("D",numbrero,"FCH."),"BTM.NoFilter.rda")) PatientInfo<-EXP.PD[!duplicated(EXP.PD$participant_id), c('age_imputed','gender','pt','study_accession', 'maxRBA_p30','MFC_p30','participant_id','maxRBA','MFC')] rownames(PatientInfo)<-PatientInfo$participant_id PatientInfo<-as.data.frame(PatientInfo) save(PatientInfo, file=paste0(extra_data_dir, "/PD.Master.", paste0("D", numbrero, "FCH."), "BTM.rda")) })
Ext. Data Figure 4a
#Subset to day 0 eset_wAb_D0=eset_wAb[,eset_wAb$time_post_last_vax==0] #Subset to within 7 days of day 28 for ab response, then set to day 28 eset_wAb_D0=eset_wAb_D0[,eset_wAb_D0$ImmResp_postVax_timepoint_MFC>=21 & eset_wAb_D0$ImmResp_postVax_timepoint_MFC<=35] eset_wAb_D0$ImmResp_postVax_timepoint_MFC=28 #Set baseline timepoint to 0 eset_wAb_D0$ImmResp_baseline_timepoint_MFC=0 #For each vaccine, only use one ab type (most common) pt_uni=unique(eset_wAb_D0$pt) for (i in 1:length(pt_uni)) { most_ab_type=tail(names(sort(table(eset_wAb_D0$assay[eset_wAb_D0$pt==pt_uni[i]]))), 1) ind_rem=which(eset_wAb_D0$pt==pt_uni[i] & eset_wAb_D0$assay!=most_ab_type) if (is_empty(ind_rem)==FALSE) { eset_wAb_D0=eset_wAb_D0[,-ind_rem] } } #Create dataframe from pdata df_pre=data.frame(eset_wAb_D0$pt, eset_wAb_D0$ImmResp_baseline_timepoint_MFC, eset_wAb_D0$ImmResp_baseline_value_MFC) colnames(df_pre)=c('Vaccine','Timepoint', 'Ab_titer') df_post=data.frame(eset_wAb_D0$pt, eset_wAb_D0$ImmResp_postVax_timepoint_MFC, eset_wAb_D0$ImmResp_postVax_value_MFC) colnames(df_post)=c('Vaccine','Timepoint', 'Ab_titer') df=rbind(df_pre,df_post) df$Timepoint=as.character(df$Timepoint) #Replace <0 with 0 df$Ab_titer[df$Ab_titer<0]=0 #Plot p=ggplot(df, aes(x=Timepoint, y=Ab_titer, fill=Timepoint)) + geom_boxplot(outlier.shape=NA,width=0.5,notch=FALSE, alpha=0.3)+ geom_jitter(pch=21, width=0.1)+ ylab('Antibody Response - log2(Ab Titer)')+ xlab('Day post-vaccination')+ theme_bw()+ theme(text=element_text(size=16),legend.position="none", plot.margin = margin(3,3,3,3, "cm")) p + facet_wrap(~Vaccine, scales = 'free')
Figure 5a. AUC bar plot of antibody response prediction performance per dataset for the elastic-net classifier trained on inactivated influenza datasets only. Data are presented as mean values ± 95% confidence intervals (CIs). n = 2000 bootstrap replicates.
load(paste0(extra_data_dir,"/ML.Data.D7FCH.BTM.SDFilter75.rda")) load(paste0(extra_data_dir,"/PD.Master.D7FCH.BTM.rda")) PatientInfo<-subset(PatientInfo, MFC_p30%in%c("lowResponder","highResponder")) PatientInfo<-subset(PatientInfo, pt=="Influenza_IN") PatientInfo<-droplevels(PatientInfo) PatientInfo$MFC_p30<-factor(PatientInfo$MFC_p30, levels = c("highResponder", "lowResponder")) MOD.LIST<-lapply(unique(PatientInfo$study_accession), function(SDY){ PatientInfo.Temp<-filter(PatientInfo, study_accession!=SDY) DevID<-rownames(PatientInfo.Temp) DevID<-intersect(DevID, colnames(data.ML.filter)) X.Dev<-data.ML.filter[,DevID] rownames(X.Dev)<-strsplit2(rownames(X.Dev), "_")[,1] Y.Dev<-PatientInfo[DevID, 'MFC_p30'] names(Y.Dev)<-DevID X.Dev <- t(X.Dev) SEED<-1987 set.seed(SEED) ctrl=trainControl(method = "cv", number = 10, #repeats = 1, selectionFunction = 'best', returnResamp = "final", classProbs = TRUE, returnData = F, verboseIter = F, summaryFunction = twoClassSummary, savePredictions = "final") set.seed(SEED) model_list <- caretList( x=X.Dev, y=Y.Dev[rownames(X.Dev)], trControl=ctrl, methodList=c("glmnet"), metric="ROC", tuneLength=5, maximize=T, preProc = c("center", "scale")) return(model_list)}) names(MOD.LIST)<-unique(PatientInfo$study_accession) ### Get 10-CV preds ### db.preds.test<-do.call("rbind.data.frame", lapply(names(MOD.LIST), function(SDY){ MOD.TEMP <- MOD.LIST[[SDY]] PatientInfo.Temp<-subset(PatientInfo, study_accession==SDY) PatientInfo.Temp<-droplevels(PatientInfo.Temp) PatientInfo.Temp$MFC_p30<-factor(PatientInfo.Temp$MFC_p30, levels = c("highResponder", "lowResponder")) DevID<-rownames(PatientInfo.Temp) DevID<-intersect(DevID, colnames(data.ML.filter)) X.Dev<-data.ML.filter[, DevID] rownames(X.Dev)<-strsplit2(rownames(X.Dev), "_")[,1] Y.Dev<-PatientInfo.Temp[DevID, 'MFC_p30'] names(Y.Dev)<-DevID model_preds <- lapply(MOD.TEMP, predict, newdata=t(X.Dev), type="prob") model_preds <- lapply(model_preds, function(x) x[,"highResponder"]) model_preds <- data.frame(model_preds) Preds.Test<-mutate(model_preds, pred=ifelse(glmnet>=0.5, "highResponder", "lowResponder"), PTID=colnames(X.Dev), highResponder=glmnet, obs=as.character(Y.Dev[colnames(X.Dev)])) CONF.MAT<-caret::confusionMatrix(data=factor(Preds.Test$pred, levels = c("highResponder", "lowResponder")), reference=factor(Preds.Test$obs, levels = c("highResponder", "lowResponder"))) ROC.OBJ<-pROC::roc(response = Preds.Test$obs, levels=c("lowResponder", "highResponder"), direction="<", predictor = Preds.Test$highResponder) AUC.CI<-as.vector(pROC::ci.auc(ROC.OBJ, method="bootstrap", boot.n=2000)) names(AUC.CI)<-c("AUC.CI.LOW", "AUC", "AUC.CI.HIGH") N = length(Preds.Test$PTID) VEC=t(as.data.frame(c(CONF.MAT$overall, CONF.MAT$byClass, AUC.CI, n_vac=N))) %>% as.data.frame() %>% mutate(SDY_OUT=SDY, SET="TEST") rownames(VEC)<-NULL return(VEC) })) p<-ggplot(db.preds.test, aes(x=SDY_OUT, y=as.numeric(AUC)))+ geom_bar(stat='identity', width = 0.5)+ geom_errorbar(aes(ymin = AUC.CI.LOW, ymax = AUC.CI.HIGH), width = 0.25)+ labs(y="AUC (CI)", x=NULL)+ theme_bw()+ geom_hline(yintercept = 0.5, linetype="dashed")+ theme(axis.text.x = element_text(angle = 45, hjust = 1, size=8))+ geom_label(aes(y=0.05, label=n_vac), show.legend = F, color="black", size=2)+ ylim(0,1) print(p)
Figure 5b. Heat map of high-versus-low antibody responder difference across vaccines of modules differentially expressed (FDR < 0.05) between high and low antibody responders to inactivated influenza vaccination.
#Load GE eset=readRDS(file.path(data_dir, paste0(date_prefix, "young_noNorm_withResponse_eset.rds"))) colnames(eset)=eset$uid #Merge pre and post-vax samples to same matrix for SDY1529 eset$matrix[eset$study_accession=='SDY1529']='SDY1529_WholeBlood_HealthyAdults' #Abbreviate vaccine type eset$vaccine_type_full=eset$vaccine_type eset$vaccine_type=str_replace_all(eset$vaccine_type,vaccine_abrev) responder_col='MFC_p30' #Create combined vaccine type/pathogen column eset$pt=paste(eset$pathogen," (",eset$vaccine_type,")", sep='') #Label columns by uid colnames(eset)=eset$uid #Find samples from timepoints of interest tp_int=c(0,1,3,7,14,21) #tp_int=unique(eset$study_time_collected[which(eset$study_time_collected>=0)]) #Alternate: use all timepoints (>=0) ind=lapply(tp_int, function(x) which(eset$study_time_collected==x)) #Combine indices of all timepoints of interest ind_all=Reduce(union,ind) #Retain only samples from timepoints of interest eset=eset[,ind_all] #Recompute timepoint indices after removing extraneous timepoints ind=lapply(tp_int, function(x) which(eset$study_time_collected==x)) #Remove samples from a single study with fewer than sample_cutoff HR and LR at any timepoint sample_cutoff=2 matrix_uni_tp=lapply(ind,function(x) unique(eset[,x]$matrix)) matrix_ind=lapply(1:length(ind),function(x) lapply(1:length(matrix_uni_tp[[x]]), function(y) which(matrix_uni_tp[[x]][[y]]==eset[,ind[[x]]]$matrix))) matrix_HR_ind=lapply(1:length(ind), function(x) lapply(1:length(matrix_uni_tp[[x]]), function(y) which(pData(eset[,ind[[x]][matrix_ind[[x]][[y]]]])[responder_col]=='highResponder'))) matrix_LR_ind=lapply(1:length(ind), function(x) lapply(1:length(matrix_uni_tp[[x]]), function(y) which(pData(eset[,ind[[x]][matrix_ind[[x]][[y]]]])[responder_col]=='lowResponder'))) ind_cut_all=vector() for (i in 1:length(matrix_ind)) { ind_cut_HR=which(sapply(matrix_HR_ind[[i]],length)<sample_cutoff) ind_cut_LR=which(sapply(matrix_LR_ind[[i]],length)<sample_cutoff) ind_cut=union(ind_cut_HR,ind_cut_LR) if (is_empty(ind_cut)==FALSE) { ind_cut_all=c(ind_cut_all,ind[[i]][unlist(matrix_ind[[i]][ind_cut])]) } } if (is_empty(ind_cut_all)==FALSE) { eset=eset[,-ind_cut_all] } #Recompute timepoint indices after removing samples tp_int=unique(eset$study_time_collected[which(eset$study_time_collected>=0)]) ind=lapply(tp_int, function(x) which(eset$study_time_collected==x)) #Remove genes with NA eset=eset[complete.cases(exprs(eset)),] #Create combined SDY/pathogen/vaccine type column eset$SDY_pt=paste(eset$study,eset$pt) #Create unique list of studies matrix_uni=unique(eset$matrix) #Collapse to BTM level #Remove BTMs which have no matching genes in dataset btm_list=btm_list[!sapply(btm_list, function(x) is_empty(intersect(x,rownames(eset))))] #Collapse - find arithmetic mean of genes in each BTM for each sample exp_BTM=do.call(rbind, lapply(btm_list, function(x) colMeans(exprs(eset[na.omit(match(x,rownames(eset))),]),na.rm=TRUE))) #Create BTM eset (maybe not proper approach but it works) eset_BTM=eset[1:nrow(exp_BTM),] rownames(eset_BTM)=rownames(exp_BTM) exprs(eset_BTM)=exp_BTM #Replace eset with BTM eset eset=eset_BTM rm(eset_BTM) #Compute D0 normalized FC ind_D0=which(0==eset$study_time_collected) common=lapply(2:length(ind),function(x) intersect(eset$participant_id[ind[[x]]],eset$participant_id[ind_D0])) ia=lapply(2:length(ind),function(x) na.omit(match(common[[x-1]],eset$participant_id[ind[[x]]]))) ib=lapply(2:length(ind),function(x) na.omit(match(common[[x-1]],eset$participant_id[ind_D0]))) exp_FC=lapply(2:length(ind),function(x) eset[,ind[[x]][ia[[x-1]]]]) exp_FC=lapply(2:length(ind),function(x) {exprs(exp_FC[[x-1]])=exprs(exp_FC[[x-1]])-exprs(eset[,ind_D0[ib[[x-1]]]]); exp_FC[[x-1]]}) #For each study, average expression across all subjects matrix_uni_tp=lapply(exp_FC,function(x) x$matrix[!duplicated(x$matrix)]) #Store study metadata matrix_uni_tp_metaData=lapply(exp_FC,function(x) pData(x)[!duplicated(x$matrix),] %>% rownames_to_column() %>% dplyr::select(matrix, vaccine, vaccine_type, pathogen, adjuvant, pt) %>% column_to_rownames(var = "matrix")) #Indices for each study matrix_ind=lapply(1:length(exp_FC), function(x) lapply(1:length(matrix_uni_tp[[x]]), function(y) which(matrix_uni_tp[[x]][[y]]==exp_FC[[x]]$matrix))) #High responder indices within each study matrix_HR_ind=lapply(1:length(exp_FC), function(x) lapply(1:length(matrix_uni_tp[[x]]), function(y) which(pData(exp_FC[[x]][,matrix_ind[[x]][[y]]])[responder_col]=='highResponder'))) matrix_LR_ind=lapply(1:length(exp_FC), function(x) lapply(1:length(matrix_uni_tp[[x]]), function(y) which(pData(exp_FC[[x]][,matrix_ind[[x]][[y]]])[responder_col]=='lowResponder'))) #Study size exp_FC_n=lapply(1:length(exp_FC), function(x) sapply(1:length(matrix_uni_tp[[x]]), function(y) length(matrix_ind[[x]][[y]]))) #Average FC exp_FC_mean=lapply(1:length(exp_FC), function(x) sapply(1:length(matrix_uni_tp[[x]]), function(y) rowMeans(exprs(exp_FC[[x]][,matrix_ind[[x]][[y]]])))) #T-test and mean FC between high/low responder #If there are less than 2 HR/LR in a given study, return NA for all t-tests exp_FC_p_less= lapply(1:length(exp_FC), function(x) sapply(1:length(matrix_uni_tp[[x]]), function(y) if(length(matrix_HR_ind[[x]][[y]])>=2 & length(matrix_LR_ind[[x]][[y]])>=2) apply(exprs(exp_FC[[x]][,matrix_ind[[x]][[y]]]), 1, function(z) t.test(z[matrix_HR_ind[[x]][[y]]], z[matrix_LR_ind[[x]][[y]]], alternative='less')$p.value) else rep(NA, nrow(exp_FC[[x]])))) exp_FC_p_greater= lapply(1:length(exp_FC), function(x) sapply(1:length(matrix_uni_tp[[x]]), function(y) if(length(matrix_HR_ind[[x]][[y]])>=2 & length(matrix_LR_ind[[x]][[y]])>=2) apply(exprs(exp_FC[[x]][,matrix_ind[[x]][[y]]]), 1, function(z) t.test(z[matrix_HR_ind[[x]][[y]]], z[matrix_LR_ind[[x]][[y]]], alternative='greater')$p.value) else rep(NA, nrow(exp_FC[[x]])))) exp_FC_mean= lapply(1:length(exp_FC), function(x) sapply(1:length(matrix_uni_tp[[x]]), function(y) if(length(matrix_HR_ind[[x]][[y]])>=2 & length(matrix_LR_ind[[x]][[y]])>=2) rowMeans(exprs(exp_FC[[x]][,matrix_ind[[x]][[y]][matrix_HR_ind[[x]][[y]]]]))-rowMeans(exprs(exp_FC[[x]][,matrix_ind[[x]][[y]][matrix_LR_ind[[x]][[y]]]])) else rep(NA, nrow(exp_FC[[x]])))) exp_FC_t_stat= lapply(1:length(exp_FC), function(x) sapply(1:length(matrix_uni_tp[[x]]), function(y) if(length(matrix_HR_ind[[x]][[y]])>=2 & length(matrix_LR_ind[[x]][[y]])>=2) apply(exprs(exp_FC[[x]][,matrix_ind[[x]][[y]]]), 1, function(z) t.test(z[matrix_HR_ind[[x]][[y]]], z[matrix_LR_ind[[x]][[y]]])$statistic) else rep(NA, nrow(exp_FC[[x]])))) #Replace NaN p values with 0.9999 exp_FC_p_less=lapply(exp_FC_p_less, function(x) replace(x,is.nan(x),0.9999)) exp_FC_p_greater=lapply(exp_FC_p_greater, function(x) replace(x,is.nan(x),0.9999)) #Replace 1 p values with 0.9999 (for compatibility with sumz) exp_FC_p_less=lapply(exp_FC_p_less, function(x) replace(x,x>0.9999,0.9999)) exp_FC_p_greater=lapply(exp_FC_p_greater, function(x) replace(x,x>0.9999,0.9999)) #Merge mean FC, correlation/t-test test statistics by vaccine (weighted average by study size) #Find unique pathogen/vaccine types by timepoint pt_uni_tp=lapply(matrix_uni_tp_metaData,function(x) x$pt[!duplicated(x$pt)]) pt_metaData=lapply(1:length(matrix_uni_tp_metaData), function(x) dplyr::select( matrix_uni_tp_metaData[[x]][!duplicated(matrix_uni_tp_metaData[[x]]$pt),], vaccine_type, pathogen, adjuvant)) #Sample indices per vaccine pt_ind=lapply(1:length(exp_FC), function(x) sapply(1:length(pt_uni_tp[[x]]), function(y) which(pt_uni_tp[[x]][[y]]==matrix_uni_tp_metaData[[x]]$pt))) pt_meanFC= lapply(1:length(exp_FC), function(x) sapply(1:length(pt_uni_tp[[x]]), function(y) if(length(pt_ind[[x]][[y]])>1) rowWeightedMeans(exp_FC_mean[[x]][,pt_ind[[x]][[y]]], w=exp_FC_n[[x]][pt_ind[[x]][[y]]], na.rm=TRUE) else exp_FC_mean[[x]][,pt_ind[[x]][[y]]])) pt_t_stat= lapply(1:length(exp_FC), function(x) sapply(1:length(pt_uni_tp[[x]]), function(y) if(length(pt_ind[[x]][[y]])>1) rowWeightedMeans(exp_FC_t_stat[[x]][,pt_ind[[x]][[y]]], w=exp_FC_n[[x]][pt_ind[[x]][[y]]], na.rm=TRUE) else exp_FC_t_stat[[x]][,pt_ind[[x]][[y]]])) ##Compute DEGs by pathogen/vaccine type #Integrate p values across multiple studies by Stouffer's method pt_p=vector("list",length(exp_FC)) pt_q=vector("list",length(exp_FC)) pt_up_down_logic=vector("list",length(exp_FC)) pt_DEG_up=vector("list",length(exp_FC)) pt_DEG_down=vector("list",length(exp_FC)) pt_DEG_total_num=vector("list",length(exp_FC)) for (i in 1:length(exp_FC)) { #For each pathogen/type combo pt_p[[i]]=matrix(NA, nrow=nrow(eset), ncol=length(pt_uni_tp[[i]])) pt_q[[i]]=matrix(NA, nrow=nrow(eset), ncol=length(pt_uni_tp[[i]])) pt_up_down_logic[[i]]=matrix(NA, nrow=nrow(eset), ncol=length(pt_uni_tp[[i]])) pt_DEG_up[[i]]=vector("list",length(pt_uni_tp[[i]])) pt_DEG_down[[i]]=vector("list",length(pt_uni_tp[[i]])) pt_DEG_total_num[[i]]=matrix(NA, nrow=1, ncol=length(pt_uni_tp[[i]])) for (j in 1:length(pt_uni_tp[[i]])) { #If there is more than 1 study with values, integrate p values by Stouffer's method if (length(pt_ind[[i]][[j]])>1) { #Directional Stouffer's method using sumz function (weighted by sqrt(n)) #t-test pt_p_less=vapply(1:nrow(eset), FUN.VALUE=1, function(x) sumz(exp_FC_p_less[[i]][x,pt_ind[[i]][[j]]],sqrt(exp_FC_n[[i]][pt_ind[[i]][[j]]]), na.action=na.omit)$p) pt_p_greater=vapply(1:nrow(eset), FUN.VALUE=1, function(x) sumz(exp_FC_p_greater[[i]][x,pt_ind[[i]][[j]]],sqrt(exp_FC_n[[i]][pt_ind[[i]][[j]]]), na.action=na.omit)$p) pt_up_down_logic[[i]][,j]=pt_p_greater<pt_p_less #logical variable, true if upregulation p value is smallest pt_p[[i]][,j]=2*pmin(pt_p_less,pt_p_greater) #Otherwise store single p values } else { #t-test pt_p[[i]][,j]=2*pmin(exp_FC_p_less[[i]][,pt_ind[[i]][[j]]],exp_FC_p_greater[[i]][,pt_ind[[i]][[j]]]) pt_up_down_logic[[i]][,j]=exp_FC_p_greater[[i]][,pt_ind[[i]][[j]]]<exp_FC_p_less[[i]][,pt_ind[[i]][[j]]] } #Correct p values >1 pt_p[[i]][(pt_p[[i]][,j]>1),j]=1 #Convert p values to q values (FDR correction) pt_q[[i]][,j]=p.adjust(pt_p[[i]][,j],method="BH") #BH method #Find DE BTMs #t-test ind_up=which(pt_up_down_logic[[i]][,j]==TRUE) ind_down=which(pt_up_down_logic[[i]][,j]==FALSE) ind_sig=which(pt_q[[i]][,j]<FDR_THRESHOLD) pt_DEG_up[[i]][[j]]=intersect(ind_up,ind_sig) pt_DEG_down[[i]][[j]]=intersect(ind_down,ind_sig) pt_DEG_total_num[[i]][j]=length(ind_sig) } #Set column/row names colnames(pt_p[[i]])=pt_uni_tp[[i]] colnames(pt_q[[i]])=pt_uni_tp[[i]] rownames(pt_metaData[[i]])=pt_uni_tp[[i]] rownames(pt_meanFC[[i]])=names(btm_list) rownames(pt_t_stat[[i]])=names(btm_list) colnames(pt_meanFC[[i]])=pt_uni_tp[[i]] colnames(pt_t_stat[[i]])=pt_uni_tp[[i]] } #Plot Day 7 t stat for TIV BTMs tp_plot=7 BTM_ind=pt_DEG_up[[which(tp_int==tp_plot)-1]][[which(pt_uni_tp[[which(tp_int==tp_plot)-1]]=='Influenza (IN)')]] df=pt_t_stat[[which(tp_int==tp_plot)-1]][BTM_ind,] df_anno=pt_metaData[[which(tp_int==tp_plot)-1]][, c('vaccine_type', 'pathogen', 'adjuvant')] ph_lim=ceiling(max(abs(df), na.rm = TRUE)) #ph_lim=3 colorLS=colorRampPalette(colors = c("blue", "white", "red"))(n = 100) breakLS=seq(from = -ph_lim, to = ph_lim, length.out = 100) #Plot result=pheatmap::pheatmap(mat = df, color = colorLS, breaks = breakLS, cluster_cols = TRUE, cluster_rows = TRUE, clustering_distance_rows = "euclidean", clustering_distance_cols = "euclidean", clustering_method = "ward.D2", show_colnames = TRUE, show_rownames = TRUE, fontsize = 10, cellheight = 10, cellwidth = 10)
Ext. Data Figure 4b-c. b) Barplot of feature importance for the GLM classifier trained on inactivated influenza datasets only. c) AUC barplot of antibody response prediction performance across vaccines for the GLM classifier trained on inactivated influenza datasets only. Data are presented as mean values + /- 95% confidence interval. n = 2000 bootstrap replicates.
load(paste0(extra_data_dir, "/ML.Data.D7FCH.BTM.SDFilter75.rda")) load(paste0(extra_data_dir ,"/PD.Master.D7FCH.BTM.rda")) PatientInfo<-subset(PatientInfo, MFC_p30%in%c("lowResponder","highResponder")) PatientInfo<-subset(PatientInfo, pt=="Influenza_IN") PatientInfo<-droplevels(PatientInfo) PatientInfo$MFC_p30<-factor(PatientInfo$MFC_p30, levels = c("highResponder", "lowResponder")) DevID<-rownames(PatientInfo) DevID<-intersect(DevID, colnames(data.ML.filter)) X.Dev<-data.ML.filter[,DevID] rownames(X.Dev)<-strsplit2(rownames(X.Dev), "_")[,1] Y.Dev<-PatientInfo[DevID, 'MFC_p30'] names(Y.Dev)<-DevID X.Dev <- t(X.Dev) SEED<-1987 set.seed(SEED) ctrl=trainControl(method = "cv", selectionFunction = 'best', returnResamp = "final", classProbs = TRUE, returnData = F, verboseIter = F, summaryFunction = twoClassSummary, savePredictions = "final") set.seed(SEED) model_list <- caretList( x=X.Dev, y=Y.Dev[rownames(X.Dev)], trControl=ctrl, methodList=c("glmnet"), metric="ROC", #tuneLength=5, maximize=T, preProc = c("center", "scale")) whichTwoPct <- best(model_list$glmnet$results, metric = "ROC", maximize = TRUE) model_list$glmnet$results[whichTwoPct,] ROC.OBJ<-pROC::roc(response = model_list$glmnet$pred$obs, levels=c("lowResponder", "highResponder"), direction="<", predictor = model_list$glmnet$pred$highResponder, plot=F, print.auc=F, ci=F, auc.polygon=FALSE, legacy.axes=FALSE, xlab= "Specificity (%) True Neg", ylab= "Sensitivity (%) True Pos", percent=TRUE, col="forestgreen", lwd=3) AUC.CI<-as.vector(pROC::ci.auc(ROC.OBJ, method="bootstrap", boot.n=2000)) names(AUC.CI)<-c("AUC.CI.LOW", "AUC", "AUC.CI.HIGH") db.imp<-arrange(varImp(model_list$glmnet)$importance, desc(Overall)) #Plot feature importance db.imp$BTM=rownames(db.imp) colnames(db.imp)=c('Importance','BTM') db.imp=db.imp[db.imp$Importance>0,] #re-order the levels in the order of appearance in the data.frame db.imp$BTM<-factor(db.imp$BTM,levels=db.imp$BTM[order(db.imp$Importance)]) #Plot p=ggplot(db.imp,aes(x=BTM,y=Importance))+ geom_bar(stat="identity",width=0.7)+ #scale_fill_manual(values = mycolors_BTM, na.value = "grey50")+ coord_flip()+ #labs(title=paste0('Timepoint ',tp[i]))+ xlab("")+ ylab("Importance")+ theme_minimal()+ theme(text = element_text(size=20), plot.title = element_text(hjust = 0.5)) p ### Test in other vaccines ### load(paste0(extra_data_dir,"/PD.Master.D7FCH.BTM.rda")) PatientInfo<-subset(PatientInfo, MFC_p30%in%c("lowResponder","highResponder")) PatientInfo<-subset(PatientInfo, pt!="Influenza_IN") PatientInfo<-droplevels(PatientInfo) PatientInfo$MFC_p30<-factor(PatientInfo$MFC_p30, levels = c("highResponder", "lowResponder")) ### DD ply to get preds ### db.preds.test<-do.call("rbind.data.frame", lapply(unique(PatientInfo$pt), function(Vaccine){ PatientInfo<-subset(PatientInfo, pt==Vaccine) PatientInfo<-droplevels(PatientInfo) PatientInfo$MFC_p30<-factor(PatientInfo$MFC_p30, levels = c("highResponder", "lowResponder")) DevID<-rownames(PatientInfo) DevID<-intersect(DevID, colnames(data.ML.filter)) X.Dev<-data.ML.filter[, DevID] rownames(X.Dev)<-strsplit2(rownames(X.Dev), "_")[,1] Y.Dev<-PatientInfo[DevID, 'MFC_p30'] names(Y.Dev)<-DevID model_preds <- lapply(model_list, predict, newdata=t(X.Dev), type="prob") model_preds <- lapply(model_preds, function(x) x[,"highResponder"]) model_preds <- data.frame(model_preds) Preds.Test<-mutate(model_preds, pt=Vaccine, #PredBinary=ifelse(rf>=0.5, "highResponder", "lowResponder"), PTID=colnames(X.Dev), Real.Resp=as.character(Y.Dev[colnames(X.Dev)])) return(Preds.Test) })) db.preds.test<-melt(db.preds.test, measure.vars = c("glmnet")) perf.stats.by.vaccine.time<-plyr::ddply(.data=db.preds.test, .variable=plyr::.(pt, variable), .fun=function(df){ CONF.MAT<-caret::confusionMatrix(data=factor(ifelse(df$value>=0.5, "highResponder", "lowResponder"), levels = c("highResponder", "lowResponder")), reference=factor(df$Real.Resp, levels = c("highResponder", "lowResponder"))) ROC.OBJ<-pROC::roc(response = df$Real.Resp, levels=c("lowResponder", "highResponder"), direction="<", predictor = df$value) AUC.CI<-as.vector(pROC::ci.auc(ROC.OBJ, method="bootstrap", boot.n=2000)) names(AUC.CI)<-c("AUC.CI.LOW", "AUC", "AUC.CI.HIGH") N = length(df$pt) VEC=t(as.data.frame(c(CONF.MAT$overall, CONF.MAT$byClass, AUC.CI, n_vac=N))) rownames(VEC)<-NULL return(VEC)}) perf.stats.by.vaccine.time<-arrange(perf.stats.by.vaccine.time, desc(AUC)) #Plot AUC lapply(unique(perf.stats.by.vaccine.time$variable), function(METH){ p<-ggplot(subset(perf.stats.by.vaccine.time, variable==METH), aes(x=pt, y=as.numeric(AUC), fill=pt))+ geom_hline(yintercept = 0.5, linetype="dashed")+ geom_bar(stat='identity', show.legend = F)+ scale_fill_brewer(palette = "Set3")+ geom_errorbar(aes(ymin = AUC.CI.LOW, ymax = AUC.CI.HIGH), width = 0.25)+ labs(y="AUC (CI)", title = paste0(METH), x=NULL)+ theme_bw()+ theme(axis.text.x = element_text(angle = 45, hjust = 1, size=8))+ geom_label(aes(y=0.05, label=n_vac), show.legend = F, color="black", size=2)+ ylim(0,1) p })
Ext. Data Figure 4d. AUC barplot of antibody response prediction performance of the leave-one-vaccine-out GLM classifier. Data are presented as mean values + /- 95% confidence interval. n = 2000 bootstrap replicates.
#Load GE load(eset_ssGSEA_wAb_path) #Keep only days 0,3,7 eset.LDA.BL.D3.D7=eset_wAb[,eset_wAb$time_post_last_vax %in% c(0,3,7)] PD<-pData(eset.LDA.BL.D3.D7) VAL.LIST <- list(Val1="Influenza_LV", Val2="Influenza_IN", Val3=c("Meningococcus_CJ", "Meningococcus_PS"), Val4=c("Tuberculosis_RVV", "Pneumococcus_PS", "Smallpox_LV"), Val5="Varicella Zoster_LV", Val6="Yellow Fever_LV") ### Start the LDA thing HighLow<-rownames(PD[PD$MFC_p30%in%c("lowResponder","highResponder"),]) eset.HighLow<-eset.LDA.BL.D3.D7[,eset.LDA.BL.D3.D7$uid%in%HighLow] EXP<-Biobase::exprs(eset.HighLow) PD<-pData(eset.HighLow) ALL <- table(PD$participant_id) KEEP <- names(ALL[ALL==3]) PD <- filter(PD, participant_id %in% KEEP) Samp2PatID<-PD$participant_id names(Samp2PatID)<-PD$uid DATA.LIST <- lapply(VAL.LIST, function(PATH){ PD.Train <- filter(PD, !(pt %in% PATH)) PD.Val <- filter(PD, pt %in% PATH) X.ls.Train<-lapply(c(0, 3, 7), function(i){ df<-EXP[,PD.Train[PD.Train$study_time_collected==i,"uid"]] colnames(df)<-Samp2PatID[colnames(df)] return(df)}) lapply(X.ls.Train, dim) names(X.ls.Train)<-c("X.0", "X.3", "X.7") X.ls.Val<-lapply(c(0, 3, 7), function(i){ df<-EXP[,PD.Val[PD.Val$study_time_collected==i,"uid"]] colnames(df)<-Samp2PatID[colnames(df)] return(df)}) names(X.ls.Val)<-c("X.0", "X.3", "X.7") Y.Train<-PD.Train[!duplicated(PD.Train$participant_id),c("participant_id", "MFC_p30","pt")] rownames(Y.Train)<-Y.Train$participant_id Y.Train<-Y.Train[colnames(X.ls.Train$X.0),] Y.Train<-droplevels(Y.Train) # need sample cols, gene rows wt<-LDAprojection(xs=X.ls.Train, ys=Y.Train$MFC_p30) colnames(wt)<-names(X.ls.Train) # projected matrix X.proj.Train<-multiplyFunction(xlist=X.ls.Train, par=wt) rownames(X.proj.Train)<-rownames(wt) colnames(X.proj.Train)<-colnames(X.ls.Train$X.0) X.proj.Val<-multiplyFunction(xlist=X.ls.Val, par=wt) rownames(X.proj.Val)<-rownames(wt) colnames(X.proj.Val)<-colnames(X.ls.Val$X.0) PHENO.TRAIN=filter(PD.Train, participant_id %in% colnames(X.proj.Train), study_time_collected==0) PHENO.VAL=filter(PD.Val, participant_id %in% colnames(X.proj.Val), study_time_collected==0) DATA.LS <- list(TRAIN=list(DATA.TRAIN=X.proj.Train, PHENO.TRAIN=PHENO.TRAIN), VAL=list(DATA.VAL=X.proj.Val, PHENO.VAL=PHENO.VAL)) return(DATA.LS)}) names(DATA.LIST) <- names(VAL.LIST) #save(DATA.LIST, file=paste0(extra_data_dir,"/LOVO_LDA_DataList.rda")) MOD.LIST <- lapply(DATA.LIST, function(TEMP){ PatientInfo <- TEMP$TRAIN$PHENO.TRAIN rownames(PatientInfo) <- PatientInfo$participant_id X.Dev <- TEMP$TRAIN$DATA.TRAIN Y.Dev <- PatientInfo[colnames(X.Dev), 'MFC_p30'] names(Y.Dev) <- colnames(X.Dev) Y.Dev <- droplevels(Y.Dev) length(Y.Dev) dim(X.Dev) X.Dev <- t(X.Dev) SEED<-1987 set.seed(SEED) ctrl=trainControl(method = "repeatedcv", number = 10, repeats = 5, selectionFunction = 'best', returnResamp = "final", classProbs = TRUE, returnData = F, verboseIter = F, summaryFunction = twoClassSummary, savePredictions = "final") set.seed(SEED) model_list <- caretList( x=X.Dev, y=Y.Dev[rownames(X.Dev)], trControl=ctrl, methodList=c("glmnet"), metric="ROC", tuneLength=5, maximize=T, preProc = c("center", "scale")) return(model_list) }) names(MOD.LIST) <- names(DATA.LIST) ## Get 10-CV preds ## Perf10CV <- do.call("rbind.data.frame", lapply(names(MOD.LIST), function(VAL){ Preds.CV <- MOD.LIST[[VAL]]$glmnet$pred CONF.MAT<-caret::confusionMatrix(data=factor(Preds.CV$pred, levels = c("highResponder", "lowResponder")), reference=factor(Preds.CV$obs, levels = c("highResponder", "lowResponder"))) ROC.OBJ<-pROC::roc(response = Preds.CV$obs, levels=c("lowResponder", "highResponder"), direction="<", predictor = Preds.CV$highResponder) AUC.CI<-as.vector(pROC::ci.auc(ROC.OBJ, method="bootstrap", boot.n=2000)) names(AUC.CI)<-c("AUC.CI.LOW", "AUC", "AUC.CI.HIGH") N = length(Preds.CV$PTID) VEC=t(as.data.frame(c(CONF.MAT$overall, CONF.MAT$byClass, AUC.CI, n_vac=N))) %>% as.data.frame() %>% mutate(VAL.NUM=VAL, SET="10-CV") rownames(VEC)<-NULL return(VEC)})) PerfVal <- do.call("rbind.data.frame", lapply(names(MOD.LIST), function(VAL){ PatientInfo.Val<-DATA.LIST[[VAL]]$VAL$PHENO.VAL rownames(PatientInfo.Val) <- PatientInfo.Val$participant_id X.Val<-DATA.LIST[[VAL]]$VAL$DATA.VAL Y.Val<-PatientInfo.Val[rownames(PatientInfo.Val), 'MFC_p30'] names(Y.Val)<-rownames(PatientInfo.Val) Preds.Test<-mutate(predict(MOD.LIST[[VAL]]$glmnet, newdata=t(X.Val), type="prob"), pred=ifelse(highResponder>=0.5, "highResponder", "lowResponder"), PTID=colnames(X.Val), obs=as.character(droplevels(Y.Val[colnames(X.Val)]))) CONF.MAT<-caret::confusionMatrix(data=factor(Preds.Test$pred, levels = c("highResponder", "lowResponder")), reference=factor(Preds.Test$obs, levels = c("highResponder", "lowResponder"))) ROC.OBJ<-pROC::roc(response = Preds.Test$obs, levels=c("lowResponder", "highResponder"), direction="<", predictor = Preds.Test$highResponder) AUC.CI<-as.vector(pROC::ci.auc(ROC.OBJ, method="bootstrap", boot.n=2000)) names(AUC.CI)<-c("AUC.CI.LOW", "AUC", "AUC.CI.HIGH") N = length(Preds.Test$PTID) VEC=t(as.data.frame(c(CONF.MAT$overall, CONF.MAT$byClass, AUC.CI, n_vac=N))) %>% as.data.frame() %>% mutate(VAL.NUM=VAL, SET="VAL") rownames(VEC)<-NULL return(VEC) })) db.performance.All <- full_join(x=Perf10CV, y=PerfVal) pd<-position_dodge(1) p<-ggplot(db.performance.All, aes(x=VAL.NUM, y=as.numeric(AUC), fill=SET, group=interaction(VAL.NUM, SET)))+ geom_bar(stat='identity', show.legend = T, position = pd)+ scale_fill_brewer(palette = "Set3")+ geom_errorbar(aes(ymin = AUC.CI.LOW, ymax = AUC.CI.HIGH), width = 0.25, position = pd)+ labs(y="AUC (CI)", x=NULL)+ theme_bw()+ geom_hline(yintercept = 0.5, linetype="dashed")+ theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1 ,size=8))+ scale_x_discrete(labels=c(Val1="Influenza (IN)", Val2="Influenza (LV)", Val3="Meningococcus (CJ)\nMeningococcus (PS)", Val4="Tuberculosis (RVV)\nPneumococcus (PS)\nSmallpox (LV)", Val5="Varicella Zoster (LV)", Val6="Yellow Fever (LV)"))+ ylim(0,1) p
Ext. Data Figure 4e. AUC barplot of antibody response prediction performance of the 10-fold cross-validation GLM classifier. Data are presented as mean values + /- 95% confidence interval. n = 2000 bootstrap replicates.
#Generate D3 results load(paste0(extra_data_dir,"/ML.Data.D3FCH.BTM.NoFilter.rda")) load(paste0(extra_data_dir,"/PD.Master.D3FCH.BTM.rda")) PatientInfo<-subset(PatientInfo, MFC_p30%in%c("lowResponder","highResponder")) PatientInfo<-droplevels(PatientInfo) PatientInfo$MFC_p30<-factor(PatientInfo$MFC_p30, levels = c("highResponder", "lowResponder")) DevID<-rownames(PatientInfo) DevID<-intersect(DevID, colnames(data.ML)) X.Dev<-data.ML[,DevID] rownames(X.Dev)<-strsplit2(rownames(X.Dev), "_")[,1] Y.Dev<-PatientInfo[DevID, 'MFC_p30'] names(Y.Dev)<-DevID X.Dev <- t(X.Dev) SEED<-1987 set.seed(SEED) ctrl=trainControl(method = "cv", number = 10, selectionFunction = 'best', returnResamp = "final", classProbs = TRUE, returnData = F, verboseIter = F, summaryFunction = twoClassSummary, savePredictions = "final") set.seed(SEED) model_list <- caretList( x=X.Dev, y=Y.Dev[rownames(X.Dev)], trControl=ctrl, methodList=c("glmnet"), metric="ROC", #tuneLength=5, maximize=T, preProc = c("center", "scale")) whichTwoPct <- best(model_list$glmnet$results, metric = "ROC", maximize = TRUE) model_list$glmnet$results[whichTwoPct,] ROC.OBJ<-pROC::roc(response = model_list$glmnet$pred$obs, levels=c("lowResponder", "highResponder"), direction="<", predictor = model_list$glmnet$pred$highResponder, plot=F, print.auc=F) AUC.CI<-as.vector(pROC::ci.auc(ROC.OBJ, method="bootstrap", boot.n=2000)) names(AUC.CI)<-c("AUC.CI.LOW", "AUC", "AUC.CI.HIGH") #save(model_list, file=paste0(extra_data_dir,"/FinalModelD3FCH.rda")) db.preds.all <- model_list$glmnet$pred %>% mutate(participant_id=rownames(X.Dev)[rowIndex]) %>% left_join(y=PatientInfo %>% dplyr::select(participant_id, pt)) perf.stats.by.vaccine<-plyr::ddply(.data=db.preds.all, .variable=c("pt"), .fun=function(df){ if(nrow(df)!=0 & length(unique(df$obs))==2){ ROC.OBJ<-pROC::roc(response = df$obs, levels=c("lowResponder", "highResponder"), direction="<", predictor = df$highResponder) AUC.CI<-as.vector(pROC::ci.auc(ROC.OBJ, method="bootstrap", boot.n=2000)) names(AUC.CI)<-c("AUC.CI.LOW", "AUC", "AUC.CI.HIGH") DB2 <- data.frame(AUC=ROC.OBJ$auc[1], AUC.CI.LOW=AUC.CI['AUC.CI.LOW'], AUC.CI.HIGH=AUC.CI['AUC.CI.HIGH']) return(DB2)}else{return(NULL)}}, .parallel=F) perf.stats.by.vaccine<-arrange(perf.stats.by.vaccine, desc(AUC)) perf.stats.by.vaccine.master <- perf.stats.by.vaccine %>% mutate(TimePoint="Day3-fold-change") #Generate D7 results load(paste0(extra_data_dir,"/ML.Data.D7FCH.BTM.NoFilter.rda")) load(paste0(extra_data_dir,"/PD.Master.D7FCH.BTM.rda")) PatientInfo<-subset(PatientInfo, MFC_p30%in%c("lowResponder","highResponder")) PatientInfo<-droplevels(PatientInfo) PatientInfo$MFC_p30<-factor(PatientInfo$MFC_p30, levels = c("highResponder", "lowResponder")) DevID<-rownames(PatientInfo) DevID<-intersect(DevID, colnames(data.ML)) X.Dev<-data.ML[,DevID] rownames(X.Dev)<-strsplit2(rownames(X.Dev), "_")[,1] Y.Dev<-PatientInfo[DevID, 'MFC_p30'] names(Y.Dev)<-DevID X.Dev <- t(X.Dev) SEED<-1987 set.seed(SEED) ctrl=trainControl(method = "cv", number = 10, selectionFunction = 'best', returnResamp = "final", classProbs = TRUE, returnData = F, verboseIter = F, summaryFunction = twoClassSummary, savePredictions = "final") set.seed(SEED) model_list <- caretList( x=X.Dev, y=Y.Dev[rownames(X.Dev)], trControl=ctrl, methodList=c("glmnet"), metric="ROC", #tuneLength=5, maximize=T, preProc = c("center", "scale")) whichTwoPct <- best(model_list$glmnet$results, metric = "ROC", maximize = TRUE) model_list$glmnet$results[whichTwoPct,] ROC.OBJ<-pROC::roc(response = model_list$glmnet$pred$obs, levels=c("lowResponder", "highResponder"), direction="<", predictor = model_list$glmnet$pred$highResponder, plot=F, print.auc=F) AUC.CI<-as.vector(pROC::ci.auc(ROC.OBJ, method="bootstrap", boot.n=2000)) names(AUC.CI)<-c("AUC.CI.LOW", "AUC", "AUC.CI.HIGH") #save(model_list, file=paste0(extra_data_dir,"/FinalModelD7FCH.rda") db.preds.all <- model_list$glmnet$pred %>% mutate(participant_id=rownames(X.Dev)[rowIndex]) %>% left_join(y=PatientInfo %>% dplyr::select(participant_id, pt)) perf.stats.by.vaccine<-plyr::ddply(.data=db.preds.all, .variable=c("pt"), .fun=function(df){ if(nrow(df)!=0 & length(unique(df$obs))==2){ ROC.OBJ<-pROC::roc(response = df$obs, levels=c("lowResponder", "highResponder"), direction="<", predictor = df$highResponder) AUC.CI<-as.vector(pROC::ci.auc(ROC.OBJ, method="bootstrap", boot.n=2000)) names(AUC.CI)<-c("AUC.CI.LOW", "AUC", "AUC.CI.HIGH") DB2 <- data.frame(AUC=ROC.OBJ$auc[1], AUC.CI.LOW=AUC.CI['AUC.CI.LOW'], AUC.CI.HIGH=AUC.CI['AUC.CI.HIGH']) return(DB2)}else{return(NULL)}}, .parallel=F) perf.stats.by.vaccine<-arrange(perf.stats.by.vaccine, desc(AUC)) perf.stats.by.vaccine %>% mutate(TimePoint="Day7-fold-change") %>% full_join(y=perf.stats.by.vaccine.master) -> perf.stats.by.vaccine.master p<-ggplot(perf.stats.by.vaccine.master, aes(x=pt, y=AUC, fill=pt))+ geom_bar(stat='identity', show.legend = T, position = pd)+ #scale_fill_brewer(palette = "Set3")+ geom_errorbar(aes(ymin = AUC.CI.LOW, ymax = AUC.CI.HIGH), width = 0.25, position = pd)+ labs(y="Mean AUC (95% CI)", x=NULL)+ facet_wrap(~TimePoint, nrow=2)+ theme_bw()+ theme(axis.text.x = element_text(angle = 60, hjust = 1, size=8))+ geom_hline(yintercept = 0.5, linetype="dashed", color="red")+ ylim(0,1) p
Figure 4c. Kinetics of the predictive power of M156.1 across vaccines. For each vaccine/time point combination, the AUC was computed based on difference in the geometric mean of the fold changes of the genes in the M156.1 between high and low responders
target_genesets <- c("plasma cells, immunoglobulins (M156.1)") eset <- young_and_old.noNorm.withResponse %>% keep_most_recent_prevax_time_point(drop_postvax = FALSE) %>% add_response_class_column() %>% keep_only_high_and_low_responders(col = "response_class") %>% remove_any_NA_rows() exprs_df <- Biobase::exprs(eset) pdata_df <- Biobase::pData(eset) # Make sure order of uids in pdata_df and exprs_df match stopifnot(all(rownames(pdata_df) == pdata_df$uid)) pdata_df <- pdata_df[colnames(exprs_df), ] # Match every sample in exprs_df with the corresponding baseline uid baseline_uids <- pdata_df %>% dplyr::group_by(pathogen, vaccine_type, study_accession, participant_id) %>% dplyr::mutate(baseline_uid = uid[time_post_last_vax <= 0]) %>% dplyr::pull(baseline_uid) baseline_exprs_df <- exprs_df[, baseline_uids] postvax_cols <- !(colnames(exprs_df) %in% unique(baseline_uids)) temporal_log2fc_withResponse_exprs <- exprs_df[, postvax_cols] - baseline_exprs_df[, postvax_cols] temporal_log2fc_withResponse_pdata <- Biobase::pData(eset)[ colnames(temporal_log2fc_withResponse_exprs), ] temporal_log2fc_withResponse_eset <- new("ExpressionSet", exprs = temporal_log2fc_withResponse_exprs, phenoData = new('AnnotatedDataFrame', temporal_log2fc_withResponse_pdata)) # Keep only study time points that have both high- and low-responders temporal_log2fc_withResponse_eset.filtered_pd <- Biobase::pData(temporal_log2fc_withResponse_eset) %>% dplyr::group_by(age_group, pathogen, vaccine_type, study_accession, time_post_last_vax) %>% dplyr::filter(all(c("lowResponder", "highResponder") %in% response_class)) %>% dplyr::ungroup() %>% tibble::column_to_rownames(var = "uid") temporal_log2fc_withResponse_eset.filtered_pd$uid <- rownames(temporal_log2fc_withResponse_eset.filtered_pd) temporal_log2fc_withResponse_eset.filtered_exprs <- Biobase::exprs(temporal_log2fc_withResponse_eset)[ , rownames(temporal_log2fc_withResponse_eset.filtered_pd)] temporal_log2fc_withResponse_eset.filtered <- new("ExpressionSet", exprs = temporal_log2fc_withResponse_eset.filtered_exprs, phenoData = new('AnnotatedDataFrame', temporal_log2fc_withResponse_eset.filtered_pd)) btm_list.filtered <- keep_symbols( genesets.symbol = btm_list, symbols2keep = rownames(temporal_log2fc_withResponse_eset.filtered)) calc_sig_df <- function(eset, sig_genes_up, sig_genes_down) { sig_scores_up <- rep(0, ncol(eset)) sig_scores_down <- rep(0, ncol(eset)) if (length(sig_genes_up) > 0) sig_scores_up <- Biobase::esApply( X = eset[sig_genes_up, ], MARGIN = 2, FUN = mean) if (length(sig_genes_down) > 0) sig_scores_down <- Biobase::esApply( X = eset[sig_genes_down, ], MARGIN = 2, FUN = mean) tibble::tibble( uid = colnames(eset), sig_score = sig_scores_up - sig_scores_down, labels = as.numeric(Biobase::pData(eset)$response_class == "highResponder")) } calc_auc_df <- function(x) { x %>% dplyr::group_by(pathogen, vaccine_type, study_accession, time_post_last_vax) %>% tidyr::nest() %>% dplyr::mutate( auc_data = purrr::map( .x = data, ~{ res <- MetaIntegrator::calculateROC(labels = .x$labels, predictions = .x$sig_score) data.frame( n_HR = sum(.x$response_class == "highResponder"), n_LR = sum(.x$response_class == "lowResponder"), n = nrow(.x), AUC = res$auc, AUC_low = res$auc.CI[1], AUC_high = res$auc.CI[2]) } ) ) %>% dplyr::ungroup() %>% dplyr::select(-data) %>% tidyr::unnest(auc_data) } auc_df <- NULL for (target_geneset in target_genesets) { target_genes <- btm_list.filtered[[target_geneset]] sig_df <- calc_sig_df(eset = temporal_log2fc_withResponse_eset.filtered, sig_genes_up = target_genes, sig_genes_down = c()) pd_sig_df <- dplyr::left_join( Biobase::pData(temporal_log2fc_withResponse_eset.filtered), sig_df, by = "uid") auc_df <- rbind(auc_df, calc_auc_df(x = pd_sig_df) %>% dplyr::mutate(GeneSet = target_geneset)) } plot_df <- auc_df %>% dplyr::group_by(pathogen, vaccine_type, GeneSet, time_post_last_vax) %>% dplyr::summarise(mean_AUC = sum(n * AUC) / sum(n), n = sum(n)) %>% dplyr::mutate(pvt = paste0(pathogen, " (", vaccine_type, ")")) for (cur_geneset in target_genesets) { p <- ggplot(plot_df %>% dplyr::filter(n >= 5, time_post_last_vax <= 30, GeneSet == cur_geneset), aes(x = time_post_last_vax, y = mean_AUC, color = pvt)) + geom_hline(yintercept = .5, color = "gray", linetype = "dashed") + geom_vline(xintercept = 7, color = "gray", linetype = "dashed") + geom_line() + geom_point() + scale_color_ptol() + theme_bw() + facet_wrap(vars(GeneSet), ncol = 1) print(p) }
Figure 4d-e. Weighted ROC curves and per-vaccine AUC bar plot for a logistic regression classifier using M156.1 expression either at day 7 in all vaccines (day 7) or at the vaccine-specific peak expression time point. FPR, false positive rate; TPR, true positive rate. Data are presented as mean values ± 95% CIs. n = 2,000 bootstrap replicates.
#Set peak timepoints Peak.List <- list( D7FCH=c("Meningococcus_CJ", "Hepatitis A/B_IN/RP", "Influenza_LV", "Influenza_IN", "Tuberculosis_RVV", "Meningococcus_PS", "Pneumococcus_PS", "Varicella Zoster_LV"), D14FCH=c("Yellow Fever_LV"), D21FCH=c("Smallpox_LV")) data.ML.final <- do.call("cbind.data.frame", lapply(names(Peak.List), function(TIME){ load(paste0(extra_data_dir,"/",paste0("ML.Data.",TIME,".BTM.NoFilter.rda"))) load(paste0(extra_data_dir,"/",paste0("PD.Master.",TIME,".BTM.rda"))) keep.id<-subset(PatientInfo, MFC_p30%in%c("lowResponder","highResponder") & pt%in%Peak.List[[TIME]])$participant_id data.ML.temp<-data.ML[,keep.id] return(data.ML.temp) })) PatientInfo.Final <- do.call("rbind.data.frame", lapply(names(Peak.List), function(TIME){ load(paste0(extra_data_dir,"/",paste0("ML.Data.",TIME,".BTM.NoFilter.rda"))) load(paste0(extra_data_dir,"/",paste0("PD.Master.",TIME,".BTM.rda"))) keep.id<-subset(PatientInfo, MFC_p30%in%c("lowResponder","highResponder") & pt%in%Peak.List[[TIME]])$participant_id data.ML.temp<-PatientInfo[keep.id,] return(data.ML.temp) })) PatientInfo.Final<-droplevels(PatientInfo.Final) PatientInfo.Final$MFC_p30<-factor(PatientInfo.Final$MFC_p30, levels = c("highResponder", "lowResponder")) save(PatientInfo.Final, data.ML.final, file=paste0(extra_data_dir,"/ML.Data.D7FCH.BTM.SELECT.PEAK.NoFilter.rda")) TIMES<-c("D7FCH.BTM", "D7FCH.BTM.SELECT") VERSION<-c(".PEAK") db.run<-expand.grid(TIMES, VERSION) MODEL.LIST<-lapply(1:nrow(db.run), function(i){ Time=db.run[i, "Var1"] # eg BL.GENES, BL.BTM VERSION<-db.run[i, "Var2"] #SEED=1987 SEED=42 # Load Data ### Get ID with Both ### Added from prior ImmuneSpace version ID.UNIFORM<-get(load(paste0(extra_data_dir,"/ML.Data.D7FCH.BTM.NoFilter.rda"))) load(paste0(extra_data_dir,"/ML.Data.D7FCH.BTM.SELECT.PEAK.NoFilter.rda")) ID.SELECT <- data.ML.final ID.KEEP<-intersect(colnames(ID.UNIFORM), colnames(ID.SELECT)) if(grepl("SELECT",Time)==FALSE){ load(paste0(extra_data_dir,"/ML.Data.",Time,".NoFilter.rda")) load(paste0(extra_data_dir,"/PD.Master.",Time,".rda")) data.ML.filter=data.ML} if(grepl("SELECT",Time)==TRUE){ Time2<-paste0(Time, VERSION) load(paste0(extra_data_dir,"/ML.Data.",Time2,".NoFilter.rda")) data.ML.filter=data.ML.final PatientInfo=PatientInfo.Final} PatientInfo<-subset(PatientInfo, MFC_p30%in%c("lowResponder","highResponder")) PatientInfo<-droplevels(PatientInfo) PatientInfo$MFC_p30<-factor(PatientInfo$MFC_p30, levels = c("highResponder", "lowResponder")) DevID<-rownames(PatientInfo) DevID<-intersect(DevID, colnames(data.ML.filter)) X.Dev<-data.ML.filter[,DevID] rownames(X.Dev)<-strsplit2(rownames(X.Dev), "_")[,1] Y.Dev<-PatientInfo[DevID, 'MFC_p30'] names(Y.Dev)<-DevID keep.genes<-grep("M156.1", rownames(X.Dev), value = T) X.Train.t1<-t(X.Dev[keep.genes,]) Y.Train1<-PatientInfo[rownames(X.Train.t1), 'MFC_p30'] names(Y.Train1)<-rownames(X.Train.t1) X.Train.t1<-X.Train.t1[intersect(ID.KEEP, rownames(X.Train.t1)),,drop=F] set.seed(SEED) ctrl=trainControl(method = "LGOCV", number = 200, p=0.8, selectionFunction = 'best', search="grid", returnResamp = "final", classProbs = TRUE, returnData = F, verboseIter = F, summaryFunction = twoClassSummary, sampling=NULL, savePredictions = "final") set.seed(SEED) model_list <- train( x=X.Train.t1, y=Y.Train1[rownames(X.Train.t1)], trControl=ctrl, method=c("glm"), metric="ROC", maximize=TRUE, preProc = c("center", "scale")) for(i in 1:length(model_list)){ model_list$pred$participant_id<-rownames(X.Train.t1)[model_list$pred$rowIndex] model_list$pred$pt=PatientInfo[model_list$pred$participant_id,"pt"] } return(model_list)}) names(MODEL.LIST)<-paste0(db.run$Var1, db.run$Var2) # save(MODEL.LIST, # file=paste0(extra_data_dir,"/PeakPredictions_Master_ModelList.rda")) # load(paste0(extra_data_dir,"/PeakPredictions_Master_ModelList.rda")) db.preds.D7FCH <- MODEL.LIST$D7FCH.BTM.PEAK$pred %>% mutate(TIME="D7FCH") db.preds.PEAK <- MODEL.LIST$D7FCH.BTM.SELECT.PEAK$pred %>% mutate(TIME="PEAK.SELECT") db.preds.all <- full_join(db.preds.D7FCH, db.preds.PEAK) db.ROCw<-plyr::ddply(db.preds.all, c("Resample", "TIME"), function(df){ tab1<-table(df$pt) tab1<-1/tab1 df$WT<-tab1[df$pt] ROCw<-WeightedROC(guess=df$highResponder, label=ifelse(df$obs=="highResponder", 1,0), weight = df$WT) db.res <-data.frame(WeightedAUC=WeightedAUC(ROCw)) }) Fig.D7FCH <- lme(WeightedAUC~1, random = ~1|Resample, data=filter(db.ROCw, TIME=="D7FCH")) MEAN.CI.D7FCH <- as.data.frame(emmeans::emmeans(Fig.D7FCH, ~1)) %>% mutate(TIME="D7FCH") Fig.PEAK.SELECT <- lme(WeightedAUC~1, random = ~1|Resample, data=filter(db.ROCw, TIME=="PEAK.SELECT")) MEAN.CI.PEAK.SELECT <- as.data.frame(emmeans::emmeans(Fig.PEAK.SELECT, ~1)) %>% mutate(TIME="PEAK.SELECT") MEAN.CI <- full_join(MEAN.CI.D7FCH, MEAN.CI.PEAK.SELECT) ### Weighted ROC ### tab1<-table(db.preds.D7FCH$pt) tab1<-1/tab1 db.preds.D7FCH$WT<-tab1[db.preds.D7FCH$pt] ROC.UNIFORM<-WeightedROC(guess=db.preds.D7FCH$highResponder, label=ifelse(db.preds.D7FCH$obs=="highResponder", 1,0), weight = db.preds.D7FCH$WT) ROC.UNIFORM$TYPE="Day7" ROC.UNIFORM$WeightedAUC=WeightedAUC(ROC.UNIFORM) tab1<-table(db.preds.PEAK$pt) tab1<-1/tab1 db.preds.PEAK$WT<-tab1[db.preds.PEAK$pt] ROC.PEAK<-WeightedROC(guess=db.preds.PEAK$highResponder, label=ifelse(db.preds.PEAK$obs=="highResponder", 1,0), weight = db.preds.PEAK$WT) ROC.PEAK$TYPE="Peak" ROC.PEAK$WeightedAUC=WeightedAUC(ROC.PEAK) ROC.db<-rbind.data.frame(ROC.PEAK, ROC.UNIFORM) p=ggplot(ROC.db, aes(x=FPR, y=TPR, color=TYPE))+ geom_path()+ scale_color_manual(values=c(Day7="gray60", Peak="black"))+ coord_equal() + theme_classic()+ theme(legend.position = "bottom") p ### ROC per Vaccine ### perf.stats.by.vaccine<-plyr::ddply(.data=db.preds.all, .variable=c("pt", "TIME", "Resample"), .fun=function(df){ if(nrow(df)!=0 & length(unique(df$obs))==2){ ROC.OBJ<-pROC::roc(response = df$obs, levels=c("lowResponder", "highResponder"), direction="<", predictor = df$highResponder) DB2 <- data.frame(AUC=ROC.OBJ$auc) return(DB2)}else{return(NULL)}}, .parallel=F) perf.stats.by.vaccine<-arrange(perf.stats.by.vaccine, desc(AUC)) plyr::ddply(perf.stats.by.vaccine, c("TIME", "pt"), function(df){ print(range(df$AUC)) if(min(df$AUC)!=1){ FIT<-lme(AUC~1, random = ~1|Resample, data=df) MEAN.CI <- as.data.frame(emmeans::emmeans(FIT, ~1)) } if(min(df$AUC)==1){MEAN.CI<-data.frame(`1`="overall", emmean=1, SE=NA, df=NA, lower.CL=1, upper.CL=1)} return(MEAN.CI) }) %>% arrange(pt) -> db.plot.ByVac pd<-position_dodge(0.9) p<-ggplot(db.plot.ByVac, aes(x=reorder(pt, -emmean), y=emmean, fill=TIME))+ geom_bar(stat='identity', show.legend = T, position = pd)+ scale_fill_brewer(palette = "Set3")+ geom_errorbar(aes(ymin = lower.CL, ymax = upper.CL), width = 0.25, position = pd)+ labs(y="AUC (CI 95%)", x=NULL)+ theme_bw()+ theme(axis.text.x = element_text(angle = 60, hjust = 1, size=8))+ ylim(0,1) p
Figure 6a. Box plots of day 30 antibody responses to vaccination in young (≤50 years) and older (≥60 years) participants across vaccines. The center line represents the median, box limits represent upper and lower quartiles, and whiskers indicate 1.5 times the interquartile range.
eset=young_and_old.noNorm.withResponse #Create combined vaccine type/pathogen column eset$pt=paste(eset$pathogen," (",eset$vaccine_type,")", sep='') #Subset to day 0 eset=eset[,eset$time_post_last_vax==0] #Subset to within 7 days of day 28 for ab response, then set to day 28 eset=eset[,eset$ImmResp_postVax_timepoint_MFC>=21 & eset$ImmResp_postVax_timepoint_MFC<=35] eset$ImmResp_postVax_timepoint_MFC=28 #Set baseline timepoint to 0 eset$ImmResp_baseline_timepoint_MFC=0 #For each vaccine, only use one ab type (most common) pt_uni=unique(eset$pt) for (i in 1:length(pt_uni)) { most_ab_type=tail(names(sort(table(eset$assay[eset$pt==pt_uni[i]]))), 1) ind_rem=which(eset$pt==pt_uni[i] & eset$assay!=most_ab_type) if (is_empty(ind_rem)==FALSE) { eset=eset[,-ind_rem] } } #Keep only studies with young and elderly sdy_y_e=names(which(sapply(unique(eset$study_accession), function(x) length(unique(eset$age_group[eset$study_accession==x]))>1))) eset=eset[,eset$study_accession %in% sdy_y_e] #Create dataframe from pdata df=data.frame(Group=eset$age_group, Vaccine=eset$pt, MFC=eset$MFC) df$Group=factor(df$Group, levels=unique(df$Group)) #Plot p=ggplot(df, aes(x=Vaccine, y=MFC, fill=Group)) + geom_boxplot(position = position_dodge(0.6), outlier.shape=NA, width=0.5, notch=FALSE, alpha=1)+ geom_point(position=position_jitterdodge(jitter.width=0.2, dodge.width=0.6), pch=21)+ stat_compare_means(aes(group=Group), hide.ns=TRUE, label='p.signif')+ ylab('log2(Ab FC)')+ xlab('')+ theme_bw()+ theme(text=element_text(size=16), axis.text.x=element_text(angle=45, hjust=1)) p
Figure 6b. Modules differentially expressed between young and older participants in response to inactivated influenza vaccination (QuSAGE FDR < 0.05). (Figure 6c created in Cytoscape)
#Generate young vs old QuSAGE results longitudinal_qusage_young_vs_old_df_path <- file.path(extra_data_dir, "longitudinal_qusage_young_vs_old_df.rds") if (file.exists(longitudinal_qusage_young_vs_old_df_path)) { longitudinal_qusage_young_vs_old_df <- readRDS(longitudinal_qusage_young_vs_old_df_path) } else { # Filter eset and gene sets ---------------------------------------------------- eset <- young_and_old.noNorm %>% keep_most_recent_prevax_time_point(drop_postvax = FALSE) %>% remove_any_NA_rows() btm_list.filtered <- keep_symbols(genesets.symbol = btm_list, symbols2keep = rownames(eset), min_size = 2) # Longitudinal GSEA of young vs old -------------------------------------------- analysis_df <- Biobase::pData(eset) %>% dplyr::filter(time_post_last_vax > 0) %>% dplyr::select(pathogen, vaccine_type, time_post_last_vax, study_accession) %>% dplyr::arrange(pathogen, vaccine_type, time_post_last_vax, study_accession) %>% dplyr::distinct() %>% dplyr::mutate(qusage_contrast = NA, qusage_output = list(NULL), error = list(NULL)) %>% dplyr::as_tibble() safe_qusage <- purrr::safely(.f = qusage::qusage, otherwise = NULL, quiet = TRUE) for (row_nr in 1:nrow(analysis_df)) { row <- analysis_df[row_nr,, drop = FALSE] cat(paste0("Row ", row_nr, "/", nrow(analysis_df), ": ", row$pathogen, " ", row$vaccine_type, " ", row$time_post_last_vax, " ", row$study_accession, "\n")) cur_eset <- eset[, eset$pathogen == row$pathogen & eset$vaccine_type == row$vaccine_type & eset$study_accession == row$study_accession & (eset$time_post_last_vax <= 0 | eset$time_post_last_vax == row$time_post_last_vax)] prevax_string <- "baseline" postvax_string <- paste0("Day", row$time_post_last_vax) qusage.labels <- paste0( cur_eset$age_group, ".", ifelse(test = cur_eset$time_post_last_vax == row$time_post_last_vax, yes = postvax_string, no = prevax_string)) qusage.contrast <- paste0( "(old.", postvax_string, "-old.", prevax_string, ") - ", "(young.", postvax_string, "-young.", prevax_string, ")") result <- safe_qusage(eset = cur_eset, labels = qusage.labels, contrast = qusage.contrast, geneSets = btm_list.filtered, pairVector = cur_eset$participant_id) if (!is.null(result$result)) analysis_df$qusage_output[[row_nr]] <- result$result analysis_df$qusage_contrast[row_nr] <- qusage.contrast if (!is.null(result$error)) analysis_df$error[[row_nr]] <- result$error } saveRDS(analysis_df, longitudinal_qusage_young_vs_old_df_path) longitudinal_qusage_young_vs_old_df=analysis_df } #Combine results across vaccines longitudinal_qusage_young_vs_old_meta_df_path <- file.path(extra_data_dir, "longitudinal_qusage_young_vs_old_meta_df.rds") if (file.exists(longitudinal_qusage_young_vs_old_meta_df_path)) { longitudinal_qusage_young_vs_old_meta_df <- readRDS(longitudinal_qusage_young_vs_old_meta_df_path) } else { longitudinal_qusage_young_vs_old_meta_df <- longitudinal_qusage_young_vs_old_df %>% meta_qusage(group_cols = c("pathogen", "vaccine_type", "time_post_last_vax")) saveRDS(longitudinal_qusage_young_vs_old_meta_df, longitudinal_qusage_young_vs_old_meta_df_path) } #Tidy results tidy_longitudinal_qusage_young_vs_old_meta_df <- longitudinal_qusage_young_vs_old_meta_df %>% tidy_qusage_output(col = "meta_qusage_output") %>% dplyr::mutate(FDR = p.adjust(p.value, method = "BH"), FDR_signif_label = create_signif_label(FDR)) #Plot plot_df <- tidy_longitudinal_qusage_young_vs_old_meta_df %>% dplyr::filter(FDR < FDR_THRESHOLD) %>% dplyr::arrange(activity_score) %>% dplyr::mutate(geneset = factor(geneset, levels = unique(geneset)), pvt = paste0(pathogen, " (", vaccine_type, ")")) pvts <- plot_df %>% dplyr::pull(pvt) %>% unique() for (cur_pvt in pvts) { p <- ggplot(plot_df %>% dplyr::filter(pvt == cur_pvt), aes(x = geneset, y = activity_score, fill = activity_score > 0)) + labs(x = "BTM", y = "Activity score") + geom_bar(stat = "identity") + geom_text(aes(label = FDR_signif_label), size = 6, y = 0.5) + ylim(c(NA, 0.6)) + coord_flip() + scale_fill_manual(values = c("TRUE" = "firebrick", "FALSE" = "navy")) + theme_bw() + theme(legend.position = "none") + facet_grid(.~time_post_last_vax) + ggtitle(cur_pvt) print(p) }
Figure 6d. Bar plot of the day 7 AUC of module M156.1 across vaccines in young and older participants. Data are mean values ± 95% CIs. n = 2,000 bootstrap replicates.
exp_FC_old_path <- file.path(extra_data_dir, "Old_FCH_esetList.BTM.rda") eset <- readRDS(file.path(data_dir, paste0(date_prefix, "old_norm_batchCorrectedFromYoung_withResponse_eset.rds"))) #Abbreviate vaccine types eset$vaccine_type_full=eset$vaccine_type eset$vaccine_type=str_replace_all(eset$vaccine_type,vaccine_abrev) #Merge pre and post-vax samples to same matrix for SDY1529 eset$matrix[eset$study_accession=='SDY1529']='SDY1529_WholeBlood_HealthyAdults' # Code below adds row-names equal to uid (uid is columnID for the Expression Data) rownames(pData(eset))<-pData(eset)$uid # ensuring order of pheno-data matchs expression data. pData(eset)<-pData(eset)[colnames(Biobase::exprs(eset)),] #Create column with combined vaccine type/pathogen eset$pt=paste0(eset$pathogen,"_",eset$vaccine_type) # Time points to calculate tp_int=c(0, 7) #Create combined vaccine type/pathogen column eset$pt=paste0(eset$pathogen,"_",eset$vaccine_type) #Remove subjects with Inf/NA MFC measurements eset=eset[,!is.na(eset$MFC)] eset=eset[,!(eset$MFC == Inf)] #For subjects with D-7 but no D0 measurement, convert to D0 for FC calculation sub_noD0=setdiff(eset[,which(eset$study_time_collected==-7)]$participant_id,eset[,which(eset$study_time_collected==0)]$participant_id) eset$study_time_collected[intersect(which(eset$study_time_collected==-7),which(eset$participant_id %in% sub_noD0))]=0 #Find samples from timepoints of interest ind=lapply(tp_int, function(x) which(eset$study_time_collected==x)) #Retain only samples from timepoints of interest eset=eset[,Reduce(union,ind)] #Recompute timepoint indices after removing extraneous timepoints ind=lapply(tp_int, function(x) which(eset$study_time_collected==x)) #Remove samples from a single study with fewer than sample_cutoff samples at any timepoint matrix_uni_tp=lapply(ind,function(x) unique(eset[,x]$matrix)) matrix_ind=lapply(1:length(ind),function(x) lapply(1:length(matrix_uni_tp[[x]]), function(y) which(matrix_uni_tp[[x]][[y]]==eset[,ind[[x]]]$matrix))) ind_cut_all=vector() for (i in 1:length(matrix_ind)) { ind_cut=which(sapply(matrix_ind[[i]],length)<sample_cutoff) if (is_empty(ind_cut)==FALSE) { for (j in 1:length(ind_cut)) { ind_cut_all=c(ind_cut_all,ind[[i]][matrix_ind[[i]][[ind_cut[j]]]]) } } } if (is_empty(ind_cut_all)==FALSE) { eset=eset[,-ind_cut_all] } #Recompute timepoint indices after removing samples tp_int=unique(eset$study_time_collected[which(eset$study_time_collected>=0)]) ind=lapply(tp_int, function(x) which(eset$study_time_collected==x)) PD<-pData(eset) unique(subset(PD,participant_id%in%participant_id[ind[[2]]])$study_time_collected) eset=eset[complete.cases(Biobase::exprs(eset)),] #Convert to ssGSEA scores and compute FC, unless cache alr if (file.exists(exp_FC_old_path)) { load(exp_FC_old_path) } else { BTM_list <- readRDS(file = file.path(data_dir, "btm_list_2020_12_23.rds")) # Gene names GeneNames<-featureNames(eset) #Remove BTMs which have no matching genes in dataset BTM_list=BTM_list[!sapply(BTM_list, function(x){is_empty(intersect(x, GeneNames))})] eset_BTM<-gsva(expr=eset, ## Gene-expression data, rows genes, cols samples BTM_list, ## PTW list verbose=TRUE, method="ssgsea") #Replace eset with BTM eset eset=eset_BTM rm(eset_BTM) #Compute D0 normalized FC - for individual genes/BTMs - exp_FC has 3 large matrix with D1, D3, D7 timepoints ind_D0=which(0==eset$study_time_collected) common=lapply(2:length(ind),function(x) intersect(eset$participant_id[ind[[x]]],eset$participant_id[ind_D0])) #get participant id with timepoint and D0 data ia=lapply(2:length(ind),function(x) na.omit(match(common[[x-1]],eset$participant_id[ind[[x]]]))) #included participant IDs match with participant IDs at diff timepoints in eset ib=lapply(2:length(ind),function(x) na.omit(match(common[[x-1]],eset$participant_id[ind_D0]))) #participant IDs match with D0 to get indices in ind_D0 exp_FC=lapply(2:length(ind),function(x) eset[,ind[[x]][ia[[x-1]]]]) #get expression data of the 3 timepoints exp_FC=lapply(2:length(ind), function(x) {Biobase::exprs(exp_FC[[x-1]])=Biobase::exprs(exp_FC[[x-1]])-Biobase::exprs(eset[,ind_D0[ib[[x-1]]]]) exp_FC[[x-1]] }) #compute FC names(exp_FC)<-paste0("Day", tp_int[-1]) exp_FC_old <- exp_FC save(exp_FC_old, file=exp_Fold_path) } #Wrangle data to ML format lapply(names(exp_FC_old), function(DAY){ eset<-exp_FC_old[[DAY]] PD<-pData(eset) # remove patients with duplicate time points PD$SubjTime<-paste0(PD$participant_id,"_",PD$study_time_collected) tab<-table(PD$SubjTime) patientsDuplic<-names(tab[tab!=1]) ## Keep only patients with 0, 1 and 3 days numbrero<-as.numeric(gsub("Day", "", DAY)) PD.sub<-subset(PD, study_time_collected%in%c(numbrero)) PD.use<-PD.sub PD.use<-droplevels(PD.use) UID<-PD.use$uid ## Prepare expression data EXP<-Biobase::exprs(eset) ## Figure out a good filter SDVec<-apply(EXP, 1, sd) GENES<-names(SDVec[SDVec>=quantile(SDVec,0.75)]) ## I will keep genes with an SD>= the median SD ## Need to wrangle this into ML format: Gene_Time by PTID EXP.melt<-reshape2::melt(EXP) colnames(EXP.melt)[2]="uid" EXP.PD<-inner_join(x=EXP.melt[EXP.melt$uid%in%PD.use$uid,], y=PD.use[,c("uid","participant_id","study_time_collected","time_post_last_vax", "age_imputed",'study_accession', "gender","pathogen","vaccine_type","maxRBA_p30","MFC_p30","MFC","maxRBA")], by="uid") EXP.PD$Gene_Time<-paste0(EXP.PD$Var1, "_",EXP.PD$study_time_collected) EXP.PD$pt<-paste0(EXP.PD$pathogen, "_", EXP.PD$vaccine_type) data.ML<-as.data.frame(dcast.data.table(as.data.table(EXP.PD), formula=Var1+Gene_Time+study_time_collected~participant_id, value.var = "value")) rownames(data.ML)<-gsub(paste0("_",numbrero,"$"), paste0("_D",numbrero,"FCH"), data.ML$Gene_Time) data.ML.filter<-data.ML[data.ML$Var1%in%GENES,-c(1:3)] OLD_data.ML.filter <- data.ML.filter save(OLD_data.ML.filter, file=paste0(extra_data_dir,"/OLD_ML.Data.", paste0("D",numbrero,"FCH."),"BTM.SDFilter75.rda")) data.ML<-data.ML[,-c(1:3)] OLD_data.ML <- data.ML save(OLD_data.ML, file=paste0(extra_data_dir,"/OLD_ML.Data.", paste0("D",numbrero,"FCH."),"BTM.NoFilter.rda")) PatientInfo<-EXP.PD[!duplicated(EXP.PD$participant_id), c('age_imputed','gender','pt','study_accession', 'maxRBA_p30','MFC_p30','participant_id','maxRBA','MFC')] rownames(PatientInfo)<-PatientInfo$participant_id PatientInfo<-as.data.frame(PatientInfo) OLD_PatientInfo <- PatientInfo save(OLD_PatientInfo, file=paste0(extra_data_dir,"/OLD_PD.Master.", paste0("D", numbrero, "FCH."),"BTM.rda")) }) #Prior to testing in elderly, need to pull out young data for comparison (does not get stored above as it is within a loop) i=1 Time=db.run[i, "Var1"] # eg BL.GENES, BL.BTM VERSION<-db.run[i, "Var2"] #SEED=1987 SEED=42 #Load data ### Get ID with Both ### Added from prior ImmuneSpace version ID.UNIFORM<-get(load(paste0(extra_data_dir,"/ML.Data.D7FCH.BTM.NoFilter.rda"))) load(paste0(extra_data_dir,"/ML.Data.D7FCH.BTM.SELECT.PEAK.NoFilter.rda")) ID.SELECT <- data.ML.final ID.KEEP<-intersect(colnames(ID.UNIFORM), colnames(ID.SELECT)) if(grepl("SELECT",Time)==FALSE){ load(paste0(extra_data_dir,"/ML.Data.",Time,".NoFilter.rda")) load(paste0(extra_data_dir,"/PD.Master.",Time,".rda")) data.ML.filter=data.ML} if(grepl("SELECT",Time)==TRUE){ Time2<-paste0(Time, VERSION) load(paste0(extra_data_dir,"/ML.Data.",Time2,".NoFilter.rda")) data.ML.filter=data.ML.final PatientInfo=PatientInfo.Final} PatientInfo<-subset(PatientInfo, MFC_p30%in%c("lowResponder","highResponder")) PatientInfo<-droplevels(PatientInfo) PatientInfo$MFC_p30<-factor(PatientInfo$MFC_p30, levels = c("highResponder", "lowResponder")) DevID<-rownames(PatientInfo) DevID<-intersect(DevID, colnames(data.ML.filter)) X.Dev<-data.ML.filter[,DevID] rownames(X.Dev)<-strsplit2(rownames(X.Dev), "_")[,1] Y.Dev<-PatientInfo[DevID, 'MFC_p30'] names(Y.Dev)<-DevID keep.genes<-grep("M156.1", rownames(X.Dev), value = T) X.Train.t1<-t(X.Dev[keep.genes,]) Y.Train1<-PatientInfo[rownames(X.Train.t1), 'MFC_p30'] names(Y.Train1)<-rownames(X.Train.t1) X.Train.t1<-X.Train.t1[intersect(ID.KEEP, rownames(X.Train.t1)),,drop=F] #Test predictions in elderly load(paste0(extra_data_dir,"/OLD_ML.Data.D7FCH.BTM.NoFilter.rda")) load(paste0(extra_data_dir,"/OLD_PD.Master.D7FCH.BTM.rda")) PTID_HIGH_LOW <- rownames(OLD_PatientInfo[OLD_PatientInfo$MFC_p30 %in% c("lowResponder", "highResponder"),]) MODEL_TO_USE <- MODEL.LIST$D7FCH.BTM.SELECT.PEAK DATA <- t(OLD_data.ML[,PTID_HIGH_LOW]) colnames(DATA) <- gsub("_D7FCH", "", colnames(DATA)) DATA <- DATA[,"plasma cells, immunoglobulins (M156.1)", drop=F] PREDS_OLD <- predict(MODEL_TO_USE, DATA, type = "prob") %>% rownames_to_column('participant_id') %>% full_join(y=dplyr::select(OLD_PatientInfo[PTID_HIGH_LOW,], participant_id, pt, MFC_p30)) %>% mutate(Age_Group="Older") PREDS_YOUNG <- predict(MODEL_TO_USE, X.Train.t1, type = "prob") %>% rownames_to_column('participant_id') %>% full_join(y=dplyr::select(PatientInfo[rownames(X.Train.t1),], participant_id, pt, MFC_p30)) %>% filter(pt %in% PREDS_OLD$pt) %>% mutate(Age_Group="Young") PREDS_ALL <- full_join(PREDS_OLD, PREDS_YOUNG) perf.stats.by.vaccine.OLD<-plyr::ddply(.data=PREDS_ALL, .variable=c("pt", "Age_Group"), .fun=function(df){ if(nrow(df)!=0 & length(unique(df$MFC_p30))==2){ ROC.OBJ<-pROC::roc(response = df$MFC_p30, levels=c("lowResponder", "highResponder"), direction="<", predictor = df$highResponder) CI <- pROC::ci.auc(ROC.OBJ) DB2 <- data.frame(AUC=ROC.OBJ$auc, CI_UP=CI[3], CI_DOWN=CI[1]) return(DB2)}else{return(NULL)}}, .parallel=F) perf.stats.by.vaccine.OLD<-arrange(perf.stats.by.vaccine.OLD, desc(AUC)) pd<-position_dodge(0.9) perf.stats.by.vaccine.OLD$AUC=as.numeric(perf.stats.by.vaccine.OLD$AUC) p<-ggplot(perf.stats.by.vaccine.OLD %>% mutate(Age_Group=factor(Age_Group, levels = c("Young", "Older"))), aes(x=reorder(pt, -AUC), y=AUC, fill=Age_Group))+ geom_bar(stat='identity', show.legend = T, position = pd)+ scale_fill_brewer(palette = "Set3")+ geom_errorbar(aes(ymin = CI_DOWN, ymax = CI_UP), width = 0.25, position = pd)+ labs(y="AUC (CI 95%)", x=NULL, fill="Age Group")+ theme_bw()+ theme(axis.text.x = element_text(angle = 60, hjust = 1, size=8))+ ylim(0,1) print(p)
Ext. Data Figure 5a. Scatterplots of module activity scores in each vaccine among young (x-axis) and older participants (y-axis) of the most commonly expressed modules (Fig. 2a) on days 1–7. Correlation coefficient and p value determined via Pearson correlation.
#Create data frame with gene set enrichment scores for postvax vs prevax time points in older adult cohort. longitudinal_qusage_old_df_path <- file.path(extra_data_dir, "longitudinal_qusage_old_df.rds") if (file.exists(longitudinal_qusage_old_df_path)) { longitudinal_qusage_old_df <- readRDS(longitudinal_qusage_old_df_path) } else { old.noNorm.noResponse.filtered <- old.noNorm.noResponse %>% keep_most_recent_prevax_time_point(drop_postvax = FALSE) %>% remove_any_NA_rows() # Check that there are no repeated measures at any time point stopifnot(Biobase::pData(old.noNorm.noResponse.filtered) %>% dplyr::group_by(participant_id, time_post_last_vax) %>% dplyr::filter(n() > 1) %>% nrow() == 0) # Check that every participant has a baseline time point stopifnot(Biobase::pData(old.noNorm.noResponse.filtered) %>% dplyr::group_by(participant_id) %>% dplyr::filter(!any(time_post_last_vax <= 0)) %>% nrow() == 0) btm_list.filtered <- keep_symbols( genesets.symbol = btm_list, symbols2keep = rownames(old.noNorm.noResponse.filtered)) # Gene set enrichment analysis: postvax vs prevax time points ------------------ eset <- old.noNorm.noResponse.filtered analysis_df <- Biobase::pData(eset) %>% dplyr::filter(time_post_last_vax > 0) %>% dplyr::select(pathogen, vaccine_type, time_post_last_vax, study_accession) %>% dplyr::arrange(pathogen, vaccine_type, time_post_last_vax, study_accession) %>% dplyr::distinct() %>% dplyr::mutate(n_participants = NA, qusage_contrast = NA, qusage_output = list(NULL), error = list(NULL)) %>% dplyr::as_tibble() safe_qusage <- purrr::safely(.f = qusage::qusage, otherwise = NULL, quiet = TRUE) for (row_nr in 1:nrow(analysis_df)) { row <- analysis_df[row_nr,, drop = FALSE] cat(paste0("Row ", row_nr, "/", nrow(analysis_df), ": ", row$pathogen, " ", row$vaccine_type, " ", row$time_post_last_vax, " ", row$study_accession, "\n")) cur_eset <- eset[, eset$pathogen == row$pathogen & eset$vaccine_type == row$vaccine_type & eset$study_accession == row$study_accession & (eset$time_post_last_vax <= 0 | eset$time_post_last_vax == row$time_post_last_vax)] prevax_string <- "baseline" postvax_string <- paste0("Day", row$time_post_last_vax) qusage.contrast <- paste0(postvax_string, "-", prevax_string) qusage.labels <- ifelse( test = cur_eset$time_post_last_vax == row$time_post_last_vax, yes = postvax_string, no = prevax_string) result <- safe_qusage(eset = cur_eset, labels = qusage.labels, contrast = qusage.contrast, geneSets = btm_list.filtered, pairVector = cur_eset$participant_id) analysis_df$n_participants[row_nr] <- length(unique(cur_eset$participant_id)) analysis_df$qusage_contrast[row_nr] <- qusage.contrast if (!is.null(result$result)) analysis_df$qusage_output[[row_nr]] <- result$result if (!is.null(result$error)) analysis_df$error[[row_nr]] <- result$error } saveRDS(analysis_df, longitudinal_qusage_old_df_path) longitudinal_qusage_old_df <- analysis_df } #Create data frame with meta analysis results of gene set enrichment scores from postvax vs prevax time point of young adults. longitudinal_qusage_old_meta_df_path <- file.path(extra_data_dir, "longitudinal_qusage_old_meta_df.rds") if (file.exists(longitudinal_qusage_old_meta_df_path)) { longitudinal_qusage_old_meta_df <- readRDS(longitudinal_qusage_old_meta_df_path) } else { longitudinal_qusage_old_meta_df <- longitudinal_qusage_old_df %>% meta_qusage(group_cols = c("pathogen", "vaccine_type", "time_post_last_vax")) saveRDS(longitudinal_qusage_old_meta_df, longitudinal_qusage_old_meta_df_path) } tidy_longitudinal_qusage_old_meta_df <- longitudinal_qusage_old_meta_df %>% tidy_qusage_output(col = "meta_qusage_output") %>% dplyr::mutate(FDR = p.adjust(p.value, method = "BH"), FDR_signif_label = create_signif_label(FDR)) #Plot plot_df <- dplyr::inner_join( tidy_longitudinal_qusage_young_meta_df %>% dplyr::select(pathogen, vaccine_type, time_post_last_vax, geneset, activity_score), tidy_longitudinal_qusage_old_meta_df %>% dplyr::select(pathogen, vaccine_type, time_post_last_vax, geneset, activity_score), by = c("pathogen", "vaccine_type", "time_post_last_vax", "geneset"), suffix = c(".young", ".old")) %>% dplyr::filter(time_post_last_vax %in% c(1, 3, 7)) lm_df <- plot_df %>% dplyr::group_by(pathogen, vaccine_type, time_post_last_vax) %>% tidyr::nest() %>% dplyr::ungroup() %>% dplyr::mutate( lm = purrr::map( .x = data, ~lm(activity_score.old ~ activity_score.young, data = .x)), r2 = purrr::map( .x = lm, ~tibble( adj.r.squared = summary(.x)$adj.r.squared, r.squared = summary(.x)$r.squared, intercept = coef(.x)[1], slope = coef(.x)[2], p.value = broom::glance(.x)$p.value) ) ) %>% tidyr::unnest(r2) %>% dplyr::mutate(signif_label = cut(p.value, c(-Inf, .001, .01, .05, Inf), labels = c("***", "**", "*", "ns")), r2_label = sapply(adj.r.squared, function(x) substitute( expr = italic(R)^2~"="~r2, env = list(r2 = format(x, digits = 2))))) p <- ggplot(plot_df, aes(x = activity_score.young, y = activity_score.old, color = pathogen)) + geom_point() + geom_vline(xintercept= 0, color = "gray") + geom_hline(yintercept = 0, color = "gray") + geom_abline(intercept = 0, slope = 1, color = "lightgray") + geom_abline(data = lm_df, aes(intercept = intercept, slope = slope), color = "lightgray", linetype = "dashed") + geom_text(data = lm_df, x = -0.2, y = 0.9, aes(label = r2_label), color = "black", parse = TRUE, show.legend = FALSE) + geom_text(data = lm_df, x = -0.2, y = 0.6, aes(label = paste0("p = ", formatC(p.value, digits = 2, format = "e"), " (", signif_label, ")")), color = "black") + coord_equal(xlim = c(-1, 1), ylim = c(-1, 1)) + theme_bw(base_size = 24) + theme(axis.text.x = element_text(angle = 90, vjust = 0.5), legend.position = "bottom", legend.title = element_blank()) + facet_grid(pathogen + vaccine_type ~ paste0("Day ", time_post_last_vax)) + ggtitle("Common response module activity") print(p)
sessionInfo()
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.