knitr::opts_chunk$set(
  echo = TRUE, 
  message = FALSE,
  warning = FALSE, 
  cache = params$cache,
  cache.path = params$report_cache_dir
)

Setup: Load libraries and helper functions

suppressPackageStartupMessages({
  library(package = "Biobase")
  library(package = "tidyverse")
  library(package = "qusage") # Bioc install
  library(package = "boot") # Bioc install
  library(package = "ggthemes")
  library(package = "pheatmap")
  library(package = "matrixStats")
  library(package = "RColorBrewer")
  library(package = "reshape2")
  library(package = "lme4")
  library(package = "metap")
  library(package = "circlize")
  library(package = "corrplot")
  library(package = "ComplexHeatmap")
  library(package = "ggpubr")
})
data_dir <- params$data_dir
extra_data_dir <- params$extra_data_dir
if (!dir.exists(extra_data_dir)) dir.create(extra_data_dir)
date_prefix <- params$date_prefix
# For every gene set, find out in how many vaccines it is significantly up at
# any one postvax time point.
# The same but for down at any postvax time point
calc_sharing_df <- function(x) {
  x %>%
    dplyr::mutate(direction = ifelse(
      test = activity_score > 0,
      yes = "up",
      no = "down"),
      is_significant = !is.na(FDR) & FDR < 0.05) %>%
    dplyr::select(pathogen,
                  vaccine_type,
                  geneset,
                  direction,
                  is_significant) %>%
    dplyr::distinct() %>%
    dplyr::group_by(geneset, direction) %>%
    dplyr::summarise(n = sum(is_significant)) %>%
    dplyr::group_by(geneset) %>%
    dplyr::arrange(-n) %>%
    dplyr::slice(1) %>%
    dplyr::ungroup()
}
# Qusage (version 2.18.0) runs very slow on certain datasets, because it uses
# the convolve() function from stats which itself uses the fft() function. The
# fft() function runs very slow on certain inputs (e.g. 3 minutes instead of
# under 1 second with other fft implementations for the same input).
#
# Found a report of speed issues with the convolve() function at:
# https://stat.ethz.ch/pipermail/r-help/2007-December/148777.html
#
# Found a solution for the slow convolve() (actually the fft() it uses) at:
# https://stat.ethz.ch/pipermail/r-help/2007-December/148779.html
#
# The solution below requires the fftw library, which on ubuntu required the
# installation of the fftw-dev package (sudo apt install fftw3-dev) before
# install.packages("fftw") was successful.
#
# Here we override the convolve function to use a different (faster) fft
# function.
convolve <- function (x, y, conj = TRUE, type = c("circular", "open", "filter"))
{
  type <- match.arg(type)
  n <- length(x)
  ny <- length(y)
  Real <- is.numeric(x) && is.numeric(y)
  if (type == "circular") {
    if (ny != n)
      stop("length mismatch in convolution")
  }
  else {
    n1 <- ny - 1
    x <- c(rep.int(0, n1), x)
    n <- length(y <- c(y, rep.int(0, n - 1)))
  }
  # Original fft lines:
  #x <- fft(fft(x) * (if (conj)
  #  Conj(fft(y))
  #  else fft(y)), inverse = TRUE)

  # Use fftw instead of fft
  x <- fftw::FFT(fftw::FFT(x) * (if (conj)
    Conj(fftw::FFT(y))
    else fftw::FFT(y)), inverse = TRUE)

  if (type == "filter")
    (if (Real)
      Re(x)
     else x)[-c(1L:n1, (n - n1 + 1L):n)]/n
  else (if (Real)
    Re(x)
    else x)/n
}
create_signif_label <- function(x) {
  cut(x, c(-Inf, .001, .01, .05, Inf), labels = c("***", "**", "*", "ns"))
}
#' Keep only specified symbols in a given list of gene sets
#'
#' @param genesets.symbol list of character vectors
#' @param symbols2keep character vector
#' @param min_size minimum number of symbols to keep a gene set
keep_symbols <- function(genesets.symbol, symbols2keep,
                         min_size = 2) {
  stopifnot(is.character(symbols2keep))
  stopifnot(is.list(genesets.symbol))
  common_symbols <- intersect(Reduce(union, genesets.symbol), symbols2keep)
  purrr::map(
    .x = genesets.symbol,
    ~.x[.x %in% common_symbols])%>%
    purrr::discard(~length(.x) < min_size)
}
# NOTE: function below was copied from
# https://github.com/RGLab/ImmuneSignatures2/blob/master/R/geneExpressionPreProcessing.R
#
#' Remove incomplete rows
#'
#' @param eset expressionSet
#' @export
#'
removeAllNArows <- function(eset){
  em <- Biobase::exprs(eset)
  allNARows <- apply(em, 1, function(x){ all(is.na(x)) })
  eset <- eset[ !allNARows, ]
}
remove_any_NA_rows <- function(eset) {
  any_NA_rows <- apply(X = Biobase::exprs(eset),
                       MARGIN = 1,
                       FUN = function(x)any(is.na(x)))
  eset[!any_NA_rows,]
}
#' Keep most recent pre-vaccination time point for each participant
#'
#' @param eset expressionSet
#' @param drop_postvax if TRUE, postvax time points will be discarded
#' @export
#'
keep_most_recent_prevax_time_point <- function(eset, drop_postvax) {
  prevax_uids <- Biobase::pData(eset) %>%
    dplyr::filter(time_post_last_vax <= 0) %>%
    dplyr::arrange(-time_post_last_vax) %>%
    dplyr::group_by(participant_id) %>%
    dplyr::slice(1) %>%
    dplyr::ungroup() %>%
    dplyr::pull(uid)
  postvax_uids <- Biobase::pData(eset) %>%
    dplyr::filter(time_post_last_vax > 0) %>%
    dplyr::pull(uid)
  if (drop_postvax)
    uids <- prevax_uids
  else
    uids <- union(prevax_uids, postvax_uids)
  eset[, eset$uid %in% uids]
}
#' Add 'response_class' column containing maxRBA_p30 values for Influenza and
#' MFC_p30 values for all other vaccines
#'
#' @param eset expressionSet
#' @export
#'
add_response_class_column <- function(eset) {
  # Note: maxRBA measure is used to determine high and low responders for
  # Influenza, because many participants are unlikely to be naive for this virus
  # (resulting in high baseline titers and consequently lower fold changes which
  # maxRBA is designed to take into account)
  Biobase::pData(eset)$response_class <-
    as.character(Biobase::pData(eset)$MFC_p30)
  Biobase::pData(eset)$response_class[
    Biobase::pData(eset)$pathogen == "Influenza"] <-
    as.character(Biobase::pData(eset)$maxRBA_p30[
      Biobase::pData(eset)$pathogen == "Influenza"])
  eset
}
#' Keep participants that are high- or low-responders
#'
#' @param eset expressionSet
#' @param col name of column containing high or low responders
#' @export
#'
keep_only_high_and_low_responders <- function(eset, col) {
  eset[, eset[[col]] %in% c("lowResponder", "highResponder")]
}
#' Perform qusage meta analysis (combinePDFs) on data frame
#'
#' @param x data frame with list column containing QSarray objects
#' @param group_cols character vector with names of columns to perform grouping
#' on for meta analysis
#' @return x with additional column containing output from meta analysis
meta_qusage <- function(x, group_cols) {
  require(qusage)
  if (find("convolve")[1] == "package:stats") {
    stop("stats::convolve is slow, please load convolve.R")
  }

  x %>%
    dplyr::group_by_at(group_cols) %>%
    tidyr::nest() %>%
    dplyr::ungroup() %>%
    dplyr::mutate(
      meta_qusage_output = purrr::map(
        .x = data,
        ~{
          QSarray_list <- .x$qusage_output %>%
            purrr::keep(~class(.x) == "QSarray")
          if (length(QSarray_list) == 0)
            return(NULL)
          if (length(QSarray_list) > 1) {
            cat("Running combinePDFs\n")
            return(qusage::combinePDFs(QSarrayList = QSarray_list))
          } else {
            return(QSarray_list[[1]])
          }
        }
      )) %>%
    dplyr::select(-data)
}
#' Replace QSarray objects with a few informative columns
#'
#' @param x data frame containing list column with QSarray objects
#' @param col string with name of list column
#' @return x with col replaced with "p.value", "geneset", and "activity_score"
#' columns
tidy_qusage_output <- function(x, col) {
  require(qusage)
  x %>%
    dplyr::filter(purrr::map_lgl(.x = !!sym(col),
                                 ~class(.x) == "QSarray")) %>%
    dplyr::mutate(
      tidied =
        purrr::map(
          .x = !!sym(col),
          ~ tibble(
            p.value = pdf.pVal(.x),
            geneset = colnames(.x$path.PDF),
            activity_score = .x$path.mean)
        )) %>%
    dplyr::select(-!!sym(col)) %>%
    tidyr::unnest(tidied)
}
#' Get confidence intervals for every variable in a bootstrap.
#'
#' @param boot_result an object of class "boot" containing the output of a
#' bootstrap calculation
#' @param type character string representing the type of interval required
tidy_boot_ci <- function(boot_result, type) {
  stopifnot(class(boot_result) == "boot")
  stopifnot(is.character(type))
  purrr::map_df(
    .x = 1:ncol(boot_result$t),
    ~{
      ci <- boot.ci(boot.out = boot_result, type = type, index = .x)
      # When using "perc" as type, the name in the output list ci is "percent"
      # Using grep to handle this inconsistency
      i <- grep(pattern = type, names(ci))[1]
      tibble(name = names(ci$t0),
             statistic = ci$t0,
             ci_low = ci[[i]][4],
             ci_high = ci[[i]][5])
    }
  ) %>%
    dplyr::mutate(bootstrapped_mean_of_statistic = colMeans(boot_result$t))
}
# Description: functions to extend pheatmap functionality
pheatmap_add_labels <- function(p, annot_labels,
                                gp = gpar(col = "#000000", fontsize = 10),
                                angle_col = 0,
                                angle_row = 270) {
  require(pheatmap)
  require(grid)
  require(gtable)
  # Get the column and row annotation rectangles (called "track" in pheatmap)
  tracks <- list()
  annot_names <- vector()
  for (annot_dir in c("col", "row")){
    track_grob_name <- paste0(annot_dir, "_annotation")
    track_index <- which(p$gtable$layout$name == track_grob_name)

    if (length(track_index) == 0){
      next
    }
    track_grob <- p$gtable$grobs[[track_index]]
    tracks[[annot_dir]] <- list("index" = track_index,
                                "grob" = track_grob)
    annot_names <- c(annot_names, colnames(track_grob$gp$fill))
  }

  # Get legend grob
  legend_index <- which(p$gtable$layout$name == "annotation_legend")
  legend_grob <- p$gtable$grobs[[legend_index]]

  # Get mapping between annotation labels and annotation colors
  elem2color <- list()
  color2label <- list()
  for (annot_name in annot_names) {
    # Retrieve from the legend the filled rectangles grob belonging to current annotation
    cur_grob <- getGrob(gTree = legend_grob,
                        gPath = childNames(legend_grob)[[paste(annot_name, "r")]])
    elem2color[[annot_name]] <- cur_grob$gp$fill
    cur_colors <- as.character(elem2color[[annot_name]])
    cur_elems <- names(elem2color[[annot_name]])
    if (annot_name %in% names(annot_labels)) {
      cur_labels <- as.character(annot_labels[[annot_name]][cur_elems])
    } else {
      cur_labels <- rep("", length.out = length(cur_elems))
    }
    color2label[[annot_name]] <- cur_labels
    names(color2label[[annot_name]]) <- cur_colors
  }

  # Add labels to annotation legend
  for (annot_name in annot_names) {
    target_child_name <- paste(annot_name, "r")
    cur_grob <- getGrob(gTree = legend_grob,
                        gPath = childNames(legend_grob)[[target_child_name]])
    legend_labels <- color2label[[annot_name]][as.character(cur_grob$gp$fill)]

    res <- gList()
    res[["rect"]] <- cur_grob
    res[["text"]] <- textGrob(x = cur_grob$x + 0.5 * cur_grob$width,
                              y = cur_grob$y - 0.5 * cur_grob$height,
                              label = legend_labels,
                              gp = gp)
    legend_grob$children[[target_child_name]] <- gTree(children = res)
  }
  p$gtable$grobs[[legend_index]] <- gTree(children = legend_grob$children)

  # Add labels to annotation tracks
  for (track_dir in names(tracks)){
    track_grob <- tracks[[track_dir]]$grob
    track_index <- tracks[[track_dir]]$index

    track_labels <- purrr::map(
      .x = colnames(track_grob$gp$fill),
      ~color2label[[.x]][track_grob$gp$fill[,.x]]
    ) %>%
      do.call(what = c, args = .)

    # Add labels to annotation track
    res <- gList()
    res[["rect"]] = track_grob
    res[["text"]] = textGrob(x = track_grob$x,
                             y = track_grob$y,
                             label = track_labels,
                             rot = ifelse(track_dir == "col", yes = angle_col, no = angle_row),
                             gp = gp)
    p$gtable$grobs[[track_index]] <- gTree(children = res)
  }
  return(p)
}
# Function based on (and slightly modified, removing the option to make
# vertically positioned dendrograms)
# https://github.com/cran/pheatmap/blob/master/R/pheatmap.r
# on 2019_02_14
draw_dendrogram <- function(hc) {
  h = hc$height / max(hc$height) / 1.05
  m = hc$merge
  o = hc$order
  n = length(o)

  m[m > 0] = n + m[m > 0]
  m[m < 0] = abs(m[m < 0])

  dist = matrix(0, nrow = 2 * n - 1, ncol = 2, dimnames = list(NULL, c("x", "y")))
  dist[1:n, 1] = 1 / n / 2 + (1 / n) * (match(1:n, o) - 1)

  for(i in 1:nrow(m)){
    dist[n + i, 1] = (dist[m[i, 1], 1] + dist[m[i, 2], 1]) / 2
    dist[n + i, 2] = h[i]
  }

  draw_connection = function(x1, x2, y1, y2, y){
    res = list(
      x = c(x1, x1, x2, x2),
      y = c(y1, y, y, y2)
    )

    return(res)
  }

  x = rep(NA, nrow(m) * 4)
  y = rep(NA, nrow(m) * 4)
  id = rep(1:nrow(m), rep(4, nrow(m)))

  for(i in 1:nrow(m)){
    c = draw_connection(dist[m[i, 1], 1], dist[m[i, 2], 1], dist[m[i, 1], 2], dist[m[i, 2], 2], h[i])
    k = (i - 1) * 4 + 1
    x[k : (k + 3)] = c$x
    y[k : (k + 3)] = c$y
  }

  x = unit(x, "npc")
  y = unit(y, "npc")

  return(polylineGrob(x = x, y = y, id = id))
}
# pheatmap variant enabling clustering of groups of columns.
col_grouped_pheatmap <- function(mat,
                                 col_groups,
                                 col_clust_fun = "hclust",
                                 col_dist_fun = "dist",
                                 method = "euclidean",
                                 treeheight_col = 50,
                                 display_numbers = FALSE,
                                 silent = TRUE,
                                 ...) {
  require(pheatmap)
  require(grid)
  require(gtable)
  stopifnot(is.matrix(mat))
  stopifnot(any(class(display_numbers) %in% c("logical", "matrix")))

  col_clust_fun <- match.fun(col_clust_fun)
  col_dist_fun <- match.fun(col_dist_fun)

  # Cluster the columns and draw the dendrograms
  dendro_grobs <- list()
  for (cur_group in unique(col_groups)) {
    col_indexes.subset <- which(col_groups == cur_group)
    #cl <- hclust(d = dist(x = t(mat[, col_indexes.subset]),
    #                      method = method))
    cl <- col_clust_fun(d = col_dist_fun(x = t(mat[, col_indexes.subset]),
                                         method = method))

    col_indexes <- 1:ncol(mat)
    col_indexes[col_indexes.subset] <- NA
    col_indexes[is.na(col_indexes)] <- col_indexes.subset[cl$order]
    mat <- mat[, col_indexes]

    dendro_grobs[[length(dendro_grobs) + 1]] <- draw_dendrogram(cl)
  }

  # Get column numbers after which a gap occurs
  gaps_col <- which(
    purrr::map_lgl(2:length(col_groups),
                   ~{col_groups[.x] != col_groups[.x-1]}))

  gap_width <- unit("4", "bigpts")
  matrix_cell_width <- (1 / ncol(mat)) * (unit(1, "npc") - length(gaps_col) * gap_width)
  dendro_widths <- base::diff(c(0, gaps_col, ncol(mat))) * matrix_cell_width

  gap_grob <- rectGrob(gp = gpar(fill = NA, col = NA), width = gap_width)

  my_grobs <- list(dendro_grobs[[1]])
  my_widths <- dendro_widths[1]
  for (i in 1:length(gaps_col)) {
    my_grobs[[2*i]] <- gap_grob
    my_grobs[[2*i + 1]] <- dendro_grobs[[i+1]]
    my_widths <- unit.c(my_widths, gap_width, dendro_widths[i+1])
  }

  combined_dendros_grob <- grobTree(gtable_matrix(
    name = "my_dendrograms",
    grobs = matrix(my_grobs, ncol = length(my_grobs)),
    widths = my_widths,
    heights = unit(1, "null")))

  if (is.matrix(display_numbers)) {
    display_numbers <- display_numbers[rownames(mat), colnames(mat)]
  }

  p <- pheatmap::pheatmap(mat = mat,
                display_numbers = display_numbers,
                gaps_col = gaps_col,
                treeheight_col = treeheight_col,
                cluster_cols = FALSE,
                filename = NA,
                silent = silent,
                ...)

  p$gtable <- gtable_add_grob(
    x = p$gtable,
    grobs = combined_dendros_grob,
    t = 2,
    l = 3,
    name = "col_tree")
  p$gtable$heights[2] <- p$gtable$heights[2] + unit(treeheight_col, "bigpts")
  p$gtable$heights[4] <- p$gtable$heights[4] - unit(treeheight_col, "bigpts")
  return(p)
}
#Principal Variant Component Analysis (PVCA) function
pvca <- function (abatch,expInfo) {
    theDataMatrix=abatch
    dataRowN <- nrow(theDataMatrix)
    dataColN <- ncol(theDataMatrix)

    ########## Center the data (center rows) ##########
    theDataMatrixCentered <- matrix(data = 0, nrow = dataRowN, ncol = dataColN)
    theDataMatrixCentered_transposed = apply(theDataMatrix, 1, scale, center = TRUE, scale = FALSE)
    theDataMatrixCentered = t(theDataMatrixCentered_transposed)

    ########## Compute correlation matrix &  Obtain eigenvalues ##########

    theDataCor <- cor(theDataMatrixCentered)
    eigenData <- eigen(theDataCor)
    eigenValues = eigenData$values
    ev_n <- length(eigenValues)
    eigenVectorsMatrix = eigenData$vectors
    eigenValuesSum = sum(eigenValues)
    percents_PCs = eigenValues /eigenValuesSum

    ##===========================================
    ##  Getting the experimental information
    ##===========================================

    exp_design <- as.data.frame(expInfo)
    expDesignRowN <- nrow(exp_design)
    expDesignColN <- ncol(exp_design)

    ########## Merge experimental file and eigenvectors for n components ##########

    ## pc_n is the number of principal components to model

    ## Use fixed pc_n
    pc_n=5

    pc_data_matrix <- matrix(data = 0, nrow = (expDesignRowN*pc_n), ncol = 1)
    mycounter = 0
    for (i in 1:pc_n){
        for (j in 1:expDesignRowN){
            mycounter <- mycounter + 1
            pc_data_matrix[mycounter,1] = eigenVectorsMatrix[j,i]
        }
    }

    AAA <- exp_design[rep(1:expDesignRowN,pc_n),]
    Data <- cbind(AAA,pc_data_matrix)


    ####### Edit these variables according to your factors #######

    variables <-c (colnames(exp_design))
    for (i in 1:length(variables))
    {
        Data$variables[i] <- as.factor(Data$variables[i] )
    }


    ########## Mixed linear model ##########
    op <- options(warn = (-1))
    effects_n = expDesignColN+1 #effects size without interaction terms
    randomEffectsMatrix <- matrix(data = 0, nrow = pc_n, ncol = effects_n)

    ##============================#
    ##  Get model functions
    ##============================#
    model.func <- c()
    index <- 1

    ##  level-1
    for (i in 1:length(variables))
    {
        mod = paste("(1|", variables[i], ")",   sep="")
        model.func[index] = mod
        index = index + 1
    }

    function.mods <- paste (model.func , collapse = " + ")

    ##============================#
    ##  Get random effects  #
    ##============================#

    for (i in 1:pc_n){
        y = (((i-1)*expDesignRowN)+1)
        funct <- paste ("pc_data_matrix", function.mods, sep =" ~ ")
                Rm1ML <- lmer( funct ,
                              Data[y:(((i-1)*expDesignRowN)+expDesignRowN),],
                              REML = TRUE, verbose = FALSE, na.action = na.omit)
                randomEffects <- Rm1ML
                randomEffectsMatrix[i,] <- c(unlist(VarCorr(Rm1ML)),resid=sigma(Rm1ML)^2)
    }
    effectsNames <- c(names(getME(Rm1ML,"cnms")),"resid")

    ########## Standardize Variance ##########
    randomEffectsMatrixStdze <- matrix(data = 0, nrow = pc_n, ncol = effects_n)
    for (i in 1:pc_n){
        mySum = sum(randomEffectsMatrix[i,])
        for (j in 1:effects_n){
            randomEffectsMatrixStdze[i,j] = randomEffectsMatrix[i,j]/mySum
        }
    }

    ########## Compute Weighted Proportions ##########

    randomEffectsMatrixWtProp <- matrix(data = 0, nrow = pc_n, ncol = effects_n)
    for (i in 1:pc_n){
        weight = eigenValues[i]/eigenValuesSum
        for (j in 1:effects_n){
            randomEffectsMatrixWtProp[i,j] = randomEffectsMatrixStdze[i,j]*weight
        }
    }

    ########## Compute Weighted Ave Proportions ##########

    randomEffectsSums <- matrix(data = 0, nrow = 1, ncol = effects_n)
    randomEffectsSums <-colSums(randomEffectsMatrixWtProp)
    totalSum = sum(randomEffectsSums)
    randomEffectsMatrixWtAveProp <- matrix(data = 0, nrow = 1, ncol = effects_n)

    for (j in 1:effects_n){
        randomEffectsMatrixWtAveProp[j] = randomEffectsSums[j]/totalSum
    }
    return(list(dat=randomEffectsMatrixWtAveProp, label=effectsNames))
}

Setup: load dataset, create fold change data

FDR_THRESHOLD <- 0.05 # Determines if a gene set is significantly up/down
NUM_PERMUTATIONS <- 10

# Colors and abbreviations
load(file.path(data_dir, "adj_path_vt_colors_abbv_v3.RData"))

young.noNorm.noResponse <- readRDS(file.path(data_dir, paste0(date_prefix, "young_noNorm_eset.rds")))
old.noNorm.noResponse <- readRDS(file.path(data_dir, paste0(date_prefix, "old_noNorm_eset.rds")))
young.noNorm.withResponse <- readRDS(file.path(data_dir, paste0(date_prefix, "young_noNorm_withResponse_eset.rds")))
old.noNorm.withResponse <- readRDS(file.path(data_dir, paste0(date_prefix, "old_noNorm_withResponse_eset.rds")))

#Merge pre/post vax samples into same matrix for SDY1529
young.noNorm.noResponse$matrix[young.noNorm.noResponse$study_accession=='SDY1529']='SDY1529_WholeBlood_HealthyAdults'
young.noNorm.withResponse$matrix[young.noNorm.withResponse$study_accession=='SDY1529']='SDY1529_WholeBlood_HealthyAdults'
#Add alum as an adjuvant to Hepatitis B
young.noNorm.noResponse$adjuvant[young.noNorm.noResponse$pathogen=='Hepatitis A/B']='Alum'
old.noNorm.noResponse$adjuvant[old.noNorm.noResponse$pathogen=='Hepatitis A/B']='Alum'
young.noNorm.withResponse$adjuvant[young.noNorm.withResponse$pathogen=='Hepatitis A/B']='Alum'
old.noNorm.withResponse$adjuvant[old.noNorm.withResponse$pathogen=='Hepatitis A/B']='Alum'
#Abbreviate vaccine types
young.noNorm.noResponse$vaccine_type_full=young.noNorm.noResponse$vaccine_type
old.noNorm.noResponse$vaccine_type_full=old.noNorm.noResponse$vaccine_type
young.noNorm.withResponse$vaccine_type_full=young.noNorm.withResponse$vaccine_type
old.noNorm.withResponse$vaccine_type_full=old.noNorm.withResponse$vaccine_type
vaccine_abrev=c('Live virus'='LV','Recombinant protein'='RP','Recombinant Viral Vector'='RVV',
                'Polysaccharide'='PS','Conjugate'='CJ','Inactivated'='IN',
                'Inactivated/Recombinant protein'='IN/RP')
young.noNorm.noResponse$vaccine_type=str_replace_all(young.noNorm.noResponse$vaccine_type,vaccine_abrev)
old.noNorm.noResponse$vaccine_type=str_replace_all(old.noNorm.noResponse$vaccine_type,vaccine_abrev)
young.noNorm.withResponse$vaccine_type=str_replace_all(young.noNorm.withResponse$vaccine_type,vaccine_abrev)
old.noNorm.withResponse$vaccine_type=str_replace_all(old.noNorm.withResponse$vaccine_type,vaccine_abrev)
names(adj_path_vt_abbv$vaccine_type)=str_replace_all(names(adj_path_vt_abbv$vaccine_type), vaccine_abrev)
names(adj_path_vt_colors$vaccine_type)=str_replace_all(names(adj_path_vt_abbv$vaccine_type), vaccine_abrev)

# Combine young and old with response
symbols <- intersect(rownames(young.noNorm.withResponse),
                     rownames(old.noNorm.withResponse))
pd <- rbind(Biobase::pData(young.noNorm.withResponse) %>%
              dplyr::mutate(age_group = "young"),
            Biobase::pData(old.noNorm.withResponse) %>%
              dplyr::mutate(age_group = "old"))
ge <- cbind(Biobase::exprs(young.noNorm.withResponse)[symbols, ],
            Biobase::exprs(old.noNorm.withResponse)[symbols, ])[, pd$uid]
rownames(pd) <- pd$uid
young_and_old.noNorm.withResponse <- new("ExpressionSet",
            exprs = ge,
            phenoData = new('AnnotatedDataFrame', pd))
# Combine young and old without response
symbols <- intersect(rownames(young.noNorm.noResponse),
                     rownames(old.noNorm.noResponse))
pd <- rbind(Biobase::pData(young.noNorm.noResponse) %>%
              dplyr::mutate(age_group = "young"),
            Biobase::pData(old.noNorm.noResponse) %>%
              dplyr::mutate(age_group = "old"))
ge <- cbind(Biobase::exprs(young.noNorm.noResponse)[symbols, ],
            Biobase::exprs(old.noNorm.noResponse)[symbols, ])[, pd$uid]
rownames(pd) <- pd$uid
young_and_old.noNorm <- new("ExpressionSet", 
                                         exprs = ge,
                                         phenoData = new('AnnotatedDataFrame', pd))

btm_list <- readRDS(file = file.path(data_dir, "btm_list_2020_12_23.rds"))

##Create fold change dataset for downstream use
eset=readRDS(file.path(data_dir, paste0(date_prefix, "young_norm_eset.rds")))
colnames(eset)=eset$uid

#Merge pre/post vax samples into same matrix for SDY1529
eset$matrix[eset$study_accession=='SDY1529']='SDY1529_WholeBlood_HealthyAdults'
#Add alum as an adjuvant to Hepatitis B
eset$adjuvant[eset$pathogen=='Hepatitis A/B']='Alum'
#Abbreviate vaccine types
eset$vaccine_type_full=eset$vaccine_type
eset$vaccine_type=str_replace_all(eset$vaccine_type,vaccine_abrev)

#Create combined vaccine type/pathogen column
eset$pt=paste(eset$pathogen," (",eset$vaccine_type,")", sep='')
#Create combined SDY/pathogen/vaccine type column
eset$SDY_pt=paste(eset$study,eset$pt)
#Find samples at timepoints >=0
tp_int=sort(unique(eset$time_post_last_vax[which(eset$time_post_last_vax>=0)]))
ind=lapply(tp_int, function(x) which(eset$time_post_last_vax==x))
#Combine indices of all timepoints of interest
ind_all=Reduce(union,ind)
#Retain only samples from timepoints of interest
eset=eset[,ind_all]
#Recompute timepoint indices after removing extraneous timepoints
ind=lapply(tp_int, function(x) which(eset$time_post_last_vax==x))
#Remove samples from a single study with fewer than sample_cutoff samples at any timepoint
sample_cutoff=3
matrix_uni_tp=lapply(ind,function(x) unique(eset[,x]$matrix))
matrix_ind=lapply(1:length(ind),function(x)
  lapply(1:length(matrix_uni_tp[[x]]), function(y) which(matrix_uni_tp[[x]][[y]]==eset[,ind[[x]]]$matrix)))
ind_cut_all=vector()
for (i in 1:length(matrix_ind)) {
  ind_cut=which(sapply(matrix_ind[[i]],length)<sample_cutoff)
  if (is_empty(ind_cut)==FALSE) {
    for (j in 1:length(ind_cut)){
      ind_cut_all=c(ind_cut_all,ind[[i]][matrix_ind[[i]][[ind_cut[j]]]])
    }
  }
}
if (is_empty(ind_cut_all)==FALSE) {
  eset=eset[,-ind_cut_all]
}
#Create unique list of studies
matrix_uni=unique(eset$matrix)
#Remove matrices with only D0 expression
matrix_tp=lapply(matrix_uni, function(x) unique(eset$time_post_last_vax[eset$matrix==x]))
matrix_d0_only=matrix_uni[sapply(matrix_tp, function(x) identical(x,0))]
eset=eset[,!(eset$matrix %in% matrix_d0_only)]
#Remove genes with NA
eset=eset[complete.cases(exprs(eset)),]
#Recompute timepoint indices after removing samples
tp_int=sort(unique(eset$time_post_last_vax[which(eset$time_post_last_vax>=0)]))
ind=lapply(tp_int, function(x) which(eset$time_post_last_vax==x))
#Create BTM-level eset
#Remove BTMs which have no matching genes in dataset
btm_list=btm_list[!sapply(btm_list, function(x) is_empty(intersect(x,rownames(eset))))]
#Collapse - find arithmetic mean of genes in each BTM for each sample
eset_BTM=do.call(rbind, lapply(btm_list, function(x) colMeans(exprs(eset[na.omit(match(x,rownames(eset))),]),na.rm=TRUE)))
#Create BTM-level eset
eset_BTM=ExpressionSet(as.matrix(eset_BTM), eset@phenoData)

#Compute D0 normalized FC
ind_D0=which(0==eset$time_post_last_vax)
common=lapply(2:length(ind),function(x) intersect(eset$participant_id[ind[[x]]],eset$participant_id[ind_D0]))
ia=lapply(2:length(ind),function(x) na.omit(match(common[[x-1]],eset$participant_id[ind[[x]]])))
ib=lapply(2:length(ind),function(x) na.omit(match(common[[x-1]],eset$participant_id[ind_D0])))
exp_FC=lapply(2:length(ind),function(x) eset[,ind[[x]][ia[[x-1]]]])
exp_FC=lapply(2:length(ind),function(x) {exprs(exp_FC[[x-1]])=exprs(exp_FC[[x-1]])-exprs(eset[,ind_D0[ib[[x-1]]]]); exp_FC[[x-1]]})
exp_FC_BTM=lapply(2:length(ind),function(x) eset_BTM[,ind[[x]][ia[[x-1]]]])
exp_FC_BTM=lapply(2:length(ind),function(x) {exprs(exp_FC_BTM[[x-1]])=exprs(exp_FC_BTM[[x-1]])-exprs(eset_BTM[,ind_D0[ib[[x-1]]]]); exp_FC_BTM[[x-1]]})
tp_FC=tp_int[-1] #Store timepoints for FC data (remove Day 0)

Figure 1. An integrated database of transcriptional responses to vaccination

Figure 1b: Histogram of the number of samples included per vaccine at each timepoint. Day 0 represents Day of vaccination.

#Generate unique list of studies/vaccine type/pathogen/timepoint
matrices=unique(eset$study)
studies=unique(eset$study)
vaccine_type=unique(eset$vaccine_type)
pathogen=unique(eset$pathogen)
timepoint=round(unique(eset$time_post_last_vax)*100)/100
pathogen_type=unique(eset$pt)

#For each timepoint, identify pathogen, then count studies/samples per pathogen/TP
TP_path_sample_count=matrix(NA,length(timepoint), length(pathogen_type))
for (i in 1:length(timepoint)) {
  #Find pathogen/type
  TP_ind=which(timepoint[i]==eset$time_post_last_vax)
  pathogen_TP=unique(eset$pt[TP_ind])
  #For each pathogen at given TP
  for (j in 1:length(pathogen_TP)) {
    #Find pathogen in overall pathogen list
    pathogen_type_ind=which(pathogen_TP[j]==pathogen_type)
    #Count studies with given TP/pathogen combo
    pathogen_TP_ind=which(pathogen_TP[j]==eset$pt[TP_ind])
    #Count samples with given TP/pathogen combo
    if (length(pathogen_TP_ind)>1) {
      TP_path_sample_count[i,pathogen_type_ind]=length(pathogen_TP_ind)
    }
  }
}
#Convert counts to data frame, add timepoint column and pathogen names
TP_path_sample_count=data.frame(TP_path_sample_count)
colnames(TP_path_sample_count)=pathogen_type
TP_path_sample_count$Day=timepoint

#Order alphabetically
TP_path_sample_count=TP_path_sample_count[,order(names(TP_path_sample_count))]
#Convert days to factor
TP_path_sample_count$Day=factor(TP_path_sample_count$Day, levels=sort(TP_path_sample_count$Day))

#Create plot
df=pivot_longer(TP_path_sample_count, !Day, names_to='Vaccine', values_to = 'Samples')
p=ggplot(df, aes(x=Day, y=Samples, fill=Vaccine))+
  geom_bar(stat="identity")+
  scale_fill_manual(values=colorRampPalette(brewer.pal(name="Paired", n = 8))(14))+
  xlab("Day post-vaccination")+
  ylab("# of Samples")+
  labs(fill='Vaccine')+
  theme_minimal()+
  theme(text=element_text(size=16),panel.grid.minor.x=element_blank(), legend.position="bottom")
print(p)

Figure 1c: Age distribution of participants in the Immune Signatures Data Resource by vaccine. Shape of points denotes the subject’s inferred sex based on Y chromosome-specific gene expression. For participants with missing age data, ages were estimated using RAPToR. Center line, median; box limits, upper and lower quartiles; whiskers, 1.5x interquartile range.

df=pData(eset[,match(unique(eset$participant_id), eset$participant_id)])
p=ggplot(df, aes(x=pt, y=age_imputed, fill=pt))+
  geom_boxplot(outlier.shape=NA,width=0.5,notch=FALSE, alpha=0.6)+
  geom_jitter(aes(shape=y_chrom_present), alpha=0.6, width=0.1)+
  scale_fill_manual(values=colorRampPalette(brewer.pal(name="Paired", n = 8))(14))+
  xlab("Vaccine")+
  ylab("Age")+
  labs(fill='Vaccine', shape='Sex (Imputed)')+
  theme_minimal()+
  theme(text=element_text(size=16),panel.grid.minor.x=element_blank(), legend.position="bottom",
        axis.text.x = element_text(angle = 45, hjust=1))
print(p)

Figure 1d: Bar plot representing the proportion of variance in post-vaccination transcriptional responses that can be attributed to clinical (age, sex, ethnicity) and experimental variables (time after vaccination, vaccine) via Principal Component Variance Analysis. The residual represents the proportion of the variance that could not be explained by any of the included variables.

#Merge FC data into one eset
exp_FC_all=do.call(Biobase::combine,exp_FC)
exp_FC_all_pData=pData(exp_FC_all)

#Collect variables for PVCA
TP_variables=data.frame(Timepoint=exp_FC_all_pData$study_time_collected,
                        Vaccine=exp_FC_all_pData$pt,
                        Gender=exp_FC_all_pData$gender,
                        Age=exp_FC_all_pData$age_reported,
                        Ethnicity=exp_FC_all_pData$ethnicity)
rownames(TP_variables)=rownames(exp_FC_all_pData)

#Call PVCA function
pvcaObj=pvca(exprs(exp_FC_all),TP_variables)

#Create plot
par(mar=c(9,6,4,1))
p=barplot(pvcaObj$dat, ylab = "Proportion variance explained",
             ylim= c(0,1.1),col = c("blue"), las=2, cex.axis=1, cex.names=1)
axis(1, at = p, labels = pvcaObj$label, cex.axis = 1, las=2)
print(p)

Figure 2. Common and unique transcriptional responses across different vaccines.

Prepare data 1. Create data frame with gene set enrichment scores for postvax vs prevax time points in young adult cohort.

longitudinal_qusage_young_df_path <- file.path(extra_data_dir, "longitudinal_qusage_young_df.rds")
if (file.exists(longitudinal_qusage_young_df_path)) {
  longitudinal_qusage_young_df <- readRDS(longitudinal_qusage_young_df_path)
} else {
  young.noNorm.noResponse.filtered <- young.noNorm.noResponse %>%
    keep_most_recent_prevax_time_point(drop_postvax = FALSE) %>%
    remove_any_NA_rows()

  # Check that there are no repeated measures at any time point
  stopifnot(Biobase::pData(young.noNorm.noResponse.filtered) %>%
              dplyr::group_by(participant_id, time_post_last_vax) %>%
              dplyr::filter(n() > 1) %>%
              nrow() == 0)

  # Check that every participant has a baseline time point
  stopifnot(Biobase::pData(young.noNorm.noResponse.filtered) %>%
              dplyr::group_by(participant_id) %>%
              dplyr::filter(!any(time_post_last_vax <= 0)) %>%
              nrow() == 0)

  btm_list.filtered <- keep_symbols(
    genesets.symbol = btm_list,
    symbols2keep = rownames(young.noNorm.noResponse.filtered))

  # Gene set enrichment analysis: postvax vs prevax time points ------------------

  analysis_df <- Biobase::pData(young.noNorm.noResponse.filtered) %>%
    dplyr::filter(time_post_last_vax > 0) %>%
    dplyr::select(pathogen,
                  vaccine_type,
                  time_post_last_vax,
                  study_accession) %>%
    dplyr::arrange(pathogen, vaccine_type, time_post_last_vax,
                   study_accession) %>%
    dplyr::distinct() %>%
    dplyr::mutate(n_participants = NA,
                  qusage_contrast = NA,
                  qusage_output = list(NULL),
                  error = list(NULL)) %>%
    dplyr::as_tibble()

  safe_qusage <- purrr::safely(.f = qusage::qusage,
                               otherwise = NULL, quiet = TRUE)

  for (row_nr in 1:nrow(analysis_df)) {

    row <- analysis_df[row_nr,, drop = FALSE]
    cat(paste0("Row ", row_nr, "/", nrow(analysis_df),
               ": ", row$pathogen, " ",
               row$vaccine_type, " ", row$time_post_last_vax, " ",
               row$study_accession, "\n"))
    cur_eset <- young.noNorm.noResponse.filtered[, young.noNorm.noResponse.filtered$pathogen == row$pathogen &
                       young.noNorm.noResponse.filtered$vaccine_type == row$vaccine_type &
                       young.noNorm.noResponse.filtered$study_accession == row$study_accession &
                       (young.noNorm.noResponse.filtered$time_post_last_vax <= 0 |
                          young.noNorm.noResponse.filtered$time_post_last_vax == row$time_post_last_vax)]

    prevax_string <- "baseline"
    postvax_string <- paste0("Day", row$time_post_last_vax)

    qusage.contrast <- paste0(postvax_string, "-", prevax_string)

    qusage.labels <- ifelse(
      test = cur_eset$time_post_last_vax == row$time_post_last_vax,
      yes = postvax_string,
      no = prevax_string)
    result <- safe_qusage(eset = cur_eset,
                          labels = qusage.labels,
                          contrast = qusage.contrast,
                          geneSets = btm_list.filtered,
                          pairVector = cur_eset$participant_id)
    analysis_df$n_participants[row_nr] <-
      length(unique(cur_eset$participant_id))
    analysis_df$qusage_contrast[row_nr] <- qusage.contrast
    if (!is.null(result$result))
      analysis_df$qusage_output[[row_nr]] <- result$result

    if (!is.null(result$error))
      analysis_df$error[[row_nr]] <- result$error
  }
 saveRDS(analysis_df, longitudinal_qusage_young_df_path)

    longitudinal_qusage_young_df <- analysis_df
}
  1. Create data frame with meta analysis results of gene set enrichment scores from postvax vs prevax time point of young adults.
longitudinal_qusage_young_meta_df_path <- file.path(extra_data_dir, "longitudinal_qusage_young_meta_df.rds")
if (file.exists(longitudinal_qusage_young_meta_df_path)) {
  longitudinal_qusage_young_meta_df <- readRDS(longitudinal_qusage_young_meta_df_path)
} else {
  longitudinal_qusage_young_meta_df <- meta_qusage(longitudinal_qusage_young_df, group_cols = c("pathogen",
                                         "vaccine_type",
                                         "time_post_last_vax"))
  saveRDS(longitudinal_qusage_young_meta_df, longitudinal_qusage_young_meta_df_path)
}
tidy_longitudinal_qusage_young_meta_df <- longitudinal_qusage_young_meta_df %>%
  tidy_qusage_output(col = "meta_qusage_output") %>%
  dplyr::mutate(FDR = p.adjust(p.value, method = "BH"),
                FDR_signif_label = create_signif_label(FDR))
  1. Create null distribution of gene set sharing between vaccines
# Calculate null distribution of gene set sharing between vaccines -------------
geneset_sharing_young_null_boot_path <- file.path(extra_data_dir, "geneset_sharing_young_null_boot.rds")
if (file.exists(geneset_sharing_young_null_boot_path)) {
  geneset_sharing_young_null_boot <- readRDS(geneset_sharing_young_null_boot_path)
} else {
  nbins <- tidy_longitudinal_qusage_young_meta_df %>%
    dplyr::select(pathogen, vaccine_type) %>%
    dplyr::distinct() %>%
    nrow() + 1

  strata <- tidy_longitudinal_qusage_young_meta_df %>%
    dplyr::group_by(pathogen, vaccine_type, time_post_last_vax) %>%
    dplyr::group_indices()

  ncpus <- parallel::detectCores()

  geneset_sharing_young_null_boot <- boot(
    data = tidy_longitudinal_qusage_young_meta_df,
    statistic = function(data, indices){
      data %>%
        dplyr::mutate(geneset = geneset[indices]) %>%
        calc_sharing_df() %>%
        dplyr::pull(n) %>%
        `+`(1) %>%
        tabulate(nbins = nbins)
    },
    R = NUM_PERMUTATIONS,
    sim = "permutation",
    stype = "i",
    strata = strata,
    parallel = "multicore",
    ncpus = ncpus)

  saveRDS(object = geneset_sharing_young_null_boot,
          file = geneset_sharing_young_null_boot_path)
}
  1. Select which gene sets are common across vaccines in the young adult responses.
sharing_young_df <- tidy_longitudinal_qusage_young_meta_df %>%
  calc_sharing_df()
genesets_shared_young_common <- sharing_young_df %>%
  dplyr::filter(n >= quantile(x = n, probs = 0.95)) %>%
  dplyr::pull(geneset)

Ext. Data. Figure 1a. Histogram of overlap in DEGs. A gene is shared with another vaccine if it is significantly (FDR < 0.05) regulated in the same direction, irrespective of timepoint. Blue bars, number of genesets shared (y-axis) between the same number of vaccines (x-axis). Grey bars represent the null distribution generated by n = 10,000 permutations of gene labels within vaccine + timepoint groups. Data are presented as mean values + /− 95% confidence interval.

exp_noNA <- na.omit(exprs(eset))
pheno <- pData(eset)

###################################################
# Find DEGS for every vaccine at every time point #
###################################################

sample_cutoff <- 3

# Remove study time points that contain less than sample_cutoff samples
cur_phenoData <- pheno %>%
  dplyr::group_by(study_accession, matrix, time_post_last_vax) %>%
  dplyr::filter(n() >= sample_cutoff) %>%
  dplyr::ungroup()
cur_expressionData <- exp_noNA[, cur_phenoData$uid]

# Keep only subjects that have both pre- and postvax data
cur_phenoData <- cur_phenoData %>%
  dplyr::group_by(participant_id) %>%
  dplyr::filter(any(time_post_last_vax <= 0) & any(time_post_last_vax > 0)) %>%
  dplyr::ungroup()

# For subjects with more than one prevax time point, take the one closest to day 0
cur_phenoData_prevax <- cur_phenoData %>%
  dplyr::filter(time_post_last_vax <= 0) %>%
  dplyr::group_by(participant_id) %>%
  dplyr::arrange(-time_post_last_vax) %>%
  dplyr::slice(1) %>%
  dplyr::ungroup()

cur_phenoData_postvax <- cur_phenoData %>%
  dplyr::filter(time_post_last_vax > 0)

cur_phenoData <- rbind(cur_phenoData_prevax, cur_phenoData_postvax)

# Check that we do not have repeated measurements
stopifnot(cur_phenoData %>%
            dplyr::group_by(participant_id, time_post_last_vax) %>%
            dplyr::filter(n() > 1) %>%
            nrow() == 0)

# Check that uid is unique
stopifnot(!any(duplicated(cur_phenoData$uid)))

# Calculate fold change with baseline for every gene at every postvax time point
exp_fc <- cur_phenoData %>%
  dplyr::group_by(participant_id) %>%
  tidyr::nest() %>%
  dplyr::ungroup() %>%
  dplyr::mutate(
    fc_data = purrr::map(
      .x = data,
      ~sweep(x = exp_noNA[, .x$uid[.x$time_post_last_vax > 0], drop = FALSE],
             MARGIN = 1,
             STATS = as.array(exp_noNA[, .x$uid[.x$time_post_last_vax <= 0], drop = FALSE]),
             FUN = "-")
    )) %>%
  dplyr::pull(fc_data) %>%
  do.call(what = cbind, args = .)

# Calculate Student t value and associated one-sided p values for every gene at every postvax time point
df <-
  cur_phenoData %>%
  dplyr::filter(time_post_last_vax > 0) %>%
  dplyr::group_by(study_accession, matrix, time_post_last_vax) %>%
  tidyr::nest() %>%
  dplyr::ungroup() %>%
  dplyr::mutate(
    deg_data = purrr::map(
      .x = data,
      ~{
        fc_data <- exp_fc[, .x$uid, drop = FALSE]
        tibble(
          gene = rownames(fc_data),
          avg_fc = rowMeans(fc_data),
          n = ncol(fc_data),
          s = apply(fc_data, 1, sd),
          t = avg_fc / (s / sqrt(ncol(fc_data))),
          p_less = pt(q = t, df = ncol(fc_data) - 1),
          p_greater = 1 - p_less
          # it is faster to run the above calculation than the base R t.test function (result is the same)
          #p_less = apply(fc_data, 1, function(x)t.test(x = x, alternative = "less")$p.value),
          #p_greater = apply(fc_data, 1, function(x)t.test(x = x, alternative = "greater")$p.value),
        )
      }
    )
  )

df <- df %>%
  dplyr::select(-data) %>%
  tidyr::unnest(deg_data) %>%
  # Replace NaN p values with 0.9999
  dplyr::mutate_at(c("p_less", "p_greater"),
                   function(x)replace(x, is.nan(x), 0.9999)) %>%
  # Replace 1 p values with 0.9999 (for compatibility with sumz)
  dplyr::mutate_at(c("p_less", "p_greater"),
                   function(x) replace(x, x > 0.9999, 0.9999)) %>%
  # Replace 0 p values with 0.00001 (for compatibility with sumz)
  dplyr::mutate_at(c("p_less", "p_greater"),
                   function(x) replace(x, x <= 0, 0.00001))

## Compute DEGs by pathogen/vaccine type
# Integrate p values across multiple studies by Stouffer's method
df <- df %>%
  dplyr::left_join(cur_phenoData %>%
                     dplyr::select(study_accession, matrix, pathogen, vaccine_type) %>%
                     dplyr::distinct(),
                   by = c("study_accession", "matrix")) %>%
  dplyr::mutate(pvt = paste0(pathogen, " (", vaccine_type, ")"))

fn <- file.path(extra_data_dir, "deg_df.rds")
if (file.exists(fn)) {
  deg_df <- readRDS(file = fn)
} else{
  deg_df <- df %>%
    dplyr::select(-s, -t) %>%
    dplyr::group_by(pathogen, vaccine_type, pvt, time_post_last_vax) %>%
    tidyr::nest() %>%
    dplyr::ungroup() %>%
    dplyr::mutate(
      deg_data =
        purrr::map(
          .x = data,
          ~{
            if(length(unique(.x$matrix)) == 1){
              return(tibble(
                gene = unique(.x$gene),
                weighted_avg_fc = .x$avg_fc,
                meta_p_less = .x$p_less,
                meta_p_greater = .x$p_greater
              ))
            }
            wide_avg_fc_df <- .x %>%
              dplyr::select(matrix, gene, avg_fc, n) %>%
              tidyr::pivot_wider(names_from = "gene", values_from = "avg_fc")

            weighted_avg_fc <- sweep(wide_avg_fc_df %>% dplyr::select(-matrix, -n),
                                     MARGIN = 1,
                                     STATS = wide_avg_fc_df$n,
                                     FUN = "*") %>%
              colSums() %>%
              `/`(sum(wide_avg_fc_df$n))

            wide_p_less_df <- .x %>%
              dplyr::select(matrix, gene, p_less, n) %>%
              tidyr::pivot_wider(names_from = "gene", values_from = "p_less")
            meta_p_less <- apply(X = wide_p_less_df %>% dplyr::select(-matrix, -n),
                                 MARGIN = 2,
                                 function(x)sumz(p = x, weights = sqrt(wide_p_less_df$n))$p)

            wide_p_greater_df <- .x %>%
              dplyr::select(matrix, gene, p_greater, n) %>%
              tidyr::pivot_wider(names_from = "gene", values_from = "p_greater")
            meta_p_greater <- apply(X = wide_p_greater_df %>% dplyr::select(-matrix, -n),
                                    MARGIN = 2,
                                    function(x)sumz(p = x, weights = sqrt(wide_p_greater_df$n))$p)

            tibble(gene = names(weighted_avg_fc),
                   weighted_avg_fc = weighted_avg_fc,
                   meta_p_less = meta_p_less,
                   meta_p_greater = meta_p_greater)
          }
        )
    )
  deg_df <- deg_df %>%
    dplyr::select(-data) %>%
    tidyr::unnest(deg_data) %>%
    dplyr::mutate(
      meta_p = 2*base::pmin(meta_p_less, meta_p_greater),
      meta_p = ifelse(test = meta_p > 1, yes = 1, no = meta_p),
      FDR = p.adjust(meta_p, method = "BH"),
      FDR_signif_label = cut(FDR, c(0, .001, .01, .05, 1),
                             labels = c("***", "**", "*", ""))
    )
  saveRDS(object = deg_df,
          file = fn)
}

##########################################################################
# Figure: bar graph of sharing of postvax gene activity between vaccines #
##########################################################################

# For every gene, find out in how many vaccines it is significantly up at any one postvax time point
# The same but for down at any postvax time point
calc_sharing_gene_df <- function(x) {
  x %>%
    dplyr::mutate(direction = ifelse(
      test = weighted_avg_fc > 0,
      yes = "up",
      no = "down"),
      is_significant = !is.na(FDR) & FDR < 0.05) %>%
    dplyr::select(pvt, gene, direction, is_significant) %>%
    dplyr::distinct() %>%
    dplyr::group_by(gene, direction) %>%
    dplyr::summarise(n = sum(is_significant)) %>%
    dplyr::group_by(gene) %>%
    dplyr::arrange(-n) %>%
    dplyr::slice(1) %>%
    dplyr::ungroup()
}

sharing_null2mat <- function(sharing_null) {
  sharing_null_mat <- do.call(rbind, lapply(sharing_null, `length<-`,
                                            max(lengths(sharing_null))))
  sharing_null_mat[is.na(sharing_null_mat)] <- 0
  return(sharing_null_mat)
}

sharing_null2df <- function(sharing_null_mat) {
  tibble(
    incidence = 0:(ncol(sharing_null_mat)-1),
    incidence_count = colMeans(sharing_null_mat),
    type = "null"
  )
}

fn <- file.path(extra_data_dir, "gene_sharing_null_list.rds")
if (file.exists(fn)) {
  sharing_null_list <- readRDS(file = fn)
} else {
  sharing_null_list <- lapply(
    1:NUM_PERMUTATIONS,
    function(x){
      cat("Permutation ", x, '/', NUM_PERMUTATIONS, "\n")
      deg_df %>%
        dplyr::group_by(pvt, time_post_last_vax) %>%
        dplyr::mutate(gene = sample(gene, replace = FALSE)) %>%
        dplyr::ungroup() %>%
        calc_sharing_gene_df() %>%
        dplyr::pull(n) %>%
        `+`(1) %>%
        tabulate()
    })
  saveRDS(object = sharing_null_list, file = fn)
}

sharing_null_mat <- sharing_null2mat(sharing_null_list)
sharing_null_counts_df <- sharing_null2df(sharing_null_mat)

obs_sharing_df <- calc_sharing_gene_df(deg_df)
obs_sharing_counts_df <- obs_sharing_df %>%
  dplyr::pull(n) %>%
  `+`(1) %>%
  tabulate() %>%
  as_tibble() %>%
  dplyr::mutate(incidence = dplyr::row_number() - 1) %>%
  dplyr::rename(incidence_count = value) %>%
  dplyr::mutate(type = "observed") %>%
  dplyr::select(incidence, incidence_count, type)


plot_df <- dplyr::bind_rows(obs_sharing_counts_df,
                            sharing_null_counts_df) %>%
  tidyr::complete(incidence, tidyr::nesting(type),
                  fill = list(incidence_count = 0))

quantile_df <- purrr::map_df(
  .x = 1:ncol(sharing_null_mat),
  ~quantile(x = sharing_null_mat[, .x], probs = c(.025, 0.975)) %>%
    as.list() %>%
    tibble::as_tibble()
)

quantile_df$incidence <- 0:(nrow(quantile_df)-1)
quantile_df$type <- "null"

p <- ggplot(plot_df,
            aes(x = incidence)) +
  geom_bar(stat = "identity", position = position_dodge(),
           aes(y = incidence_count, fill = type)) +

  geom_segment(data = quantile_df, aes(x = incidence-.25, xend = incidence-.25,
                                       y = `2.5%`, yend = `97.5%`)) +
  geom_segment(data = quantile_df, aes(x = incidence-.35, xend = incidence-.15,
                                       y = `2.5%`, yend = `2.5%`)) +
  geom_segment(data = quantile_df, aes(x = incidence-.35, xend = incidence-.15,
                                       y = `97.5%`, yend = `97.5%`)) +
  xlab("Number of different vaccines") +
  ylab("Number of genes") +
  scale_fill_manual(values = c("null" = "lightgrey", "observed" = "navy")) +
  scale_x_continuous(breaks = plot_df$incidence) +
  theme_bw(base_size = 22)
print(p)

Ext. Data Figure 1b. Histogram of overlap in differentially expressed modules. A module is shared with another vaccine if it is significantly (FDR < 0.05) regulated in the same direction, irrespective of timepoint. Blue bars, number of genesets shared (y-axis) between the same number of vaccines (x-axis). Grey bars represent the null distribution generated by n = 10,000 permutations of module labels within vaccine + timepoint groups. Data are presented as mean values + /− 95% confidence interval.

nbins <- tidy_longitudinal_qusage_young_meta_df %>%
  dplyr::select(pathogen, vaccine_type) %>%
  dplyr::distinct() %>%
  nrow() + 1
obs_sharing_young_df <- calc_sharing_df(
  tidy_longitudinal_qusage_young_meta_df) %>%
  dplyr::pull(n) %>%
  `+`(1) %>%
  tabulate(nbins = nbins) %>%
  dplyr::as_tibble() %>%
  dplyr::mutate(incidence = dplyr::row_number() - 1) %>%
  dplyr::rename(incidence_count = value) %>%
  dplyr::mutate(ymin = NA, ymax = NA, type = "observed") %>%
  dplyr::select(incidence, incidence_count, ymin, ymax, type)
probs <- c(0.025, 0.975)
null_sharing_young_df <- geneset_sharing_young_null_boot$t %>%
  as.data.frame() %>%
  dplyr::summarise(dplyr::across(.fns = function(col)
    c(median(col), quantile(col , probs = probs)))) %>%
  t() %>%
  as.data.frame()
colnames(null_sharing_young_df) <- c("incidence_count", "ymin", "ymax")
null_sharing_young_df <- null_sharing_young_df %>%
  dplyr::mutate(incidence = dplyr::row_number() - 1,
                type = "null") %>%
  dplyr::select(incidence, incidence_count, ymin, ymax, type)
plot_df <- dplyr::bind_rows(obs_sharing_young_df, null_sharing_young_df) %>%
  dplyr::filter(!(incidence > 1 & incidence_count < 0.1))
ggplot(plot_df, aes(x = incidence, y = incidence_count, fill = type)) +
  geom_bar(stat = "identity", position = position_dodge(),
           width = 0.9) +
  geom_errorbar(aes(ymin = ymin, ymax = ymax),
                position = position_dodge(width = 0.9),
                width = 0.4) +
  xlab("Number of different vaccines") +
  ylab("Number of BTMs") +
  scale_fill_manual(values = c("null" = "lightgrey", "observed" = "navy")) +
  scale_x_continuous(breaks = plot_df$incidence) +
  theme_bw(base_size = 22) +
  theme(panel.grid = element_blank())

Figure 2a. Heatmap of common differentially expressed modules (regulated in 7 or more vaccines) over time (*FDR<0.05). Color represents the QuSAGE activity score. Clustering on columns was performed separately for Days 1, 3, 7, 14, and 21 post-vaccination. TBA – To be annotated.

n_clusters=4 #Number of module clusters
common_tp=c(1,3,7,14,21) #Common timepoints to plot
time_windows <- c(0, 1, 4, 10, 15, Inf)
#Subset qusage results to most common timepoints
tidy_longitudinal_qusage_young_meta_df_commontp=tidy_longitudinal_qusage_young_meta_df[tidy_longitudinal_qusage_young_meta_df$time_post_last_vax %in% common_tp,]
plot_df <- tidy_longitudinal_qusage_young_meta_df_commontp %>%
  dplyr::filter(geneset %in% genesets_shared_young_common) %>%
  dplyr::mutate(pvt_tp = paste0(pathogen, " (", vaccine_type, ")", " Day ",
                                time_post_last_vax)) %>%
  dplyr::left_join(Biobase::pData(young.noNorm.noResponse) %>%
                     dplyr::select(pathogen, vaccine_type, adjuvant) %>%
                     dplyr::distinct(),
                   by = c("pathogen", "vaccine_type"))
m <- plot_df %>%
  dplyr::select(pvt_tp, geneset, activity_score) %>%
  tidyr::pivot_wider(names_from = "pvt_tp", values_from = "activity_score") %>%
  tibble::column_to_rownames(var = "geneset") %>%
  as.matrix()
m_fdr <- plot_df %>%
  dplyr::select(pvt_tp, geneset, FDR) %>%
  tidyr::pivot_wider(names_from = "pvt_tp", values_from = "FDR") %>%
  tibble::column_to_rownames(var = "geneset") %>%
  as.matrix()
# Setup color gradient
paletteLength <- 50
myColor <- colorRampPalette(c("navy", "white", "firebrick3"))(paletteLength)
myBreaks <- c(
  seq(-max(abs(m)), 0, length.out=ceiling(paletteLength/2) + 1),
  seq(max(abs(m))/paletteLength, max(m), length.out=floor(paletteLength/2)))
# Column annotations
myColAnnot <- plot_df %>%
  dplyr::select(time_post_last_vax, pvt_tp, pathogen, vaccine_type, adjuvant) %>%
  dplyr::distinct() %>%
  dplyr::arrange(time_post_last_vax, pathogen, vaccine_type, adjuvant) %>%
  tibble::column_to_rownames(var = "pvt_tp")
m <- m[, rownames(myColAnnot)]
cor.dist <- function(x, method = NA) {
  stopifnot(is.na(method))
  stopifnot(is.matrix(x))
  as.dist((1-cor(t(x)))/2)
}
cluster_rows <- hclust(d = cor.dist(m))
cluster_df <- cutree(tree = cluster_rows, k = n_clusters) %>%
  as.data.frame() %>%
  tibble:::rownames_to_column(var = "geneset") %>%
  dplyr::rename(cluster_nr = ".")
vaccine_type <- unique(plot_df$vaccine_type)
pathogen <- unique(plot_df$pathogen)
adjuvant <- unique(plot_df$adjuvant)
myAnnotColors <- list(
  vaccine_type = adj_path_vt_colors$vaccine_type[vaccine_type],
  pathogen = adj_path_vt_colors$pathogen[pathogen],
  adjuvant = adj_path_vt_colors$adjuvant[adjuvant],
  cluster = setNames(ptol_pal()(n_clusters), nm = as.character(1:n_clusters))
)

#Manually change cluster labels for aesthetics
cluster_df$cluster_nr[cluster_df$cluster_nr==4]=5
cluster_df$cluster_nr[cluster_df$cluster_nr==2]=4
cluster_df$cluster_nr[cluster_df$cluster_nr==3]=2
cluster_df$cluster_nr[cluster_df$cluster_nr==5]=3

p <- col_grouped_pheatmap(
  mat = m,
  col_groups = cut(myColAnnot$time_post_last_vax, time_windows),
  col_dist_fun = cor.dist,
  method = NA,
  color = myColor,
  breaks = myBreaks,
  cluster_rows = cluster_rows,
  display_numbers =  ifelse(test = is.na(m_fdr) | m_fdr >= FDR_THRESHOLD,
                            yes = " ",
                            no = "*"),
  annotation_col = myColAnnot %>%
    dplyr::select(pathogen, vaccine_type, adjuvant),
  annotation_row = cluster_df %>%
    dplyr::rename(cluster = cluster_nr) %>%
    dplyr::mutate(cluster = as.character(cluster)) %>%
    tibble::column_to_rownames(var = "geneset"),
  annotation_colors = myAnnotColors,
  main = "",
  fontsize = 22,
  fontsize_row = 18,
  fontsize_col = 18,
  fontsize_number = 22)
p <- pheatmap_add_labels(
  p = p,
  annot_labels = adj_path_vt_abbv)
print(p)

Figure 2b-c, Ext. Data Figure 1c-d. Kinetics of the mean FC of module clusters across vaccines.

#Create color/linetype paired list for pathogen/vaccine type combination
#Reorder vaccines to keep consistent with prior versions
pt_order=c(1,3,4,5,2,6,10,7,11,12,13,8,9)
cols=colorRampPalette(brewer.pal(ceiling(length(unique(eset$pt))/3), "Set1"))
pt_line_color_anno=list('color'=setNames(rep(cols(ceiling(length(unique(eset$pt))/3)),3), unique(eset$pt)[pt_order]),
                        'linetype'=setNames(rep(c('solid','dashed','dotted'),ceiling(length(unique(eset$pt))/3)), unique(eset$pt)[pt_order]))
#Create list from BTM clustering in common BTM heatmap
BTM_clust=lapply(unique(cluster_df$cluster_nr), function(x) cluster_df$geneset[which(cluster_df$cluster_nr==x)])
names(BTM_clust)=unique(cluster_df$cluster_nr)
#Get BTM FC data for common timepoints
common_tp=c(1,3,7,14,21)
exp_FC_BTM_commonTP=exp_FC_BTM[tp_FC %in% common_tp]
#Collapse BTM FC data to cluster-level
exp_FC_BTM_cluster=lapply(exp_FC_BTM_commonTP, function(x) {z=do.call(rbind, lapply(BTM_clust, function(y) colMeans(exprs(x[match(y,rownames(x)),])))); ExpressionSet(as.matrix(z), x@phenoData)})
#For each study, average expression across all subjects
matrix_uni_tp=lapply(exp_FC_BTM_cluster,function(x) x$matrix[!duplicated(x$matrix)])
#Store study metadata
matrix_uni_tp_metaData=lapply(exp_FC_BTM_cluster,function(x) pData(x)[!duplicated(x$matrix),] %>%
                                rownames_to_column() %>%
                                dplyr::select(matrix, vaccine, vaccine_type, pathogen, adjuvant, pt) %>%
                                column_to_rownames(var = "matrix"))
#Find sample indices per matrix
matrix_ind=lapply(1:length(exp_FC_BTM_cluster),
                  function(x) lapply(1:length(matrix_uni_tp[[x]]), function(y) which(matrix_uni_tp[[x]][[y]]==exp_FC_BTM_cluster[[x]]$matrix)))
#Study size
exp_FC_BTM_cluster_n=lapply(1:length(exp_FC_BTM_cluster),
                function(x) sapply(1:length(matrix_uni_tp[[x]]), function(y) length(matrix_ind[[x]][[y]])))
#Average FC
exp_FC_BTM_cluster_mean=lapply(1:length(exp_FC_BTM_cluster),
                   function(x) sapply(1:length(matrix_uni_tp[[x]]), function(y) rowMeans(exprs(exp_FC_BTM_cluster[[x]][,matrix_ind[[x]][[y]]]))))

#Find unique pathogen/vaccine types by timepoint
pt_uni_tp=lapply(matrix_uni_tp_metaData,function(x) x$pt[!duplicated(x$pt)])
pt_metaData=lapply(1:length(matrix_uni_tp_metaData),
                   function(x) dplyr::select(
                     matrix_uni_tp_metaData[[x]][!duplicated(matrix_uni_tp_metaData[[x]]$pt),],
                     vaccine_type, pathogen, adjuvant))
#Vaccine size
exp_FC_BTM_cluster_pt_n=lapply(1:length(exp_FC_BTM_cluster),
                   function(x) sapply(1:length(pt_uni_tp[[x]]), function(y) length(which(exp_FC_BTM_cluster[[x]]$pt==pt_uni_tp[[x]][y]))))
#Add weight (1/vaccine sample size) to pData for all samples
n_weight=lapply(1:length(exp_FC_BTM_cluster),
                function(x) sapply(1:ncol(exp_FC_BTM_cluster[[x]]), function(y) exp_FC_BTM_cluster_pt_n[[x]][which(pt_uni_tp[[x]]==exp_FC_BTM_cluster[[x]]$pt[y])]))
for (i in 1:length(exp_FC_BTM_cluster)) {
  exp_FC_BTM_cluster[[i]]$n_weight=1/n_weight[[i]]
}

#Compute mean FC and p values per cluster per vaccine (pt-pathogen/vaccine type)
pt_ind=vector("list",length(exp_FC_BTM_cluster))
pt_FC_mean=vector("list",length(exp_FC_BTM_cluster))
for (i in 1:length(exp_FC_BTM_cluster)) {
  #For each vaccine
  pt_FC_mean[[i]]=matrix(NA, nrow=nrow(exp_FC_BTM_cluster[[i]]), ncol=length(pt_uni_tp[[i]]))
  for (j in 1:length(pt_uni_tp[[i]])) {
    pt_ind[[i]][[j]]=which(pt_uni_tp[[i]][[j]]==matrix_uni_tp_metaData[[i]]$pt)
    #If there is more than 1 study, compute weighted average
    if (length(pt_ind[[i]][[j]])>1) {
      #Average FCs (weighted by n)
      pt_FC_mean[[i]][,j]=rowWeightedMeans(exp_FC_BTM_cluster_mean[[i]][,pt_ind[[i]][[j]]], w=exp_FC_BTM_cluster_n[[i]][pt_ind[[i]][[j]]])
      #Otherwise store single value
    } else {
      pt_FC_mean[[i]][,j]=exp_FC_BTM_cluster_mean[[i]][,pt_ind[[i]][[j]]]
    }
  }
  colnames(pt_FC_mean[[i]])=pt_uni_tp[[i]]
  rownames(pt_FC_mean[[i]])=rownames(exp_FC_BTM_cluster_mean[[i]])
}

#Set clusters of interest, plot kinetics
BTM_clust_int=names(BTM_clust)
for (i in 1:length(BTM_clust_int)) {
  #Lookup BTM
  ind=which(rownames(exp_FC_BTM_cluster_mean[[1]])==BTM_clust_int[i])
  #Build matrix of avg common gene exp over time per vaccine
  BTM_int_FC_mat=array(NA, dim=c(length(unique(eset$pt)),ncol=length(exp_FC_BTM_cluster)+1),
                       dimnames=list(unique(eset$pt),c(0,common_tp)))
  #Set D0 FC to 0
  BTM_int_FC_mat[,1]=0
  #Add vaccine FCs at each timepoine
  for (j in 1:length(exp_FC_BTM_cluster_mean)) {
    ind_pt=match(pt_uni_tp[[j]],unique(eset$pt))
    BTM_int_FC_mat[ind_pt,j+1]=pt_FC_mean[[j]][ind,]
  }
  #Plot line graph of most common DEG avg per vaccine
  #Convert matrix to long format
  BTM_int_FC_mat=data.frame(BTM_int_FC_mat) %>% tibble::rownames_to_column(var = "Vaccine") %>%
    pivot_longer(-Vaccine, names_to='Timepoint', values_to='FC')
  #Convert timepoint back to numeric
  BTM_int_FC_mat$Timepoint=as.numeric(str_replace(BTM_int_FC_mat$Timepoint, 'X', ''))
  #Plot
  p=ggplot(BTM_int_FC_mat[!is.na(BTM_int_FC_mat$FC),], aes(x=Timepoint, y=FC, group=Vaccine)) +
      geom_line(aes(linetype=Vaccine, color=Vaccine), size=1.07)+
      theme_classic()+
      scale_color_manual(values=pt_line_color_anno$color)+
      scale_linetype_manual(values=pt_line_color_anno$linetype)+
      scale_x_continuous(breaks=c(0,1,3,7,14,21))+
      xlab("Day post-last vaccination")+
      ylab("Mean FC (log2)")+
      ggtitle(paste0('Cluster ',rownames(exp_FC_BTM_cluster_mean[[1]])[ind]))+
      theme(plot.title = element_text(hjust = 0.5), text = element_text(size=20))
  print(p)
}

Figure 3: Overlap in transcriptional responses across vaccines.

Figure 3a-c. Circos plots of the overlap in differentially expressed modules (FDR < 0.05) across vaccines on days 1 (a), 3 (b) and 7 (c). Each segment of the circle represents one vaccine, and each point in a segment represents a single module. Heat maps in the outermost ring represent the correlation with day 28 antibody responses, and bars in the middle ring represent the activity score of differentially expressed modules. Lines connect modules with a significant positive score shared between vaccines. Boxes and line colors in the innermost ring represent the functional groups of the modules. ECM, extracellular matrix.

corrMethod='pearson'
tp_int=c(1,3,7)
ab_response_col='MFC'

#Load processed qusage results
qusage_df=tidy_longitudinal_qusage_young_meta_df

#Load BTM groups
load(file.path(extra_data_dir,'BTM_groups.rda'))
#Optional:Remove BTMs without subgroup
BTM_groups=BTM_groups[sapply(BTM_groups$SUBGROUP, function(x) nchar(x)>0),]
#Set BTM group as factor and manually set order
BTM_groups$SUBGROUP=factor(BTM_groups$SUBGROUP,
                           levels=c('ANTIGEN PRESENTATION', 'INFLAMMATORY/TLR/CHEMOKINES','INTERFERON/ANTIVIRAL SENSING',
                                    'MONOCYTES', 'DC ACTIVATION', 'NEUTROPHILS', 'SIGNAL TRANSDUCTION', 'ECM AND MIGRATION',
                                    'ENERGY METABOLISM', 'CELL CYCLE', 'NK CELLS', 'T CELLS', 'B CELLS', 'PLASMA CELLS'))
#Sort BTM list by subgroup
BTM_groups=BTM_groups[order(BTM_groups$SUBGROUP),]
rownames(BTM_groups)=BTM_groups$NAME

#Match qusage BTM names with list
common_BTMs=intersect(sub('.*\\(S','S',sub('.*\\(M','M',BTM_groups$NAME)), sub('.*\\(S','S',sub('.*\\(M','M',unique(qusage_df$geneset))))
#Remove qusage BTMs without group
qusage_df=qusage_df[sub('.*\\(S','S',sub('.*\\(M','M',qusage_df$geneset)) %in% common_BTMs,]
#Rename to match BTM_groups
qusage_df$geneset=BTM_groups$NAME[match(sub('.*\\(S','S',sub('.*\\(M','M',qusage_df$geneset)),sub('.*\\(S','S',sub('.*\\(M','M',BTM_groups$NAME)))]
#Remove BTMs missing from qusage results
BTM_groups=BTM_groups[sub('.*\\(S','S',sub('.*\\(M','M',BTM_groups$NAME)) %in% common_BTMs,]

#Add pathogen_type (vaccine) column
qusage_df$pt=paste(qusage_df$pathogen," (",qusage_df$vaccine_type,")", sep='')

#Subset data to timepoints of interest
qusage_df=qusage_df[qusage_df$time_post_last_vax %in% tp_int,]
pt_uni=unique(qusage_df$pt)
pt_uni_tp=lapply(tp_int, function(x) unique(qusage_df$pt[qusage_df$time_post_last_vax==x])) #Get vaccines with data at each timepoint
#Split by timepoint
qusage_df=lapply(tp_int, function(x) qusage_df[qusage_df$time_post_last_vax==x,])
#Convert to wide format
pt_FC_NES=lapply(qusage_df, function(x) data.frame(spread(x[,c('pt','geneset','activity_score')], pt, activity_score)))
pt_FC_NES=lapply(pt_FC_NES, function(x) {rownames(x)=x$geneset; x[,-1]})
pt_FC_NES=lapply(1:length(pt_FC_NES), function(x) {colnames(pt_FC_NES[[x]])=pt_uni_tp[[x]]; pt_FC_NES[[x]]})
pt_FC_p=lapply(qusage_df, function(x) data.frame(spread(x[,c('pt','geneset','FDR')], pt, FDR)))
pt_FC_p=lapply(pt_FC_p, function(x) {rownames(x)=x$geneset; x[,-1]})
pt_FC_p=lapply(1:length(pt_FC_p), function(x) {colnames(pt_FC_p[[x]])=pt_uni_tp[[x]]; pt_FC_p[[x]]})
#Reorder qusage results to match BTM_groups
pt_FC_NES=lapply(pt_FC_NES, function(x) x[match(BTM_groups$NAME,rownames(x)),])
pt_FC_p=lapply(pt_FC_p, function(x) x[match(BTM_groups$NAME,rownames(x)),])

#Generate Ab response correlation data
#Load GE
eset_wAb=young.noNorm.withResponse
colnames(eset_wAb)=eset_wAb$uid

#Create combined vaccine type/pathogen column
eset_wAb$pt=paste(eset_wAb$pathogen," (",eset_wAb$vaccine_type,")", sep='')
#Label columns by uid
colnames(eset_wAb)=eset_wAb$uid

#Find samples from timepoints of interest
tp_int=c(0,tp_int)
ind=lapply(tp_int, function(x) which(eset_wAb$study_time_collected==x))
#Combine indices of all timepoints of interest
ind_all=Reduce(union,ind)
#Retain only samples from timepoints of interest
eset_wAb=eset_wAb[,ind_all]
#Recompute timepoint indices after removing extraneous timepoints
ind=lapply(tp_int, function(x) which(eset_wAb$study_time_collected==x))
#Remove samples from a single study with fewer than sample_cutoff samples at any timepoint
sample_cutoff=3
matrix_uni_tp=lapply(ind,function(x) unique(eset_wAb[,x]$matrix))
matrix_ind=lapply(1:length(ind),function(x)
  lapply(1:length(matrix_uni_tp[[x]]), function(y) which(matrix_uni_tp[[x]][[y]]==eset_wAb[,ind[[x]]]$matrix)))
ind_cut_all=vector()
for (i in 1:length(matrix_ind)) {
  ind_cut=which(sapply(matrix_ind[[i]],length)<sample_cutoff)
  if (is_empty(ind_cut)==FALSE) {
    for (j in 1:length(ind_cut)){
      ind_cut_all=c(ind_cut_all,ind[[i]][matrix_ind[[i]][[ind_cut[j]]]])
    }
  }
}
if (is_empty(ind_cut_all)==FALSE) {
  eset_wAb=eset_wAb[,-ind_cut_all]
}
#Recompute timepoint indices after removing samples
tp_int=unique(eset_wAb$study_time_collected[which(eset_wAb$study_time_collected>=0)])
ind=lapply(tp_int, function(x) which(eset_wAb$study_time_collected==x))

#Remove genes with NA
eset_wAb=eset_wAb[complete.cases(exprs(eset_wAb)),]

#Create unique list of studies
matrix_uni=unique(eset_wAb$matrix)

#Collapse to BTM level
#Load BTM list
BTM_list=readRDS(file = file.path(data_dir, "btm_list_2020_12_23.rds"))
#match with BTM groups/qusage data
BTM_list=BTM_list[na.omit(match(sub('.*\\(S','S',sub('.*\\(M','M',BTM_groups$NAME)),sub('.*\\(S','S',sub('.*\\(M','M',names(BTM_list)))))]
#Collapse - find arithmetic mean of genes in each BTM for each sample
exp_BTM=do.call(rbind, lapply(BTM_list, function(x) colMeans(exprs(eset_wAb[na.omit(match(x,rownames(eset_wAb))),]),na.rm=TRUE)))
#Create BTM eset_wAb
eset_wAb_BTM=ExpressionSet(exp_BTM, eset_wAb@phenoData)
#Replace eset_wAb with BTM eset_wAb
eset_wAb=eset_wAb_BTM
rm(eset_wAb_BTM)

#Compute D0 normalized FC
ind_D0=which(0==eset_wAb$study_time_collected)
common=lapply(2:length(ind),function(x) intersect(eset_wAb$participant_id[ind[[x]]],eset_wAb$participant_id[ind_D0]))
ia=lapply(2:length(ind),function(x) na.omit(match(common[[x-1]],eset_wAb$participant_id[ind[[x]]])))
ib=lapply(2:length(ind),function(x) na.omit(match(common[[x-1]],eset_wAb$participant_id[ind_D0])))
exp_FC_wAb=lapply(2:length(ind),function(x) eset_wAb[,ind[[x]][ia[[x-1]]]])
exp_FC_wAb=lapply(2:length(ind),function(x) {exprs(exp_FC_wAb[[x-1]])=exprs(exp_FC_wAb[[x-1]])-exprs(eset_wAb[,ind_D0[ib[[x-1]]]]); exp_FC_wAb[[x-1]]})
#For each study, average expression across all subjects
matrix_uni_tp=lapply(exp_FC_wAb,function(x) x$matrix[!duplicated(x$matrix)])
#Store study metadata
matrix_uni_tp_metaData=lapply(exp_FC_wAb,function(x) pData(x)[!duplicated(x$matrix),] %>%
                                rownames_to_column() %>%
                                dplyr::select(matrix, vaccine, vaccine_type, pathogen, adjuvant, pt) %>%
                                column_to_rownames(var = "matrix"))
#Indices for each study
matrix_ind=lapply(1:length(exp_FC_wAb),
                  function(x) lapply(1:length(matrix_uni_tp[[x]]), function(y) which(matrix_uni_tp[[x]][[y]]==exp_FC_wAb[[x]]$matrix)))
#Study size
exp_FC_wAb_n=lapply(1:length(exp_FC_wAb),
                function(x) sapply(1:length(matrix_uni_tp[[x]]), function(y) length(matrix_ind[[x]][[y]])))

#Correlation with Ab response
exp_FC_wAb_corr_stat=
  lapply(1:length(exp_FC_wAb),  #for each timepoint
         function(x) sapply(1:length(matrix_uni_tp[[x]]),  #for each study
                            function(y) apply(exprs(exp_FC_wAb[[x]][,matrix_ind[[x]][[y]]]), 1,  #for each geneset
                                              function(z) cor.test(z, pData(exp_FC_wAb[[x]][,matrix_ind[[x]][[y]]])[[ab_response_col]], method = corrMethod)$statistic)))

#Merge mean FC, correlation/t-test test statistics by vaccine (weighted average by study size)

#Find unique pathogen/vaccine types by timepoint
pt_uni_tp=lapply(matrix_uni_tp_metaData,function(x) x$pt[!duplicated(x$pt)])
pt_metaData=lapply(1:length(matrix_uni_tp_metaData),
                   function(x) dplyr::select(
                     matrix_uni_tp_metaData[[x]][!duplicated(matrix_uni_tp_metaData[[x]]$pt),],
                     vaccine_type, pathogen, adjuvant))
#Sample indices per vaccine
pt_ind=lapply(1:length(exp_FC_wAb),
              function(x) sapply(1:length(pt_uni_tp[[x]]),
                                 function(y) which(pt_uni_tp[[x]][[y]]==matrix_uni_tp_metaData[[x]]$pt)))
pt_corr_stat=
  lapply(1:length(exp_FC_wAb),
         function(x) sapply(1:length(pt_uni_tp[[x]]),
                            function(y) if(length(pt_ind[[x]][[y]])>1) rowWeightedMeans(exp_FC_wAb_corr_stat[[x]][,pt_ind[[x]][[y]]], w=exp_FC_wAb_n[[x]][pt_ind[[x]][[y]]], na.rm=TRUE) else exp_FC_wAb_corr_stat[[x]][,pt_ind[[x]][[y]]]))
pt_corr_stat=lapply(1:length(pt_corr_stat), function(x) {rownames(pt_corr_stat[[x]])=BTM_groups$NAME; colnames(pt_corr_stat[[x]])=pt_uni_tp[[x]]; pt_corr_stat[[x]]})

#Add vaccines with missing data at each timepoint
pt_FC_NES_full=vector("list",length(pt_FC_NES))
pt_FC_p_full=vector("list",length(pt_FC_NES))
pt_corr_stat_full=vector("list",length(pt_corr_stat))
for (i in 1:length(pt_corr_stat)) {
  pt_FC_NES_full[[i]]=matrix(NA, ncol=length(pt_uni), nrow=nrow(pt_FC_NES[[i]]), dimnames = list(rownames(pt_FC_NES[[i]]), pt_uni))
  pt_FC_NES_full[[i]][rownames(pt_FC_NES[[i]]), colnames(pt_FC_NES[[i]])]=unlist(pt_FC_NES[[i]])
  pt_FC_p_full[[i]]=matrix(NA, ncol=length(pt_uni), nrow=nrow(pt_FC_p[[i]]), dimnames = list(rownames(pt_FC_p[[i]]), pt_uni))
  pt_FC_p_full[[i]][rownames(pt_FC_p[[i]]), colnames(pt_FC_p[[i]])]=unlist(pt_FC_p[[i]])
  pt_corr_stat_full[[i]]=matrix(NA, ncol=length(pt_uni), nrow=nrow(pt_corr_stat[[i]]), dimnames = list(rownames(pt_corr_stat[[i]]), pt_uni))
  pt_corr_stat_full[[i]][rownames(pt_corr_stat[[i]]), colnames(pt_corr_stat[[i]])]=unlist(pt_corr_stat[[i]])
}
pt_FC_NES=pt_FC_NES_full
pt_FC_p=pt_FC_p_full
pt_corr_stat=pt_corr_stat_full
rm(pt_corr_stat_full, pt_FC_NES_full, pt_FC_p_full)
#Update vaccine per timepoint variable
pt_uni_tp=lapply(1:length(pt_FC_NES), function(x) pt_uni)


#Compute overlap in sig BTMs
comb=vector("list",length(pt_FC_NES))
pt_BTM_ind_sig_NES=vector("list",length(pt_FC_NES))
pt_BTM_ind_sig_up=vector("list",length(pt_FC_NES))
pt_BTM_ind_sig_down=vector("list",length(pt_FC_NES))
pt_BTM_up_overlap=vector("list",length(pt_FC_NES))
pt_BTM_down_overlap=vector("list",length(pt_FC_NES))
for (i in 1:length(pt_FC_NES)) {
  pt_BTM_ind_sig_NES[[i]]=vector("list",ncol(pt_FC_NES[[i]]))
  names(pt_BTM_ind_sig_NES[[i]])=colnames(pt_FC_NES[[i]])
  for (j in 1:ncol(pt_FC_NES[[i]])) {
    #Find signifiant pathways
    pt_BTM_ind_sig_NES[[i]][[j]]=which(pt_FC_p[[i]][,j]<FDR_THRESHOLD)
    pt_BTM_ind_sig_up[[i]][[j]]=intersect(pt_BTM_ind_sig_NES[[i]][[j]],which(pt_FC_NES[[i]][,j]>0))
    pt_BTM_ind_sig_down[[i]][[j]]=intersect(pt_BTM_ind_sig_NES[[i]][[j]],which(pt_FC_NES[[i]][,j]<0))
  }
  #Pairwise combinations
  comb[[i]]=combn(seq.int(1,length(pt_uni_tp[[i]])),2,simplify = FALSE)
  pt_BTM_up_overlap[[i]]=
    lapply(comb[[i]],function(x) intersect(pt_BTM_ind_sig_up[[i]][[x[1]]] , pt_BTM_ind_sig_up[[i]][[x[2]]]))
  pt_BTM_down_overlap[[i]]=
    lapply(comb[[i]],function(x) intersect(pt_BTM_ind_sig_down[[i]][[x[1]]] , pt_BTM_ind_sig_down[[i]][[x[2]]]))
}


#Circos plot of BTMs
#Set vaccine colors
cols=colorRampPalette(brewer.pal(length(pt_uni), "Set3"))
mycolors_vac=cols(length(pt_uni))
#Create link colors by BTM group
cols=colorRampPalette(brewer.pal(length(unique(BTM_groups$SUBGROUP)), "Set1"))
mycolors_BTM=cols(length(unique(BTM_groups$SUBGROUP)))
#Set barplot limits
# bar_lim=ceiling(max(abs(do.call(cbind,pt_FC_NES)), na.rm=TRUE)*2)/2
# map_lim=ceiling(max(abs(do.call(cbind,pt_corr_stat)), na.rm=TRUE)*2)/2
#Set barplot limits manually (alternate)
bar_lim=1
map_lim=2.6
#Replace nonsignificant values with 0
pt_FC_NES=lapply(1:length(pt_FC_NES), function(x) {pt_FC_NES[[x]][pt_FC_p[[x]]>=FDR_THRESHOLD]=0; pt_FC_NES[[x]]})
#Replace values outside limits with limit
pt_FC_NES=lapply(1:length(pt_FC_NES), function(x) {pt_FC_NES[[x]][pt_FC_NES[[x]]<(-bar_lim)]=-bar_lim; pt_FC_NES[[x]]})
pt_FC_NES=lapply(1:length(pt_FC_NES), function(x) {pt_FC_NES[[x]][pt_FC_NES[[x]]>bar_lim]=bar_lim; pt_FC_NES[[x]]})
pt_corr_stat=lapply(1:length(pt_corr_stat), function(x) {pt_corr_stat[[x]][pt_corr_stat[[x]]<(-map_lim)]=-map_lim; pt_corr_stat[[x]]})
pt_corr_stat=lapply(1:length(pt_corr_stat), function(x) {pt_corr_stat[[x]][pt_corr_stat[[x]]>map_lim]=map_lim; pt_corr_stat[[x]]})
#Set heatmap colors
map_col=colorRamp2(c(-map_lim, 0, map_lim), c("black", "white", "orange"))
for (i in 1:length(pt_FC_NES)) {
  #Create df with sectors
  df_sector_list=lapply(1:length(pt_uni_tp[[i]]),
                        function(y) data.frame(factors=replicate(nrow(pt_FC_NES[[i]]),pt_uni_tp[[i]][[y]]),
                                               x=seq.int(1,nrow(pt_FC_NES[[i]]))))
  #Combine into one df
  df=bind_rows(df_sector_list)
  #Draw sectors
  circos.par(cell.padding = c(0.02, 0.1, 0.02, 0.1), circle.margin=c(0.01,0.4,0.01,0.01), track.margin=c(0,0)) #Margins with legend
  #circos.par(cell.padding = c(0.02, 0.1, 0.02, 0.1), circle.margin=c(0.01,0.01,0.01,0.01), track.margin=c(0,0)) #Margins without legend
  circos.initialize(factors = df$factors, x = df$x)
  circos.track(ylim = c(0, 1),  track.height=0.15, bg.border=NA, panel.fun=function(x, y) {
    #chr=CELL_META$sector.index
    xlim=CELL_META$xlim
    ylim=CELL_META$ylim
    circos.rect(xlim[1], 0, xlim[2], 1, col = mycolors_vac[which(pt_uni_tp[[i]][CELL_META$sector.numeric.index]==pt_uni)])
    #Add labels
    circos.text(nrow(pt_FC_NES[[i]])/2, 0.5, CELL_META$sector.index, CELL_META$sector.index, 1, facing = "bending.outside", niceFacing=TRUE, cex = 1.5)
  })
  #Add Ab correlation heatmap
  circos.track(ylim = c(0.25,0.75), track.height=0.05, track.margin=c(0.01,0), bg.border=NA, panel.fun=function(x, y) {
    m=t(data.matrix(pt_corr_stat[[i]][,CELL_META$sector.numeric.index]))
    col_mat=map_col(m)
    nc=ncol(m)
    circos.rect(1:nc, rep(0, nc), 1:nc, rep(1, nc), border=col_mat, col=col_mat)
  })
  #Add barplots
  circos.track(ylim=c(-bar_lim, bar_lim), track.height=0.1, panel.fun=function(x, y) {
    value=pt_FC_NES[[i]][,CELL_META$sector.numeric.index]
    circos.barplot(value, 1:nrow(pt_FC_NES[[i]]),
                   col = ifelse(value > 0, '#FF0000', '#0000FF'), border = NA)
    #circos.yaxis(side='right')
  })
  #Add BTM group color blocks
  circos.track(ylim = c(0, 1), track.height=0.05, bg.border=NA, panel.fun=function(x, y) {
    for (j in 1:length(unique(BTM_groups$SUBGROUP))) {
      #Get start/ending indices of BTM group
      start=match(unique(BTM_groups$SUBGROUP)[j],BTM_groups$SUBGROUP)
      end=length(BTM_groups$SUBGROUP)-match(unique(BTM_groups$SUBGROUP)[j],rev(BTM_groups$SUBGROUP))+1
      circos.rect(start, 0, end, 1, col=mycolors_BTM[j])
    }
  })
  #Draw common DEG links (up)
  for (j in 1:length(pt_BTM_up_overlap[[i]])) {
    if (length(pt_BTM_up_overlap[[i]][[j]])>0){
      for (k in 1:length(pt_BTM_up_overlap[[i]][[j]])) {
        if (!is.na(pt_BTM_up_overlap[[i]][[j]][k])) {
          circos.link(pt_uni_tp[[i]][[comb[[i]][[j]][1]]], pt_BTM_up_overlap[[i]][[j]][k],
                      pt_uni_tp[[i]][[comb[[i]][[j]][2]]], pt_BTM_up_overlap[[i]][[j]][k],
                      col=mycolors_BTM[[which(BTM_groups$SUBGROUP[pt_BTM_up_overlap[[i]][[j]][k]]==unique(BTM_groups$SUBGROUP))]])
        }
      }
    }
  }
  #Legend
  lgd_points = Legend(at = pt_uni_tp[[i]], type = "points",pch=15,
                      legend_gp = gpar(col = mycolors_vac,fill = mycolors_vac), title_position = "topleft",
                      labels_gp = gpar(fontsize = 12), title = "Vaccine")
  lgd_lines = Legend(at = unique(BTM_groups$SUBGROUP), type = "lines",
                     legend_gp = gpar(col = mycolors_BTM, lwd = 4), title_position = "topleft",
                     labels_gp= gpar(fontsize = 14), title = "BTM", background = NA)
  #lgd_list_vertical = packLegend(lgd_points, lgd_lines)
  lgd_list_vertical = packLegend(lgd_lines)
  pushViewport(viewport(x = unit(40, "cm"), y = unit(40, "cm"),
                        width = widthDetails(lgd_list_vertical), height = heightDetails(lgd_list_vertical), just = c("left", "bottom")))
  grid.draw(lgd_list_vertical)
  upViewport()
  circos.clear()
}

Figure 4: Early adaptive and delayed innate transcriptional signatures of yellow fever vaccine

Figure 4a, Ext. Data Figure 2a-b. Correlation matrices of pairwise Spearman correlations of gene-level fold changes between vaccines at each timepoint.

#Get FC data for common timepoints
common_tp=c(1,3,7,14,21)
exp_FC_commonTP=exp_FC[tp_FC %in% common_tp]
#For each study, average expression across all subjects
matrix_uni_tp=lapply(exp_FC_commonTP,function(x) x$matrix[!duplicated(x$matrix)])
#Store study metadata
matrix_uni_tp_metaData=lapply(exp_FC_commonTP,function(x) pData(x)[!duplicated(x$matrix),] %>%
                                rownames_to_column() %>%
                                dplyr::select(matrix, vaccine, vaccine_type, pathogen, adjuvant, pt) %>%
                                column_to_rownames(var = "matrix"))
#Find sample indices per matrix
matrix_ind=lapply(1:length(exp_FC_commonTP),
                  function(x) lapply(1:length(matrix_uni_tp[[x]]), function(y) which(matrix_uni_tp[[x]][[y]]==exp_FC_commonTP[[x]]$matrix)))
#Study size
exp_FC_commonTP_n=lapply(1:length(exp_FC_commonTP),
                            function(x) sapply(1:length(matrix_uni_tp[[x]]), function(y) length(matrix_ind[[x]][[y]])))
#Average FC
exp_FC_commonTP_mean=lapply(1:length(exp_FC_commonTP),
                               function(x) sapply(1:length(matrix_uni_tp[[x]]), function(y) rowMeans(exprs(exp_FC_commonTP[[x]][,matrix_ind[[x]][[y]]]))))
#Find unique pathogen/vaccine types by timepoint
pt_uni_tp=lapply(matrix_uni_tp_metaData,function(x) x$pt[!duplicated(x$pt)])
pt_metaData=lapply(1:length(matrix_uni_tp_metaData),
                   function(x) dplyr::select(
                     matrix_uni_tp_metaData[[x]][!duplicated(matrix_uni_tp_metaData[[x]]$pt),],
                     vaccine_type, pathogen, adjuvant))
#Vaccine size
exp_FC_commonTP_pt_n=lapply(1:length(exp_FC_commonTP),
                               function(x) sapply(1:length(pt_uni_tp[[x]]), function(y) length(which(exp_FC_commonTP[[x]]$pt==pt_uni_tp[[x]][y]))))
#Add weight (1/vaccine sample size) to pData for all samples
n_weight=lapply(1:length(exp_FC_commonTP),
                function(x) sapply(1:ncol(exp_FC_commonTP[[x]]), function(y) exp_FC_commonTP_pt_n[[x]][which(pt_uni_tp[[x]]==exp_FC_commonTP[[x]]$pt[y])]))
for (i in 1:length(exp_FC_commonTP)) {
  exp_FC_commonTP[[i]]$n_weight=1/n_weight[[i]]
}

#Compute mean FC and p values per cluster per vaccine (pt-pathogen/vaccine type)
pt_ind=vector("list",length(exp_FC_commonTP))
pt_FC_mean=vector("list",length(exp_FC_commonTP))
for (i in 1:length(exp_FC_commonTP)) {
  #For each vaccine
  pt_FC_mean[[i]]=matrix(NA, nrow=nrow(exp_FC_commonTP[[i]]), ncol=length(pt_uni_tp[[i]]))
  for (j in 1:length(pt_uni_tp[[i]])) {
    pt_ind[[i]][[j]]=which(pt_uni_tp[[i]][[j]]==matrix_uni_tp_metaData[[i]]$pt)
    #If there is more than 1 study, compute weighted average
    if (length(pt_ind[[i]][[j]])>1) {
      #Average FCs (weighted by n)
      pt_FC_mean[[i]][,j]=rowWeightedMeans(exp_FC_commonTP_mean[[i]][,pt_ind[[i]][[j]]], w=exp_FC_commonTP_n[[i]][pt_ind[[i]][[j]]])
      #Otherwise store single value
    } else {
      pt_FC_mean[[i]][,j]=exp_FC_commonTP_mean[[i]][,pt_ind[[i]][[j]]]
    }
  }
  colnames(pt_FC_mean[[i]])=pt_uni_tp[[i]]
  rownames(pt_FC_mean[[i]])=rownames(exp_FC_commonTP_mean[[i]])
}

#Compute rank-based correlation matrices at each timepoint using gene FC
pt_FC_cor=lapply(1:length(pt_FC_mean), function(x) cor(pt_FC_mean[[x]], method='spearman'))
#Plot correlation matrices
colorLS=colorRampPalette(colors = c("blue", "white", "red"))(n = 100)
breakLS=seq(from = -1, to = 1, length.out = 100)
tp_plot=c(1,3,7)
for (i in 1:length(tp_plot)) {
  df=pt_FC_cor[[which(common_tp==tp_plot[i])]]
  diag(df)=0
  df_lim=1
  p=corrplot(df, order='hclust', col=colorLS, tl.col='black', cl.lim = c(-df_lim, df_lim), #smaller label version
             is.corr = FALSE, tl.cex=1, cl.cex=1, title=paste0('Day ',tp_plot[i]), mar=c(0,0,2,0))
}

Figure S2c-d. Scatterplots of Day 1 gene FCs between vaccines

#Plot example scatterplots for YF vs Pnemococcus (best negative) and Malaria vs HIV (best positive)
pt_plot_pairs=list(c('Yellow Fever','Pneumococcus'), c('Malaria','HIV'))
for (i in 1:length(pt_plot_pairs)) {
  ind1=grep(pt_plot_pairs[[i]][1],pt_uni_tp[[1]])
  ind2=grep(pt_plot_pairs[[i]][2], pt_uni_tp[[1]])
  df=setNames(data.frame(pt_FC_mean[[1]][,c(ind1,ind2)]), c('X','Y'))
  p=ggscatter(df, x='X', y='Y',
              xlab=paste(pt_uni_tp[[1]][ind1],'Day 1 log2 FC'),
              ylab=paste(pt_uni_tp[[1]][ind2],'Day 1 log2 FC'),
              add = "reg.line",
              add.params = list(color = "blue", fill = "lightgray"))
  # Add correlation coefficient, set scales
  p+stat_cor(method = "pearson")+theme(text = element_text(size=20))
  print(p)
}

Figure 4b-c. Activity score heatmaps of modules differentially expressed in response to YF vaccination on day 1 (b, QuSAGE FDR<0.2) and day 7 (c, QuSAGE FDR<0.05, activity score>0.2)

#Collect processed qusage results
qusage_df=tidy_longitudinal_qusage_young_meta_df
#Subset data to timepoints of interest
tp_plot=c(1,7)
qusage_df=qusage_df[qusage_df$time_post_last_vax %in% tp_plot,]
qusage_df$pt=paste(qusage_df$pathogen," (",qusage_df$vaccine_type,")", sep='')
pt_uni=unique(qusage_df$pt)
pt_uni_tp=lapply(tp_plot, function(x) unique(qusage_df$pt[qusage_df$time_post_last_vax==x]))
#Split by timepoint
qusage_df=lapply(tp_plot, function(x) qusage_df[qusage_df$time_post_last_vax==x,])
#Convert to wide format
pt_FC_NES=lapply(qusage_df, function(x) data.frame(spread(x[,c('pt','geneset','activity_score')], pt, activity_score)))
pt_FC_NES=lapply(pt_FC_NES, function(x) {rownames(x)=x$geneset; x[,-1]})
pt_FC_NES=lapply(1:length(pt_FC_NES), function(x) {colnames(pt_FC_NES[[x]])=pt_uni_tp[[x]]; pt_FC_NES[[x]]})
pt_FC_p=lapply(qusage_df, function(x) data.frame(spread(x[,c('pt','geneset','FDR')], pt, FDR)))
pt_FC_p=lapply(pt_FC_p, function(x) {rownames(x)=x$geneset; x[,-1]})
pt_FC_p=lapply(1:length(pt_FC_p), function(x) {colnames(pt_FC_p[[x]])=pt_uni_tp[[x]]; pt_FC_p[[x]]})
#Reorder qusage results to match BTM_groups
pt_FC_NES=lapply(pt_FC_NES, function(x) x[match(BTM_groups$NAME,rownames(x)),])
pt_FC_p=lapply(pt_FC_p, function(x) x[match(BTM_groups$NAME,rownames(x)),])

#Find significant BTMs
ind_sig=vector("list",length(pt_FC_NES))
for (i in 1:length(pt_FC_NES)) {
  ind_sig[[i]]=vector("list",ncol(pt_FC_NES[[i]]))
  names(ind_sig[[i]])=colnames(pt_FC_NES[[i]])
  for (j in 1:ncol(pt_FC_NES[[i]])) {
    if (tp_plot[i]==1) {
      ind=which(pt_FC_p[[i]][,j]<0.2)
      ind_sig[[i]][[j]]=intersect(ind,which(abs(pt_FC_NES[[i]][,j])>0))
    } else if (tp_plot[i]==7)
    ind=which(pt_FC_p[[i]][,j]<0.05)
    ind_sig[[i]][[j]]=intersect(ind,which(abs(pt_FC_NES[[i]][,j])>0.2))
  }
}
#Find all significant BTMs at each timepoint
ind_sig_pertp=lapply(ind_sig, function(x) Reduce(union,x))
#Use YF only to define sig BTMs
ind_sig_pertp=lapply(ind_sig, function(x) unlist(x['Yellow Fever (LV)']))
#Plot heatmap of significant BTMs
for (i in 1:length(pt_FC_NES)) {
  df=pt_FC_NES[[i]][ind_sig_pertp[[i]],]
  #df_anno=pathogen_type_metaData[[i]][,c('vaccine_type', 'pathogen')]
  ph_lim=ceiling(max(abs(df), na.rm = TRUE))
  #ha=HeatmapAnnotation(df=df_anno, col=path_vt_colors)
  p=Heatmap(df,
            #top_annotation=ha,
            #column_labels=pathogen_type_metaData[[i]][,'adjuvant'],
            column_labels=colnames(df),
            column_names_side="top",
            name='Activity Score', col=colorRamp2(c(-ph_lim, 0, ph_lim), c("blue", "white", "red")),
            row_names_gp = gpar(fontsize = 12),
            column_names_gp = gpar(fontsize = 12))
  print(p)
}

Figure 4D. Kinetics of the mean FC of module M75 across vaccines.

#For each study, average expression across all subjects
matrix_uni_tp=lapply(exp_FC_BTM_commonTP,function(x) x$matrix[!duplicated(x$matrix)])
#Store study metadata
matrix_uni_tp_metaData=lapply(exp_FC_BTM_commonTP,function(x) pData(x)[!duplicated(x$matrix),] %>%
                                 rownames_to_column() %>%
                                 dplyr::select(matrix, vaccine, vaccine_type, pathogen, adjuvant, pt) %>%
                                 column_to_rownames(var = "matrix"))
#Get study indices
matrix_ind=lapply(1:length(exp_FC_BTM_commonTP),
                   function(x) lapply(1:length(matrix_uni_tp[[x]]), function(y) which(matrix_uni_tp[[x]][[y]]==exp_FC_BTM_commonTP[[x]]$matrix)))
#Study size
exp_FC_BTM_commonTP_n=lapply(1:length(exp_FC_BTM_commonTP),
                              function(x) sapply(1:length(matrix_uni_tp[[x]]), function(y) length(matrix_ind[[x]][[y]])))
#Average FC
exp_FC_BTM_commonTP_mean=lapply(1:length(exp_FC_BTM_commonTP),
                                 function(x) sapply(1:length(matrix_uni_tp[[x]]), function(y) rowMeans(exprs(exp_FC_BTM_commonTP[[x]][,matrix_ind[[x]][[y]]]))))
#Find unique pathogen/vaccine types by timepoint
pt_uni_tp=lapply(matrix_uni_tp_metaData,function(x) x$pt[!duplicated(x$pt)])
pt_ind=lapply(1:length(pt_uni_tp), function(x) lapply(1:length(pt_uni_tp[[x]]), function(y) which(pt_uni_tp[[x]][[y]]==matrix_uni_tp_metaData[[x]]$pt)))
#Compute average expression for each vaccine across studies (weighted by sample size)
pt_FC_mean=lapply(1:length(exp_FC_BTM_commonTP), function(x) sapply(setNames(1:length(pt_uni_tp[[x]]), pt_uni_tp[[x]]), function(y) if(length(pt_ind[[x]][[y]])>1)
  rowWeightedMeans(exp_FC_BTM_commonTP_mean[[x]][,pt_ind[[x]][[y]]], w=exp_FC_BTM_commonTP_n[[x]][pt_ind[[x]][[y]]], useNames=T) else exp_FC_BTM_commonTP_mean[[x]][,pt_ind[[x]][[y]]]))
#Set BTM of interest list, plot kinetics
BTM_int=c('M75)')
#Lookup BTM
ind=grep(BTM_int, rownames(exp_FC_BTM_commonTP[[1]]), fixed=TRUE)
#Build matrix of avg common gene exp over time per vaccine
BTM_int_FC_mat=array(NA, dim=c(length(unique(eset$pt)),ncol=length(exp_FC_BTM_commonTP)+1),
                      dimnames=list(unique(eset$pt),c(0,common_tp)))
#Set D0 FC to 0
BTM_int_FC_mat[,1]=0
for (j in 1:length(exp_FC_BTM_commonTP)) {
  ind_pt=match(pt_uni_tp[[j]],unique(eset$pt))
  BTM_int_FC_mat[ind_pt,j+1]=pt_FC_mean[[j]][ind,]
}
#Convert matrix to long format
BTM_int_FC_mat=data.frame(BTM_int_FC_mat) %>% tibble::rownames_to_column(var = "Vaccine") %>%
  pivot_longer(-Vaccine, names_to='Timepoint', values_to='FC')
#Convert timepoint back to numeric
BTM_int_FC_mat$Timepoint=as.numeric(str_replace(BTM_int_FC_mat$Timepoint, 'X', ''))
#Plot
p=ggplot(BTM_int_FC_mat[!is.na(BTM_int_FC_mat$FC),], aes(x=Timepoint, y=FC, group=Vaccine)) +
  geom_line(aes(linetype=Vaccine, color=Vaccine), size=1.07)+
  theme_classic()+
  scale_color_manual(values=pt_line_color_anno$color)+
  scale_linetype_manual(values=pt_line_color_anno$linetype)+
  scale_x_continuous(breaks=c(0,common_tp))+
  xlab("Day post-last vaccination")+
  ylab("Mean FC (log2)")+
  ggtitle(rownames(exp_FC_BTM[[1]][ind,]))+
  theme(plot.title = element_text(hjust = 0.5), text = element_text(size=12))
print(p)

Figure 4e. Heatmaps of the post-vaccination FC of genes in module M75

#Get FC data for common timepoints
common_tp=c(1,3,7,14,21)
exp_FC_commonTP=exp_FC[tp_FC %in% common_tp]
#For each study, average expression across all subjects
matrix_uni_tp=lapply(exp_FC_commonTP,function(x) x$matrix[!duplicated(x$matrix)])
#Store study metadata
matrix_uni_tp_metaData=lapply(exp_FC_commonTP,function(x) pData(x)[!duplicated(x$matrix),] %>%
                                rownames_to_column() %>%
                                dplyr::select(matrix, vaccine, vaccine_type, pathogen, adjuvant, pt) %>%
                                column_to_rownames(var = "matrix"))
#Find sample indices per matrix
matrix_ind=lapply(1:length(exp_FC_commonTP),
                  function(x) lapply(1:length(matrix_uni_tp[[x]]), function(y) which(matrix_uni_tp[[x]][[y]]==exp_FC_commonTP[[x]]$matrix)))
#Study size
exp_FC_commonTP_n=lapply(1:length(exp_FC_commonTP),
                            function(x) sapply(1:length(matrix_uni_tp[[x]]), function(y) length(matrix_ind[[x]][[y]])))
#Average FC
exp_FC_commonTP_mean=lapply(1:length(exp_FC_commonTP),
                               function(x) sapply(1:length(matrix_uni_tp[[x]]), function(y) rowMeans(exprs(exp_FC_commonTP[[x]][,matrix_ind[[x]][[y]]]))))
#Find unique pathogen/vaccine types by timepoint
pt_uni_tp=lapply(matrix_uni_tp_metaData,function(x) x$pt[!duplicated(x$pt)])
pt_metaData=lapply(1:length(matrix_uni_tp_metaData),
                   function(x) dplyr::select(
                     matrix_uni_tp_metaData[[x]][!duplicated(matrix_uni_tp_metaData[[x]]$pt),],
                     vaccine_type, pathogen, adjuvant))
#Vaccine size
exp_FC_commonTP_pt_n=lapply(1:length(exp_FC_commonTP),
                               function(x) sapply(1:length(pt_uni_tp[[x]]), function(y) length(which(exp_FC_commonTP[[x]]$pt==pt_uni_tp[[x]][y]))))

#Compute mean FC and p values per cluster per vaccine (pt-pathogen/vaccine type)
pt_ind=vector("list",length(exp_FC_commonTP))
pt_FC_mean=vector("list",length(exp_FC_commonTP))
for (i in 1:length(exp_FC_commonTP)) {
  #For each vaccine
  pt_FC_mean[[i]]=matrix(NA, nrow=nrow(exp_FC_commonTP[[i]]), ncol=length(pt_uni_tp[[i]]))
  for (j in 1:length(pt_uni_tp[[i]])) {
    pt_ind[[i]][[j]]=which(pt_uni_tp[[i]][[j]]==matrix_uni_tp_metaData[[i]]$pt)
    #If there is more than 1 study, compute weighted average
    if (length(pt_ind[[i]][[j]])>1) {
      #Average FCs (weighted by n)
      pt_FC_mean[[i]][,j]=rowWeightedMeans(exp_FC_commonTP_mean[[i]][,pt_ind[[i]][[j]]], w=exp_FC_commonTP_n[[i]][pt_ind[[i]][[j]]])
      #Otherwise store single value
    } else {
      pt_FC_mean[[i]][,j]=exp_FC_commonTP_mean[[i]][,pt_ind[[i]][[j]]]
    }
  }
  colnames(pt_FC_mean[[i]])=pt_uni_tp[[i]]
  rownames(pt_FC_mean[[i]])=rownames(exp_FC_commonTP_mean[[i]])
  rownames(pt_metaData[[i]])=pt_uni_tp[[i]]
}

#Plot BTM genes
BTM_int='M75'
for (i in 1:length(pt_FC_mean)) {
  ind=with(pt_metaData[[i]], order(vaccine_type, adjuvant, pathogen))
  gene_int=btm_list[[grep(BTM_int,names(btm_list))]]
  #Create data frame and annotation
  df=pt_FC_mean[[i]][na.omit(match(gene_int,rownames(eset))),ind]
  df_anno=pt_metaData[[i]][ind, c('vaccine_type', 'pathogen', 'adjuvant')]
  ph_lim=ceiling(max(abs(df), na.rm = TRUE))
  ph_lim=2
  colorLS=colorRampPalette(colors = c("blue", "white", "red"))(n = 100)
  breakLS=seq(from = -ph_lim, to = ph_lim, length.out = 100)
  #Plotting with pheatmap
  p=pheatmap::pheatmap(mat = df,
                  color = colorLS,
                  breaks = breakLS,
                  cluster_cols = FALSE,
                  cluster_rows = FALSE,
                  clustering_distance_rows = "euclidean",
                  clustering_distance_cols = "euclidean",
                  clustering_method = "ward.D2",
                  show_colnames = TRUE,
                  show_rownames = TRUE,
                  annotation_col = df_anno,
                  annotation_colors = adj_path_vt_colors,
                  fontsize = 10,
                  cellheight = 10,
                  cellwidth = 15,
                  main=paste0('Day ',common_tp[i]))
}

Ext. Data Figure 2e. Boxplot of Day 1 FCs in xCell estimated B cell frequencies across vaccines

common_tp=c(0,1,3,7,14,21)
cellFreq_p_cutoff=0.2 #cutoff for discarding cell frequency estimations below a confidence threshold
#Load xCell data
freqDF=read.delim(file.path(extra_data_dir,'xcell_Vax_noResponse.csv'), sep=',', row.names=1)
freqDF_p=read.delim(file.path(extra_data_dir,'pval_xcell_Vax_noResponse.csv'), sep=',', row.names=1)
#Replace nonsignificant cell scores with NA
freqDF[freqDF_p>=cellFreq_p_cutoff]=NA
#Select only subjects with both pData and est freq, create expressionset
common=intersect(colnames(eset), colnames(freqDF))
eset_freq=ExpressionSet(as.matrix(log2(freqDF[,match(common,colnames(freqDF))])),
                        eset@phenoData[match(common,colnames(eset))])
#Recompute timepoint indices after removing samples
ind=lapply(common_tp, function(x) which(eset_freq$time_post_last_vax==x))

#Compute D0 normalized FC
ind_D0=which(0==eset_freq$time_post_last_vax)
common=lapply(2:length(ind),function(x) intersect(eset_freq$participant_id[ind[[x]]],eset_freq$participant_id[ind_D0]))
ia=lapply(2:length(ind),function(x) na.omit(match(common[[x-1]],eset_freq$participant_id[ind[[x]]])))
ib=lapply(2:length(ind),function(x) na.omit(match(common[[x-1]],eset_freq$participant_id[ind_D0])))
exp_FC_freq=lapply(2:length(ind),function(x) eset_freq[,ind[[x]][ia[[x-1]]]])
exp_FC_freq=lapply(2:length(ind),function(x) {exprs(exp_FC_freq[[x-1]])=exprs(exp_FC_freq[[x-1]])-exprs(eset_freq[,ind_D0[ib[[x-1]]]]); exp_FC_freq[[x-1]]})
#For each study, average expression across all subjects
matrix_uni_tp_freq=lapply(exp_FC_freq,function(x) x$matrix[!duplicated(x$matrix)])
#Store study metadata
matrix_uni_tp_metaData_freq=lapply(exp_FC_freq,function(x) pData(x)[!duplicated(x$matrix),] %>%
                                rownames_to_column() %>%
                                dplyr::select(matrix, vaccine, vaccine_type, pathogen, adjuvant, pt) %>%
                                column_to_rownames(var = "matrix"))
#Compute average expression/t-statistic
matrix_ind_freq=lapply(1:length(exp_FC_freq),
                  function(x) lapply(1:length(matrix_uni_tp_freq[[x]]), function(y) which(matrix_uni_tp_freq[[x]][[y]]==exp_FC_freq[[x]]$matrix)))
#Study size
exp_FC_freq_n=lapply(1:length(exp_FC_freq),
                function(x) sapply(1:length(matrix_uni_tp_freq[[x]]), function(y) length(matrix_ind_freq[[x]][[y]])))
#Average FC
exp_FC_freq_mean=lapply(1:length(exp_FC_freq),
                   function(x) sapply(1:length(matrix_uni_tp_freq[[x]]), function(y) rowMeans(exprs(exp_FC_freq[[x]][,matrix_ind_freq[[x]][[y]]]))))
#t-test/p value
exp_FC_freq_p_less=
  lapply(1:length(exp_FC_freq),
         function(x) sapply(1:length(matrix_uni_tp_freq[[x]]),
                            function(y) apply(exprs(exp_FC_freq[[x]][,matrix_ind_freq[[x]][[y]]]), 1,
                                              function(z) if(length(which(!is.na(z)))>2) t.test(z, alternative='less')$p.value else NA)))
exp_FC_freq_p_greater=
  lapply(1:length(exp_FC_freq),
         function(x) sapply(1:length(matrix_uni_tp_freq[[x]]),
                            function(y) apply(exprs(exp_FC_freq[[x]][,matrix_ind_freq[[x]][[y]]]), 1,
                                              function(z) if(length(which(!is.na(z)))>2) t.test(z, alternative='greater')$p.value else NA)))
#Replace NaN p values with 0.9999
exp_FC_freq_p_less=lapply(exp_FC_freq_p_less, function(x) replace(x,is.nan(x),0.9999))
exp_FC_freq_p_greater=lapply(exp_FC_freq_p_greater, function(x) replace(x,is.nan(x),0.9999))
#Replace 1 p values with 0.9999 (for compatibility with sumz)
exp_FC_freq_p_less=lapply(exp_FC_freq_p_less, function(x) replace(x,x>0.9999,0.9999))
exp_FC_freq_p_greater=lapply(exp_FC_freq_p_greater, function(x) replace(x,x>0.9999,0.9999))

##Compute DE cells by pathogen/vaccine type
#Integrate p values across multiple studies by Stouffer's method

#Find unique pathogen/vaccine types by timepoint
pt_uni_tp_freq=lapply(matrix_uni_tp_metaData_freq,function(x) x$pt[!duplicated(x$pt)])
pt_metaData_freq=lapply(1:length(matrix_uni_tp_metaData_freq),
                   function(x) dplyr::select(
                     matrix_uni_tp_metaData_freq[[x]][!duplicated(matrix_uni_tp_metaData_freq[[x]]$pt),],
                     vaccine_type, pathogen, adjuvant))
#Vaccine size
exp_FC_freq_pt_n=lapply(1:length(exp_FC_freq),
                   function(x) sapply(1:length(pt_uni_tp_freq[[x]]), function(y) length(which(exp_FC_freq[[x]]$pt==pt_uni_tp_freq[[x]][y]))))
#Add weight (1/vaccine sample size) to pData for all samples
n_weight_freq=lapply(1:length(exp_FC_freq),
                function(x) sapply(1:ncol(exp_FC_freq[[x]]), function(y) exp_FC_freq_pt_n[[x]][which(pt_uni_tp_freq[[x]]==exp_FC_freq[[x]]$pt[y])]))
for (i in 1:length(exp_FC_freq)) {
  exp_FC_freq[[i]]$n_weight_freq=1/n_weight_freq[[i]]
}


pt_ind_freq=vector("list",length(exp_FC_freq))
pt_FC_mean_freq=vector("list",length(exp_FC_freq))
pt_p_freq=vector("list",length(exp_FC_freq))
pt_q_freq=vector("list",length(exp_FC_freq))
for (i in 1:length(exp_FC_freq)) {
  #For each pathogen/type combo
  pt_FC_mean_freq[[i]]=matrix(NA, nrow=nrow(eset_freq), ncol=length(pt_uni_tp_freq[[i]]))
  pt_p_freq[[i]]=matrix(NA, nrow=nrow(eset_freq), ncol=length(pt_uni_tp_freq[[i]]))
  pt_q_freq[[i]]=matrix(NA, nrow=nrow(eset_freq), ncol=length(pt_uni_tp_freq[[i]]))
  for (j in 1:length(pt_uni_tp_freq[[i]])) {
    pt_ind_freq[[i]][[j]]=which(pt_uni_tp_freq[[i]][[j]]==matrix_uni_tp_metaData_freq[[i]]$pt)
    #If there is more than 1 study, integrate p values by Stouffer's method
    if (length(pt_ind_freq[[i]][[j]])>1) {
      #Average FCs (weighted by n)
      pt_FC_mean_freq[[i]][,j]=rowWeightedMeans(exp_FC_freq_mean[[i]][,pt_ind_freq[[i]][[j]]], w=exp_FC_freq_n[[i]][pt_ind_freq[[i]][[j]]], na.rm=T)
      #Directional Stouffer's method using sumz function (weighted by sqrt(n))
      pt_p_freq_less=vapply(1:nrow(eset_freq), FUN.VALUE=1, function(x)
        sumz(exp_FC_freq_p_less[[i]][x,pt_ind_freq[[i]][[j]]],sqrt(exp_FC_freq_n[[i]][pt_ind_freq[[i]][[j]]]), na.action=na.omit)$p)
      pt_p_freq_greater=vapply(1:nrow(eset_freq), FUN.VALUE=1, function(x)
        sumz(exp_FC_freq_p_greater[[i]][x,pt_ind_freq[[i]][[j]]],sqrt(exp_FC_freq_n[[i]][pt_ind_freq[[i]][[j]]]), na.action=na.omit)$p)
      pt_p_freq[[i]][,j]=2*pmin(pt_p_freq_less,pt_p_freq_greater)

      #Otherwise store single p values
    } else {
      pt_FC_mean_freq[[i]][,j]=exp_FC_freq_mean[[i]][,pt_ind_freq[[i]][[j]]]
      pt_p_freq[[i]][,j]=2*pmin(exp_FC_freq_p_less[[i]][,pt_ind_freq[[i]][[j]]],exp_FC_freq_p_greater[[i]][,pt_ind_freq[[i]][[j]]])
    }
    #Correct p values >1
    pt_p_freq[[i]][(pt_p_freq[[i]][,j]>1),j]=1
    #Convert p values to q values (FDR correction)
    pt_q_freq[[i]][,j]=p.adjust(pt_p_freq[[i]][,j],method="BH") #BH method
  }
  colnames(pt_FC_mean_freq[[i]])=pt_uni_tp_freq[[i]]
  rownames(pt_FC_mean_freq[[i]])=rownames(fData(eset_freq))
  colnames(pt_p_freq[[i]])=pt_uni_tp_freq[[i]]
  rownames(pt_p_freq[[i]])=rownames(fData(eset_freq))
  colnames(pt_q_freq[[i]])=pt_uni_tp_freq[[i]]
  rownames(pt_q_freq[[i]])=rownames(fData(eset_freq))
  rownames(pt_metaData_freq[[i]])=pt_uni_tp_freq[[i]]
}

#Plot boxplot for total B cells on D1
cells_plot=c('B-cells')
for (i in 1:length(cells_plot)) {
  ind=match(cells_plot[i], rownames(eset_freq))
  df=data.frame(value=exprs(exp_FC_freq[[1]])[ind,], vaccine=exp_FC_freq[[1]]$pt)
  p=ggplot(df, aes(x=vaccine, y=value, fill=vaccine)) +
    geom_boxplot(outlier.shape=NA,width=0.5,notch=FALSE, alpha=0.3)+
    geom_jitter(pch=21, width=0.1)+
    ylab('log2(FC)')+
    xlab('')+
    ggtitle(paste0('Day 1 ',cells_plot[i]))+
    theme_bw()+
    theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1), plot.title = element_text(hjust = 0.5), legend.position = "none")
  print(p)
}

Ext. Data Figure 2f-g. Kinetics of the mean FC of modules M47.0 and M75 across Yellow Fever vaccine studies.

#Keep only YF studies from FC data
exp_FC_BTM_YF=lapply(exp_FC_BTM, function(x){y=x[,x$pathogen=='Yellow Fever']; y})
#Add vaccine+study column to pData
exp_FC_BTM_YF=lapply(exp_FC_BTM_YF, function(x){x$pt_sdy=paste(x$pt, x$study_accession); x})
#Keep only up to day 14
exp_FC_BTM_YF=exp_FC_BTM_YF[which(tp_FC %in% c(1,3,7,14))]


#For each matrix, average expression across all subjects
matrix_uni_tp_YF=lapply(exp_FC_BTM_YF,function(x) x$matrix[!duplicated(x$matrix)])
#Store matrix metadata
matrix_uni_tp_YF_metaData=lapply(exp_FC_BTM_YF,function(x) pData(x)[!duplicated(x$matrix),] %>%
                                   rownames_to_column() %>%
                                   dplyr::select(matrix, vaccine, vaccine_type, pathogen, adjuvant, pt_sdy) %>%
                                   column_to_rownames(var = "matrix"))
#Find matrix indices
matrix_ind_YF=lapply(1:length(exp_FC_BTM_YF),
                     function(x) lapply(1:length(matrix_uni_tp_YF[[x]]), function(y) which(matrix_uni_tp_YF[[x]][[y]]==exp_FC_BTM_YF[[x]]$matrix)))
#Matrix size
exp_FC_BTM_YF_n=lapply(1:length(exp_FC_BTM_YF),
                       function(x) sapply(1:length(matrix_uni_tp_YF[[x]]), function(y) length(matrix_ind_YF[[x]][[y]])))
#Average FC
exp_FC_BTM_YF_mean=lapply(1:length(exp_FC_BTM_YF),
                          function(x) sapply(1:length(matrix_uni_tp_YF[[x]]), function(y) rowMeans(exprs(exp_FC_BTM_YF[[x]][,matrix_ind_YF[[x]][[y]]]))))

#Find unique pathogen/vaccine types by timepoint
pt_uni_tp_YF=lapply(matrix_uni_tp_YF_metaData,function(x) x$pt_sdy[!duplicated(x$pt_sdy)])
pt_metaData_YF=lapply(1:length(matrix_uni_tp_YF_metaData),
                      function(x) dplyr::select(
                        matrix_uni_tp_YF_metaData[[x]][!duplicated(matrix_uni_tp_YF_metaData[[x]]$pt_sdy),],
                        vaccine_type, pathogen, adjuvant))
#Vaccine size
exp_FC_BTM_YF_pt_n=lapply(1:length(exp_FC_BTM_YF),
                          function(x) sapply(1:length(pt_uni_tp_YF[[x]]), function(y) length(which(exp_FC_BTM_YF[[x]]$pt_sdy==pt_uni_tp_YF[[x]][y]))))
#Add weight (1/vaccine sample size) to pData for all samples
n_weight=lapply(1:length(exp_FC_BTM_YF),
                function(x) sapply(1:ncol(exp_FC_BTM_YF[[x]]), function(y) exp_FC_BTM_YF_pt_n[[x]][which(pt_uni_tp_YF[[x]]==exp_FC_BTM_YF[[x]]$pt_sdy[y])]))
for (i in 1:length(exp_FC_BTM_YF)) {
  exp_FC_BTM_YF[[i]]$n_weight=1/n_weight[[i]]
}


pt_ind_YF=vector("list",length(exp_FC_BTM_YF))
pt_FC_mean_BTM_YF=vector("list",length(exp_FC_BTM_YF))
for (i in 1:length(exp_FC_BTM_YF)) {
  #For each pathogen/type combo
  pt_FC_mean_BTM_YF[[i]]=matrix(NA, nrow=nrow(exp_FC_BTM_YF[[1]]), ncol=length(pt_uni_tp_YF[[i]]))
  for (j in 1:length(pt_uni_tp_YF[[i]])) {
    pt_ind_YF[[i]][[j]]=which(pt_uni_tp_YF[[i]][[j]]==matrix_uni_tp_YF_metaData[[i]]$pt_sdy)
    #If there is more than 1 study
    if (length(pt_ind_YF[[i]][[j]])>1) {
      #Average FCs (weighted by n)
      pt_FC_mean_BTM_YF[[i]][,j]=rowWeightedMeans(exp_FC_BTM_YF_mean[[i]][,pt_ind_YF[[i]][[j]]], w=exp_FC_BTM_YF_n[[i]][pt_ind_YF[[i]][[j]]])
      #Otherwise store single p values
    } else {
      pt_FC_mean_BTM_YF[[i]][,j]=exp_FC_BTM_YF_mean[[i]][,pt_ind_YF[[i]][[j]]]
    }
  }
  colnames(pt_FC_mean_BTM_YF[[i]])=pt_uni_tp_YF[[i]]
  rownames(pt_FC_mean_BTM_YF[[i]])=rownames(rownames(exp_FC_BTM_YF[[1]]))
  rownames(pt_metaData_YF[[i]])=pt_uni_tp_YF[[i]]
}

#Set BTM of interest list, plot kinetics
BTM_int=c('M47.0)', 'M75)')
for (i in 1:length(BTM_int)) {
  #Lookup BTM
  ind=grep(BTM_int[i], names(btm_list), fixed=TRUE)
  #Build matrix of avg common gene exp over time per vaccine
  BTM_int_FC_mat=array(NA, dim=c(length(unique(exp_FC_BTM_YF[[2]]$pt_sdy)),ncol=length(exp_FC_BTM_YF)+1),
                       dimnames=list(unique(exp_FC_BTM_YF[[2]]$pt_sdy),c(0,1,3,7,14)))
  #Set D0 FC to 0
  BTM_int_FC_mat[,1]=0
  #Lookup values for other timepoints
  for (j in 1:length(exp_FC_BTM_YF)) {
    ind_pt=match(pt_uni_tp_YF[[j]],unique(exp_FC_BTM_YF[[2]]$pt_sdy))
    BTM_int_FC_mat[ind_pt,j+1]=pt_FC_mean_BTM_YF[[j]][ind,]
  }
  #Plot line graph
  #Convert matrix to long format
  BTM_int_FC_mat=data.frame(BTM_int_FC_mat) %>% tibble::rownames_to_column(var = "Study") %>%
    pivot_longer(-Study, names_to='Timepoint', values_to='FC')
  #Convert timepoint back to numeric
  BTM_int_FC_mat$Timepoint=as.numeric(str_replace(BTM_int_FC_mat$Timepoint, 'X', ''))
  #Add vaccine column
  BTM_int_FC_mat$Vaccine=gsub(' SDY.*','',BTM_int_FC_mat$Study)
  #Fix study labels
  BTM_int_FC_mat$Study=str_replace_all(BTM_int_FC_mat$Study, 'Yellow Fever \\(Live attenuated\\) ', '')
  #Plot
  p=ggplot(BTM_int_FC_mat, aes(x=Timepoint, y=FC, group=Study)) +
    geom_line(aes(color=Study), size=1.07)+
    theme_classic()+
    scale_x_continuous(breaks=c(0,1,3,7,14))+
    xlab("Day post-last vaccination")+
    ylab("Mean FC (log2)")+
    ggtitle(names(btm_list[ind]))+
    theme(plot.title = element_text(hjust = 0.5), text = element_text(size=20))
  print(p)
}

Extended Data Figure 3: Impact of pre-existing antibody levels on transcriptional responses to influenza vaccination

Prepare data

basetiter_high_cut=0.7 #Baseline titer quantile cutoffs
basetiter_low_cut=1-basetiter_high_cut

#Geometric mean function
gm_mean = function(x, na.rm=TRUE, zero.propagate = FALSE){
  if(any(x < 0, na.rm = TRUE)){
    return(NaN)
  }
  if(zero.propagate){
    if(any(x == 0, na.rm = TRUE)){
      return(0)
    }
    exp(mean(log(x), na.rm = na.rm))
  } else {
    exp(sum(log(x[x > 0]), na.rm=na.rm) / length(x))
  }
}  
#Load BTMs/groups
BTM_list=readRDS(file = file.path(data_dir, "btm_list_2020_12_23.rds"))
#Load BTM groups
load(file.path(extra_data_dir,'BTM_groups.rda'))
#Match names with list
BTM_groups=BTM_groups[match(sub('.*\\(S','S',sub('.*\\(M','M',names(BTM_list))),sub('.*\\(S','S',sub('.*\\(M','M',BTM_groups$NAME))),]
names(BTM_list)=BTM_groups$NAME
#Set BTM group as factor and manually set order
BTM_groups$SUBGROUP=factor(BTM_groups$SUBGROUP,
                           levels=c('ANTIGEN PRESENTATION', 'INFLAMMATORY/TLR/CHEMOKINES','INTERFERON/ANTIVIRAL SENSING',
                                    'MONOCYTES', 'DC ACTIVATION', 'NEUTROPHILS', 'SIGNAL TRANSDUCTION', 'ECM AND MIGRATION',
                                    'ENERGY METABOLISM', 'CELL CYCLE', 'NK CELLS', 'T CELLS', 'B CELLS', 'PLASMA CELLS'))
#Sort BTM list by subgroup
BTM_list=BTM_list[order(BTM_groups$SUBGROUP)]
BTM_groups=BTM_groups[order(BTM_groups$SUBGROUP),]
rownames(BTM_groups)=BTM_groups$NAME
#Set BTM colors
cols=colorRampPalette(brewer.pal(length(levels(BTM_groups$SUBGROUP)), "Set2"))
mycolors_BTM=setNames(cols(length(levels(BTM_groups$SUBGROUP))), levels(BTM_groups$SUBGROUP))

#Load datasets
eset=readRDS(file.path(data_dir, paste0(date_prefix, "young_noNorm_eset.rds")))
eset_norm=readRDS(file.path(data_dir, paste0(date_prefix, "young_norm_eset.rds")))
#Clean data
#Create combined vaccine type/pathogen column
eset$pt=paste(eset$pathogen," (",eset$vaccine_type,")", sep='')
eset_norm$pt=paste(eset_norm$pathogen," (",eset_norm$vaccine_type,")", sep='')
#Subset to flu only
eset=eset[,eset$pt=='Influenza (Inactivated)']
eset_norm=eset_norm[,eset_norm$pt=='Influenza (Inactivated)']
#Set SDY1276 cohorts as separate SDY for processing
eset$study_accession[eset$arm_accession=='ARM4431']='SDY1276_V'
eset$study_accession[eset$arm_accession=='ARM4428']='SDY1276_D'
eset_norm$study_accession[eset_norm$arm_accession=='ARM4431']='SDY1276_V'
eset_norm$study_accession[eset_norm$arm_accession=='ARM4428']='SDY1276_D'
#Set SDY180 groups as the same matrix
eset$matrix[eset$matrix=='SDY180_WholeBlood_Grp2Fluzone_Geo']='SDY180_WholeBlood_Grp1Fluzone_Geo'
eset_norm$matrix[eset_norm$matrix=='SDY180_WholeBlood_Grp2Fluzone_Geo']='SDY180_WholeBlood_Grp1Fluzone_Geo'

#Load titer data
immdata=readRDS(file.path(data_dir, paste0(date_prefix, "immdata_all.rds")))
#Convert day -7 to 0 for easy matching
immdata=lapply(immdata, function(x) {x$study_time_collected[x$study_time_collected==-7]=0; x})
#Set SDY1276 cohorts as separate
immdata=lapply(immdata, function(x) {x$study_accession[x$arm_accession=='ARM4431']='SDY1276_V'; x})
immdata=lapply(immdata, function(x) {x$study_accession[x$arm_accession=='ARM4428']='SDY1276_D'; x})
#Retain only D0 titers
immdata=lapply(immdata, function(x) x[x$study_time_collected==0])
#For each study, ID Ab assay and assign geom mean baseline titer/group for each subject
sdy_uni=unique(eset$study_accession)
ab_d0_gmt_all=data.frame()
for (i in 1:length(sdy_uni)) {
  #Check for hai titers, then nAb, then elisa
  if (sdy_uni[i] %in% immdata$hai$study_accession) {
    assay_mode='hai'
  } else if (sdy_uni[i] %in% immdata$neut_ab_titer$study_accession) {
    assay_mode='neut_ab_titer'
  } else if (sdy_uni[i] %in% immdata$elisa$study_accession) {
    assay_mode='elisa'
  } else {
    assay_mode=NA
  }
  if (is.na(assay_mode)==F) {
    #Get titers
    ab_data=immdata[[assay_mode]][immdata[[assay_mode]]$study_accession==sdy_uni[i]] %>%
      dplyr::select(participant_id, virus, value_preferred)
    #For each subject, get GMT
    ab_d0_gmt=ab_data %>% group_by(participant_id) %>% summarise(d0_gmt=gm_mean(as.numeric(value_preferred)))
    #Define baseline titer groups
    ab_d0_gmt$basetiter_group=NA
    ab_d0_gmt$basetiter_group[ab_d0_gmt$d0_gmt>=quantile(ab_d0_gmt$d0_gmt,basetiter_high_cut)]='High'
    ab_d0_gmt$basetiter_group[ab_d0_gmt$d0_gmt<quantile(ab_d0_gmt$d0_gmt,basetiter_low_cut)]='Low'
    #Add to full list
    ab_d0_gmt_all=rbind(ab_d0_gmt_all,ab_d0_gmt)
  }
}
#Add to pData
pData(eset)=left_join(pData(eset),ab_d0_gmt_all)
pData(eset_norm)=left_join(pData(eset_norm),ab_d0_gmt_all)
colnames(eset)=eset$uid
colnames(eset_norm)=eset_norm$uid
#Remove intermediate subjects/subjects without titer data
eset=eset[,!is.na(eset$basetiter_group)]
eset_norm=eset_norm[,!is.na(eset_norm$basetiter_group)]

#Find samples from timepoints of interest
tp_int=c(0,1,3,7,14)
ind=lapply(tp_int, function(x) which(eset$study_time_collected==x))
#Combine indices of all timepoints of interest
ind_all=Reduce(union,ind)
#Retain only samples from timepoints of interest
eset=eset[,ind_all]
eset_norm=eset_norm[,ind_all]

#Recompute timepoint indices after removing samples
ind=lapply(tp_int, function(x) which(eset$study_time_collected==x))

#Remove samples from a single study with fewer than sc high and low baseline titer subjects at any timepoint
sc=3
matrix_uni_tp=lapply(ind,function(x) unique(eset[,x]$matrix))
matrix_ind=lapply(1:length(ind),function(x)
  lapply(1:length(matrix_uni_tp[[x]]), function(y) which(matrix_uni_tp[[x]][[y]]==eset[,ind[[x]]]$matrix)))
matrix_H_ind=lapply(1:length(ind), function(x)
  lapply(1:length(matrix_uni_tp[[x]]), function(y) which(pData(eset[,ind[[x]][matrix_ind[[x]][[y]]]])['basetiter_group']=='High')))
matrix_L_ind=lapply(1:length(ind), function(x)
  lapply(1:length(matrix_uni_tp[[x]]), function(y) which(pData(eset[,ind[[x]][matrix_ind[[x]][[y]]]])['basetiter_group']=='Low')))
ind_cut_all=vector()
for (i in 1:length(matrix_ind)) {
  ind_cut_H=which(sapply(matrix_H_ind[[i]],length)<sc)
  ind_cut_L=which(sapply(matrix_L_ind[[i]],length)<sc)
  ind_cut=union(ind_cut_H,ind_cut_L)
  if (is_empty(ind_cut)==FALSE) {
    ind_cut_all=c(ind_cut_all,ind[[i]][unlist(matrix_ind[[i]][ind_cut])])
  }
}
if (is_empty(ind_cut_all)==FALSE) {
  eset=eset[,-ind_cut_all]
  eset_norm=eset_norm[,-ind_cut_all]
}
#Recompute timepoint indices after removing samples
tp_int=unique(eset$study_time_collected[which(eset$study_time_collected>=0)])
ind=lapply(tp_int, function(x) which(eset$study_time_collected==x))

#Remove genes with NA
eset=eset[complete.cases(exprs(eset)),]
eset_norm=eset_norm[complete.cases(exprs(eset_norm)),]

#Collapse to BTM level
#Remove BTMs which have no matching genes in dataset
BTM_groups=BTM_groups[!sapply(BTM_list, function(x) is_empty(intersect(x,rownames(eset)))),]
BTM_list=BTM_list[!sapply(BTM_list, function(x) is_empty(intersect(x,rownames(eset))))]
#Collapse - find arithmetic mean of genes in each BTM for each sample
exp_BTM=do.call(rbind, lapply(BTM_list, function(x) colMeans(exprs(eset[na.omit(match(x,rownames(eset))),]),na.rm=TRUE)))
#Create BTM eset
eset=ExpressionSet(exp_BTM, eset@phenoData)
rm(exp_BTM)
exp_BTM=do.call(rbind, lapply(BTM_list, function(x) colMeans(exprs(eset_norm[na.omit(match(x,rownames(eset_norm))),]),na.rm=TRUE)))
eset_norm=ExpressionSet(exp_BTM, eset_norm@phenoData)
rm(exp_BTM)

#Compute D0 normalized FC
ind_D0=which(0==eset$study_time_collected)
common=lapply(2:length(ind),function(x) intersect(eset$participant_id[ind[[x]]],eset$participant_id[ind_D0]))
ia=lapply(2:length(ind),function(x) na.omit(match(common[[x-1]],eset$participant_id[ind[[x]]])))
ib=lapply(2:length(ind),function(x) na.omit(match(common[[x-1]],eset$participant_id[ind_D0])))
exp_FC=lapply(2:length(ind),function(x) eset[,ind[[x]][ia[[x-1]]]])
exp_FC=lapply(2:length(ind),function(x) {exprs(exp_FC[[x-1]])=exprs(exp_FC[[x-1]])-exprs(eset[,ind_D0[ib[[x-1]]]]); exp_FC[[x-1]]})
exp_FC_norm=lapply(2:length(ind),function(x) eset_norm[,ind[[x]][ia[[x-1]]]])
exp_FC_norm=lapply(2:length(ind),function(x) {exprs(exp_FC_norm[[x-1]])=exprs(exp_FC_norm[[x-1]])-exprs(eset_norm[,ind_D0[ib[[x-1]]]]); exp_FC_norm[[x-1]]})

Ext. Data Figure 3a-b. Differentially expressed modules at Day 1 (a) and Day 7 (b) (FDR < 0.05, t-test between mean fold changes) between participants with high and low baseline antibody titers (negative score indicates increased expression in the low baseline antibody group).

#For each study, average expression across all subjects
matrix_uni_tp=lapply(exp_FC,function(x) x$matrix[!duplicated(x$matrix)])
#Store study metadata
matrix_uni_tp_metaData=lapply(exp_FC,function(x) pData(x)[!duplicated(x$matrix),] %>%
                                rownames_to_column() %>%
                                dplyr::select(matrix, vaccine, vaccine_type, pathogen, adjuvant, pt) %>%
                                column_to_rownames(var = "matrix"))
#Indices for each study
matrix_ind=lapply(1:length(exp_FC),
                  function(x) lapply(1:length(matrix_uni_tp[[x]]), function(y) which(matrix_uni_tp[[x]][[y]]==exp_FC[[x]]$matrix)))
#High/low baseline titer group within each study
matrix_H_ind=lapply(1:length(exp_FC),
                    function(x) lapply(1:length(matrix_uni_tp[[x]]),
                                       function(y) which(pData(exp_FC[[x]][,matrix_ind[[x]][[y]]])['basetiter_group']=='High')))
matrix_L_ind=lapply(1:length(exp_FC),
                    function(x) lapply(1:length(matrix_uni_tp[[x]]),
                                       function(y) which(pData(exp_FC[[x]][,matrix_ind[[x]][[y]]])['basetiter_group']=='Low')))
#Study size
exp_FC_n=lapply(1:length(exp_FC),
                function(x) sapply(1:length(matrix_uni_tp[[x]]), function(y) length(matrix_ind[[x]][[y]])))

#T-test and mean FC between high/low baseline titer group
#If there are less than sc high/low titer subjects in a given study, return NA for all t-tests
exp_FC_mean=
  lapply(1:length(exp_FC),
         function(x) sapply(1:length(matrix_uni_tp[[x]]),
                            function(y) if(length(matrix_H_ind[[x]][[y]])>=sc & length(matrix_L_ind[[x]][[y]])>=sc) rowMeans(exprs(exp_FC[[x]][,matrix_ind[[x]][[y]][matrix_H_ind[[x]][[y]]]]))-rowMeans(exprs(exp_FC[[x]][,matrix_ind[[x]][[y]][matrix_L_ind[[x]][[y]]]])) else rep(NA, nrow(exp_FC[[x]]))))
exp_FC_t_stat=
  lapply(1:length(exp_FC),
         function(x) sapply(1:length(matrix_uni_tp[[x]]),
                            function(y) if(length(matrix_H_ind[[x]][[y]])>=sc & length(matrix_L_ind[[x]][[y]])>=sc) apply(exprs(exp_FC[[x]][,matrix_ind[[x]][[y]]]), 1, function(z) t.test(z[matrix_H_ind[[x]][[y]]], z[matrix_L_ind[[x]][[y]]])$statistic) else rep(NA, nrow(exp_FC[[x]]))))


exp_FC_p_less=
  lapply(1:length(exp_FC),
         function(x) sapply(1:length(matrix_uni_tp[[x]]),
                            function(y) if(length(matrix_H_ind[[x]][[y]])>=sc & length(matrix_L_ind[[x]][[y]])>=sc) apply(exprs(exp_FC[[x]][,matrix_ind[[x]][[y]]]), 1,
                                                                                                                          function(z) t.test(z[matrix_H_ind[[x]][[y]]], z[matrix_L_ind[[x]][[y]]], alternative='less')$p.value) else rep(NA, nrow(exp_FC[[x]]))))
exp_FC_p_greater=
  lapply(1:length(exp_FC),
         function(x) sapply(1:length(matrix_uni_tp[[x]]),
                            function(y) if(length(matrix_H_ind[[x]][[y]])>=sc & length(matrix_L_ind[[x]][[y]])>=sc) apply(exprs(exp_FC[[x]][,matrix_ind[[x]][[y]]]), 1,
                                                                                                                          function(z) t.test(z[matrix_H_ind[[x]][[y]]], z[matrix_L_ind[[x]][[y]]], alternative='greater')$p.value) else rep(NA, nrow(exp_FC[[x]]))))
exp_FC_mean=
  lapply(1:length(exp_FC),
         function(x) sapply(1:length(matrix_uni_tp[[x]]),
                            function(y) if(length(matrix_H_ind[[x]][[y]])>=sc & length(matrix_L_ind[[x]][[y]])>=sc) rowMeans(exprs(exp_FC[[x]][,matrix_ind[[x]][[y]][matrix_H_ind[[x]][[y]]]]))-rowMeans(exprs(exp_FC[[x]][,matrix_ind[[x]][[y]][matrix_L_ind[[x]][[y]]]])) else rep(NA, nrow(exp_FC[[x]]))))

#Replace NaN p values with 0.9999
exp_FC_p_less=lapply(exp_FC_p_less, function(x) replace(x,is.nan(x),0.9999))
exp_FC_p_greater=lapply(exp_FC_p_greater, function(x) replace(x,is.nan(x),0.9999))
#Replace 1 p values with 0.9999 (for compatibility with sumz)
exp_FC_p_less=lapply(exp_FC_p_less, function(x) replace(x,x>0.9999,0.9999))
exp_FC_p_greater=lapply(exp_FC_p_greater, function(x) replace(x,x>0.9999,0.9999))

##Compute DE features by pathogen/vaccine type (only inactivated influenza here)
#Integrate p values across multiple studies by Stouffer's method

#Find unique pathogen/vaccine types by timepoint
pt_uni_tp=lapply(matrix_uni_tp_metaData,function(x) x$pt[!duplicated(x$pt)])
pt_metaData=lapply(1:length(matrix_uni_tp_metaData),
                              function(x) dplyr::select(
                                matrix_uni_tp_metaData[[x]][!duplicated(matrix_uni_tp_metaData[[x]]$pt),],
                                vaccine_type, pathogen, adjuvant))

pt_ind=vector("list",length(exp_FC))
pt_FC_mean=vector("list",length(exp_FC))
pt_p=vector("list",length(exp_FC))
pt_q=vector("list",length(exp_FC))
pt_up_down_logic=vector("list",length(exp_FC))
pt_DEG_up=vector("list",length(exp_FC))
pt_DEG_down=vector("list",length(exp_FC))
pt_DEG_total_num=vector("list",length(exp_FC))
pt_DEG_up_overlap=vector("list",length(exp_FC))
pt_DEG_down_overlap=vector("list",length(exp_FC))
for (i in 1:length(exp_FC)) {
  #For each pathogen/type combo (only inactivated influenza here)
  pt_FC_mean[[i]]=matrix(NA, nrow=nrow(eset), ncol=length(pt_uni_tp[[i]]))
  pt_p[[i]]=matrix(NA, nrow=nrow(eset), ncol=length(pt_uni_tp[[i]]))
  pt_q[[i]]=matrix(NA, nrow=nrow(eset), ncol=length(pt_uni_tp[[i]]))
  pt_up_down_logic[[i]]=matrix(NA, nrow=nrow(eset), ncol=length(pt_uni_tp[[i]]))
  pt_ind[[i]]=vector("list",length(pt_uni_tp[[i]]))
  pt_DEG_up[[i]]=vector("list",length(pt_uni_tp[[i]]))
  pt_DEG_down[[i]]=vector("list",length(pt_uni_tp[[i]]))
  pt_DEG_total_num[[i]]=matrix(NA, nrow=1, ncol=length(pt_uni_tp[[i]]))
  for (j in 1:length(pt_uni_tp[[i]])) {
    pt_ind[[i]][[j]]=which(pt_uni_tp[[i]][[j]]==matrix_uni_tp_metaData[[i]]$pt)
    #If there is more than 1 study with values, integrate p values by Stouffer's method
    if (length(pt_ind[[i]][[j]])>1) {
      #Average FCs (weighted by n)
      pt_FC_mean[[i]][,j]=rowWeightedMeans(exp_FC_mean[[i]][,pt_ind[[i]][[j]]], w=exp_FC_n[[i]][pt_ind[[i]][[j]]])
      #Directional Stouffer's method using sumz function (weighted by sqrt(n))
      pt_p_less=vapply(1:nrow(eset), FUN.VALUE=1, function(x)
        sumz(exp_FC_p_less[[i]][x,pt_ind[[i]][[j]]],sqrt(exp_FC_n[[i]][pt_ind[[i]][[j]]]), na.action=na.omit)$p)
      pt_p_greater=vapply(1:nrow(eset), FUN.VALUE=1, function(x)
        sumz(exp_FC_p_greater[[i]][x,pt_ind[[i]][[j]]],sqrt(exp_FC_n[[i]][pt_ind[[i]][[j]]]), na.action=na.omit)$p)
      pt_up_down_logic[[i]][,j]=pt_p_greater<pt_p_less #logical variable, true if upregulation p value is smallest
      pt_p[[i]][,j]=2*pmin(pt_p_less,pt_p_greater)

      #Otherwise store single p values
    } else {
      pt_FC_mean[[i]][,j]=exp_FC_mean[[i]][,pt_ind[[i]][[j]]]
      pt_p[[i]][,j]=2*pmin(exp_FC_p_less[[i]][,pt_ind[[i]][[j]]],exp_FC_p_greater[[i]][,pt_ind[[i]][[j]]])
      pt_up_down_logic[[i]][,j]=exp_FC_p_greater[[i]][,pt_ind[[i]][[j]]]<exp_FC_p_less[[i]][,pt_ind[[i]][[j]]]
    }
    #Correct p values >1
    pt_p[[i]][(pt_p[[i]][,j]>1),j]=1
    #Convert p values to q values (FDR correction)
    pt_q[[i]][,j]=p.adjust(pt_p[[i]][,j],method="BH") #BH method
    #Find DE features
    ind_up=which(pt_up_down_logic[[i]][,j]==TRUE)
    ind_down=which(pt_up_down_logic[[i]][,j]==FALSE)
    ind_sig=which(pt_q[[i]][,j]<FDR_THRESHOLD)
    pt_DEG_up[[i]][[j]]=intersect(ind_up,ind_sig)
    pt_DEG_down[[i]][[j]]=intersect(ind_down,ind_sig)
    pt_DEG_total_num[[i]][j]=length(ind_sig)
  }
  colnames(pt_FC_mean[[i]])=pt_uni_tp[[i]]
  rownames(pt_FC_mean[[i]])=rownames(fData(eset))
  colnames(pt_p[[i]])=pt_uni_tp[[i]]
  colnames(pt_q[[i]])=pt_uni_tp[[i]]
  rownames(pt_metaData[[i]])=pt_uni_tp[[i]]
}

pt_t_stat=
  lapply(1:length(exp_FC),
         function(x) sapply(1:length(pt_uni_tp[[x]]),
                            function(y) if(length(pt_ind[[x]][[y]])>1) rowWeightedMeans(exp_FC_t_stat[[x]][,pt_ind[[x]][[y]]], w=exp_FC_n[[x]][pt_ind[[x]][[y]]], na.rm=TRUE) else exp_FC_t_stat[[x]][,pt_ind[[x]][[y]]]))

#Plot DE BTMs as bar chart
tp_plot=c(1,7)
for (i in 1:length(tp_plot)) {
  tp_ind=which(tp_int==tp_plot[i])-1
  DE_feat=c(c(pt_DEG_up[[tp_ind]][[1]],pt_DEG_down[[tp_ind]][[1]]))
  df=data.frame(pathway=names(BTM_list[DE_feat]), t=pt_t_stat[[tp_ind]][DE_feat,1])
  #Order
  df=df[order(-df$t),]
  #Add subgroup
  df$subgroup=BTM_groups$SUBGROUP[match(df$pathway,BTM_groups$NAME)]
  df$pathway=factor(df$pathway, levels=rev(as.character(df$pathway)))
  p=ggplot(df,aes(x=pathway,y=t, fill=subgroup))+
      geom_bar(stat="identity",width=0.5)+
      scale_fill_manual(values=mycolors_BTM, na.value='grey')+
      coord_flip()+
      xlab("BTM")+
      ylab("t-statistic")+
      theme_minimal()+
      theme(text = element_text(size=12))
  print(p)
}

Ext. Data Figure 3c-d. Boxplots of (c) IFN signature module M75 expression at Day 1 and (d) plasma cell module M156.1 expression at Day 7 between high and low baseline antibody groups at Day 1. Center line, median; box limits, upper and lower quartiles; whiskers, 1.5x interquartile range.

#For BTMs of interest, plot FC boxplots between baseline Ab groups
BTM_plot=c('M75', 'M156.1')
tp_plot=c(1,7)
for (i in 1:length(BTM_plot)) {
  tp_ind=which(tp_int==tp_plot[i])-1
  df=data.frame(Exp=exprs(exp_FC_norm[[tp_ind]])[match(BTM_plot[i],BTM_groups$BTM),],
                Study=exp_FC_norm[[tp_ind]]$study_accession, Group=exp_FC_norm[[tp_ind]]$basetiter_group)
  #Plot
  p=ggplot(df, aes(x=Study, y=Exp, fill=Group)) +
      geom_boxplot(outlier.shape=NA,position=position_dodge(0.7),width=0.5,notch=FALSE)+
      geom_point(pch = 21, position = position_jitterdodge(jitter.width = 0.1))+
      stat_compare_means(method = "t.test", hide.ns=TRUE, label='p.signif')+
      ylab('log2 FC')+
      ggtitle(paste0(BTM_groups$NAME[match(BTM_plot[i],BTM_groups$BTM)],' Day ',tp_plot[i]))+
      theme_classic()+
      theme(plot.title = element_text(hjust = 0.5))
  print(p)
}

Ext. Data Figure 3e-f. Line graphs of (e) M75 and (f) M156.1 expression across time in high and low baseline antibody groups. Error bars represent standard error of the mean.

#Create merged FC eset
exp_FC_all=do.call(Biobase::combine,exp_FC_norm)
#Add 0 FC for all subjects at Day 0
exp_FC_d0=eset_norm[,eset_norm$time_post_last_vax==0]
exprs(exp_FC_d0)=matrix(0,nrow(exp_FC_d0),ncol(exp_FC_d0))
exp_FC_all=Biobase::combine(exp_FC_d0,exp_FC_all)

#Plot BTM FC kinetics between high/low groups
BTM_plot=c('M75', 'M156.1')
for (i in 1:length(BTM_plot)) {
  df=data.frame(Exp=exprs(exp_FC_all)[match(BTM_plot[i],BTM_groups$BTM),],
                Timepoint=exp_FC_all$time_post_last_vax, Group=exp_FC_all$basetiter_group)
  #Plot
  p=ggline(df, x='Timepoint', y='Exp', color='Group', add='mean_se') +
      stat_compare_means(aes(group=Group), method = "t.test", hide.ns=TRUE, label='p.signif', label.y=0.8)+
      ggtitle(BTM_groups$NAME[match(BTM_plot[i],BTM_groups$BTM)])+
      theme(plot.title = element_text(hjust = 0.5))
  print(p)
}

Figure 5: Time-adjusted transcriptional predictors of antibody responses

Load packages/functions

require(Biobase)
require(tidyverse)
require(GSVA)
require(reshape2)
require(data.table)
require(limma)
require(caret)
require(caretEnsemble)
require(WeightedROC)
require(nlme)
require(emmeans)
require(plotROC)

LDAprojection<-function (xs = xs, ys = ys)
{
  M = as.matrix(xs[[1]])
  for (i in 2:length(xs)) {
    M = cbind(M, xs[[i]])
  }
  extractWT <- function(vec, y = ys) {
    x = matrix(vec, nrow = length(y))
    x = as.data.frame(x)
    if (ncol(x) == 2) {
      d1 = diff(c(by(unlist(x[, 1]), y, mean)))
      d2 = diff(c(by(unlist(x[, 2]), y, mean)))
      if (d1 == 0 | d2 == 0) {
        rs = matrix(0.5, nrow = ncol(x))
      }
      else {
        rs = MASS::lda(x, y)$scaling
      }
    }
    else {
      rs = MASS::lda(x, y)$scaling
    }
    return(rs[, 1])
  }
  wt = t(apply(M, 1, extractWT, ys))
  return(wt)
}

multiplyFunction<-function (xlist, par)
{
  tmp = xlist[[1]] * par[, 1]
  for (i in 2:ncol(par)) {
    tmp = tmp + xlist[[i]] * par[, i]
  }
  x.return = matrix(unlist(tmp), nrow = nrow(par))
}

Prepare data for model training/prediction 1. Calculate ssGSEA scores for each BTM, followed by fold change calculation of ssGSEA scores

exp_FC_wAb_path <- file.path(extra_data_dir, "FCH_eset_wAbList.BTM.rda")
eset_ssGSEA_wAb_path <- file.path(extra_data_dir, "eset_wAbList.BTM.rda")

eset_wAb <- readRDS(file.path(data_dir, paste0(date_prefix, "young_norm_withResponse_eset.rds")))

#Merge pre and post-vax samples to same matrix for SDY1529
eset_wAb$matrix[eset_wAb$study_accession=='SDY1529']='SDY1529_WholeBlood_HealthyAdults'
#Abbreviate vaccine types
eset_wAb$vaccine_type_full=eset_wAb$vaccine_type
eset_wAb$vaccine_type=str_replace_all(eset_wAb$vaccine_type,vaccine_abrev)

# Code below adds row-names equal to uid (uid is columnID for the Expression Data)
rownames(pData(eset_wAb))<-pData(eset_wAb)$uid
# ensuring order of pheno-data matchs expression data.
pData(eset_wAb)<-pData(eset_wAb)[colnames(Biobase::exprs(eset_wAb)),]

#Create column with combined vaccine type/pathogen
eset_wAb$pt=paste0(eset_wAb$pathogen,"_",eset_wAb$vaccine_type)

# Time points to calculate
tp_int=c(0, 3, 7, 14, 21)

#Remove subjects with Inf/NA MFC measurements
eset_wAb=eset_wAb[,!is.na(eset_wAb$MFC)]
eset_wAb=eset_wAb[,!(eset_wAb$MFC == Inf)]

#For subjects with D-7 but no D0 measurement, convert to D0 for FC calculation
sub_noD0=setdiff(eset_wAb[,which(eset_wAb$study_time_collected==-7)]$participant_id,eset_wAb[,which(eset_wAb$study_time_collected==0)]$participant_id)
eset_wAb$study_time_collected[intersect(which(eset_wAb$study_time_collected==-7),which(eset_wAb$participant_id %in% sub_noD0))]=0

#Find samples from timepoints of interest
ind=lapply(tp_int, function(x) which(eset_wAb$study_time_collected==x))
#Retain only samples from timepoints of interest
eset_wAb=eset_wAb[,Reduce(union,ind)]
#Recompute timepoint indices after removing extraneous timepoints
ind=lapply(tp_int, function(x) which(eset_wAb$study_time_collected==x))

#Remove samples from a single study with fewer than sample_cutoff samples at any timepoint
matrix_uni_tp=lapply(ind,function(x) unique(eset_wAb[,x]$matrix))
matrix_ind=lapply(1:length(ind),function(x)
  lapply(1:length(matrix_uni_tp[[x]]), function(y) which(matrix_uni_tp[[x]][[y]]==eset_wAb[,ind[[x]]]$matrix)))
ind_cut_all=vector()
for (i in 1:length(matrix_ind)) {
  ind_cut=which(sapply(matrix_ind[[i]],length)<sample_cutoff)
  if (is_empty(ind_cut)==FALSE) {
    for (j in 1:length(ind_cut)) {
      ind_cut_all=c(ind_cut_all,ind[[i]][matrix_ind[[i]][[ind_cut[j]]]])
    }
  }
}
if (is_empty(ind_cut_all)==FALSE) {
  eset_wAb=eset_wAb[,-ind_cut_all]
}

#Recompute timepoint indices after removing samples
tp_int=unique(eset_wAb$study_time_collected[which(eset_wAb$study_time_collected>=0)])
ind=lapply(tp_int, function(x) which(eset_wAb$study_time_collected==x))

PD<-pData(eset_wAb)

eset_wAb=eset_wAb[complete.cases(Biobase::exprs(eset_wAb)),]

#Convert to ssGSEA scores and compute FC, unless cache already exists
if (file.exists(eset_ssGSEA_wAb_path)) {
  load(exp_FC_wAb_path)
  load(eset_ssGSEA_wAb_path)
} else {

  BTM_list <- readRDS(file = file.path(data_dir, "btm_list_2020_12_23.rds"))

  # Gene names
  GeneNames<-featureNames(eset_wAb)

  #Remove BTMs which have no matching genes in dataset
  BTM_list=BTM_list[!sapply(BTM_list, function(x){is_empty(intersect(x, GeneNames))})]

  eset_wAb_BTM<-gsva(expr=eset_wAb, ## Gene-expression data, rows genes, cols samples
                 BTM_list, ## PTW list
                 verbose=TRUE,
                 method="ssgsea")

  #Replace eset_wAb with BTM eset_wAb
  eset_wAb=eset_wAb_BTM
  rm(eset_wAb_BTM)

  #Compute D0 normalized FC - for individual genes/BTMs - exp_FC_wAb has 3 large matrix with D1, D3, D7 timepoints
  ind_D0=which(0==eset_wAb$study_time_collected)
  common=lapply(2:length(ind),function(x) intersect(eset_wAb$participant_id[ind[[x]]],eset_wAb$participant_id[ind_D0]))
  #get participant id with timepoint and D0 data
  ia=lapply(2:length(ind),function(x) na.omit(match(common[[x-1]],eset_wAb$participant_id[ind[[x]]])))
  #included participant IDs match with participant IDs at diff timepoints in eset_wAb
  ib=lapply(2:length(ind),function(x) na.omit(match(common[[x-1]],eset_wAb$participant_id[ind_D0])))
  #participant IDs match with D0 to get indices in ind_D0
  exp_FC_wAb=lapply(2:length(ind),function(x) eset_wAb[,ind[[x]][ia[[x-1]]]])  #get expression data of the 3 timepoints

  exp_FC_wAb=lapply(2:length(ind),
                function(x) {Biobase::exprs(exp_FC_wAb[[x-1]])=Biobase::exprs(exp_FC_wAb[[x-1]])-Biobase::exprs(eset_wAb[,ind_D0[ib[[x-1]]]])
                exp_FC_wAb[[x-1]]
                }) #compute FC

  names(exp_FC_wAb)<-paste0("Day", tp_int[-1])

  save(eset_wAb, file=eset_ssGSEA_wAb_path)
  save(exp_FC_wAb, file=exp_FC_wAb_path)
}
  1. Wrangle fold change data to machine learning format
lapply(names(exp_FC_wAb), function(DAY){

eset_wAb<-exp_FC_wAb[[DAY]]

PD<-pData(eset_wAb)

# remove patients with duplicate time points
PD$SubjTime<-paste0(PD$participant_id,"_",PD$study_time_collected)
tab<-table(PD$SubjTime)

patientsDuplic<-names(tab[tab!=1])

## Keep only patients with 0, 1 and 3 days
numbrero<-as.numeric(gsub("Day", "", DAY))

PD.sub<-subset(PD, study_time_collected%in%c(numbrero))

PD.use<-PD.sub
PD.use<-droplevels(PD.use)

UID<-PD.use$uid

## Prepare expression data
EXP<-Biobase::exprs(eset_wAb)

## Figure out a good filter
SDVec<-apply(EXP, 1, sd)

GENES<-names(SDVec[SDVec>=quantile(SDVec,0.75)])

## I will keep genes with an SD>= the median SD

## Need to wrangle this into ML format: Gene_Time by PTID
EXP.melt<-reshape2::melt(EXP)
colnames(EXP.melt)[2]="uid"

EXP.PD<-inner_join(x=EXP.melt[EXP.melt$uid%in%PD.use$uid,],
                   y=PD.use[,c("uid","participant_id","study_time_collected","time_post_last_vax",
                               "age_imputed",'study_accession',
                               "gender","pathogen","vaccine_type","maxRBA_p30","MFC_p30","MFC","maxRBA")],
                   by="uid")

EXP.PD$Gene_Time<-paste0(EXP.PD$Var1, "_",EXP.PD$study_time_collected)
EXP.PD$pt<-paste0(EXP.PD$pathogen, "_", EXP.PD$vaccine_type)

data.ML<-as.data.frame(dcast.data.table(as.data.table(EXP.PD),
                                        formula=Var1+Gene_Time+study_time_collected~participant_id,
                                        value.var = "value"))

rownames(data.ML)<-gsub(paste0("_",numbrero,"$"),
                        paste0("_D",numbrero,"FCH"),
                        data.ML$Gene_Time)

data.ML.filter<-data.ML[data.ML$Var1%in%GENES,-c(1:3)]

save(data.ML.filter, file=paste0(extra_data_dir,"/ML.Data.", paste0("D",numbrero,"FCH."),"BTM.SDFilter75.rda"))

data.ML<-data.ML[,-c(1:3)]

save(data.ML, file=paste0(extra_data_dir,"/ML.Data.", paste0("D",numbrero,"FCH."),"BTM.NoFilter.rda"))

PatientInfo<-EXP.PD[!duplicated(EXP.PD$participant_id),
                    c('age_imputed','gender','pt','study_accession',
                      'maxRBA_p30','MFC_p30','participant_id','maxRBA','MFC')]

rownames(PatientInfo)<-PatientInfo$participant_id
PatientInfo<-as.data.frame(PatientInfo)

save(PatientInfo, file=paste0(extra_data_dir,
                               "/PD.Master.",
                              paste0("D", numbrero, "FCH."), "BTM.rda"))

})

Ext. Data Figure 4a

#Subset to day 0
eset_wAb_D0=eset_wAb[,eset_wAb$time_post_last_vax==0]
#Subset to within 7 days of day 28 for ab response, then set to day 28
eset_wAb_D0=eset_wAb_D0[,eset_wAb_D0$ImmResp_postVax_timepoint_MFC>=21 & eset_wAb_D0$ImmResp_postVax_timepoint_MFC<=35]
eset_wAb_D0$ImmResp_postVax_timepoint_MFC=28
#Set baseline timepoint to 0
eset_wAb_D0$ImmResp_baseline_timepoint_MFC=0
#For each vaccine, only use one ab type (most common)
pt_uni=unique(eset_wAb_D0$pt)
for (i in 1:length(pt_uni)) {
  most_ab_type=tail(names(sort(table(eset_wAb_D0$assay[eset_wAb_D0$pt==pt_uni[i]]))), 1)
  ind_rem=which(eset_wAb_D0$pt==pt_uni[i] & eset_wAb_D0$assay!=most_ab_type)
  if (is_empty(ind_rem)==FALSE) {
    eset_wAb_D0=eset_wAb_D0[,-ind_rem]
  }
}

#Create dataframe from pdata
df_pre=data.frame(eset_wAb_D0$pt, eset_wAb_D0$ImmResp_baseline_timepoint_MFC, eset_wAb_D0$ImmResp_baseline_value_MFC)
colnames(df_pre)=c('Vaccine','Timepoint', 'Ab_titer')
df_post=data.frame(eset_wAb_D0$pt, eset_wAb_D0$ImmResp_postVax_timepoint_MFC, eset_wAb_D0$ImmResp_postVax_value_MFC)
colnames(df_post)=c('Vaccine','Timepoint', 'Ab_titer')
df=rbind(df_pre,df_post)
df$Timepoint=as.character(df$Timepoint)
#Replace <0 with 0
df$Ab_titer[df$Ab_titer<0]=0

#Plot
p=ggplot(df, aes(x=Timepoint, y=Ab_titer, fill=Timepoint)) +
  geom_boxplot(outlier.shape=NA,width=0.5,notch=FALSE, alpha=0.3)+
  geom_jitter(pch=21, width=0.1)+
  ylab('Antibody Response - log2(Ab Titer)')+
  xlab('Day post-vaccination')+
  theme_bw()+
  theme(text=element_text(size=16),legend.position="none", plot.margin = margin(3,3,3,3, "cm"))
p + facet_wrap(~Vaccine, scales = 'free')

Figure 5a. AUC bar plot of antibody response prediction performance per dataset for the elastic-net classifier trained on inactivated influenza datasets only. Data are presented as mean values ± 95% confidence intervals (CIs). n = 2000 bootstrap replicates.

load(paste0(extra_data_dir,"/ML.Data.D7FCH.BTM.SDFilter75.rda"))
load(paste0(extra_data_dir,"/PD.Master.D7FCH.BTM.rda"))

PatientInfo<-subset(PatientInfo, MFC_p30%in%c("lowResponder","highResponder"))
PatientInfo<-subset(PatientInfo, pt=="Influenza_IN")
PatientInfo<-droplevels(PatientInfo)
PatientInfo$MFC_p30<-factor(PatientInfo$MFC_p30, levels = c("highResponder", "lowResponder"))

MOD.LIST<-lapply(unique(PatientInfo$study_accession), function(SDY){

  PatientInfo.Temp<-filter(PatientInfo, study_accession!=SDY)

  DevID<-rownames(PatientInfo.Temp)
  DevID<-intersect(DevID, colnames(data.ML.filter))
  X.Dev<-data.ML.filter[,DevID]
  rownames(X.Dev)<-strsplit2(rownames(X.Dev), "_")[,1]

  Y.Dev<-PatientInfo[DevID, 'MFC_p30']
  names(Y.Dev)<-DevID

  X.Dev <- t(X.Dev)

  SEED<-1987

  set.seed(SEED)
  ctrl=trainControl(method = "cv",
                    number = 10,
                    #repeats = 1,
                    selectionFunction = 'best',
                    returnResamp = "final",
                    classProbs = TRUE,
                    returnData = F,
                    verboseIter = F,
                    summaryFunction = twoClassSummary,
                    savePredictions = "final")


  set.seed(SEED)
  model_list <- caretList(
    x=X.Dev,
    y=Y.Dev[rownames(X.Dev)],
    trControl=ctrl,
    methodList=c("glmnet"),
    metric="ROC",
    tuneLength=5,
    maximize=T,
    preProc = c("center", "scale"))

  return(model_list)})

names(MOD.LIST)<-unique(PatientInfo$study_accession)

### Get 10-CV preds ###

db.preds.test<-do.call("rbind.data.frame", lapply(names(MOD.LIST),
                                                  function(SDY){

                                                    MOD.TEMP <- MOD.LIST[[SDY]]

                                                    PatientInfo.Temp<-subset(PatientInfo, study_accession==SDY)
                                                    PatientInfo.Temp<-droplevels(PatientInfo.Temp)
                                                    PatientInfo.Temp$MFC_p30<-factor(PatientInfo.Temp$MFC_p30, levels = c("highResponder", "lowResponder"))

                                                    DevID<-rownames(PatientInfo.Temp)
                                                    DevID<-intersect(DevID, colnames(data.ML.filter))

                                                    X.Dev<-data.ML.filter[, DevID]
                                                    rownames(X.Dev)<-strsplit2(rownames(X.Dev), "_")[,1]

                                                    Y.Dev<-PatientInfo.Temp[DevID, 'MFC_p30']
                                                    names(Y.Dev)<-DevID

                                                    model_preds <- lapply(MOD.TEMP, predict, newdata=t(X.Dev), type="prob")
                                                    model_preds <- lapply(model_preds, function(x) x[,"highResponder"])
                                                    model_preds <- data.frame(model_preds)

                                                    Preds.Test<-mutate(model_preds,
                                                                       pred=ifelse(glmnet>=0.5, "highResponder", "lowResponder"),
                                                                       PTID=colnames(X.Dev),
                                                                       highResponder=glmnet,
                                                                       obs=as.character(Y.Dev[colnames(X.Dev)]))

                                                    CONF.MAT<-caret::confusionMatrix(data=factor(Preds.Test$pred, levels = c("highResponder", "lowResponder")),
                                                                                     reference=factor(Preds.Test$obs,
                                                                                                      levels = c("highResponder", "lowResponder")))

                                                    ROC.OBJ<-pROC::roc(response = Preds.Test$obs,
                                                                       levels=c("lowResponder", "highResponder"),
                                                                       direction="<",
                                                                       predictor = Preds.Test$highResponder)

                                                    AUC.CI<-as.vector(pROC::ci.auc(ROC.OBJ,
                                                                                   method="bootstrap", boot.n=2000))
                                                    names(AUC.CI)<-c("AUC.CI.LOW", "AUC", "AUC.CI.HIGH")

                                                    N = length(Preds.Test$PTID)

                                                    VEC=t(as.data.frame(c(CONF.MAT$overall, CONF.MAT$byClass, AUC.CI, n_vac=N))) %>%
                                                      as.data.frame() %>%
                                                      mutate(SDY_OUT=SDY, SET="TEST")
                                                    rownames(VEC)<-NULL
                                                    return(VEC)
                                                  }))

p<-ggplot(db.preds.test,
          aes(x=SDY_OUT,
              y=as.numeric(AUC)))+
  geom_bar(stat='identity', width = 0.5)+
  geom_errorbar(aes(ymin = AUC.CI.LOW, ymax = AUC.CI.HIGH), width = 0.25)+
  labs(y="AUC (CI)", x=NULL)+
  theme_bw()+
  geom_hline(yintercept = 0.5, linetype="dashed")+
  theme(axis.text.x = element_text(angle = 45, hjust = 1, size=8))+
  geom_label(aes(y=0.05, label=n_vac), show.legend = F, color="black", size=2)+
  ylim(0,1)

print(p)

Figure 5b. Heat map of high-versus-low antibody responder difference across vaccines of modules differentially expressed (FDR < 0.05) between high and low antibody responders to inactivated influenza vaccination.

#Load GE
eset=readRDS(file.path(data_dir, paste0(date_prefix, "young_noNorm_withResponse_eset.rds")))
colnames(eset)=eset$uid
#Merge pre and post-vax samples to same matrix for SDY1529
eset$matrix[eset$study_accession=='SDY1529']='SDY1529_WholeBlood_HealthyAdults'
#Abbreviate vaccine type
eset$vaccine_type_full=eset$vaccine_type
eset$vaccine_type=str_replace_all(eset$vaccine_type,vaccine_abrev)

responder_col='MFC_p30'

#Create combined vaccine type/pathogen column
eset$pt=paste(eset$pathogen," (",eset$vaccine_type,")", sep='')
#Label columns by uid
colnames(eset)=eset$uid

#Find samples from timepoints of interest
tp_int=c(0,1,3,7,14,21)
#tp_int=unique(eset$study_time_collected[which(eset$study_time_collected>=0)]) #Alternate: use all timepoints (>=0)
ind=lapply(tp_int, function(x) which(eset$study_time_collected==x))
#Combine indices of all timepoints of interest
ind_all=Reduce(union,ind)
#Retain only samples from timepoints of interest
eset=eset[,ind_all]
#Recompute timepoint indices after removing extraneous timepoints
ind=lapply(tp_int, function(x) which(eset$study_time_collected==x))
#Remove samples from a single study with fewer than sample_cutoff HR and LR at any timepoint
sample_cutoff=2
matrix_uni_tp=lapply(ind,function(x) unique(eset[,x]$matrix))
matrix_ind=lapply(1:length(ind),function(x)
  lapply(1:length(matrix_uni_tp[[x]]), function(y) which(matrix_uni_tp[[x]][[y]]==eset[,ind[[x]]]$matrix)))
matrix_HR_ind=lapply(1:length(ind), function(x)
  lapply(1:length(matrix_uni_tp[[x]]), function(y) which(pData(eset[,ind[[x]][matrix_ind[[x]][[y]]]])[responder_col]=='highResponder')))
matrix_LR_ind=lapply(1:length(ind), function(x)
  lapply(1:length(matrix_uni_tp[[x]]), function(y) which(pData(eset[,ind[[x]][matrix_ind[[x]][[y]]]])[responder_col]=='lowResponder')))
ind_cut_all=vector()
for (i in 1:length(matrix_ind)) {
  ind_cut_HR=which(sapply(matrix_HR_ind[[i]],length)<sample_cutoff)
  ind_cut_LR=which(sapply(matrix_LR_ind[[i]],length)<sample_cutoff)
  ind_cut=union(ind_cut_HR,ind_cut_LR)
  if (is_empty(ind_cut)==FALSE) {
    ind_cut_all=c(ind_cut_all,ind[[i]][unlist(matrix_ind[[i]][ind_cut])])
  }
}
if (is_empty(ind_cut_all)==FALSE) {
  eset=eset[,-ind_cut_all]
}
#Recompute timepoint indices after removing samples
tp_int=unique(eset$study_time_collected[which(eset$study_time_collected>=0)])
ind=lapply(tp_int, function(x) which(eset$study_time_collected==x))

#Remove genes with NA
eset=eset[complete.cases(exprs(eset)),]

#Create combined SDY/pathogen/vaccine type column
eset$SDY_pt=paste(eset$study,eset$pt)
#Create unique list of studies
matrix_uni=unique(eset$matrix)

#Collapse to BTM level
#Remove BTMs which have no matching genes in dataset
btm_list=btm_list[!sapply(btm_list, function(x) is_empty(intersect(x,rownames(eset))))]
#Collapse - find arithmetic mean of genes in each BTM for each sample
exp_BTM=do.call(rbind, lapply(btm_list, function(x) colMeans(exprs(eset[na.omit(match(x,rownames(eset))),]),na.rm=TRUE)))
#Create BTM eset (maybe not proper approach but it works)
eset_BTM=eset[1:nrow(exp_BTM),]
rownames(eset_BTM)=rownames(exp_BTM)
exprs(eset_BTM)=exp_BTM
#Replace eset with BTM eset
eset=eset_BTM
rm(eset_BTM)

#Compute D0 normalized FC
ind_D0=which(0==eset$study_time_collected)
common=lapply(2:length(ind),function(x) intersect(eset$participant_id[ind[[x]]],eset$participant_id[ind_D0]))
ia=lapply(2:length(ind),function(x) na.omit(match(common[[x-1]],eset$participant_id[ind[[x]]])))
ib=lapply(2:length(ind),function(x) na.omit(match(common[[x-1]],eset$participant_id[ind_D0])))
exp_FC=lapply(2:length(ind),function(x) eset[,ind[[x]][ia[[x-1]]]])
exp_FC=lapply(2:length(ind),function(x) {exprs(exp_FC[[x-1]])=exprs(exp_FC[[x-1]])-exprs(eset[,ind_D0[ib[[x-1]]]]); exp_FC[[x-1]]})
#For each study, average expression across all subjects
matrix_uni_tp=lapply(exp_FC,function(x) x$matrix[!duplicated(x$matrix)])
#Store study metadata
matrix_uni_tp_metaData=lapply(exp_FC,function(x) pData(x)[!duplicated(x$matrix),] %>%
                                rownames_to_column() %>%
                                dplyr::select(matrix, vaccine, vaccine_type, pathogen, adjuvant, pt) %>%
                                column_to_rownames(var = "matrix"))
#Indices for each study
matrix_ind=lapply(1:length(exp_FC),
                  function(x) lapply(1:length(matrix_uni_tp[[x]]), function(y) which(matrix_uni_tp[[x]][[y]]==exp_FC[[x]]$matrix)))
#High responder indices within each study
matrix_HR_ind=lapply(1:length(exp_FC),
                     function(x) lapply(1:length(matrix_uni_tp[[x]]),
                                        function(y) which(pData(exp_FC[[x]][,matrix_ind[[x]][[y]]])[responder_col]=='highResponder')))
matrix_LR_ind=lapply(1:length(exp_FC),
                     function(x) lapply(1:length(matrix_uni_tp[[x]]),
                                        function(y) which(pData(exp_FC[[x]][,matrix_ind[[x]][[y]]])[responder_col]=='lowResponder')))
#Study size
exp_FC_n=lapply(1:length(exp_FC),
                function(x) sapply(1:length(matrix_uni_tp[[x]]), function(y) length(matrix_ind[[x]][[y]])))
#Average FC
exp_FC_mean=lapply(1:length(exp_FC),
                   function(x) sapply(1:length(matrix_uni_tp[[x]]), function(y) rowMeans(exprs(exp_FC[[x]][,matrix_ind[[x]][[y]]]))))
#T-test and mean FC between high/low responder
#If there are less than 2 HR/LR in a given study, return NA for all t-tests
exp_FC_p_less=
  lapply(1:length(exp_FC),
         function(x) sapply(1:length(matrix_uni_tp[[x]]),
                            function(y) if(length(matrix_HR_ind[[x]][[y]])>=2 & length(matrix_LR_ind[[x]][[y]])>=2) apply(exprs(exp_FC[[x]][,matrix_ind[[x]][[y]]]), 1,
                                                                                                                          function(z) t.test(z[matrix_HR_ind[[x]][[y]]], z[matrix_LR_ind[[x]][[y]]], alternative='less')$p.value) else rep(NA, nrow(exp_FC[[x]]))))
exp_FC_p_greater=
  lapply(1:length(exp_FC),
         function(x) sapply(1:length(matrix_uni_tp[[x]]),
                            function(y) if(length(matrix_HR_ind[[x]][[y]])>=2 & length(matrix_LR_ind[[x]][[y]])>=2) apply(exprs(exp_FC[[x]][,matrix_ind[[x]][[y]]]), 1,
                                                                                                                          function(z) t.test(z[matrix_HR_ind[[x]][[y]]], z[matrix_LR_ind[[x]][[y]]], alternative='greater')$p.value) else rep(NA, nrow(exp_FC[[x]]))))
exp_FC_mean=
  lapply(1:length(exp_FC),
         function(x) sapply(1:length(matrix_uni_tp[[x]]),
                            function(y) if(length(matrix_HR_ind[[x]][[y]])>=2 & length(matrix_LR_ind[[x]][[y]])>=2) rowMeans(exprs(exp_FC[[x]][,matrix_ind[[x]][[y]][matrix_HR_ind[[x]][[y]]]]))-rowMeans(exprs(exp_FC[[x]][,matrix_ind[[x]][[y]][matrix_LR_ind[[x]][[y]]]])) else rep(NA, nrow(exp_FC[[x]]))))
exp_FC_t_stat=
  lapply(1:length(exp_FC),
         function(x) sapply(1:length(matrix_uni_tp[[x]]),
                            function(y) if(length(matrix_HR_ind[[x]][[y]])>=2 & length(matrix_LR_ind[[x]][[y]])>=2) apply(exprs(exp_FC[[x]][,matrix_ind[[x]][[y]]]), 1, function(z) t.test(z[matrix_HR_ind[[x]][[y]]], z[matrix_LR_ind[[x]][[y]]])$statistic) else rep(NA, nrow(exp_FC[[x]]))))

#Replace NaN p values with 0.9999
exp_FC_p_less=lapply(exp_FC_p_less, function(x) replace(x,is.nan(x),0.9999))
exp_FC_p_greater=lapply(exp_FC_p_greater, function(x) replace(x,is.nan(x),0.9999))
#Replace 1 p values with 0.9999 (for compatibility with sumz)
exp_FC_p_less=lapply(exp_FC_p_less, function(x) replace(x,x>0.9999,0.9999))
exp_FC_p_greater=lapply(exp_FC_p_greater, function(x) replace(x,x>0.9999,0.9999))

#Merge mean FC, correlation/t-test test statistics by vaccine (weighted average by study size)

#Find unique pathogen/vaccine types by timepoint
pt_uni_tp=lapply(matrix_uni_tp_metaData,function(x) x$pt[!duplicated(x$pt)])
pt_metaData=lapply(1:length(matrix_uni_tp_metaData),
                   function(x) dplyr::select(
                     matrix_uni_tp_metaData[[x]][!duplicated(matrix_uni_tp_metaData[[x]]$pt),],
                     vaccine_type, pathogen, adjuvant))
#Sample indices per vaccine
pt_ind=lapply(1:length(exp_FC),
              function(x) sapply(1:length(pt_uni_tp[[x]]),
                                 function(y) which(pt_uni_tp[[x]][[y]]==matrix_uni_tp_metaData[[x]]$pt)))
pt_meanFC=
  lapply(1:length(exp_FC),
         function(x) sapply(1:length(pt_uni_tp[[x]]),
                            function(y) if(length(pt_ind[[x]][[y]])>1) rowWeightedMeans(exp_FC_mean[[x]][,pt_ind[[x]][[y]]], w=exp_FC_n[[x]][pt_ind[[x]][[y]]], na.rm=TRUE) else exp_FC_mean[[x]][,pt_ind[[x]][[y]]]))
pt_t_stat=
  lapply(1:length(exp_FC),
         function(x) sapply(1:length(pt_uni_tp[[x]]),
                            function(y) if(length(pt_ind[[x]][[y]])>1) rowWeightedMeans(exp_FC_t_stat[[x]][,pt_ind[[x]][[y]]], w=exp_FC_n[[x]][pt_ind[[x]][[y]]], na.rm=TRUE) else exp_FC_t_stat[[x]][,pt_ind[[x]][[y]]]))

##Compute DEGs by pathogen/vaccine type
#Integrate p values across multiple studies by Stouffer's method

pt_p=vector("list",length(exp_FC))
pt_q=vector("list",length(exp_FC))
pt_up_down_logic=vector("list",length(exp_FC))
pt_DEG_up=vector("list",length(exp_FC))
pt_DEG_down=vector("list",length(exp_FC))
pt_DEG_total_num=vector("list",length(exp_FC))
for (i in 1:length(exp_FC)) {
  #For each pathogen/type combo
  pt_p[[i]]=matrix(NA, nrow=nrow(eset), ncol=length(pt_uni_tp[[i]]))
  pt_q[[i]]=matrix(NA, nrow=nrow(eset), ncol=length(pt_uni_tp[[i]]))
  pt_up_down_logic[[i]]=matrix(NA, nrow=nrow(eset), ncol=length(pt_uni_tp[[i]]))
  pt_DEG_up[[i]]=vector("list",length(pt_uni_tp[[i]]))
  pt_DEG_down[[i]]=vector("list",length(pt_uni_tp[[i]]))
  pt_DEG_total_num[[i]]=matrix(NA, nrow=1, ncol=length(pt_uni_tp[[i]]))
  for (j in 1:length(pt_uni_tp[[i]])) {
    #If there is more than 1 study with values, integrate p values by Stouffer's method
    if (length(pt_ind[[i]][[j]])>1) {
      #Directional Stouffer's method using sumz function (weighted by sqrt(n))
      #t-test
      pt_p_less=vapply(1:nrow(eset), FUN.VALUE=1, function(x)
        sumz(exp_FC_p_less[[i]][x,pt_ind[[i]][[j]]],sqrt(exp_FC_n[[i]][pt_ind[[i]][[j]]]), na.action=na.omit)$p)
      pt_p_greater=vapply(1:nrow(eset), FUN.VALUE=1, function(x)
        sumz(exp_FC_p_greater[[i]][x,pt_ind[[i]][[j]]],sqrt(exp_FC_n[[i]][pt_ind[[i]][[j]]]), na.action=na.omit)$p)
      pt_up_down_logic[[i]][,j]=pt_p_greater<pt_p_less #logical variable, true if upregulation p value is smallest
      pt_p[[i]][,j]=2*pmin(pt_p_less,pt_p_greater)
      #Otherwise store single p values
    } else {
      #t-test
      pt_p[[i]][,j]=2*pmin(exp_FC_p_less[[i]][,pt_ind[[i]][[j]]],exp_FC_p_greater[[i]][,pt_ind[[i]][[j]]])
      pt_up_down_logic[[i]][,j]=exp_FC_p_greater[[i]][,pt_ind[[i]][[j]]]<exp_FC_p_less[[i]][,pt_ind[[i]][[j]]]
    }
    #Correct p values >1
    pt_p[[i]][(pt_p[[i]][,j]>1),j]=1
    #Convert p values to q values (FDR correction)
    pt_q[[i]][,j]=p.adjust(pt_p[[i]][,j],method="BH") #BH method
    #Find DE BTMs
    #t-test
    ind_up=which(pt_up_down_logic[[i]][,j]==TRUE)
    ind_down=which(pt_up_down_logic[[i]][,j]==FALSE)
    ind_sig=which(pt_q[[i]][,j]<FDR_THRESHOLD)
    pt_DEG_up[[i]][[j]]=intersect(ind_up,ind_sig)
    pt_DEG_down[[i]][[j]]=intersect(ind_down,ind_sig)
    pt_DEG_total_num[[i]][j]=length(ind_sig)
  }
  #Set column/row names
  colnames(pt_p[[i]])=pt_uni_tp[[i]]
  colnames(pt_q[[i]])=pt_uni_tp[[i]]
  rownames(pt_metaData[[i]])=pt_uni_tp[[i]]

  rownames(pt_meanFC[[i]])=names(btm_list)
  rownames(pt_t_stat[[i]])=names(btm_list)
  colnames(pt_meanFC[[i]])=pt_uni_tp[[i]]
  colnames(pt_t_stat[[i]])=pt_uni_tp[[i]]
}

#Plot Day 7 t stat for TIV BTMs
tp_plot=7
BTM_ind=pt_DEG_up[[which(tp_int==tp_plot)-1]][[which(pt_uni_tp[[which(tp_int==tp_plot)-1]]=='Influenza (IN)')]]
df=pt_t_stat[[which(tp_int==tp_plot)-1]][BTM_ind,]
df_anno=pt_metaData[[which(tp_int==tp_plot)-1]][, c('vaccine_type', 'pathogen', 'adjuvant')]
ph_lim=ceiling(max(abs(df), na.rm = TRUE))
#ph_lim=3
colorLS=colorRampPalette(colors = c("blue", "white", "red"))(n = 100)
breakLS=seq(from = -ph_lim, to = ph_lim, length.out = 100)
#Plot
result=pheatmap::pheatmap(mat = df,
                color = colorLS,
                breaks = breakLS,
                cluster_cols = TRUE,
                cluster_rows = TRUE,
                clustering_distance_rows = "euclidean",
                clustering_distance_cols = "euclidean",
                clustering_method = "ward.D2",
                show_colnames = TRUE,
                show_rownames = TRUE,
                fontsize = 10,
                cellheight = 10,
                cellwidth = 10)

Ext. Data Figure 4b-c. b) Barplot of feature importance for the GLM classifier trained on inactivated influenza datasets only. c) AUC barplot of antibody response prediction performance across vaccines for the GLM classifier trained on inactivated influenza datasets only. Data are presented as mean values + /- 95% confidence interval. n = 2000 bootstrap replicates.

load(paste0(extra_data_dir, "/ML.Data.D7FCH.BTM.SDFilter75.rda"))
load(paste0(extra_data_dir ,"/PD.Master.D7FCH.BTM.rda"))

PatientInfo<-subset(PatientInfo, MFC_p30%in%c("lowResponder","highResponder"))
PatientInfo<-subset(PatientInfo, pt=="Influenza_IN")
PatientInfo<-droplevels(PatientInfo)
PatientInfo$MFC_p30<-factor(PatientInfo$MFC_p30, levels = c("highResponder", "lowResponder"))

DevID<-rownames(PatientInfo)
DevID<-intersect(DevID, colnames(data.ML.filter))
X.Dev<-data.ML.filter[,DevID]
rownames(X.Dev)<-strsplit2(rownames(X.Dev), "_")[,1]

Y.Dev<-PatientInfo[DevID, 'MFC_p30']
names(Y.Dev)<-DevID

X.Dev <- t(X.Dev)

SEED<-1987

set.seed(SEED)
ctrl=trainControl(method = "cv",
                  selectionFunction = 'best',
                  returnResamp = "final",
                  classProbs = TRUE,
                  returnData = F,
                  verboseIter = F,
                  summaryFunction = twoClassSummary,
                  savePredictions = "final")

set.seed(SEED)
model_list <- caretList(
  x=X.Dev,
  y=Y.Dev[rownames(X.Dev)],
  trControl=ctrl,
  methodList=c("glmnet"),
  metric="ROC",
  #tuneLength=5,
  maximize=T,
  preProc = c("center", "scale"))

whichTwoPct <- best(model_list$glmnet$results,
                    metric = "ROC",
                    maximize = TRUE)

model_list$glmnet$results[whichTwoPct,]

ROC.OBJ<-pROC::roc(response = model_list$glmnet$pred$obs,
                   levels=c("lowResponder", "highResponder"),
                   direction="<",
                   predictor = model_list$glmnet$pred$highResponder,
                   plot=F,
                   print.auc=F,
                   ci=F,
                   auc.polygon=FALSE,
                   legacy.axes=FALSE,
                   xlab= "Specificity (%) True Neg",
                   ylab= "Sensitivity (%) True Pos",
                   percent=TRUE, col="forestgreen", lwd=3)

AUC.CI<-as.vector(pROC::ci.auc(ROC.OBJ,
                               method="bootstrap", boot.n=2000))
names(AUC.CI)<-c("AUC.CI.LOW", "AUC", "AUC.CI.HIGH")

db.imp<-arrange(varImp(model_list$glmnet)$importance, desc(Overall))
#Plot feature importance
db.imp$BTM=rownames(db.imp)
colnames(db.imp)=c('Importance','BTM')
db.imp=db.imp[db.imp$Importance>0,]
#re-order the levels in the order of appearance in the data.frame
db.imp$BTM<-factor(db.imp$BTM,levels=db.imp$BTM[order(db.imp$Importance)])
#Plot
p=ggplot(db.imp,aes(x=BTM,y=Importance))+
    geom_bar(stat="identity",width=0.7)+
    #scale_fill_manual(values = mycolors_BTM, na.value = "grey50")+
    coord_flip()+
    #labs(title=paste0('Timepoint ',tp[i]))+
    xlab("")+
    ylab("Importance")+
    theme_minimal()+
    theme(text = element_text(size=20), plot.title = element_text(hjust = 0.5))
p

### Test in other vaccines ###

load(paste0(extra_data_dir,"/PD.Master.D7FCH.BTM.rda"))

PatientInfo<-subset(PatientInfo, MFC_p30%in%c("lowResponder","highResponder"))
PatientInfo<-subset(PatientInfo, pt!="Influenza_IN")
PatientInfo<-droplevels(PatientInfo)
PatientInfo$MFC_p30<-factor(PatientInfo$MFC_p30, levels = c("highResponder", "lowResponder"))

### DD ply to get preds ###

db.preds.test<-do.call("rbind.data.frame", lapply(unique(PatientInfo$pt),
                                                  function(Vaccine){

                                                    PatientInfo<-subset(PatientInfo, pt==Vaccine)
                                                    PatientInfo<-droplevels(PatientInfo)
                                                    PatientInfo$MFC_p30<-factor(PatientInfo$MFC_p30, levels = c("highResponder", "lowResponder"))

                                                    DevID<-rownames(PatientInfo)
                                                    DevID<-intersect(DevID, colnames(data.ML.filter))

                                                    X.Dev<-data.ML.filter[, DevID]
                                                    rownames(X.Dev)<-strsplit2(rownames(X.Dev), "_")[,1]

                                                    Y.Dev<-PatientInfo[DevID, 'MFC_p30']
                                                    names(Y.Dev)<-DevID

                                                    model_preds <- lapply(model_list, predict, newdata=t(X.Dev), type="prob")
                                                    model_preds <- lapply(model_preds, function(x) x[,"highResponder"])
                                                    model_preds <- data.frame(model_preds)

                                                    Preds.Test<-mutate(model_preds,
                                                                       pt=Vaccine,
                                                                       #PredBinary=ifelse(rf>=0.5, "highResponder", "lowResponder"),
                                                                       PTID=colnames(X.Dev),
                                                                       Real.Resp=as.character(Y.Dev[colnames(X.Dev)]))

                                                    return(Preds.Test)

                                                  }))

db.preds.test<-melt(db.preds.test, measure.vars = c("glmnet"))

perf.stats.by.vaccine.time<-plyr::ddply(.data=db.preds.test,
                                        .variable=plyr::.(pt, variable),
                                        .fun=function(df){

                                          CONF.MAT<-caret::confusionMatrix(data=factor(ifelse(df$value>=0.5, "highResponder", "lowResponder"),
                                                                                       levels = c("highResponder", "lowResponder")),
                                                                           reference=factor(df$Real.Resp,
                                                                                            levels = c("highResponder", "lowResponder")))

                                          ROC.OBJ<-pROC::roc(response = df$Real.Resp,
                                                             levels=c("lowResponder", "highResponder"),
                                                             direction="<",
                                                             predictor = df$value)

                                          AUC.CI<-as.vector(pROC::ci.auc(ROC.OBJ,
                                                                         method="bootstrap", boot.n=2000))


                                          names(AUC.CI)<-c("AUC.CI.LOW", "AUC", "AUC.CI.HIGH")


                                          N = length(df$pt)

                                          VEC=t(as.data.frame(c(CONF.MAT$overall, CONF.MAT$byClass, AUC.CI, n_vac=N)))
                                          rownames(VEC)<-NULL

                                          return(VEC)})



perf.stats.by.vaccine.time<-arrange(perf.stats.by.vaccine.time, desc(AUC))

#Plot AUC

lapply(unique(perf.stats.by.vaccine.time$variable), function(METH){

  p<-ggplot(subset(perf.stats.by.vaccine.time, variable==METH),
            aes(x=pt,
                y=as.numeric(AUC),
                fill=pt))+
    geom_hline(yintercept = 0.5, linetype="dashed")+
    geom_bar(stat='identity', show.legend = F)+
    scale_fill_brewer(palette = "Set3")+
    geom_errorbar(aes(ymin = AUC.CI.LOW, ymax = AUC.CI.HIGH), width = 0.25)+
    labs(y="AUC (CI)", title = paste0(METH), x=NULL)+
    theme_bw()+
    theme(axis.text.x = element_text(angle = 45, hjust = 1, size=8))+
    geom_label(aes(y=0.05, label=n_vac), show.legend = F, color="black", size=2)+
    ylim(0,1)

  p

})

Ext. Data Figure 4d. AUC barplot of antibody response prediction performance of the leave-one-vaccine-out GLM classifier. Data are presented as mean values + /- 95% confidence interval. n = 2000 bootstrap replicates.

#Load GE
load(eset_ssGSEA_wAb_path)
#Keep only days 0,3,7
eset.LDA.BL.D3.D7=eset_wAb[,eset_wAb$time_post_last_vax %in% c(0,3,7)]
PD<-pData(eset.LDA.BL.D3.D7)

VAL.LIST <- list(Val1="Influenza_LV",
                 Val2="Influenza_IN",
                 Val3=c("Meningococcus_CJ", "Meningococcus_PS"),
                 Val4=c("Tuberculosis_RVV",
                        "Pneumococcus_PS", "Smallpox_LV"),
                 Val5="Varicella Zoster_LV",
                 Val6="Yellow Fever_LV")

### Start the LDA thing
HighLow<-rownames(PD[PD$MFC_p30%in%c("lowResponder","highResponder"),])
eset.HighLow<-eset.LDA.BL.D3.D7[,eset.LDA.BL.D3.D7$uid%in%HighLow]

EXP<-Biobase::exprs(eset.HighLow)
PD<-pData(eset.HighLow)

ALL <- table(PD$participant_id)
KEEP <- names(ALL[ALL==3])

PD <- filter(PD, participant_id %in% KEEP)

Samp2PatID<-PD$participant_id
names(Samp2PatID)<-PD$uid

DATA.LIST <- lapply(VAL.LIST, function(PATH){
  PD.Train <- filter(PD, !(pt %in% PATH))
  PD.Val <- filter(PD, pt %in% PATH)

  X.ls.Train<-lapply(c(0, 3, 7), function(i){
    df<-EXP[,PD.Train[PD.Train$study_time_collected==i,"uid"]]
    colnames(df)<-Samp2PatID[colnames(df)]
    return(df)})

  lapply(X.ls.Train, dim)

  names(X.ls.Train)<-c("X.0", "X.3", "X.7")

  X.ls.Val<-lapply(c(0, 3, 7), function(i){
    df<-EXP[,PD.Val[PD.Val$study_time_collected==i,"uid"]]
    colnames(df)<-Samp2PatID[colnames(df)]
    return(df)})

  names(X.ls.Val)<-c("X.0", "X.3", "X.7")

  Y.Train<-PD.Train[!duplicated(PD.Train$participant_id),c("participant_id", "MFC_p30","pt")]
  rownames(Y.Train)<-Y.Train$participant_id
  Y.Train<-Y.Train[colnames(X.ls.Train$X.0),]
  Y.Train<-droplevels(Y.Train)

  # need sample cols, gene rows
  wt<-LDAprojection(xs=X.ls.Train, ys=Y.Train$MFC_p30)

  colnames(wt)<-names(X.ls.Train)

  # projected matrix
  X.proj.Train<-multiplyFunction(xlist=X.ls.Train, par=wt)
  rownames(X.proj.Train)<-rownames(wt)
  colnames(X.proj.Train)<-colnames(X.ls.Train$X.0)

  X.proj.Val<-multiplyFunction(xlist=X.ls.Val, par=wt)
  rownames(X.proj.Val)<-rownames(wt)
  colnames(X.proj.Val)<-colnames(X.ls.Val$X.0)

  PHENO.TRAIN=filter(PD.Train, participant_id %in% colnames(X.proj.Train), study_time_collected==0)
  PHENO.VAL=filter(PD.Val, participant_id %in% colnames(X.proj.Val), study_time_collected==0)

  DATA.LS <- list(TRAIN=list(DATA.TRAIN=X.proj.Train,
                             PHENO.TRAIN=PHENO.TRAIN),
                  VAL=list(DATA.VAL=X.proj.Val,
                           PHENO.VAL=PHENO.VAL))

  return(DATA.LS)})

names(DATA.LIST) <- names(VAL.LIST)

#save(DATA.LIST, file=paste0(extra_data_dir,"/LOVO_LDA_DataList.rda"))

MOD.LIST <- lapply(DATA.LIST, function(TEMP){

  PatientInfo <- TEMP$TRAIN$PHENO.TRAIN
  rownames(PatientInfo) <- PatientInfo$participant_id
  X.Dev <- TEMP$TRAIN$DATA.TRAIN

  Y.Dev <- PatientInfo[colnames(X.Dev), 'MFC_p30']
  names(Y.Dev) <- colnames(X.Dev)
  Y.Dev <- droplevels(Y.Dev)

  length(Y.Dev)
  dim(X.Dev)

  X.Dev <- t(X.Dev)

  SEED<-1987

  set.seed(SEED)
  ctrl=trainControl(method = "repeatedcv",
                    number = 10,
                    repeats = 5,
                    selectionFunction = 'best',
                    returnResamp = "final",
                    classProbs = TRUE,
                    returnData = F,
                    verboseIter = F,
                    summaryFunction = twoClassSummary,
                    savePredictions = "final")


  set.seed(SEED)
  model_list <- caretList(
    x=X.Dev,
    y=Y.Dev[rownames(X.Dev)],
    trControl=ctrl,
    methodList=c("glmnet"),
    metric="ROC",
    tuneLength=5,
    maximize=T,
    preProc = c("center", "scale"))

  return(model_list)

})

names(MOD.LIST) <- names(DATA.LIST)

## Get 10-CV preds ##

Perf10CV <- do.call("rbind.data.frame", lapply(names(MOD.LIST), function(VAL){

  Preds.CV <- MOD.LIST[[VAL]]$glmnet$pred

  CONF.MAT<-caret::confusionMatrix(data=factor(Preds.CV$pred, levels = c("highResponder", "lowResponder")),
                                   reference=factor(Preds.CV$obs,
                                                    levels = c("highResponder", "lowResponder")))

  ROC.OBJ<-pROC::roc(response = Preds.CV$obs,
                     levels=c("lowResponder", "highResponder"),
                     direction="<",
                     predictor = Preds.CV$highResponder)

  AUC.CI<-as.vector(pROC::ci.auc(ROC.OBJ,
                                 method="bootstrap", boot.n=2000))
  names(AUC.CI)<-c("AUC.CI.LOW", "AUC", "AUC.CI.HIGH")

  N = length(Preds.CV$PTID)

  VEC=t(as.data.frame(c(CONF.MAT$overall, CONF.MAT$byClass, AUC.CI, n_vac=N))) %>%
    as.data.frame() %>%
    mutate(VAL.NUM=VAL,
           SET="10-CV")
  rownames(VEC)<-NULL
  return(VEC)}))

PerfVal <- do.call("rbind.data.frame", lapply(names(MOD.LIST), function(VAL){

  PatientInfo.Val<-DATA.LIST[[VAL]]$VAL$PHENO.VAL
  rownames(PatientInfo.Val) <- PatientInfo.Val$participant_id

  X.Val<-DATA.LIST[[VAL]]$VAL$DATA.VAL

  Y.Val<-PatientInfo.Val[rownames(PatientInfo.Val), 'MFC_p30']
  names(Y.Val)<-rownames(PatientInfo.Val)

  Preds.Test<-mutate(predict(MOD.LIST[[VAL]]$glmnet, newdata=t(X.Val), type="prob"),
                     pred=ifelse(highResponder>=0.5, "highResponder", "lowResponder"),
                     PTID=colnames(X.Val),
                     obs=as.character(droplevels(Y.Val[colnames(X.Val)])))

  CONF.MAT<-caret::confusionMatrix(data=factor(Preds.Test$pred, levels = c("highResponder", "lowResponder")),
                                   reference=factor(Preds.Test$obs,
                                                    levels = c("highResponder", "lowResponder")))

  ROC.OBJ<-pROC::roc(response = Preds.Test$obs,
                     levels=c("lowResponder", "highResponder"),
                     direction="<",
                     predictor = Preds.Test$highResponder)

  AUC.CI<-as.vector(pROC::ci.auc(ROC.OBJ,
                                 method="bootstrap", boot.n=2000))
  names(AUC.CI)<-c("AUC.CI.LOW", "AUC", "AUC.CI.HIGH")

  N = length(Preds.Test$PTID)

  VEC=t(as.data.frame(c(CONF.MAT$overall, CONF.MAT$byClass, AUC.CI, n_vac=N))) %>%
    as.data.frame() %>%
    mutate(VAL.NUM=VAL,
           SET="VAL")
  rownames(VEC)<-NULL
  return(VEC)
}))

db.performance.All <- full_join(x=Perf10CV,
                                y=PerfVal)

pd<-position_dodge(1)

p<-ggplot(db.performance.All,
          aes(x=VAL.NUM,
              y=as.numeric(AUC),
              fill=SET,
              group=interaction(VAL.NUM, SET)))+
  geom_bar(stat='identity', show.legend = T, position = pd)+
  scale_fill_brewer(palette = "Set3")+
  geom_errorbar(aes(ymin = AUC.CI.LOW, ymax = AUC.CI.HIGH),
                width = 0.25, position = pd)+
  labs(y="AUC (CI)", x=NULL)+
  theme_bw()+
  geom_hline(yintercept = 0.5, linetype="dashed")+
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1 ,size=8))+
  scale_x_discrete(labels=c(Val1="Influenza (IN)",
                            Val2="Influenza (LV)",
                            Val3="Meningococcus (CJ)\nMeningococcus (PS)",
                            Val4="Tuberculosis (RVV)\nPneumococcus (PS)\nSmallpox (LV)",
                            Val5="Varicella Zoster (LV)",
                            Val6="Yellow Fever (LV)"))+
  ylim(0,1)

p

Ext. Data Figure 4e. AUC barplot of antibody response prediction performance of the 10-fold cross-validation GLM classifier. Data are presented as mean values + /- 95% confidence interval. n = 2000 bootstrap replicates.

#Generate D3 results
load(paste0(extra_data_dir,"/ML.Data.D3FCH.BTM.NoFilter.rda"))
load(paste0(extra_data_dir,"/PD.Master.D3FCH.BTM.rda"))

PatientInfo<-subset(PatientInfo, MFC_p30%in%c("lowResponder","highResponder"))
PatientInfo<-droplevels(PatientInfo)
PatientInfo$MFC_p30<-factor(PatientInfo$MFC_p30,
                            levels = c("highResponder", "lowResponder"))

DevID<-rownames(PatientInfo)
DevID<-intersect(DevID, colnames(data.ML))
X.Dev<-data.ML[,DevID]
rownames(X.Dev)<-strsplit2(rownames(X.Dev), "_")[,1]

Y.Dev<-PatientInfo[DevID, 'MFC_p30']
names(Y.Dev)<-DevID

X.Dev <- t(X.Dev)

SEED<-1987

set.seed(SEED)
ctrl=trainControl(method = "cv",
                  number = 10,
                  selectionFunction = 'best',
                  returnResamp = "final",
                  classProbs = TRUE,
                  returnData = F,
                  verboseIter = F,
                  summaryFunction = twoClassSummary,
                  savePredictions = "final")

set.seed(SEED)
model_list <- caretList(
  x=X.Dev,
  y=Y.Dev[rownames(X.Dev)],
  trControl=ctrl,
  methodList=c("glmnet"),
  metric="ROC",
  #tuneLength=5,
  maximize=T,
  preProc = c("center", "scale"))

whichTwoPct <- best(model_list$glmnet$results,
                    metric = "ROC",
                    maximize = TRUE)

model_list$glmnet$results[whichTwoPct,]

ROC.OBJ<-pROC::roc(response = model_list$glmnet$pred$obs,
                   levels=c("lowResponder", "highResponder"),
                   direction="<",
                   predictor = model_list$glmnet$pred$highResponder,
                   plot=F,
                   print.auc=F)

AUC.CI<-as.vector(pROC::ci.auc(ROC.OBJ,
                               method="bootstrap",
                               boot.n=2000))

names(AUC.CI)<-c("AUC.CI.LOW", "AUC", "AUC.CI.HIGH")

#save(model_list, file=paste0(extra_data_dir,"/FinalModelD3FCH.rda"))

db.preds.all <- model_list$glmnet$pred %>%
  mutate(participant_id=rownames(X.Dev)[rowIndex]) %>%
  left_join(y=PatientInfo %>%
              dplyr::select(participant_id, pt))

perf.stats.by.vaccine<-plyr::ddply(.data=db.preds.all,
                             .variable=c("pt"),
                             .fun=function(df){

                               if(nrow(df)!=0 & length(unique(df$obs))==2){

                                 ROC.OBJ<-pROC::roc(response = df$obs, levels=c("lowResponder", "highResponder"),
                                                    direction="<",
                                                    predictor = df$highResponder)

                                 AUC.CI<-as.vector(pROC::ci.auc(ROC.OBJ,
                                                                method="bootstrap",
                                                                boot.n=2000))

                                 names(AUC.CI)<-c("AUC.CI.LOW", "AUC", "AUC.CI.HIGH")

                                 DB2 <- data.frame(AUC=ROC.OBJ$auc[1],
                                                   AUC.CI.LOW=AUC.CI['AUC.CI.LOW'],
                                                   AUC.CI.HIGH=AUC.CI['AUC.CI.HIGH'])

                                 return(DB2)}else{return(NULL)}}, .parallel=F)

perf.stats.by.vaccine<-arrange(perf.stats.by.vaccine, desc(AUC))

perf.stats.by.vaccine.master <- perf.stats.by.vaccine %>%
  mutate(TimePoint="Day3-fold-change")

#Generate D7 results
load(paste0(extra_data_dir,"/ML.Data.D7FCH.BTM.NoFilter.rda"))
load(paste0(extra_data_dir,"/PD.Master.D7FCH.BTM.rda"))

PatientInfo<-subset(PatientInfo, MFC_p30%in%c("lowResponder","highResponder"))
PatientInfo<-droplevels(PatientInfo)
PatientInfo$MFC_p30<-factor(PatientInfo$MFC_p30,
                            levels = c("highResponder", "lowResponder"))

DevID<-rownames(PatientInfo)
DevID<-intersect(DevID, colnames(data.ML))
X.Dev<-data.ML[,DevID]
rownames(X.Dev)<-strsplit2(rownames(X.Dev), "_")[,1]

Y.Dev<-PatientInfo[DevID, 'MFC_p30']
names(Y.Dev)<-DevID

X.Dev <- t(X.Dev)

SEED<-1987

set.seed(SEED)
ctrl=trainControl(method = "cv",
                  number = 10,
                  selectionFunction = 'best',
                  returnResamp = "final",
                  classProbs = TRUE,
                  returnData = F,
                  verboseIter = F,
                  summaryFunction = twoClassSummary,
                  savePredictions = "final")

set.seed(SEED)
model_list <- caretList(
  x=X.Dev,
  y=Y.Dev[rownames(X.Dev)],
  trControl=ctrl,
  methodList=c("glmnet"),
  metric="ROC",
  #tuneLength=5,
  maximize=T,
  preProc = c("center", "scale"))

whichTwoPct <- best(model_list$glmnet$results,
                    metric = "ROC",
                    maximize = TRUE)

model_list$glmnet$results[whichTwoPct,]

ROC.OBJ<-pROC::roc(response = model_list$glmnet$pred$obs,
                   levels=c("lowResponder", "highResponder"),
                   direction="<",
                   predictor = model_list$glmnet$pred$highResponder,
                   plot=F,
                   print.auc=F)

AUC.CI<-as.vector(pROC::ci.auc(ROC.OBJ,
                               method="bootstrap",
                               boot.n=2000))

names(AUC.CI)<-c("AUC.CI.LOW", "AUC", "AUC.CI.HIGH")

#save(model_list, file=paste0(extra_data_dir,"/FinalModelD7FCH.rda")

db.preds.all <- model_list$glmnet$pred %>%
  mutate(participant_id=rownames(X.Dev)[rowIndex]) %>%
  left_join(y=PatientInfo %>%
              dplyr::select(participant_id, pt))

perf.stats.by.vaccine<-plyr::ddply(.data=db.preds.all,
                             .variable=c("pt"),
                             .fun=function(df){

                               if(nrow(df)!=0 & length(unique(df$obs))==2){

                                 ROC.OBJ<-pROC::roc(response = df$obs, levels=c("lowResponder", "highResponder"),
                                                    direction="<",
                                                    predictor = df$highResponder)

                                 AUC.CI<-as.vector(pROC::ci.auc(ROC.OBJ,
                                                                method="bootstrap",
                                                                boot.n=2000))

                                 names(AUC.CI)<-c("AUC.CI.LOW", "AUC", "AUC.CI.HIGH")

                                 DB2 <- data.frame(AUC=ROC.OBJ$auc[1],
                                                   AUC.CI.LOW=AUC.CI['AUC.CI.LOW'],
                                                   AUC.CI.HIGH=AUC.CI['AUC.CI.HIGH'])

                                 return(DB2)}else{return(NULL)}}, .parallel=F)

perf.stats.by.vaccine<-arrange(perf.stats.by.vaccine, desc(AUC))

perf.stats.by.vaccine %>%
  mutate(TimePoint="Day7-fold-change") %>%
  full_join(y=perf.stats.by.vaccine.master) -> perf.stats.by.vaccine.master

p<-ggplot(perf.stats.by.vaccine.master,
          aes(x=pt,
              y=AUC,
              fill=pt))+
  geom_bar(stat='identity', show.legend = T, position = pd)+
  #scale_fill_brewer(palette = "Set3")+
  geom_errorbar(aes(ymin = AUC.CI.LOW, ymax = AUC.CI.HIGH), width = 0.25, position = pd)+
  labs(y="Mean AUC (95% CI)", x=NULL)+
  facet_wrap(~TimePoint, nrow=2)+
  theme_bw()+
  theme(axis.text.x = element_text(angle = 60, hjust = 1, size=8))+
  geom_hline(yintercept = 0.5, linetype="dashed", color="red")+
  ylim(0,1)

p

Figure 4c. Kinetics of the predictive power of M156.1 across vaccines. For each vaccine/time point combination, the AUC was computed based on difference in the geometric mean of the fold changes of the genes in the M156.1 between high and low responders

target_genesets <- c("plasma cells, immunoglobulins (M156.1)")

eset <- young_and_old.noNorm.withResponse %>%
  keep_most_recent_prevax_time_point(drop_postvax = FALSE) %>%
  add_response_class_column() %>%
  keep_only_high_and_low_responders(col = "response_class") %>%
  remove_any_NA_rows()

exprs_df <- Biobase::exprs(eset)
pdata_df <- Biobase::pData(eset)

# Make sure order of uids in pdata_df and exprs_df match
stopifnot(all(rownames(pdata_df) == pdata_df$uid))
pdata_df <- pdata_df[colnames(exprs_df), ]

# Match every sample in exprs_df with the corresponding baseline uid
baseline_uids <- pdata_df %>%
  dplyr::group_by(pathogen, vaccine_type, study_accession, participant_id) %>%
  dplyr::mutate(baseline_uid = uid[time_post_last_vax <= 0]) %>%
  dplyr::pull(baseline_uid)

baseline_exprs_df <- exprs_df[, baseline_uids]
postvax_cols <- !(colnames(exprs_df) %in% unique(baseline_uids))

temporal_log2fc_withResponse_exprs <- exprs_df[, postvax_cols] -
  baseline_exprs_df[, postvax_cols]
temporal_log2fc_withResponse_pdata <- Biobase::pData(eset)[
  colnames(temporal_log2fc_withResponse_exprs), ]

temporal_log2fc_withResponse_eset <- new("ExpressionSet",
    exprs = temporal_log2fc_withResponse_exprs,
    phenoData = new('AnnotatedDataFrame', temporal_log2fc_withResponse_pdata))

# Keep only study time points that have both high- and low-responders
temporal_log2fc_withResponse_eset.filtered_pd <-
  Biobase::pData(temporal_log2fc_withResponse_eset) %>%
  dplyr::group_by(age_group, pathogen, vaccine_type, study_accession,
                  time_post_last_vax) %>%
  dplyr::filter(all(c("lowResponder", "highResponder") %in% response_class)) %>%
  dplyr::ungroup() %>%
  tibble::column_to_rownames(var = "uid")
temporal_log2fc_withResponse_eset.filtered_pd$uid <-
  rownames(temporal_log2fc_withResponse_eset.filtered_pd)
temporal_log2fc_withResponse_eset.filtered_exprs <-
  Biobase::exprs(temporal_log2fc_withResponse_eset)[
    , rownames(temporal_log2fc_withResponse_eset.filtered_pd)]
temporal_log2fc_withResponse_eset.filtered <-
  new("ExpressionSet", exprs = temporal_log2fc_withResponse_eset.filtered_exprs,
      phenoData = new('AnnotatedDataFrame',
                      temporal_log2fc_withResponse_eset.filtered_pd))

btm_list.filtered <- keep_symbols(
  genesets.symbol = btm_list,
  symbols2keep = rownames(temporal_log2fc_withResponse_eset.filtered))


calc_sig_df <- function(eset, sig_genes_up, sig_genes_down) {
  sig_scores_up <- rep(0, ncol(eset))
  sig_scores_down <- rep(0, ncol(eset))

  if (length(sig_genes_up) > 0)
    sig_scores_up <- Biobase::esApply(
      X = eset[sig_genes_up, ], MARGIN = 2, FUN = mean)

  if (length(sig_genes_down) > 0)
    sig_scores_down <- Biobase::esApply(
      X = eset[sig_genes_down, ], MARGIN = 2, FUN = mean)

  tibble::tibble(
    uid = colnames(eset),
    sig_score = sig_scores_up - sig_scores_down,
    labels = as.numeric(Biobase::pData(eset)$response_class == "highResponder"))
}

calc_auc_df <- function(x) {
  x %>%
    dplyr::group_by(pathogen, vaccine_type, study_accession,
                    time_post_last_vax) %>%
    tidyr::nest() %>%
    dplyr::mutate(
      auc_data = purrr::map(
        .x = data,
        ~{
          res <- MetaIntegrator::calculateROC(labels = .x$labels,
                                              predictions = .x$sig_score)
          data.frame(
            n_HR = sum(.x$response_class == "highResponder"),
            n_LR = sum(.x$response_class == "lowResponder"),
            n = nrow(.x),
            AUC = res$auc,
            AUC_low = res$auc.CI[1],
            AUC_high = res$auc.CI[2])
        }
      )
    ) %>%
    dplyr::ungroup() %>%
    dplyr::select(-data) %>%
    tidyr::unnest(auc_data)
}

auc_df <- NULL
for (target_geneset in target_genesets) {

  target_genes <- btm_list.filtered[[target_geneset]]

  sig_df <- calc_sig_df(eset = temporal_log2fc_withResponse_eset.filtered,
                        sig_genes_up = target_genes,
                        sig_genes_down = c())

  pd_sig_df <- dplyr::left_join(
    Biobase::pData(temporal_log2fc_withResponse_eset.filtered),
    sig_df, by = "uid")
  auc_df <- rbind(auc_df, calc_auc_df(x = pd_sig_df) %>%
                    dplyr::mutate(GeneSet = target_geneset))
}

plot_df <- auc_df %>%
  dplyr::group_by(pathogen, vaccine_type, GeneSet, time_post_last_vax) %>%
  dplyr::summarise(mean_AUC = sum(n * AUC) / sum(n), n = sum(n)) %>%
  dplyr::mutate(pvt = paste0(pathogen, " (", vaccine_type, ")"))

for (cur_geneset in target_genesets) {
  p <- ggplot(plot_df %>%
                dplyr::filter(n >= 5,
                              time_post_last_vax <= 30,
                              GeneSet == cur_geneset),
              aes(x = time_post_last_vax, y = mean_AUC, color = pvt)) +
    geom_hline(yintercept = .5, color = "gray", linetype = "dashed") +
    geom_vline(xintercept = 7, color = "gray", linetype = "dashed") +
    geom_line() +
    geom_point() +
    scale_color_ptol() +
    theme_bw() +
    facet_wrap(vars(GeneSet), ncol = 1)

  print(p)
}

Figure 4d-e. Weighted ROC curves and per-vaccine AUC bar plot for a logistic regression classifier using M156.1 expression either at day 7 in all vaccines (day 7) or at the vaccine-specific peak expression time point. FPR, false positive rate; TPR, true positive rate. Data are presented as mean values ± 95% CIs. n = 2,000 bootstrap replicates.

#Set peak timepoints
Peak.List <- list(
  D7FCH=c("Meningococcus_CJ",
          "Hepatitis A/B_IN/RP",
          "Influenza_LV",
          "Influenza_IN",
          "Tuberculosis_RVV",
          "Meningococcus_PS",
          "Pneumococcus_PS",
          "Varicella Zoster_LV"),
  D14FCH=c("Yellow Fever_LV"),
  D21FCH=c("Smallpox_LV"))

data.ML.final <- do.call("cbind.data.frame", lapply(names(Peak.List), function(TIME){

  load(paste0(extra_data_dir,"/",paste0("ML.Data.",TIME,".BTM.NoFilter.rda")))
  load(paste0(extra_data_dir,"/",paste0("PD.Master.",TIME,".BTM.rda")))

  keep.id<-subset(PatientInfo, MFC_p30%in%c("lowResponder","highResponder") &
                    pt%in%Peak.List[[TIME]])$participant_id

  data.ML.temp<-data.ML[,keep.id]

  return(data.ML.temp)

}))


PatientInfo.Final <- do.call("rbind.data.frame", lapply(names(Peak.List), function(TIME){

  load(paste0(extra_data_dir,"/",paste0("ML.Data.",TIME,".BTM.NoFilter.rda")))
  load(paste0(extra_data_dir,"/",paste0("PD.Master.",TIME,".BTM.rda")))

  keep.id<-subset(PatientInfo, MFC_p30%in%c("lowResponder","highResponder") &
                    pt%in%Peak.List[[TIME]])$participant_id

  data.ML.temp<-PatientInfo[keep.id,]

  return(data.ML.temp)

}))

PatientInfo.Final<-droplevels(PatientInfo.Final)
PatientInfo.Final$MFC_p30<-factor(PatientInfo.Final$MFC_p30,
                                  levels = c("highResponder", "lowResponder"))

save(PatientInfo.Final, data.ML.final,
     file=paste0(extra_data_dir,"/ML.Data.D7FCH.BTM.SELECT.PEAK.NoFilter.rda"))

TIMES<-c("D7FCH.BTM", "D7FCH.BTM.SELECT")
VERSION<-c(".PEAK")

db.run<-expand.grid(TIMES, VERSION)

MODEL.LIST<-lapply(1:nrow(db.run), function(i){

Time=db.run[i, "Var1"] # eg BL.GENES, BL.BTM

VERSION<-db.run[i, "Var2"]

#SEED=1987
SEED=42

# Load Data

### Get ID with Both ### Added from prior ImmuneSpace version
ID.UNIFORM<-get(load(paste0(extra_data_dir,"/ML.Data.D7FCH.BTM.NoFilter.rda")))

load(paste0(extra_data_dir,"/ML.Data.D7FCH.BTM.SELECT.PEAK.NoFilter.rda"))

ID.SELECT <- data.ML.final

ID.KEEP<-intersect(colnames(ID.UNIFORM), colnames(ID.SELECT))

if(grepl("SELECT",Time)==FALSE){
  load(paste0(extra_data_dir,"/ML.Data.",Time,".NoFilter.rda"))
  load(paste0(extra_data_dir,"/PD.Master.",Time,".rda"))
  data.ML.filter=data.ML}

if(grepl("SELECT",Time)==TRUE){
  Time2<-paste0(Time, VERSION)
  load(paste0(extra_data_dir,"/ML.Data.",Time2,".NoFilter.rda"))
  data.ML.filter=data.ML.final
  PatientInfo=PatientInfo.Final}

PatientInfo<-subset(PatientInfo, MFC_p30%in%c("lowResponder","highResponder"))
PatientInfo<-droplevels(PatientInfo)
PatientInfo$MFC_p30<-factor(PatientInfo$MFC_p30, levels = c("highResponder", "lowResponder"))

DevID<-rownames(PatientInfo)

DevID<-intersect(DevID, colnames(data.ML.filter))
X.Dev<-data.ML.filter[,DevID]
rownames(X.Dev)<-strsplit2(rownames(X.Dev), "_")[,1]

Y.Dev<-PatientInfo[DevID, 'MFC_p30']
names(Y.Dev)<-DevID

keep.genes<-grep("M156.1", rownames(X.Dev), value = T)

X.Train.t1<-t(X.Dev[keep.genes,])

Y.Train1<-PatientInfo[rownames(X.Train.t1), 'MFC_p30']
names(Y.Train1)<-rownames(X.Train.t1)

X.Train.t1<-X.Train.t1[intersect(ID.KEEP, rownames(X.Train.t1)),,drop=F]

set.seed(SEED)
ctrl=trainControl(method = "LGOCV",
                  number = 200,
                  p=0.8,
                  selectionFunction = 'best',
                  search="grid",
                  returnResamp = "final",
                  classProbs = TRUE,
                  returnData = F,
                  verboseIter = F,
                  summaryFunction = twoClassSummary,
                  sampling=NULL,
                  savePredictions = "final")


set.seed(SEED)
model_list <- train(
  x=X.Train.t1,
  y=Y.Train1[rownames(X.Train.t1)],
  trControl=ctrl,
  method=c("glm"),
  metric="ROC",
  maximize=TRUE,
  preProc = c("center", "scale"))

for(i in 1:length(model_list)){

    model_list$pred$participant_id<-rownames(X.Train.t1)[model_list$pred$rowIndex]
    model_list$pred$pt=PatientInfo[model_list$pred$participant_id,"pt"]
}

return(model_list)})

names(MODEL.LIST)<-paste0(db.run$Var1, db.run$Var2)

# save(MODEL.LIST,
#      file=paste0(extra_data_dir,"/PeakPredictions_Master_ModelList.rda"))

# load(paste0(extra_data_dir,"/PeakPredictions_Master_ModelList.rda"))

db.preds.D7FCH <- MODEL.LIST$D7FCH.BTM.PEAK$pred %>%
  mutate(TIME="D7FCH")

db.preds.PEAK <- MODEL.LIST$D7FCH.BTM.SELECT.PEAK$pred %>%
  mutate(TIME="PEAK.SELECT")

db.preds.all <- full_join(db.preds.D7FCH, db.preds.PEAK)

db.ROCw<-plyr::ddply(db.preds.all,
               c("Resample", "TIME"),
               function(df){

                 tab1<-table(df$pt)
                 tab1<-1/tab1
                 df$WT<-tab1[df$pt]

                 ROCw<-WeightedROC(guess=df$highResponder,
                                   label=ifelse(df$obs=="highResponder", 1,0),
                                   weight = df$WT)

                 db.res <-data.frame(WeightedAUC=WeightedAUC(ROCw))

               })

Fig.D7FCH <- lme(WeightedAUC~1, random = ~1|Resample, data=filter(db.ROCw, TIME=="D7FCH"))

MEAN.CI.D7FCH <- as.data.frame(emmeans::emmeans(Fig.D7FCH, ~1)) %>%
  mutate(TIME="D7FCH")

Fig.PEAK.SELECT <- lme(WeightedAUC~1, random = ~1|Resample, data=filter(db.ROCw, TIME=="PEAK.SELECT"))

MEAN.CI.PEAK.SELECT <- as.data.frame(emmeans::emmeans(Fig.PEAK.SELECT, ~1))  %>%
  mutate(TIME="PEAK.SELECT")

MEAN.CI <- full_join(MEAN.CI.D7FCH, MEAN.CI.PEAK.SELECT)

### Weighted ROC ###
tab1<-table(db.preds.D7FCH$pt)
tab1<-1/tab1
db.preds.D7FCH$WT<-tab1[db.preds.D7FCH$pt]

ROC.UNIFORM<-WeightedROC(guess=db.preds.D7FCH$highResponder,
                         label=ifelse(db.preds.D7FCH$obs=="highResponder", 1,0),
                         weight = db.preds.D7FCH$WT)
ROC.UNIFORM$TYPE="Day7"
ROC.UNIFORM$WeightedAUC=WeightedAUC(ROC.UNIFORM)

tab1<-table(db.preds.PEAK$pt)
tab1<-1/tab1
db.preds.PEAK$WT<-tab1[db.preds.PEAK$pt]

ROC.PEAK<-WeightedROC(guess=db.preds.PEAK$highResponder,
                      label=ifelse(db.preds.PEAK$obs=="highResponder", 1,0),
                      weight = db.preds.PEAK$WT)
ROC.PEAK$TYPE="Peak"
ROC.PEAK$WeightedAUC=WeightedAUC(ROC.PEAK)

ROC.db<-rbind.data.frame(ROC.PEAK, ROC.UNIFORM)

p=ggplot(ROC.db, aes(x=FPR, y=TPR, color=TYPE))+
    geom_path()+
    scale_color_manual(values=c(Day7="gray60", Peak="black"))+
    coord_equal() +
    theme_classic()+
    theme(legend.position = "bottom")
p

### ROC per Vaccine ###

perf.stats.by.vaccine<-plyr::ddply(.data=db.preds.all,
                             .variable=c("pt", "TIME", "Resample"),
                             .fun=function(df){

                               if(nrow(df)!=0 & length(unique(df$obs))==2){

                                 ROC.OBJ<-pROC::roc(response = df$obs, levels=c("lowResponder", "highResponder"),
                                                    direction="<",
                                                    predictor = df$highResponder)

                                 DB2 <- data.frame(AUC=ROC.OBJ$auc)

                                 return(DB2)}else{return(NULL)}}, .parallel=F)

perf.stats.by.vaccine<-arrange(perf.stats.by.vaccine, desc(AUC))

plyr::ddply(perf.stats.by.vaccine,
      c("TIME", "pt"),
      function(df){

        print(range(df$AUC))

        if(min(df$AUC)!=1){
        FIT<-lme(AUC~1, random = ~1|Resample, data=df)
        MEAN.CI <- as.data.frame(emmeans::emmeans(FIT, ~1))
        }

        if(min(df$AUC)==1){MEAN.CI<-data.frame(`1`="overall",
                                               emmean=1,
                                               SE=NA,
                                               df=NA,
                                               lower.CL=1,
                                               upper.CL=1)}

        return(MEAN.CI)

      }) %>% arrange(pt) -> db.plot.ByVac

pd<-position_dodge(0.9)

p<-ggplot(db.plot.ByVac,
          aes(x=reorder(pt, -emmean),
              y=emmean,
              fill=TIME))+
  geom_bar(stat='identity', show.legend = T, position = pd)+
  scale_fill_brewer(palette = "Set3")+
  geom_errorbar(aes(ymin = lower.CL, ymax = upper.CL), width = 0.25, position = pd)+
  labs(y="AUC (CI 95%)", x=NULL)+
  theme_bw()+
  theme(axis.text.x = element_text(angle = 60, hjust = 1, size=8))+
  ylim(0,1)

p

Figure 6: Impact of aging on transcriptional responses to vaccination

Figure 6a. Box plots of day 30 antibody responses to vaccination in young (≤50 years) and older (≥60 years) participants across vaccines. The center line represents the median, box limits represent upper and lower quartiles, and whiskers indicate 1.5 times the interquartile range.

eset=young_and_old.noNorm.withResponse
#Create combined vaccine type/pathogen column
eset$pt=paste(eset$pathogen," (",eset$vaccine_type,")", sep='')

#Subset to day 0
eset=eset[,eset$time_post_last_vax==0]
#Subset to within 7 days of day 28 for ab response, then set to day 28
eset=eset[,eset$ImmResp_postVax_timepoint_MFC>=21 & eset$ImmResp_postVax_timepoint_MFC<=35]
eset$ImmResp_postVax_timepoint_MFC=28
#Set baseline timepoint to 0
eset$ImmResp_baseline_timepoint_MFC=0
#For each vaccine, only use one ab type (most common)
pt_uni=unique(eset$pt)
for (i in 1:length(pt_uni)) {
  most_ab_type=tail(names(sort(table(eset$assay[eset$pt==pt_uni[i]]))), 1)
  ind_rem=which(eset$pt==pt_uni[i] & eset$assay!=most_ab_type)
  if (is_empty(ind_rem)==FALSE) {
    eset=eset[,-ind_rem]
  }
}

#Keep only studies with young and elderly
sdy_y_e=names(which(sapply(unique(eset$study_accession), function(x) length(unique(eset$age_group[eset$study_accession==x]))>1)))
eset=eset[,eset$study_accession %in% sdy_y_e]

#Create dataframe from pdata
df=data.frame(Group=eset$age_group, Vaccine=eset$pt, MFC=eset$MFC)
df$Group=factor(df$Group, levels=unique(df$Group))

#Plot
p=ggplot(df, aes(x=Vaccine, y=MFC, fill=Group)) +
  geom_boxplot(position = position_dodge(0.6), outlier.shape=NA, width=0.5, notch=FALSE, alpha=1)+
  geom_point(position=position_jitterdodge(jitter.width=0.2, dodge.width=0.6), pch=21)+
  stat_compare_means(aes(group=Group), hide.ns=TRUE, label='p.signif')+
  ylab('log2(Ab FC)')+
  xlab('')+
  theme_bw()+
  theme(text=element_text(size=16), axis.text.x=element_text(angle=45, hjust=1))
p

Figure 6b. Modules differentially expressed between young and older participants in response to inactivated influenza vaccination (QuSAGE FDR < 0.05). (Figure 6c created in Cytoscape)

#Generate young vs old QuSAGE results

longitudinal_qusage_young_vs_old_df_path <- file.path(extra_data_dir, "longitudinal_qusage_young_vs_old_df.rds")
if (file.exists(longitudinal_qusage_young_vs_old_df_path)) {
  longitudinal_qusage_young_vs_old_df <- readRDS(longitudinal_qusage_young_vs_old_df_path)
} else {
  # Filter eset and gene sets ----------------------------------------------------

  eset <- young_and_old.noNorm %>%
    keep_most_recent_prevax_time_point(drop_postvax = FALSE) %>%
    remove_any_NA_rows()

  btm_list.filtered <- keep_symbols(genesets.symbol = btm_list,
                                    symbols2keep = rownames(eset),
                                    min_size = 2)

  # Longitudinal GSEA of young vs old --------------------------------------------
  analysis_df <- Biobase::pData(eset) %>%
    dplyr::filter(time_post_last_vax > 0) %>%
    dplyr::select(pathogen, vaccine_type, time_post_last_vax,
                  study_accession) %>%
    dplyr::arrange(pathogen, vaccine_type, time_post_last_vax,
                   study_accession) %>%
    dplyr::distinct() %>%
    dplyr::mutate(qusage_contrast = NA,
                  qusage_output = list(NULL),
                  error = list(NULL)) %>%
    dplyr::as_tibble()

  safe_qusage <- purrr::safely(.f = qusage::qusage,
                               otherwise = NULL, quiet = TRUE)

  for (row_nr in 1:nrow(analysis_df)) {
    row <- analysis_df[row_nr,, drop = FALSE]
    cat(paste0("Row ", row_nr, "/", nrow(analysis_df),
               ": ", row$pathogen, " ",
               row$vaccine_type, " ", row$time_post_last_vax, " ",
               row$study_accession, "\n"))
    cur_eset <- eset[, eset$pathogen == row$pathogen &
                       eset$vaccine_type == row$vaccine_type &
                       eset$study_accession == row$study_accession &
                       (eset$time_post_last_vax <= 0 |
                          eset$time_post_last_vax == row$time_post_last_vax)]

    prevax_string <- "baseline"
    postvax_string <- paste0("Day", row$time_post_last_vax)

    qusage.labels <- paste0(
      cur_eset$age_group, ".",
      ifelse(test = cur_eset$time_post_last_vax == row$time_post_last_vax,
             yes = postvax_string,
             no = prevax_string))
    qusage.contrast <- paste0(
      "(old.", postvax_string, "-old.", prevax_string, ") - ",
      "(young.", postvax_string, "-young.", prevax_string, ")")

    result <- safe_qusage(eset = cur_eset,
                          labels = qusage.labels,
                          contrast = qusage.contrast,
                          geneSets = btm_list.filtered,
                          pairVector = cur_eset$participant_id)
    if (!is.null(result$result))
      analysis_df$qusage_output[[row_nr]] <- result$result
    analysis_df$qusage_contrast[row_nr] <- qusage.contrast
    if (!is.null(result$error))
      analysis_df$error[[row_nr]] <- result$error
  }
  saveRDS(analysis_df, longitudinal_qusage_young_vs_old_df_path)

  longitudinal_qusage_young_vs_old_df=analysis_df
}

#Combine results across vaccines
longitudinal_qusage_young_vs_old_meta_df_path <- file.path(extra_data_dir, "longitudinal_qusage_young_vs_old_meta_df.rds")
if (file.exists(longitudinal_qusage_young_vs_old_meta_df_path)) {
  longitudinal_qusage_young_vs_old_meta_df <- readRDS(longitudinal_qusage_young_vs_old_meta_df_path)
} else {
  longitudinal_qusage_young_vs_old_meta_df <-
                longitudinal_qusage_young_vs_old_df %>%
                meta_qusage(group_cols = c("pathogen",
                                           "vaccine_type",
                                           "time_post_last_vax"))
  saveRDS(longitudinal_qusage_young_vs_old_meta_df, longitudinal_qusage_young_vs_old_meta_df_path)
}

#Tidy results
tidy_longitudinal_qusage_young_vs_old_meta_df <-
  longitudinal_qusage_young_vs_old_meta_df %>%
  tidy_qusage_output(col = "meta_qusage_output") %>%
  dplyr::mutate(FDR = p.adjust(p.value, method = "BH"),
                FDR_signif_label = create_signif_label(FDR))

#Plot
plot_df <- tidy_longitudinal_qusage_young_vs_old_meta_df %>%
  dplyr::filter(FDR < FDR_THRESHOLD) %>%
  dplyr::arrange(activity_score) %>%
  dplyr::mutate(geneset = factor(geneset, levels = unique(geneset)),
                pvt = paste0(pathogen, " (", vaccine_type, ")"))

pvts <- plot_df %>%
  dplyr::pull(pvt) %>%
  unique()

for (cur_pvt in pvts) {
  p <- ggplot(plot_df %>%
                dplyr::filter(pvt == cur_pvt),
              aes(x = geneset, y = activity_score, fill = activity_score > 0)) +
    labs(x = "BTM", y = "Activity score") +
    geom_bar(stat = "identity") +
    geom_text(aes(label = FDR_signif_label),
              size = 6,
              y = 0.5) +
    ylim(c(NA, 0.6)) +
    coord_flip() +
    scale_fill_manual(values = c("TRUE" = "firebrick", "FALSE" = "navy")) +
    theme_bw() +
    theme(legend.position = "none") +
    facet_grid(.~time_post_last_vax) +
    ggtitle(cur_pvt)
  print(p)
}

Figure 6d. Bar plot of the day 7 AUC of module M156.1 across vaccines in young and older participants. Data are mean values ± 95% CIs. n = 2,000 bootstrap replicates.

exp_FC_old_path <- file.path(extra_data_dir, "Old_FCH_esetList.BTM.rda")

eset <- readRDS(file.path(data_dir, paste0(date_prefix, "old_norm_batchCorrectedFromYoung_withResponse_eset.rds")))

#Abbreviate vaccine types
eset$vaccine_type_full=eset$vaccine_type
eset$vaccine_type=str_replace_all(eset$vaccine_type,vaccine_abrev)

#Merge pre and post-vax samples to same matrix for SDY1529
eset$matrix[eset$study_accession=='SDY1529']='SDY1529_WholeBlood_HealthyAdults'

# Code below adds row-names equal to uid (uid is columnID for the Expression Data)
rownames(pData(eset))<-pData(eset)$uid
# ensuring order of pheno-data matchs expression data. 
pData(eset)<-pData(eset)[colnames(Biobase::exprs(eset)),]

#Create column with combined vaccine type/pathogen
eset$pt=paste0(eset$pathogen,"_",eset$vaccine_type)

# Time points to calculate
tp_int=c(0, 7)

#Create combined vaccine type/pathogen column
eset$pt=paste0(eset$pathogen,"_",eset$vaccine_type)

#Remove subjects with Inf/NA MFC measurements
eset=eset[,!is.na(eset$MFC)]
eset=eset[,!(eset$MFC == Inf)]  

#For subjects with D-7 but no D0 measurement, convert to D0 for FC calculation
sub_noD0=setdiff(eset[,which(eset$study_time_collected==-7)]$participant_id,eset[,which(eset$study_time_collected==0)]$participant_id)
eset$study_time_collected[intersect(which(eset$study_time_collected==-7),which(eset$participant_id %in% sub_noD0))]=0

#Find samples from timepoints of interest
ind=lapply(tp_int, function(x) which(eset$study_time_collected==x))
#Retain only samples from timepoints of interest
eset=eset[,Reduce(union,ind)]
#Recompute timepoint indices after removing extraneous timepoints
ind=lapply(tp_int, function(x) which(eset$study_time_collected==x))

#Remove samples from a single study with fewer than sample_cutoff samples at any timepoint
matrix_uni_tp=lapply(ind,function(x) unique(eset[,x]$matrix))
matrix_ind=lapply(1:length(ind),function(x)
  lapply(1:length(matrix_uni_tp[[x]]), function(y) which(matrix_uni_tp[[x]][[y]]==eset[,ind[[x]]]$matrix)))
ind_cut_all=vector()
for (i in 1:length(matrix_ind)) {
  ind_cut=which(sapply(matrix_ind[[i]],length)<sample_cutoff)
  if (is_empty(ind_cut)==FALSE) {
    for (j in 1:length(ind_cut)) {
      ind_cut_all=c(ind_cut_all,ind[[i]][matrix_ind[[i]][[ind_cut[j]]]])
    }
  }
}
if (is_empty(ind_cut_all)==FALSE) {
  eset=eset[,-ind_cut_all]
}

#Recompute timepoint indices after removing samples
tp_int=unique(eset$study_time_collected[which(eset$study_time_collected>=0)])
ind=lapply(tp_int, function(x) which(eset$study_time_collected==x))

PD<-pData(eset)
unique(subset(PD,participant_id%in%participant_id[ind[[2]]])$study_time_collected)

eset=eset[complete.cases(Biobase::exprs(eset)),]

#Convert to ssGSEA scores and compute FC, unless cache alr
if (file.exists(exp_FC_old_path)) {
  load(exp_FC_old_path)
} else {

  BTM_list <- readRDS(file = file.path(data_dir, "btm_list_2020_12_23.rds"))

  # Gene names
  GeneNames<-featureNames(eset)

  #Remove BTMs which have no matching genes in dataset
  BTM_list=BTM_list[!sapply(BTM_list, function(x){is_empty(intersect(x, GeneNames))})]

  eset_BTM<-gsva(expr=eset, ## Gene-expression data, rows genes, cols samples
                 BTM_list, ## PTW list
                 verbose=TRUE,
                 method="ssgsea")

  #Replace eset with BTM eset
  eset=eset_BTM
  rm(eset_BTM)

  #Compute D0 normalized FC - for individual genes/BTMs - exp_FC has 3 large matrix with D1, D3, D7 timepoints
  ind_D0=which(0==eset$study_time_collected)
  common=lapply(2:length(ind),function(x) intersect(eset$participant_id[ind[[x]]],eset$participant_id[ind_D0]))
  #get participant id with timepoint and D0 data
  ia=lapply(2:length(ind),function(x) na.omit(match(common[[x-1]],eset$participant_id[ind[[x]]])))
  #included participant IDs match with participant IDs at diff timepoints in eset
  ib=lapply(2:length(ind),function(x) na.omit(match(common[[x-1]],eset$participant_id[ind_D0])))
  #participant IDs match with D0 to get indices in ind_D0
  exp_FC=lapply(2:length(ind),function(x) eset[,ind[[x]][ia[[x-1]]]])  #get expression data of the 3 timepoints

  exp_FC=lapply(2:length(ind),
                function(x) {Biobase::exprs(exp_FC[[x-1]])=Biobase::exprs(exp_FC[[x-1]])-Biobase::exprs(eset[,ind_D0[ib[[x-1]]]])
                exp_FC[[x-1]]
                }) #compute FC

  names(exp_FC)<-paste0("Day", tp_int[-1])

  exp_FC_old <- exp_FC

  save(exp_FC_old, file=exp_Fold_path)
}

#Wrangle data to ML format

lapply(names(exp_FC_old), function(DAY){

eset<-exp_FC_old[[DAY]]

PD<-pData(eset)

# remove patients with duplicate time points
PD$SubjTime<-paste0(PD$participant_id,"_",PD$study_time_collected)
tab<-table(PD$SubjTime)

patientsDuplic<-names(tab[tab!=1])

## Keep only patients with 0, 1 and 3 days
numbrero<-as.numeric(gsub("Day", "", DAY))

PD.sub<-subset(PD, study_time_collected%in%c(numbrero))

PD.use<-PD.sub
PD.use<-droplevels(PD.use)

UID<-PD.use$uid

## Prepare expression data
EXP<-Biobase::exprs(eset)

## Figure out a good filter
SDVec<-apply(EXP, 1, sd)

GENES<-names(SDVec[SDVec>=quantile(SDVec,0.75)])

## I will keep genes with an SD>= the median SD

## Need to wrangle this into ML format: Gene_Time by PTID
EXP.melt<-reshape2::melt(EXP)
colnames(EXP.melt)[2]="uid"

EXP.PD<-inner_join(x=EXP.melt[EXP.melt$uid%in%PD.use$uid,],
                   y=PD.use[,c("uid","participant_id","study_time_collected","time_post_last_vax",
                               "age_imputed",'study_accession',
                               "gender","pathogen","vaccine_type","maxRBA_p30","MFC_p30","MFC","maxRBA")],
                   by="uid")

EXP.PD$Gene_Time<-paste0(EXP.PD$Var1, "_",EXP.PD$study_time_collected)
EXP.PD$pt<-paste0(EXP.PD$pathogen, "_", EXP.PD$vaccine_type)

data.ML<-as.data.frame(dcast.data.table(as.data.table(EXP.PD),
                                        formula=Var1+Gene_Time+study_time_collected~participant_id,
                                        value.var = "value"))

rownames(data.ML)<-gsub(paste0("_",numbrero,"$"), 
                        paste0("_D",numbrero,"FCH"), 
                        data.ML$Gene_Time)

data.ML.filter<-data.ML[data.ML$Var1%in%GENES,-c(1:3)]

OLD_data.ML.filter <- data.ML.filter

save(OLD_data.ML.filter, file=paste0(extra_data_dir,"/OLD_ML.Data.", paste0("D",numbrero,"FCH."),"BTM.SDFilter75.rda"))

data.ML<-data.ML[,-c(1:3)]

OLD_data.ML <- data.ML

save(OLD_data.ML, file=paste0(extra_data_dir,"/OLD_ML.Data.", paste0("D",numbrero,"FCH."),"BTM.NoFilter.rda"))

PatientInfo<-EXP.PD[!duplicated(EXP.PD$participant_id),
                    c('age_imputed','gender','pt','study_accession',
                      'maxRBA_p30','MFC_p30','participant_id','maxRBA','MFC')]

rownames(PatientInfo)<-PatientInfo$participant_id
PatientInfo<-as.data.frame(PatientInfo)

OLD_PatientInfo <- PatientInfo

save(OLD_PatientInfo, file=paste0(extra_data_dir,"/OLD_PD.Master.",
                              paste0("D", numbrero, "FCH."),"BTM.rda"))

})

#Prior to testing in elderly, need to pull out young data for comparison (does not get stored above as it is within a loop)
i=1
Time=db.run[i, "Var1"] # eg BL.GENES, BL.BTM

VERSION<-db.run[i, "Var2"]

#SEED=1987
SEED=42

#Load data

### Get ID with Both ### Added from prior ImmuneSpace version
ID.UNIFORM<-get(load(paste0(extra_data_dir,"/ML.Data.D7FCH.BTM.NoFilter.rda")))

load(paste0(extra_data_dir,"/ML.Data.D7FCH.BTM.SELECT.PEAK.NoFilter.rda"))

ID.SELECT <- data.ML.final

ID.KEEP<-intersect(colnames(ID.UNIFORM), colnames(ID.SELECT))

if(grepl("SELECT",Time)==FALSE){
  load(paste0(extra_data_dir,"/ML.Data.",Time,".NoFilter.rda"))
  load(paste0(extra_data_dir,"/PD.Master.",Time,".rda"))
  data.ML.filter=data.ML}

if(grepl("SELECT",Time)==TRUE){
  Time2<-paste0(Time, VERSION)
  load(paste0(extra_data_dir,"/ML.Data.",Time2,".NoFilter.rda"))
  data.ML.filter=data.ML.final
  PatientInfo=PatientInfo.Final}

PatientInfo<-subset(PatientInfo, MFC_p30%in%c("lowResponder","highResponder"))
PatientInfo<-droplevels(PatientInfo)
PatientInfo$MFC_p30<-factor(PatientInfo$MFC_p30, levels = c("highResponder", "lowResponder"))

DevID<-rownames(PatientInfo)

DevID<-intersect(DevID, colnames(data.ML.filter))
X.Dev<-data.ML.filter[,DevID]
rownames(X.Dev)<-strsplit2(rownames(X.Dev), "_")[,1]

Y.Dev<-PatientInfo[DevID, 'MFC_p30']
names(Y.Dev)<-DevID

keep.genes<-grep("M156.1", rownames(X.Dev), value = T)

X.Train.t1<-t(X.Dev[keep.genes,])

Y.Train1<-PatientInfo[rownames(X.Train.t1), 'MFC_p30']
names(Y.Train1)<-rownames(X.Train.t1)

X.Train.t1<-X.Train.t1[intersect(ID.KEEP, rownames(X.Train.t1)),,drop=F]

#Test predictions in elderly
load(paste0(extra_data_dir,"/OLD_ML.Data.D7FCH.BTM.NoFilter.rda"))
load(paste0(extra_data_dir,"/OLD_PD.Master.D7FCH.BTM.rda"))

PTID_HIGH_LOW <- rownames(OLD_PatientInfo[OLD_PatientInfo$MFC_p30 %in% c("lowResponder", "highResponder"),])

MODEL_TO_USE <- MODEL.LIST$D7FCH.BTM.SELECT.PEAK

DATA <- t(OLD_data.ML[,PTID_HIGH_LOW])
colnames(DATA) <- gsub("_D7FCH", "", colnames(DATA))
DATA <- DATA[,"plasma cells, immunoglobulins (M156.1)", drop=F]

PREDS_OLD <- predict(MODEL_TO_USE, DATA, type = "prob") %>% 
  rownames_to_column('participant_id') %>%
  full_join(y=dplyr::select(OLD_PatientInfo[PTID_HIGH_LOW,], participant_id, pt, MFC_p30)) %>%
  mutate(Age_Group="Older")

PREDS_YOUNG <- predict(MODEL_TO_USE, X.Train.t1, type = "prob") %>% 
  rownames_to_column('participant_id') %>%
  full_join(y=dplyr::select(PatientInfo[rownames(X.Train.t1),], participant_id, pt, MFC_p30)) %>%
  filter(pt %in% PREDS_OLD$pt) %>%
  mutate(Age_Group="Young")

PREDS_ALL <- full_join(PREDS_OLD, PREDS_YOUNG)

perf.stats.by.vaccine.OLD<-plyr::ddply(.data=PREDS_ALL,
                             .variable=c("pt", "Age_Group"),
                             .fun=function(df){

                               if(nrow(df)!=0 & length(unique(df$MFC_p30))==2){

                                 ROC.OBJ<-pROC::roc(response = df$MFC_p30, levels=c("lowResponder", "highResponder"),
                                                    direction="<", 
                                                    predictor = df$highResponder)

                                 CI <- pROC::ci.auc(ROC.OBJ)

                                 DB2 <- data.frame(AUC=ROC.OBJ$auc,
                                                   CI_UP=CI[3],
                                                   CI_DOWN=CI[1])

                                 return(DB2)}else{return(NULL)}}, .parallel=F)

perf.stats.by.vaccine.OLD<-arrange(perf.stats.by.vaccine.OLD, desc(AUC))

pd<-position_dodge(0.9)

perf.stats.by.vaccine.OLD$AUC=as.numeric(perf.stats.by.vaccine.OLD$AUC)

p<-ggplot(perf.stats.by.vaccine.OLD %>%
            mutate(Age_Group=factor(Age_Group, levels = c("Young", "Older"))),
          aes(x=reorder(pt, -AUC), 
              y=AUC, 
              fill=Age_Group))+
  geom_bar(stat='identity', show.legend = T, position = pd)+
  scale_fill_brewer(palette = "Set3")+
  geom_errorbar(aes(ymin = CI_DOWN, ymax = CI_UP), width = 0.25, position = pd)+
  labs(y="AUC (CI 95%)", x=NULL, fill="Age Group")+
  theme_bw()+
  theme(axis.text.x = element_text(angle = 60, hjust = 1, size=8))+
  ylim(0,1)

print(p)

Ext. Data Figure 5a. Scatterplots of module activity scores in each vaccine among young (x-axis) and older participants (y-axis) of the most commonly expressed modules (Fig. 2a) on days 1–7. Correlation coefficient and p value determined via Pearson correlation.

#Create data frame with gene set enrichment scores for postvax vs prevax time points in older adult cohort.
longitudinal_qusage_old_df_path <- file.path(extra_data_dir, "longitudinal_qusage_old_df.rds")
if (file.exists(longitudinal_qusage_old_df_path)) {
  longitudinal_qusage_old_df <- readRDS(longitudinal_qusage_old_df_path)
} else {
  old.noNorm.noResponse.filtered <- old.noNorm.noResponse %>%
    keep_most_recent_prevax_time_point(drop_postvax = FALSE) %>%
    remove_any_NA_rows()

  # Check that there are no repeated measures at any time point
  stopifnot(Biobase::pData(old.noNorm.noResponse.filtered) %>%
              dplyr::group_by(participant_id, time_post_last_vax) %>%
              dplyr::filter(n() > 1) %>%
              nrow() == 0)

  # Check that every participant has a baseline time point
  stopifnot(Biobase::pData(old.noNorm.noResponse.filtered) %>%
              dplyr::group_by(participant_id) %>%
              dplyr::filter(!any(time_post_last_vax <= 0)) %>%
              nrow() == 0)

  btm_list.filtered <- keep_symbols(
    genesets.symbol = btm_list,
    symbols2keep = rownames(old.noNorm.noResponse.filtered))

  # Gene set enrichment analysis: postvax vs prevax time points ------------------

  eset <- old.noNorm.noResponse.filtered

  analysis_df <- Biobase::pData(eset) %>%
    dplyr::filter(time_post_last_vax > 0) %>%
    dplyr::select(pathogen,
                  vaccine_type,
                  time_post_last_vax,
                  study_accession) %>%
    dplyr::arrange(pathogen, vaccine_type, time_post_last_vax,
                   study_accession) %>%
    dplyr::distinct() %>%
    dplyr::mutate(n_participants = NA,
                  qusage_contrast = NA,
                  qusage_output = list(NULL),
                  error = list(NULL)) %>%
    dplyr::as_tibble()

  safe_qusage <- purrr::safely(.f = qusage::qusage,
                               otherwise = NULL, quiet = TRUE)

  for (row_nr in 1:nrow(analysis_df)) {
    row <- analysis_df[row_nr,, drop = FALSE]
    cat(paste0("Row ", row_nr, "/", nrow(analysis_df),
               ": ", row$pathogen, " ",
               row$vaccine_type, " ", row$time_post_last_vax, " ",
               row$study_accession, "\n"))
    cur_eset <- eset[, eset$pathogen == row$pathogen &
                       eset$vaccine_type == row$vaccine_type &
                       eset$study_accession == row$study_accession &
                       (eset$time_post_last_vax <= 0 |
                          eset$time_post_last_vax == row$time_post_last_vax)]

    prevax_string <- "baseline"
    postvax_string <- paste0("Day", row$time_post_last_vax)

    qusage.contrast <- paste0(postvax_string, "-", prevax_string)

    qusage.labels <- ifelse(
      test = cur_eset$time_post_last_vax == row$time_post_last_vax,
      yes = postvax_string,
      no = prevax_string)
    result <- safe_qusage(eset = cur_eset,
                          labels = qusage.labels,
                          contrast = qusage.contrast,
                          geneSets = btm_list.filtered,
                          pairVector = cur_eset$participant_id)
    analysis_df$n_participants[row_nr] <-
      length(unique(cur_eset$participant_id))
    analysis_df$qusage_contrast[row_nr] <- qusage.contrast
    if (!is.null(result$result))
      analysis_df$qusage_output[[row_nr]] <- result$result

    if (!is.null(result$error))
      analysis_df$error[[row_nr]] <- result$error
  }

  saveRDS(analysis_df, longitudinal_qusage_old_df_path)
  longitudinal_qusage_old_df <- analysis_df
}

#Create data frame with meta analysis results of gene set enrichment scores from postvax vs prevax time point of young adults.

longitudinal_qusage_old_meta_df_path <- file.path(extra_data_dir, "longitudinal_qusage_old_meta_df.rds")
if (file.exists(longitudinal_qusage_old_meta_df_path)) {
  longitudinal_qusage_old_meta_df <- readRDS(longitudinal_qusage_old_meta_df_path)
} else {
  longitudinal_qusage_old_meta_df <- longitudinal_qusage_old_df %>%
    meta_qusage(group_cols = c("pathogen",
                               "vaccine_type",
                               "time_post_last_vax"))
  saveRDS(longitudinal_qusage_old_meta_df, longitudinal_qusage_old_meta_df_path)
}
tidy_longitudinal_qusage_old_meta_df <- longitudinal_qusage_old_meta_df %>%
  tidy_qusage_output(col = "meta_qusage_output") %>%
  dplyr::mutate(FDR = p.adjust(p.value, method = "BH"),
                FDR_signif_label = create_signif_label(FDR))
#Plot
plot_df <- dplyr::inner_join(
  tidy_longitudinal_qusage_young_meta_df %>%
    dplyr::select(pathogen, vaccine_type, time_post_last_vax, geneset,
                  activity_score),
  tidy_longitudinal_qusage_old_meta_df %>%
    dplyr::select(pathogen, vaccine_type, time_post_last_vax, geneset,
                  activity_score),
  by = c("pathogen", "vaccine_type", "time_post_last_vax", "geneset"),
  suffix = c(".young", ".old")) %>%
  dplyr::filter(time_post_last_vax %in% c(1, 3, 7))
lm_df <- plot_df %>%
  dplyr::group_by(pathogen, vaccine_type, time_post_last_vax) %>%
  tidyr::nest() %>%
  dplyr::ungroup() %>%
  dplyr::mutate(
    lm = purrr::map(
      .x = data,
      ~lm(activity_score.old ~ activity_score.young, data = .x)),
    r2 = purrr::map(
      .x = lm,
      ~tibble(
        adj.r.squared = summary(.x)$adj.r.squared,
        r.squared = summary(.x)$r.squared,
        intercept = coef(.x)[1],
        slope = coef(.x)[2],
        p.value = broom::glance(.x)$p.value)
    )
  ) %>%
  tidyr::unnest(r2) %>%
  dplyr::mutate(signif_label = cut(p.value, c(-Inf, .001, .01, .05, Inf),
                                   labels = c("***", "**", "*", "ns")),
                r2_label = sapply(adj.r.squared, function(x)
                  substitute(
                    expr = italic(R)^2~"="~r2,
                    env = list(r2 = format(x, digits = 2)))))

p <- ggplot(plot_df, aes(x = activity_score.young, y = activity_score.old,
                         color = pathogen)) +
  geom_point() +
  geom_vline(xintercept= 0, color = "gray") +
  geom_hline(yintercept = 0, color = "gray") +
  geom_abline(intercept = 0, slope = 1, color = "lightgray") +
  geom_abline(data = lm_df, aes(intercept = intercept, slope = slope),
              color = "lightgray", linetype = "dashed") +
  geom_text(data = lm_df,
            x = -0.2, y = 0.9,
            aes(label = r2_label),
            color = "black",
            parse = TRUE,
            show.legend = FALSE) +
  geom_text(data = lm_df, x = -0.2, y = 0.6,
            aes(label = paste0("p = ",
                               formatC(p.value, digits = 2, format = "e"),
                               " (", signif_label, ")")),
            color = "black") +
  coord_equal(xlim = c(-1, 1), ylim = c(-1, 1)) +
  theme_bw(base_size = 24) +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5),
        legend.position = "bottom",
        legend.title = element_blank()) +
  facet_grid(pathogen + vaccine_type ~ paste0("Day ", time_post_last_vax)) +
  ggtitle("Common response module activity")
print(p)

Session Info

sessionInfo()


RGLab/ImmuneSignatures2 documentation built on Dec. 9, 2022, 10:51 a.m.