In RGLab/ImmuneSignatures2: Pre-Processing for HIPC ImmuneSignatures2 Analysis

Description: File used to generate figures [WHICH FIGURES?]

Load packages

library(stats) #prcomp
library(RAPToR)
library(Biobase) #pData
library(tidyverse)
library(reshape2)
library(RColorBrewer)
library(cowplot) #to plot side-by-side

# Paths to virtual studies: Currently using the normalized, noResponse esets (to include more studies)
path_to_young_eset <- "~/Documents/Coding/R/HIPC/IOF/2020_08_10_young_norm_noResponse_ageImputation_eset.rds"
path_to_old_eset <- "~/Documents/Coding/R/HIPC/IOF/2020_08_10_extendedOld_norm_noResponse_ageImputation_eset.rds"

# Set seed for consistent results
set.seed(123)

Load / Separate Data

# Load data
IS2_eset_noResponse_norm_young <- readRDS(file = path_to_young_eset)
IS2_eset_noResponse_norm_old <- readRDS(file = path_to_old_eset)

# Subset young data: 
pdata_df_young <- as_tibble(pData(IS2_eset_noResponse_norm_young@phenoData))
pdata_df_young <- pdata_df_young %>% dplyr::select(uid, everything())
pdata_df_young$age_reported <- as.numeric(pdata_df_young$age_reported)
pdata_df_young$age_imputed <- as.numeric(pdata_df_young$age_imputed)
young_exprs <- exprs(IS2_eset_noResponse_norm_young)

# Subset old data: 
pdata_df_old <- as_tibble(pData(IS2_eset_noResponse_norm_old@phenoData))
pdata_df_old <- pdata_df_old %>% dplyr::select(uid, everything())
pdata_df_old$age_reported <- as.numeric(pdata_df_old$age_reported)
pdata_df_old$age_imputed <- as.numeric(pdata_df_old$age_imputed)
old_exprs <- exprs(IS2_eset_noResponse_norm_old)

# Combine:
pdata_df <- rbind(pdata_df_young, pdata_df_old)
#exprs_mat <- cbind(young_exprs, old_exprs)

Distribution of data:

pdata_no_malaria <- filter(pdata_df, study_accession != "SDY1293")
pdata_df_uniqueSubjects <- distinct(pdata_no_malaria, participant_id, gender_imputed, race, pathogen, vaccine_type)
pdata_df_uniqueSubjects <- mutate(pdata_df_uniqueSubjects, pathogen_vacc = paste0(pathogen, "\n(", vaccine_type, ")"))

sex_counts_df <- arrange(count(pdata_df_uniqueSubjects, gender_imputed, pathogen_vacc), n)
sex_counts_df$pathogen_total <- sapply(1:nrow(sex_counts_df), function(i){
  return(sum(filter(sex_counts_df, pathogen_vacc == sex_counts_df$pathogen_vacc[i])$n))
})
sex_counts_df$pathogen_vacc <- paste0(sex_counts_df$pathogen_vacc, "\nn = ", sex_counts_df$pathogen_total)
sex_counts_df$n <- sex_counts_df$n/sex_counts_df$pathogen_total
colnames(sex_counts_df) <- c("category", "pathogen_vacc", "freq")
sex_counts_df$category <- paste0("SEX_", sex_counts_df$category)

race_counts_df <- arrange(count(pdata_df_uniqueSubjects, race, pathogen_vacc), n)
race_counts_df$pathogen_total <- sapply(1:nrow(race_counts_df), function(i){
  return(sum(filter(race_counts_df, pathogen_vacc == race_counts_df$pathogen_vacc[i])$n))
})
race_counts_df$pathogen_vacc <- paste0(race_counts_df$pathogen_vacc, "\nn = ", race_counts_df$pathogen_total)
race_counts_df$n <- race_counts_df$n/race_counts_df$pathogen_total
colnames(race_counts_df) <- c("category", "pathogen_vacc", "freq")
race_counts_df$category <- paste0("RACE_", race_counts_df$category)

total_counts <- rbind(sex_counts_df, race_counts_df)

# Choose colors to keep consistent color scheme (scheme made in other figure)
mycolors= c("#8DD3C7", "#F9F9B6", "#C8B0C8", "#DC8C8A", "#A9B2AD", "#DEC564", "#D7D5A6", "#E7D3DE", "#C59DC6", "#439688", "#B96915", "#7D6EA4", "#E7298A")
names(mycolors)= c(unique(total_counts$pathogen_vacc)[c(2, 11, 3, 12, 8)],
                   "Malaria (Recombinant protein)",
                   unique(total_counts$pathogen_vacc)[c(7, 6, 5, 1, 4, 9, 10)])

p <- ggplot(total_counts) +
  geom_bar(aes(x = freq, y = category, fill = pathogen_vacc), color = "black", stat = "identity", lwd = .2) +
  geom_hline(yintercept = 7.5, color = "grey", lwd = .4) +
  facet_wrap(vars(pathogen_vacc), nrow = 2) +
  scale_y_discrete(limits = rev(c("SEX_Male", "SEX_Female",
                              "RACE_White", "RACE_Black or African American", "RACE_Asian", "RACE_American Indian or Alaska Native", "RACE_Other", "RACE_Not Specified", "RACE_Unknown")),
                   labels = rev(c("Male", "Female",
                              "White", "Black or African American", "Asian", "American Indian or Alaska Native", "Other", "Not Specified", "Unknown"))) +
  scale_x_continuous(breaks = c(0, .2, .4, .6, .8, 1), labels = c(0, 20, 40, 60, 80, 100)) +
  scale_fill_manual(values = mycolors) +
  guides(fill = FALSE) +
  xlab("Distribution (%)") +
  ylab(NULL) +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 90))

Distribution of ages:

pdata_df_uniqueSubjects <- distinct(pdata_df, participant_id, pathogen, raptor_age, study_accession)

p <- ggplot(pdata_df_uniqueSubjects) +
  geom_histogram(aes(x = raptor_age, fill = pathogen), stat = "count", binwidth = 3) +
  facet_wrap(vars(pathogen), nrow = 1) +
  guides(fill = FALSE) +
  theme_bw()