In RGLab/ImmuneSignatures2: Pre-Processing for HIPC ImmuneSignatures2 Analysis

Description: This file takes two separate eset files (currently 2021_02_11) and implements the RAPToR package to impute age for studies without age_reported. All possible feature combinations are tried, and the highest R^2 values (calculated for studies with age_reported against their RAPToR imputed ages) are returned.

Load packages

library(stats)
library(RAPToR)
library(Biobase)
library(tidyverse)

set.seed(123)

dataCacheDir <- params$dataCacheDir
outputDir <- file.path(params$dataCacheDir, "age_imputed")
timestamp <- params$timestamp
if (!dir.exists(dataCacheDir)) stop(dataCacheDir, " does not exist")
if (!dir.exists(outputDir)) dir.create(outputDir)
# Paths to virtual studies: Currently using the normalized, noResponse esets (to include more studies)
# path_to_young_eset <- "~/Documents/Coding/Data/HIPC/2020_08_10_young_norm_noResponse_eset.rds"
# path_to_old_eset <- "~/Documents/Coding/Data/HIPC/2020_08_10_extendedOld_norm_noResponse_eset.rds"
path_to_young_eset <- file.path(here::here(dataCacheDir), paste0(timestamp, "young_norm_eset.rds"))
path_to_old_eset <- file.path(here::here(dataCacheDir), paste0(timestamp, "extendedOld_norm_batchCorrectedFromYoung_eset.rds"))

# Best functions: (found from IOF_RAPToR_age_imputation_all_functions.rmd)
functions_young <- readRDS(file.path(outputDir, paste0(timestamp, "raptor_young_functs.rds")))
functions_old <- readRDS(file.path(outputDir, paste0(timestamp, "raptor_extendedOld_functs.rds")))

functions_young <- arrange(functions_young, -r.squared)
functions_old <- arrange(functions_old, -r.squared)

best_young_funct <- functions_young$Funct[1]
best_old_funct <- functions_old$Funct[1]
print(paste0("YOUNG: ", best_young_funct))
print(paste0("OLD: ", best_old_funct))

Imputing via young best function:

# Load data
IS2_eset_noResponse_norm_young <- readRDS(file = path_to_young_eset)

# Subset data: 
pdata_df <- as_tibble(pData(IS2_eset_noResponse_norm_young@phenoData))
pdata_df <- pdata_df %>% dplyr::select(uid, everything())
pdata_df$age_reported <- as.numeric(pdata_df$age_reported)
pdata_df$age_imputed <- as.numeric(pdata_df$age_imputed)
pdata_df_young <- pdata_df
# exprs_mat -> rows = genes, cols = samples
exprs_mat <- exprs(IS2_eset_noResponse_norm_young)
IS2_eset <- IS2_eset_noResponse_norm_young

Make Reference Dataset

# Reference Dataset: 
# X - gene expression data [genes X samples]
# p - phenotypic data [samples X pheno_feat]

# pheno data (e.g time, batch)
p <- pdata_df

# gene expression data
X <- exprs_mat

# only use pre-vaccination for reference dataset:
pre_vacc_ind <- which(p$time_post_last_vax == 0)
p <- p[pre_vacc_ind,]
X <- X[,pre_vacc_ind]

# Choose which samples to use for reference dataset:
studies_without_ages <- unique(p$study_accession)[unique(p$study_accession) %in% c("SDY1260", "SDY1264","SDY1293","SDY1294","SDY1364","SDY1370","SDY1373","SDY984")]
test_sample_indices <- which(p$study_accession %in% studies_without_ages)
ref_sample_indices <- which(!p$study_accession %in% studies_without_ages)

# Make test sets:
p_test <- p[test_sample_indices,]
p <- p[ref_sample_indices,]

# Remove missing gene values
missing_indices <- sapply(1:nrow(X), function(j){
    return(any(is.na(X[j,])))
  })
X <- X[!missing_indices,]
X_test <- X[,test_sample_indices]
X <- X[,ref_sample_indices]

# Number of components
pca <- stats::prcomp(X, rank = 20)
nc <- sum(summary(pca)$importance[3, ] < .999) + 1 # Number of significant components

# Build Model:
funct <- best_young_funct
m <- ge_im(X = X, p = p, formula = funct, nc = nc)

  # Build interpolation data
  n.inter = 500
  ndat <- data.frame(
    age_reported = seq(min(p$age_reported),
              max(p$age_reported), l = n.inter),
    arm_accession = sample(unique(p$arm_accession), n.inter, replace = TRUE), 
    y_chrom_present = sample(c(TRUE, FALSE), n.inter, replace = TRUE),
    matrix = sample(unique(p$matrix),n.inter,replace = TRUE),
    study_accession = sample(unique(p$study_accession),n.inter,replace = TRUE),
    race = sample(unique(p$race), n.inter, replace = TRUE),
    ethnicity = sample(unique(p$ethnicity), n.inter, replace = TRUE),
    cohort = sample(unique(p$cohort),n.inter,replace = TRUE)
  )

# get interpolated GE matrix, as a reference
r_X <- list(interpGE = stats::predict(m, ndat), time.series = ndat$age_reported)

# Impute all ages
all_ae_X_young <- ae(exprs_mat[,pre_vacc_ind], r_X$interpGE, r_X$time.series)

Results for young

# test
ae_X_young <- ae(X, r_X$interpGE, r_X$time.series)

# Get regression:
results_young <- tibble(actual = p$age_reported, estimates = ae_X_young$age.estimates[,1])
lm_fit_young <- lm(actual ~ estimates, data=results_young)
# Plot:
p <- ggplot(results_young) +
  #geom_histogram(aes(x = estimates)) +
  geom_point(aes(x = actual, y = estimates)) +
  geom_smooth(aes(x = actual, y = actual), method='lm') +
  xlab("age_reported") +
  ylab("estimated age (from RAPToR)") +
  xlim(18, 50) +
  ylim(18, 50) +
  ggtitle(paste0("RAPToR Prediction vs. Actual Age: R^2 = ", round(summary(lm_fit_young)$r.squared, 3)))
ggsave(file.path(outputDir, "young_results.pdf"), plot = p)

Finding the extendedOld dataset's best function:

Load / Separate Data

# Load data
IS2_eset_noResponse_norm_old <- readRDS(file = path_to_old_eset)

# Subset data: 
pdata_df <- as_tibble(pData(IS2_eset_noResponse_norm_old@phenoData))
pdata_df <- pdata_df %>% dplyr::select(uid, everything())
pdata_df$age_reported <- as.numeric(pdata_df$age_reported)
pdata_df$age_imputed <- as.numeric(pdata_df$age_imputed)
pdata_df_old <- pdata_df
# exprs_mat -> rows = genes, cols = samples
exprs_mat <- exprs(IS2_eset_noResponse_norm_old)
IS2_eset <- IS2_eset_noResponse_norm_old

Make Reference Dataset

# Reference Dataset: 
# X - gene expression data [genes X samples]
# p - phenotypic data [samples X pheno_feat]

# pheno data (e.g time, batch)
p <- pdata_df

# gene expression data
X <- exprs_mat

# only use pre-vaccination for reference dataset:
pre_vacc_ind <- which(p$time_post_last_vax == 0)
p <- p[pre_vacc_ind,]
X <- X[,pre_vacc_ind]

# Choose which samples to use for reference dataset:
studies_without_ages <- unique(p$study_accession)[unique(p$study_accession) %in% c("SDY1260", "SDY1264","SDY1293","SDY1294","SDY1364","SDY1370","SDY1373","SDY984")]
test_sample_indices <- which(p$study_accession %in% studies_without_ages)
ref_sample_indices <- which(!p$study_accession %in% studies_without_ages)

# Make test sets:
p_test <- p[test_sample_indices,]
p <- p[ref_sample_indices,]

# Remove missing gene values
missing_indices <- sapply(1:nrow(X), function(j){
    return(any(is.na(X[j,])))
  })
X <- X[!missing_indices,]
X_test <- X[,test_sample_indices]
X <- X[,ref_sample_indices]

# Number of components
pca <- stats::prcomp(X, rank = 20)
nc <- sum(summary(pca)$importance[3, ] < .999) + 1 # Number of significant components

# Build Model:
funct <- best_old_funct
m <- ge_im(X = X, p = p, formula = funct, nc = nc)

# Build interpolation data
n.inter = 500
ndat <- data.frame(
  age_reported = seq(min(p$age_reported),
                     max(p$age_reported), l = n.inter),
  arm_accession = sample(unique(p$arm_accession), n.inter, replace = TRUE), 
  y_chrom_present = sample(c(TRUE, FALSE), n.inter, replace = TRUE),
  matrix = sample(unique(p$matrix),n.inter,replace = TRUE),
  study_accession = sample(unique(p$study_accession),n.inter,replace = TRUE),
  race = sample(unique(p$race), n.inter, replace = TRUE),
  ethnicity = sample(unique(p$ethnicity), n.inter, replace = TRUE),
  cohort = sample(unique(p$cohort),n.inter,replace = TRUE)
)

# get interpolated GE matrix, as a reference
r_X <- list(interpGE = stats::predict(m, ndat), time.series = ndat$age_reported)

# Impute all ages
all_ae_X_old <- ae(exprs_mat[,pre_vacc_ind], r_X$interpGE, r_X$time.series)

Results for old

# test
ae_X_old <- ae(X, r_X$interpGE, r_X$time.series)

# Get regression:
results_old <- tibble(actual = p$age_reported, estimates = ae_X_old$age.estimates[,1])
lm_fit_old <- lm(actual ~ estimates, data=results_old)
# Plot:
p <- ggplot(results_old) +
  #geom_histogram(aes(x = estimates)) +
  geom_point(aes(x = actual, y = estimates)) +
  geom_smooth(aes(x = actual, y = actual), method='lm') +
  xlab("age_reported") +
  ylab("estimated age (from RAPToR)") +
  xlim(50, 100) +
  ylim(50, 100) +
  ggtitle(paste0("RAPToR Prediction vs. Actual Age: R^2 = ", round(summary(lm_fit_old)$r.squared, 3)))
ggsave(file.path(outputDir, "extendedOld_results.pdf"), plot = p)

Last step: Saving a list (dictionary) with each participant and their Raptor_age:

# Save all ages:
young_dict <- tibble(uid = names(all_ae_X_young$age.estimates[,1]), raptor_age = unname(all_ae_X_young$age.estimates[,1]))
young_dict$participant_id <- sapply(1:nrow(young_dict), function(i){
  return(pdata_df_young$participant_id[which(young_dict$uid[i] == pdata_df_young$uid)])
})
saveRDS(young_dict, file = file.path(outputDir, paste0(timestamp, "young_RAPToR_age_dictionary.rds")))


old_dict <- tibble(uid = names(all_ae_X_old$age.estimates[,1]), raptor_age = unname(all_ae_X_old$age.estimates[,1]))
old_dict$participant_id <- sapply(1:nrow(old_dict), function(i){
  return(pdata_df_old$participant_id[which(old_dict$uid[i] == pdata_df_old$uid)])
})
saveRDS(old_dict, file = file.path(outputDir, paste0(timestamp, "oldExtended_RAPToR_age_dictionary.rds")))

Add to virtual study:

# Young:
RAPToR_age <- c()
for(i in 1:nrow(IS2_eset_noResponse_norm_young@phenoData@data))
{
  # If study doesn't have age_reported, use imputed age:
  if(IS2_eset_noResponse_norm_young@phenoData@data$study_accession[i] %in% c("SDY1260", "SDY1264","SDY1293", "SDY1294","SDY1364","SDY1370","SDY1373","SDY984")){
    ages <- young_dict$raptor_age[young_dict$participant_id == IS2_eset_noResponse_norm_young@phenoData@data$participant_id[i]]
    if(length(ages) > 1){
      RAPToR_age <- c(RAPToR_age, ages[1])
    }
    else{
      RAPToR_age <- c(RAPToR_age, ages)
    }
  } 
  else {
      RAPToR_age <- c(RAPToR_age, IS2_eset_noResponse_norm_young@phenoData@data$age_reported[i])
  }
}
IS2_eset_noResponse_norm_young@phenoData@data$raptor_age <- RAPToR_age
saveRDS(IS2_eset_noResponse_norm_young, 
        file.path(outputDir, paste0(timestamp, "young_norm_noResponse_ageImputation_eset.rds")))

# Old:
RAPToR_age <- c()
for(i in 1:nrow(IS2_eset_noResponse_norm_old@phenoData@data))
{
  # If study doesn't have age_reported, use imputed age:
  if(IS2_eset_noResponse_norm_old@phenoData@data$study_accession[i] %in% c("SDY1260", "SDY1264","SDY1293","SDY1294","SDY1364","SDY1370","SDY1373","SDY984")){
    ages <- old_dict$raptor_age[old_dict$participant_id == IS2_eset_noResponse_norm_old@phenoData@data$participant_id[i]]
    if(length(ages) > 1){
      RAPToR_age <- c(RAPToR_age, ages[1])
    }
    else{
      RAPToR_age <- c(RAPToR_age, ages)
    }
  } else {
      RAPToR_age <- c(RAPToR_age, IS2_eset_noResponse_norm_old@phenoData@data$age_reported[i])
    }
}
IS2_eset_noResponse_norm_old@phenoData@data$raptor_age <- RAPToR_age
saveRDS(IS2_eset_noResponse_norm_old, file.path(outputDir, paste0(timestamp, "extendedOld_norm_noResponse_ageImputation_eset.rds")))