Load packages
library(stats) library(RAPToR) library(Biobase) library(tidyverse) set.seed(123) dataCacheDir <- params$dataCacheDir outputDir <- file.path(params$dataCacheDir, "age_imputed") timestamp <- params$timestamp if (!dir.exists(dataCacheDir)) stop(dataCacheDir, " does not exist") if (!dir.exists(outputDir)) dir.create(outputDir) # Paths to virtual studies: Currently using the normalized, noResponse esets (to include more studies) # path_to_young_eset <- "~/Documents/Coding/Data/HIPC/2020_08_10_young_norm_noResponse_eset.rds" # path_to_old_eset <- "~/Documents/Coding/Data/HIPC/2020_08_10_extendedOld_norm_noResponse_eset.rds" path_to_young_eset <- file.path(here::here(dataCacheDir), paste0(timestamp, "young_norm_eset.rds")) path_to_old_eset <- file.path(here::here(dataCacheDir), paste0(timestamp, "extendedOld_norm_batchCorrectedFromYoung_eset.rds")) # Best functions: (found from IOF_RAPToR_age_imputation_all_functions.rmd) functions_young <- readRDS(file.path(outputDir, paste0(timestamp, "raptor_young_functs.rds"))) functions_old <- readRDS(file.path(outputDir, paste0(timestamp, "raptor_extendedOld_functs.rds"))) functions_young <- arrange(functions_young, -r.squared) functions_old <- arrange(functions_old, -r.squared) best_young_funct <- functions_young$Funct[1] best_old_funct <- functions_old$Funct[1] print(paste0("YOUNG: ", best_young_funct)) print(paste0("OLD: ", best_old_funct))
# Load data IS2_eset_noResponse_norm_young <- readRDS(file = path_to_young_eset) # Subset data: pdata_df <- as_tibble(pData(IS2_eset_noResponse_norm_young@phenoData)) pdata_df <- pdata_df %>% dplyr::select(uid, everything()) pdata_df$age_reported <- as.numeric(pdata_df$age_reported) pdata_df$age_imputed <- as.numeric(pdata_df$age_imputed) pdata_df_young <- pdata_df # exprs_mat -> rows = genes, cols = samples exprs_mat <- exprs(IS2_eset_noResponse_norm_young) IS2_eset <- IS2_eset_noResponse_norm_young
Make Reference Dataset
# Reference Dataset: # X - gene expression data [genes X samples] # p - phenotypic data [samples X pheno_feat] # pheno data (e.g time, batch) p <- pdata_df # gene expression data X <- exprs_mat # only use pre-vaccination for reference dataset: pre_vacc_ind <- which(p$time_post_last_vax == 0) p <- p[pre_vacc_ind,] X <- X[,pre_vacc_ind] # Choose which samples to use for reference dataset: studies_without_ages <- unique(p$study_accession)[unique(p$study_accession) %in% c("SDY1260", "SDY1264","SDY1293","SDY1294","SDY1364","SDY1370","SDY1373","SDY984")] test_sample_indices <- which(p$study_accession %in% studies_without_ages) ref_sample_indices <- which(!p$study_accession %in% studies_without_ages) # Make test sets: p_test <- p[test_sample_indices,] p <- p[ref_sample_indices,] # Remove missing gene values missing_indices <- sapply(1:nrow(X), function(j){ return(any(is.na(X[j,]))) }) X <- X[!missing_indices,] X_test <- X[,test_sample_indices] X <- X[,ref_sample_indices]
# Number of components pca <- stats::prcomp(X, rank = 20) nc <- sum(summary(pca)$importance[3, ] < .999) + 1 # Number of significant components # Build Model: funct <- best_young_funct m <- ge_im(X = X, p = p, formula = funct, nc = nc) # Build interpolation data n.inter = 500 ndat <- data.frame( age_reported = seq(min(p$age_reported), max(p$age_reported), l = n.inter), arm_accession = sample(unique(p$arm_accession), n.inter, replace = TRUE), y_chrom_present = sample(c(TRUE, FALSE), n.inter, replace = TRUE), matrix = sample(unique(p$matrix),n.inter,replace = TRUE), study_accession = sample(unique(p$study_accession),n.inter,replace = TRUE), race = sample(unique(p$race), n.inter, replace = TRUE), ethnicity = sample(unique(p$ethnicity), n.inter, replace = TRUE), cohort = sample(unique(p$cohort),n.inter,replace = TRUE) ) # get interpolated GE matrix, as a reference r_X <- list(interpGE = stats::predict(m, ndat), time.series = ndat$age_reported) # Impute all ages all_ae_X_young <- ae(exprs_mat[,pre_vacc_ind], r_X$interpGE, r_X$time.series)
# test ae_X_young <- ae(X, r_X$interpGE, r_X$time.series) # Get regression: results_young <- tibble(actual = p$age_reported, estimates = ae_X_young$age.estimates[,1]) lm_fit_young <- lm(actual ~ estimates, data=results_young) # Plot: p <- ggplot(results_young) + #geom_histogram(aes(x = estimates)) + geom_point(aes(x = actual, y = estimates)) + geom_smooth(aes(x = actual, y = actual), method='lm') + xlab("age_reported") + ylab("estimated age (from RAPToR)") + xlim(18, 50) + ylim(18, 50) + ggtitle(paste0("RAPToR Prediction vs. Actual Age: R^2 = ", round(summary(lm_fit_young)$r.squared, 3))) ggsave(file.path(outputDir, "young_results.pdf"), plot = p)
Load / Separate Data
# Load data IS2_eset_noResponse_norm_old <- readRDS(file = path_to_old_eset) # Subset data: pdata_df <- as_tibble(pData(IS2_eset_noResponse_norm_old@phenoData)) pdata_df <- pdata_df %>% dplyr::select(uid, everything()) pdata_df$age_reported <- as.numeric(pdata_df$age_reported) pdata_df$age_imputed <- as.numeric(pdata_df$age_imputed) pdata_df_old <- pdata_df # exprs_mat -> rows = genes, cols = samples exprs_mat <- exprs(IS2_eset_noResponse_norm_old) IS2_eset <- IS2_eset_noResponse_norm_old
Make Reference Dataset
# Reference Dataset: # X - gene expression data [genes X samples] # p - phenotypic data [samples X pheno_feat] # pheno data (e.g time, batch) p <- pdata_df # gene expression data X <- exprs_mat # only use pre-vaccination for reference dataset: pre_vacc_ind <- which(p$time_post_last_vax == 0) p <- p[pre_vacc_ind,] X <- X[,pre_vacc_ind] # Choose which samples to use for reference dataset: studies_without_ages <- unique(p$study_accession)[unique(p$study_accession) %in% c("SDY1260", "SDY1264","SDY1293","SDY1294","SDY1364","SDY1370","SDY1373","SDY984")] test_sample_indices <- which(p$study_accession %in% studies_without_ages) ref_sample_indices <- which(!p$study_accession %in% studies_without_ages) # Make test sets: p_test <- p[test_sample_indices,] p <- p[ref_sample_indices,] # Remove missing gene values missing_indices <- sapply(1:nrow(X), function(j){ return(any(is.na(X[j,]))) }) X <- X[!missing_indices,] X_test <- X[,test_sample_indices] X <- X[,ref_sample_indices]
# Number of components pca <- stats::prcomp(X, rank = 20) nc <- sum(summary(pca)$importance[3, ] < .999) + 1 # Number of significant components # Build Model: funct <- best_old_funct m <- ge_im(X = X, p = p, formula = funct, nc = nc) # Build interpolation data n.inter = 500 ndat <- data.frame( age_reported = seq(min(p$age_reported), max(p$age_reported), l = n.inter), arm_accession = sample(unique(p$arm_accession), n.inter, replace = TRUE), y_chrom_present = sample(c(TRUE, FALSE), n.inter, replace = TRUE), matrix = sample(unique(p$matrix),n.inter,replace = TRUE), study_accession = sample(unique(p$study_accession),n.inter,replace = TRUE), race = sample(unique(p$race), n.inter, replace = TRUE), ethnicity = sample(unique(p$ethnicity), n.inter, replace = TRUE), cohort = sample(unique(p$cohort),n.inter,replace = TRUE) ) # get interpolated GE matrix, as a reference r_X <- list(interpGE = stats::predict(m, ndat), time.series = ndat$age_reported) # Impute all ages all_ae_X_old <- ae(exprs_mat[,pre_vacc_ind], r_X$interpGE, r_X$time.series)
# test ae_X_old <- ae(X, r_X$interpGE, r_X$time.series) # Get regression: results_old <- tibble(actual = p$age_reported, estimates = ae_X_old$age.estimates[,1]) lm_fit_old <- lm(actual ~ estimates, data=results_old) # Plot: p <- ggplot(results_old) + #geom_histogram(aes(x = estimates)) + geom_point(aes(x = actual, y = estimates)) + geom_smooth(aes(x = actual, y = actual), method='lm') + xlab("age_reported") + ylab("estimated age (from RAPToR)") + xlim(50, 100) + ylim(50, 100) + ggtitle(paste0("RAPToR Prediction vs. Actual Age: R^2 = ", round(summary(lm_fit_old)$r.squared, 3))) ggsave(file.path(outputDir, "extendedOld_results.pdf"), plot = p)
# Save all ages: young_dict <- tibble(uid = names(all_ae_X_young$age.estimates[,1]), raptor_age = unname(all_ae_X_young$age.estimates[,1])) young_dict$participant_id <- sapply(1:nrow(young_dict), function(i){ return(pdata_df_young$participant_id[which(young_dict$uid[i] == pdata_df_young$uid)]) }) saveRDS(young_dict, file = file.path(outputDir, paste0(timestamp, "young_RAPToR_age_dictionary.rds"))) old_dict <- tibble(uid = names(all_ae_X_old$age.estimates[,1]), raptor_age = unname(all_ae_X_old$age.estimates[,1])) old_dict$participant_id <- sapply(1:nrow(old_dict), function(i){ return(pdata_df_old$participant_id[which(old_dict$uid[i] == pdata_df_old$uid)]) }) saveRDS(old_dict, file = file.path(outputDir, paste0(timestamp, "oldExtended_RAPToR_age_dictionary.rds")))
Add to virtual study:
# Young: RAPToR_age <- c() for(i in 1:nrow(IS2_eset_noResponse_norm_young@phenoData@data)) { # If study doesn't have age_reported, use imputed age: if(IS2_eset_noResponse_norm_young@phenoData@data$study_accession[i] %in% c("SDY1260", "SDY1264","SDY1293", "SDY1294","SDY1364","SDY1370","SDY1373","SDY984")){ ages <- young_dict$raptor_age[young_dict$participant_id == IS2_eset_noResponse_norm_young@phenoData@data$participant_id[i]] if(length(ages) > 1){ RAPToR_age <- c(RAPToR_age, ages[1]) } else{ RAPToR_age <- c(RAPToR_age, ages) } } else { RAPToR_age <- c(RAPToR_age, IS2_eset_noResponse_norm_young@phenoData@data$age_reported[i]) } } IS2_eset_noResponse_norm_young@phenoData@data$raptor_age <- RAPToR_age saveRDS(IS2_eset_noResponse_norm_young, file.path(outputDir, paste0(timestamp, "young_norm_noResponse_ageImputation_eset.rds"))) # Old: RAPToR_age <- c() for(i in 1:nrow(IS2_eset_noResponse_norm_old@phenoData@data)) { # If study doesn't have age_reported, use imputed age: if(IS2_eset_noResponse_norm_old@phenoData@data$study_accession[i] %in% c("SDY1260", "SDY1264","SDY1293","SDY1294","SDY1364","SDY1370","SDY1373","SDY984")){ ages <- old_dict$raptor_age[old_dict$participant_id == IS2_eset_noResponse_norm_old@phenoData@data$participant_id[i]] if(length(ages) > 1){ RAPToR_age <- c(RAPToR_age, ages[1]) } else{ RAPToR_age <- c(RAPToR_age, ages) } } else { RAPToR_age <- c(RAPToR_age, IS2_eset_noResponse_norm_old@phenoData@data$age_reported[i]) } } IS2_eset_noResponse_norm_old@phenoData@data$raptor_age <- RAPToR_age saveRDS(IS2_eset_noResponse_norm_old, file.path(outputDir, paste0(timestamp, "extendedOld_norm_noResponse_ageImputation_eset.rds")))
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.