knitr::opts_chunk$set(echo = FALSE, 
                      message = FALSE,
                      warning = FALSE)
# libraries
suppressPackageStartupMessages({
  library(ImmuneSignatures2)
  library(Biobase)
  library(data.table)
  library(Rlabkey)
})
# global variables
dataCacheDir <- here::here("data_cache", "virtualStudy_prod")
datePrefix <- ""
datePrefix <- ""
esets_all <- list.files(here::here(dataCacheDir), pattern = "_eset.rds")
dimstudy <- Rlabkey::labkey.selectRows("https://test.immunespace.org/", 
                              folderPath = "HIPC/IS2", 
                              schemaName = "immport", 
                              queryName = "dimstudy", 
                              colSelect = c("study", "program"),
                              colNameOpt = "rname")
datasets <- lapply(file.path(dataCacheDir, esets_all), readRDS)
names(datasets) <- esets_all
summarizeEset <- function(eset) {
  d <- pData(eset)
  study_count <- length(unique(d$study_accession))
  arm_count <- length(unique(d$arm_accession))
  cohort_count <- length(unique(d$cohort))
  cohort_count <- max(arm_count, cohort_count)
  # subject_count <- length(unique(gsub("\\.\\d+", "", d$participant_id)))
  subject_count <- length(unique(d$participant_id))
  sample_count <- length(unique(d$biosample_accession))
  summary_dataset <- data.table(
    studies = study_count,
    cohorts = cohort_count, 
    subjects = subject_count, 
    samples = sample_count)
  return(summary_dataset)
}

summarizeEsetList <- function(esetList) {
  pdList <- lapply(esetList, pData)
  d <- rbindlist(pdList)
  study_count <- length(unique(d$study_accession))
  cohort_count <- length(unique(d$arm_accession))
  # subject_count <- length(unique(gsub("\\.\\d+", "", d$participant_id)))
  subject_count <- length(unique(d$participant_id))
  sample_count <- length(unique(d$biosample_accession))
  summary_dataset <- data.table(
    studies = study_count,
    cohorts = cohort_count, 
    subjects = subject_count, 
    samples = sample_count)
  return(summary_dataset)
}

Subject summaries by dataset

For all versions of data

datasetSummaries <- lapply(datasets, function(dataset) {

  d <- data.table(pData(dataset))
  d <- merge(d, dimstudy, by.x = "study_accession", by.y = "study")
  study_count <- length(unique(d$study_accession))
  cohort_count <- length(unique(d$arm_accession))
  # subject_count <- length(unique(gsub("\\.\\d+", "", d$participant_id)))
  subject_count <- length(unique(d$participant_id))
  sample_count <- length(unique(d$biosample_accession))
  vaccine_count <- length(unique(d$vaccine))
  vaccine_type_count <- length(unique(d$vaccine_type))
  pathogen_count <- length(unique(d$pathogen))
  studyProgram <- unique(d[, .(study_accession, program)])
  studyProgram[, hipc := grepl("HIPC", program)]
  hipc_studies <- sum(studyProgram$hipc)
  not_hipc_studies <- sum(!studyProgram$hipc)

  summary_datset <- data.table(
    studies = study_count, 
    cohorts = cohort_count, 
    subjects = subject_count, 
    samples = sample_count,
    vaccines = vaccine_count, 
    vaccine_types = vaccine_type_count,
    pathogens = pathogen_count,
    hipc_studies = hipc_studies,
    not_hipc_studies = not_hipc_studies)

})

datasetSummary <- rbindlist(datasetSummaries)
datasetSummary$dataset <- esets_all

consort_numbers <- readRDS(here::here(dataCacheDir, "consort_numbers.rds"))
all_norm_summary <- summarizeEsetList(c(datasets[["young_norm_noResponse_eset.rds"]], 
                                        datasets[["extendedOld_norm_noResponse_eset.rds"]]))
consort_numbers[step == "Normalize: remove studies without young cohort", 
                `:=`(studies_remaining = all_norm_summary$studies,
                     studies_affected = consort_numbers[step == "remove subjects without baseline"]$studies_remaining - all_norm_summary$studies,
                     studies_dropped = consort_numbers[step == "remove subjects without baseline"]$studies_remaining - all_norm_summary$studies,
                     cohorts_remaining = all_norm_summary$cohorts,
                     cohorts_affected = consort_numbers[step == "remove subjects without baseline"]$cohorts_remaining - all_norm_summary$cohorts,
                     cohorts_dropped = consort_numbers[step == "remove subjects without baseline"]$cohorts_remaining - all_norm_summary$cohorts,
                     subjects_remaining = all_norm_summary$subjects,
                     subjects_affected = consort_numbers[step == "remove subjects without baseline"]$subjects_remaining - all_norm_summary$subjects,
                     subjects_dropped = consort_numbers[step == "remove subjects without baseline"]$subjects_remaining - all_norm_summary$subjects,
                     samples_remaining = all_norm_summary$samples,
                     samples_affected = consort_numbers[step == "remove subjects without baseline"]$samples_remaining - all_norm_summary$samples,
                     samples_dropped = consort_numbers[step == "remove subjects without baseline"]$samples_remaining - all_norm_summary$samples
                     )]
young_norm_summary <- summarizeEset(datasets[["young_norm_noResponse_eset.rds"]])
consort_numbers[step == "Young adult dataset",
                `:=`(studies_remaining = young_norm_summary$studies,
                     cohorts_remaining = young_norm_summary$cohorts,
                     subjects_remaining = young_norm_summary$subjects,
                     samples_remaining = young_norm_summary$samples)]
extendedOld_norm_summary <- summarizeEset(datasets[["extendedOld_norm_noResponse_eset.rds"]])
consort_numbers[step == "Older adult dataset",
                `:=`(studies_remaining = extendedOld_norm_summary$studies,
                     cohorts_remaining = extendedOld_norm_summary$cohorts,
                     subjects_remaining = extendedOld_norm_summary$subjects,
                     samples_remaining = extendedOld_norm_summary$samples)]

knitr::kable(datasetSummary)
knitr::kable(consort_numbers)

fwrite(consort_numbers, file.path(dataCacheDir, "consort_numbers.csv"))
fwrite(datasetSummary, file.path(dataCacheDir, "dataset_summary.csv"))


RGLab/ImmuneSignatures2 documentation built on Dec. 9, 2022, 10:51 a.m.