# Utilities for retrieving and preparing files for analysis
#' Retrieve input files
#'
#' Identify input file location and download and prepare
#' input files as needed, into a format that is standard per platform
#'
#' @param study Study accession
#' @param gef gene expression files
#' @param meta_data List of study-specific metadata
#' @param analysis_dir directory to save intermediate files.
#' @param verbose print verbose logging statements?
#' @param reload re-download files if already found?
#'
#' @export
retrieve_input_files <- function(study,
gef,
meta_data,
analysis_dir,
verbose = FALSE,
reload = FALSE) {
# Identify correct input_files
if (meta_data$file_location == "immport") {
if (verbose) log_message("Using immport files in ", analysis_dir)
input_files <- gef$file_info_name[grep("cel$|txt$|tsv$|csv$",
gef$file_info_name,
ignore.case = TRUE
)]
input_files <- file.path(analysis_dir, input_files)
input_files <- unique(input_files[file.exists(input_files)])
} else
if (meta_data$file_location == "custom") {
raw_dir <- file.path(analysis_dir, meta_data$custom_file_info$directory)
if (verbose) log_message("Using custom files in ", raw_dir)
input_files <- list.files(raw_dir, full.names = TRUE)
input_files <- input_files[grep(meta_data$custom_file_info$file_identifier_regex, input_files)]
} else {
input_files <- NA
gef <- gef[!is.na(gef$geo_accession), ]
}
# Get raw files from GEO or prepare ImmPort flat files
# At end of this step, there should be a single "cohort_type_raw_expression.txt"
# file for non-affymetrix studies
if (meta_data$file_location %in% c("gsm_soft", "gsm_supp_files", "gse_supp_files")) {
input_files <- .prep_geo_files(
study,
gef,
meta_data,
input_files,
analysis_dir,
verbose,
reload
)
} else {
input_files <- .prep_immport_files(
study,
gef,
meta_data$platform,
input_files,
analysis_dir,
verbose
)
}
input_files
}
## Utils for downloading supplemental files from GEO ####
# Download supp files from geo accession and return full paths to files
#' Get GEO supp files
#'
#' Download supplementary files from GEO accession (GSE or GSM)
#' and return the paths to the files.
#'
#' @param geo_accession GSM or GSE accesssion
#' @param supp_files_dir path to directory to save supplementary files.
#' @param reload Force re-download of files if already found in \code{supp_files_dir}?
#' On ImmuneSpace servers, an appropriate directory can be found with
#' \code{.get_supp_files_dir()}
.get_geo_supp_files <- function(geo_accession,
supp_files_dir,
reload = FALSE) {
if (!reload) {
file_info <- GEOquery::getGEOSuppFiles(geo_accession,
makeDirectory = FALSE,
baseDir = supp_files_dir,
fetch_files = FALSE
)
file_path <- file.path(supp_files_dir, gsub("\\.gz", "", file_info$fname))
if (all(file.exists(file_path) | file.exists(file.path(supp_files_dir, file_info$fname)))) {
log_message("Using saved supp files for ", geo_accession)
return(file.path(supp_files_dir, file_info$fname))
}
}
info <- GEOquery::getGEOSuppFiles(geo_accession,
makeDirectory = FALSE,
baseDir = supp_files_dir
)
rownames(info)
}
#' Select input files
#'
#' Select the correct input files from supplemental directory and unzip if needed.
#'
#' @param supp_files_dir Path to directory for storing supplementary files.
.select_input_files <- function(supp_files_dir) {
target_file_terms <- "non-normalized|non_normalized|corrected|raw|cel|pbmc|count"
supp_files <- list.files(supp_files_dir)
raw_files <- supp_files[grep(target_file_terms, supp_files, ignore.case = TRUE)]
# unzip any files if needed (overwrite = TRUE in case of processing fail)
gz_files <- raw_files[grep("gz", raw_files)]
for (file in gz_files) {
GEOquery::gunzip(file.path(supp_files_dir, file),
overwrite = TRUE,
remove = TRUE
)
}
# find correct unzipped files with full paths
# Do not include attempted intermediate file that may have been created if
# run failed at a later point
supp_files <- list.files(supp_files_dir)
input_files <- supp_files[grepl(target_file_terms, supp_files, ignore.case = TRUE) &
# exclude compressed files
!grepl("gz|tar|RData", supp_files) &
# exclude intermediate file generated by run
!grepl("SDY\\d+_raw_expression", supp_files)]
file.path(supp_files_dir, input_files)
}
#' getGEO (custom)
#'
#' Helper to download GEO soft files to supp_files_dir
#'
#' @param geo_accession GSM or GSE accesssion
#' @param supp_files_dir path to directory to save supplementary files.
.getGEO_custom <- function(geo_accession, supp_files_dir) {
stopifnot(dir.exists(supp_files_dir))
soft_files_dir <- file.path(supp_files_dir, "geo_soft_files")
if (!dir.exists(soft_files_dir)) dir.create(soft_files_dir)
geo <- GEOquery::getGEO(geo_accession,
destdir = soft_files_dir
)
geo
}
##
#' Make id to GSM map
#'
#' Create map of GSM accessions to ID's provided in GSE. If needed, appropriate
#' info for the matrix will be returned by \code{get_meta_data}.
#'
#' @param gsms Input vector of gsm accession
#' @param study_id_term Term from getGEO(gsm) header to use for extracting
#' sample id. "description" or "title"
#' @param gsm_map_index Index returning id in GSM header
#' @param id_regex_map list specifying regex terms to map GSE id's using gsub
#' of the form \code{list(old = "", new = "")}
#' @param supp_files_dir path to directory to save supplementary files.
.make_id_to_gsm_map <- function(gsms,
study_id_term,
gsm_map_index = 1,
id_regex_map = NULL,
supp_files_dir = NULL) {
if (!is.null(id_regex_map)) {
stopifnot(is.list(id_regex_map))
stopifnot(names(id_regex_map) == c("old", "new"))
}
stopifnot(!is.null(supp_files_dir))
stopifnot(dir.exists(supp_files_dir))
id_to_gsm_map <- vapply(gsms,
function(gsm) {
res <- .getGEO_custom(gsm, supp_files_dir)
res@header[[study_id_term]][[gsm_map_index]]
},
FUN.VALUE = "id",
USE.NAMES = TRUE
)
if (!is.null(id_regex_map)) {
id_to_gsm_map <- gsub(
id_regex_map$old,
id_regex_map$new,
id_to_gsm_map
)
}
id_to_gsm_map
}
#' Prep GEO files
#'
#' Generate flat files that are ready for processing from GEO "raw" data.
#' Warning! Raw data is highly variable for gse supplementary files
#'
#' @param study study accession eg \code{SDY269}
#' @param gef result of ISCon$getDataset("gene_expression_files") for one run.
#' @param meta_data list of study-specific meta data
#' @param input_files input file names
#' @param analysis_dir path to analysis directory
#' @param verbose print verbose logging statements?
#' @param reload re-download files if already found in analysis_dir?
#'
#' @return path to raw, prepped input files. tsv for everyone except affy, which
#' will be the path to CEL files.
#'
#' @export
.prep_geo_files <- function(study,
gef,
meta_data,
input_files,
analysis_dir,
verbose = FALSE,
reload = FALSE) {
supp_files_dir <- .get_supp_files_dir(
analysis_dir,
gef
)
# In GSM SOFT File (currently only SDY1289 which is Illumina)
if (meta_data$file_location == "gsm_soft") {
if (verbose) log_message("Downloading GSM soft files to ", supp_files_dir)
ge_df_list <- lapply(gef$geo_accession, function(gsm) {
res <- .getGEO_custom(gsm, supp_files_dir)
ge_df <- res@dataTable@table
ge_df <- ge_df[, colnames(ge_df) %in% c("ID_REF", meta_data$gsm_table_var_name)]
colnames(ge_df)[[2]] <- gsm
return(ge_df)
})
input_files <- .ge_list_to_flat_file(ge_df_list, supp_files_dir, study)
} else
# In GSM Supp file (most common )
if (meta_data$file_location == "gsm_supp_files") {
if (verbose) log_message("Downloading GSM supp files to ", supp_files_dir)
if (meta_data$platform == "Illumina") {
ge_list <- lapply(gef$geo_accession, function(gsm) {
if (verbose) log_message("----- ", gsm, " -----")
file_gz <- .get_geo_supp_files(gsm,
supp_files_dir,
reload = reload
)
file_path <- file.path(gsub("\\.gz", "", file_gz))
if (!file.exists(file_path)) {
GEOquery::gunzip(file_gz, file_path, overwrite = TRUE)
}
if (!is.null(meta_data$illumina_manifest_file)) {
manifest_file <- file.path(analysis_dir, meta_data$illumina_manifest_file)
if (!file.exists(manifest_path)) {
manifest_src <- file.path(
system.file("CreateMatrixAssets/IlluminaManifests", package = "HIPCMatrix"),
meta_data$illumina_manifest_file
)
file.copy(manifest_src, manifest_file)
}
res <- limma::read.idat(
idatfiles = file_path,
bgxfile = manifest_file
)
raw_illumina_dt <- res$E
pvals <- limma::detectionPValues(res)
raw_illumina_dt <- data.table(
gsm = raw_illumina_dt[, 1],
pvals = pvals[, 1],
ID_REF = res$genes$Probe_Id
)
# dups b/c single probe assigned to multiple array_ids
raw_illumina_dt <- raw_illumina_dt[!duplicated(raw_illumina_dt$ID_REF)]
setnames(raw_illumina_dt, "gsm", paste0(gsm, ".AVG_Signal"))
setnames(raw_illumina_dt, "pvals", paste0(gsm, ".Detection Pval"))
} else {
raw_illumina_dt <- fread(file_path)
raw_illumina_dt <- .subset_raw_illumina_dt(raw_illumina_dt)
raw_illumina_dt <- .prep_illumina_headers(raw_illumina_dt)
sampleid_formats <- "\\d{10}_[A-Z]"
sampleid <- regmatches(
colnames(raw_illumina_dt)[[2]],
regexpr(
sampleid_formats,
colnames(raw_illumina_dt)[[2]]
)
)
colnames(raw_illumina_dt) <- gsub(sampleid, gsm, colnames(raw_illumina_dt))
}
raw_illumina_dt
})
} else
if (meta_data$platform == "Affymetrix") {
lapply(gef$geo_accession,
.get_geo_supp_files,
supp_files_dir = supp_files_dir,
reload = reload
)
input_files <- .select_input_files(supp_files_dir)
# Stanford custom HEEBO
} else
if (grepl("Stanford", meta_data$platform)) {
ge_list <- lapply(gef$geo_accession, function(gsm) {
path <- .get_geo_supp_files(gsm, supp_files_dir, reload = reload)
# Because of two colors, do background correction and processing here
# to generate single expression value per probe
ge_df <- .process_two_color_array(path)
setnames(ge_df, "gsm", gsm)
return(ge_df)
})
} else
if (meta_data$platform == "NA") {
ge_list <- lapply(gef$geo_accession, function(gsm) {
path <- .get_geo_supp_files(gsm, supp_files_dir, reload = reload)
ge_dt <- fread(path)
setnames(ge_dt, "V2", gsm) # ensemblId as 'V1'
return(ge_dt)
})
}
# InputFiles for Affy will just be a list of CEL files. All others
# should be written to one "raw" matrix (counts or probe intensities)
if (meta_data$platform != "Affymetrix") {
input_files <- .ge_list_to_flat_file(ge_list, supp_files_dir, study)
}
} else
if (meta_data$file_location == "gse_supp_files") {
if (verbose) log_message("Downloading GSE supp files to ", supp_files_dir)
gse_accessions <- unique(unlist(lapply(gef$geo_accession, function(x) {
gsm <- .getGEO_custom(x, supp_files_dir)
gsm@header$series_id
})))
gse_supp_files <- sapply(gse_accessions,
.get_geo_supp_files,
supp_files_dir = supp_files_dir,
reload = reload
)
input_files <- .select_input_files(supp_files_dir)
ge_list <- lapply(input_files, fread)
ge_list <- .fix_headers(ge_list, study)
id_mapping_info <- meta_data$id_to_gse_mapping_info
if (!is.null(id_mapping_info)) {
id_map <- .make_id_to_gsm_map(gef$geo_accession,
study_id_term = id_mapping_info$study_id_term,
gsm_map_index = id_mapping_info$gsm_map_index,
id_regex_map = id_mapping_info$id_regex_map,
supp_files_dir = supp_files_dir
)
}
# Illumina raw data in gse supp files
# Because multiple raw files need to be combined, must
# address header issues prior to combination otherwise
# untreated "Detection Pval" cols will cause dup error
# during merge. Note: SDY400 handled in fixHeaders
if (meta_data$platform == "Illumina") {
ge_list <- lapply(ge_list, function(raw_illumina_dt) {
raw_illumina_dt <- .subset_raw_illumina_dt(raw_illumina_dt)
raw_illumina_dt <- .prep_illumina_headers(raw_illumina_dt)
if (!is.null(id_mapping_info)) {
# Replace ids with GSM accession
for (gsm in names(id_map)) {
# Fixed b/c some ids have escape char (e.g. ".")
# Paste0 with "^" b/c some ids are numeric and confusable (e.g. "2.1" and "12.1")
# lookahead to ensure full id before sep and not partial (e.g "PBMC_1" and "PBMC_12")
# perl = TRUE for lookahead
fixed_id_regex <- paste0(
"^",
gsub(".", "\\.", id_map[gsm], fixed = T),
"(?=(\\.|_|$))"
)
colnames(raw_illumina_dt) <- gsub(fixed_id_regex,
gsm,
colnames(raw_illumina_dt),
perl = TRUE
)
}
}
raw_illumina_dt <- raw_illumina_dt[,
grep(
"GSM|ID_REF",
colnames(raw_illumina_dt)
),
with = FALSE
]
})
}
ge_df <- Reduce(f = function(x, y) {
merge(x, y)
}, ge_list)
# Case 7: RNAseq in gse supp files
# Header mapping assumes that names are in getGEO(gsm) object.
# Need to check on a per study basis and tweak if need be.
if (meta_data$platform == "NA") {
ge_df <- ge_df[, colnames(ge_df) %in% c("GENES", "V1", id_map), with = FALSE]
ids <- grep("GENES|V1", colnames(ge_df), invert = TRUE, value = TRUE)
gsms <- names(id_map)[match(ids, id_map)]
setnames(ge_df, ids, gsms)
}
input_files <- .write_raw_expression(ge_df, supp_files_dir, study)
} else {
stop("Could not determine location of raw files. ")
}
input_files
}
#' Prep ImmPort Files
#'
#' @param study study accession eg \code{SDY269}
#' @param gef result of ISCon$getDataset("gene_expression_files") for one run.
#' @param platform Sequencing platform: "Illumina", "Affymetrix", "Stanford",
#' or "NA" for RNA-seq.
#' @param input_files input file names
#' @param analysis_dir path to analysis directory
#' @param verbose print verbose logging statements?
#'
#' @return path to raw, prepped input files
#'
.prep_immport_files <- function(study,
gef,
platform,
input_files,
analysis_dir,
verbose = FALSE) {
supp_files_dir <- .get_supp_files_dir(
analysis_dir,
gef
)
if (verbose) log_message("Preparing immport files from ", analysis_dir)
if (platform == "Illumina") {
ge_list <- lapply(input_files, function(path) {
raw_illumina_dt <- fread(path)
raw_illumina_dt <- .subset_raw_illumina_dt(raw_illumina_dt)
raw_illumina_dt <- .prep_illumina_headers(raw_illumina_dt)
})
input_files <- .ge_list_to_flat_file(ge_list, supp_files_dir, study)
} else
if (platform == "NA") {
ge_list <- lapply(input_files, fread)
ge_list <- .fix_headers(ge_list, study)
input_files <- .ge_list_to_flat_file(ge_list, supp_files_dir, study)
} else
if (platform == "Affymetrix") {
if (!all(grepl("\\.cel", tolower(input_files)))) {
stop("Affymetrix should only have CEL input files")
}
}
# Affy should already be CEL files so nothing to do
return(input_files)
}
#' Map experiment-sample or geo accessions to biosample accessions
#'
#' @param exprs_dt data.table of gene expression with one column per sample,
#' one row per feature.
#' @param gef result of ISCon$getDataset("gene_expression_files") for one run.
#'
.map_sampleid_to_biosample_accession <- function(exprs_dt, gef) {
if (any(grepl("^(ES|GSM)\\d{6,7}$", colnames(exprs_dt)))) {
col_to_use <- ifelse(any(grepl("ES", colnames(exprs_dt))),
"expsample_accession",
"geo_accession"
)
} else {
col_to_use <- "file_info_name"
}
sampleids <- grep("feature_id", colnames(exprs_dt), value = TRUE, invert = TRUE)
biosample_accessions <- gef$biosample_accession[match(sampleids, gef[[col_to_use]])]
# remove samples without matching biosample accession
sampleids <- sampleids[!is.na(biosample_accessions)]
biosample_accessions <- biosample_accessions[!is.na(biosample_accessions)]
setnames(exprs_dt, sampleids, biosample_accessions)
return(exprs_dt)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.