#' Download gene expression from GEO using study specific id and process it
#'
#' @param geo_code character string indicating name of GEO dataset
#' @param file_directory character string indicating path for downloading raw
#' GEO data
#' @param cleanup logical value to remove intermediate files
#' @param collapse_fun function to collapse probe(s) or select a probe,
#' e.g. mean, median, or function that picks a probe with high variance
#' @param ... additional arguments
#' @noRd
#' @keywords internal
generate_gex_geo <- function( # Unique GEO identifier
geo_code = c(
"GSE18655", # Barwick et al.
"GSE6919", # Chandran et al., Yu et al. from three platforms combined
"GSE134051", # Friedrich et al.
"GSE2109", # IGC
"GSE119616", # Kim et al.
"GSE14206", # Kunderfranco et al.
"GSE25136", # Sun et al.
"GSE21032", # Taylor et al. Alternative more specific accession code "GSE21034" for GEX
"GSE5132", # True et al.
"GSE6956", # Wallace et al.
"GSE8218", # Wang et al.
"GSE157548" # Weiner et al.
),
# Indicate whether the 'oligo' or the 'affy' package should be the primary means for processing the CEL-data
pckg = "oligo",
# Working directory
file_directory,
# Clean up (i.e. delete) files post-download/-processing
cleanup = TRUE,
# Regex filter for GEOquery download
filter_regex,
# Function for collapsing rows
collapse_fun = function(z) {
apply(z, MARGIN = 2, FUN = stats::median)
},
...) {
if (!missing(file_directory)) here::set_here(file_directory)
# Supplementary files include the raw CEL files, possibility to use regular expression filter
if (missing(filter_regex)) {
supfiles <- GEOquery::getGEOSuppFiles(geo_code)
} else {
supfiles <- GEOquery::getGEOSuppFiles(geo_code, filter_regex = filter_regex)
}
if (!pckg %in% c("oligo", "affy", "limma", "other")) stop(paste0("Invalid processing method parameter pckg (should be either 'oligo', 'affy', 'limma', or 'other'):", pckg))
# Barwick et al.
if (geo_code == "GSE18655") {
# Following lumi-package variance stabilizing transformation normalization
# https://www.bioconductor.org/packages//2.13/bioc/vignettes/lumi/inst/doc/lumi.pdf
# Custom DASL, just .gz (gunzipped) raw file
GEOquery::gunzip(rownames(supfiles), overwrite = TRUE)
tmp <- read.table("./GSE18655/GSE18655_HCP_Toronto_raw.txt", header = TRUE, row.names = 1, skip = 4)
# Contains
# X1_rep1 X1_rep2 X10 X100_rep1 X100_rep2
# GI_10092618-S-3 30329.56 28241.03 28005.69 30165.83 28913.85
# GI_10092618-S-1 25222.37 22141.40 25871.17 26419.28 23814.04
# GI_10092618-S-2 30202.68 30705.60 32448.47 30405.46 29000.44
# No probe level variance information available; log-transforming after quantile normalization
gex <- log2(preprocessCore::normalize.quantiles(as.matrix(tmp)))
dimnames(gex) <- dimnames(tmp)
tmp <- gex
# Download GPL annotations for the custom platform
gpl <- GEOquery::getGEO(geo_code, GSEMatrix = FALSE, getGPL = TRUE)
map_barwick <- gpl@gpls[[1]]@dataTable@table
# Contains
# ID SequenceSource GB_ACC
# 1 GI_10092618-S RefSeq NM_020529.1
# 2 GI_10337586-S RefSeq NM_020996.1
# 3 GI_10834981-S RefSeq NM_000599.1
# Drop the third '-' split suffix from tmp rownames
map_tmp <- unlist(lapply(rownames(tmp), FUN = function(x) {
paste(strsplit(x, "-")[[1]][1:2], collapse = "-")
}))
# Collapse over multiple hits to same RefSeq within a sample
gex <- do.call("rbind", by(tmp, INDICES = map_barwick[match(map_tmp, map_barwick$ID), "GB_ACC"], FUN = collapse_fun))
# Omit RefSeq versions from mapping to gene symbols
rownames(gex) <- unlist(lapply(rownames(gex), FUN = function(x) {
strsplit(x, ".", fixed = TRUE)[[1]][1]
}))
# Hugo symbols
symbols <- curatedPCaData_genes[match(rownames(gex), curatedPCaData_genes[, "refseq_mrna"]), "hgnc_symbol"]
gex <- gex[!is.na(symbols), ]
symbols <- symbols[!is.na(symbols)]
rownames(gex) <- symbols
# Collapse multiple instances of same gene symbol
gex <- do.call("rbind", by(as.matrix(gex), INDICES = rownames(gex), FUN = function(x) {
collapse_fun(x)
}))
}
# Chandran et al.
else if (geo_code == "GSE6919") {
# Open the tarball(s)
utils::untar(tarfile = rownames(supfiles))
gz_files <- list.files()
gz_files <- gz_files[grep(".gz", gz_files)]
if (pckg == "oligo") {
# Download the GSE that contains indicators which samples were run on the most modern chip type Av2
gse <- GEOquery::getGEO(geo_code, GSEMatrix = TRUE)
gpl_av2 <- rownames(Biobase::pData(gse[[1]]))
gex <- oligo::read.celfiles(paste0(gpl_av2, ".CEL.gz"))
# Normalize background convolution of noise and signal using RMA (median-polish)
gex <- oligo::rma(gex)
# Extract expression matrix with probe ids
gex <- oligo::exprs(gex)
# Sanitize column names
colnames(gex) <- gsub(".CEL.gz", "", colnames(gex))
# > Biobase::featureData(gex) <- oligo::getNetAffx(gex, "transcript")
# Error in oligo::getNetAffx(gex, "transcript") :
# NetAffx Annotation not available in 'pd.hg.u95av2'. Consider using 'biomaRt'.
# Using hgu95av2.db directly:
map <- AnnotationDbi::select(hgu95av2.db::hgu95av2.db, rownames(gex), c("SYMBOL"))
map <- map[match(rownames(gex), map$PROBEID), ]
# Collapsing gene expression for a given symbol:
gex <- do.call("rbind", by(gex, INDICES = map$SYMBOL, FUN = collapse_fun))
} else {
stop("Package 'oligo' used for 'GSE6919' (Chandran et al.)")
}
}
# Friedrich et al.
else if (geo_code == "GSE134051") {
# Open the tarball(s)
utils::untar(tarfile = rownames(supfiles))
gz_files <- list.files()
gz_files <- gz_files[grep(".gz", gz_files)]
for (x in gz_files) {
GEOquery::gunzip(x)
}
files <- gsub(".gz", "", gz_files)
# Single color Agilent array
if (pckg == "limma") {
# Download GPL for probe identifier annotations
gpl <- GEOquery::getGEO(geo_code, GSEMatrix = FALSE, getGPL = TRUE)
# Extract annotations in ENSG####
gpl <- GEOquery::Table(slot(gpl, "gpls")[[1]])
# Process using limma pipeline for Agilent single color custom arrays
# Read in raw data
gex <- limma::read.maimages(dir(".", "txt"), "agilent.median", green.only = TRUE)
# Perform background correction (and apparently log-transformation of intensities)
# 'method="normexp" a convolution of normal and exponential distributions is fitted to the foreground intensities using the
# background intensities as a covariate, and the expected signal given the observed foreground becomes the corrected intensity.'
gex <- limma::backgroundCorrect(gex, method = "normexp", offset = 1)
# ^ normexp.method="rma" is a possibility
# Normalize between arrays using quantile normalization
gex <- limma::normalizeBetweenArrays(gex, method = "quantile")
# Extract probe average expressions and use Agilent probe identifiers
gex <- limma::avereps(gex, ID = gex$genes$ProbeName)
# Transform into a matrix and truncate column names
gex <- as.matrix(gex)
colnames(gex) <- lapply(colnames(gex), FUN = function(x) {
strsplit(x, "_")[[1]][1]
})
# Map rownames to ENSG####
ensg <- gpl[match(rownames(gex), gpl$ID), "SPOT_ID"]
ensg[which(ensg == "NoEntry")] <- NA
genes <- curatedPCaData_genes$hgnc_symbol[match(ensg, curatedPCaData_genes$ensembl_gene_id)]
genes[which(genes == "")] <- NA
# Collapse using hugo gene symbols
gex <- do.call("rbind", by(gex, INDICES = genes, FUN = collapse_fun))
# Include non-redundant names (non-NAs)
gex <- gex[!rownames(gex) %in% c(""), ]
} else {
stop("Package 'limma' used for 'GSE134051' (Friedrich et al.)")
}
}
# Kim et al.
else if (geo_code == "GSE119616") {
# Open the tarball(s)
supfiles <- supfiles[-2, ]
utils::untar(tarfile = rownames(supfiles))
# Use editcelfile package to convert decipher annotations
gz_files <- list.files()
gz_files <- gz_files[grep(".gz", gz_files)]
for (x in gz_files) {
GEOquery::gunzip(x)
}
celfiles <- list.files()
celfiles <- celfiles[grep(".CEL", celfiles)]
# Include custom function that was originally small part of editcelfile (private package available upon request only), to avoid unnecessary non-publicly available dependencies
# Original code by Jonathan Lehrer & Seagle Liu (R package presented without license, modified code credited here)
celfileHeaderToHuex <- function(celFilePath, header = "HuEx-1_0-st-v2", verbose = TRUE) {
# NEVER change this.
writeBytePos <- 430
overwriteBytes <- 28 # 2 x 14 characters in "HuEx-1_0-st-v2"
if (verbose) {
print("ORIGINAL:")
print(affyio::read.celfile.header(celFilePath)[1])
}
filename <- base::file(celFilePath, "r+b")
base::seek(filename, writeBytePos, "start", "write")
replicate(overwriteBytes, base::writeBin(as.raw(0), filename, useBytes = TRUE)) # clear the old name (14 characters)
base::seek(filename, writeBytePos, "start", "write")
encoded <- stringi::stri_enc_toutf32(header)[[1]]
base::writeBin(encoded, filename, size = 2)
base::close(filename)
cfh <- affyio::read.celfile.header(celFilePath)
if (verbose) {
print("REVISED:")
print(cfh[1])
}
return(invisible(cfh$cdfName == header))
}
sapply(celfiles, celfileHeaderToHuex, verbose = TRUE)
# Read in the CEL files
CELs <- oligo::read.celfiles(celfiles)
# Perform RMA normalization
RMAs <- oligo::rma(CELs)
# Obtain gene and sample information
Biobase::featureData(RMAs) <- oligo::getNetAffx(RMAs, "transcript")
# GSM######-type names from GEO
nam0 <- unlist(lapply(strsplit(affy::list.celfiles(), "_"),
FUN = function(z) z[[1]]
))
# Two naming conventions for the files; picking the PCA###-style
nam1 <- unlist(lapply(strsplit(affy::list.celfiles(), "_"),
FUN = function(z) z[[3]]
))
nam2 <- gsub(".CEL.gz", "", unlist(lapply(strsplit(affy::list.celfiles(), "_"),
FUN = function(z) z[[4]]
)))
# Some samples were suffixed with HuEx, while others had Exonl prefix
nam <- paste(nam0, "_", ifelse(nam1 == "Exon1", nam2, nam1), sep = "")
# Extract gene names
genenames <- unlist(lapply(Biobase::fData(RMAs)[, "geneassignment"],
FUN = function(z) {
strsplit(z, " // ")[[1]][2]
}
))
# Transform into a matrix and remove empty gene names
gex <- as.matrix(Biobase::exprs(RMAs))
gex <- gex[-which(is.na(genenames)), ]
rownames(gex) <- genenames[-which(is.na(genenames))]
# Map the coventionally used Taylor sample names instead of GEO codes
# Compatible with e.g. cBioPortal sample names
# Give unique names with GSM#####.{PCA,PAN}##### combination for uniqueness
## colnames(gex) <- nam
# Use the unique GSM###-names
colnames(gex) <- unlist(lapply(nam, FUN = function(z) {
strsplit(z, "_")[[1]][1]
}))
}
# Kunderfranco et al.
# GPL887 Agilent-012097 Human 1A Microarray (V2) G4110B (Feature Number version)
else if (geo_code == "GSE14206") {
# Two colour Agilent array
utils::untar(tarfile = rownames(supfiles))
gz_files <- list.files()
gz_files <- gz_files[grep(".gz", gz_files)]
for (x in gz_files) {
GEOquery::gunzip(x)
}
files <- gsub(".gz", "", gz_files)
# Files include 'GPL887_old_annotation.txt', grep this away
files <- files[-grep("GPL887", files)]
if (pckg == "limma") {
# New code RAW data processed in limma
# Download GPL for probe identifier annotations
gpl <- GEOquery::getGEO(geo_code, GSEMatrix = FALSE, getGPL = TRUE)
# Extract annotations in ENSG####
gpl <- GEOquery::Table(slot(gpl, "gpls")[[1]])
# Read in raw data
gex <- limma::read.maimages(files, "agilent", green.only = FALSE)
# Background correction
gex <- limma::backgroundCorrect(gex, method = "normexp", offset = 1)
# Normalize with global loess
gex <- limma::normalizeWithinArrays(gex, method = "loess")
# Average expression
gex <- limma::avereps(gex, ID = gex$genes$ProbeName)
# Extract sample names (GSM#####)
samples <- unlist(lapply(strsplit(colnames(gex$M)[seq(from = 1, to = ncol(gex$M), by = 2)], "_"), FUN = function(x) {
x[[1]][1]
}))
# For diagnostics (mean should be at M=0): > limma::plotMA(gex, array=1)
# Checking dye-swapped correlations (flipped log ration)
#> quantile(unlist(lapply(seq(from=1, to=ncol(gex$M), by=2), FUN=function(x) { cor(gex$M[,x], -gex$M[,x+1], method="spearman") })), probs=seq(0,1,by=.1))
# 0% 10% 20% 30% 40% 50% 60% 70% 80% 90% 100%
#-0.2760865 0.2179309 0.2766869 0.3413689 0.3967651 0.4219280 0.4615115 0.5176102 0.5468144 0.5776107 0.6099629
# ...
# Visual diagnostics
#
# For now take the average of the normal and flipped dye swapped log ratios:
gex$M <- do.call("cbind", lapply(seq(from = 1, to = ncol(gex$M), by = 2), FUN = function(x) {
apply(cbind(-gex$M[, x], gex$M[, x + 1]), MARGIN = 1, FUN = mean)
}))
rownames(gex$M) <- gex$genes$ProbeName
colnames(gex$M) <- samples
gex <- do.call("rbind", by(gex$M, INDICES = gpl[match(rownames(gex$M), gpl$SPOT_ID), "GENE_SYMBOL"], FUN = collapse_fun))
# Keep only non-NA & non-"" rows
gex <- gex[!(rownames(gex) == "" | is.na(rownames(gex))), ]
} else {
stop("Package 'limma' used for 'GSE14206' (Kunderfranco et al.)")
}
}
# IGC
else if (geo_code == "GSE2109") {
## Note: IGC has >2k samples, many of which are not prostate cancer
# Open the tarball(s)
utils::untar(tarfile = rownames(supfiles))
# Make sure to function in a working directory where the are no other tarballs present
gz_files <- list.files()
gz_files <- gz_files[grep(".gz", gz_files)]
# Need phenodata as samples are from mixed cancers
gset <- GEOquery::getGEO(geo_code, GSEMatrix = TRUE)
# Subset to just prostate cancer samples
pca_gsm <- rownames(Biobase::pData(gset[[1]]))[grep("Prostate", Biobase::pData(gset[[1]])[, "title"])]
gz_files_pca <- grep(paste0(pca_gsm, collapse = "|"), gz_files, value = TRUE)
if (pckg == "oligo") {
## Subset to just prostate cancer as IGC covers in this GEO submission other cancer types also
# Read CEL
gex <- oligo::read.celfiles(gz_files_pca)
# Normalize background convolution of noise and signal using RMA (median-polish)
gex <- oligo::rma(gex)
# Extract expression matrix with probe ids
gex <- oligo::exprs(gex)
# Sanitize column names
colnames(gex) <- gsub(".CEL.gz", "", colnames(gex))
# Map the probes to gene symbols stored in curatedPCaData_genes for hgu133a
genes <- curatedPCaData_genes_affy_hg_u133a_2[match(rownames(gex), curatedPCaData_genes_affy_hg_u133a_2$affy_hg_u133a_2), "hgnc_symbol"]
# Collapse probes that target the same gene
gex <- do.call("rbind", by(gex, INDICES = genes, FUN = collapse_fun))
} else if (pckg == "affy") {
# Open the tarball(s)
utils::untar(tarfile = rownames(supfiles[17, ]))
clinical <- rio::import("data-raw/clinical_igc.RData")
# Make sure to function in a working directory where the are no other tarballs present
rownames(clinical) <- paste0(rownames(clinical), ".CEL.gz")
gz_files <- rownames(clinical)
gz_files <- gz_files[grep(".gz", gz_files)]
gz_files <- as.data.frame(gz_files)
# Read Affymetrix MA
igc <- affy::ReadAffy(filenames = gz_files$gz_files)
colnames(affy::exprs(igc)) <- gsub(".gz|.CEL", "", colnames(igc))
# Careful not to mask 'rma' from 'affy' by the 'rma' from 'oligo'
gex <- affy::rma(igc)
# Extracting .CEL and packaging names from the GEO-compatible sample names
colnames(gex) <- gsub(".CEL.gz", "", colnames(affy::exprs(gex)))
# Find gene annotations
keys <- AnnotationDbi::mappedkeys(hgu133a.db::hgu133aGENENAME)
nam <- names(as.character(hgu133a.db::hgu133aALIAS2PROBE)[match(
rownames(gex),
as.character(hgu133a.db::hgu133aALIAS2PROBE)
)])
nam[is.na(nam)] <- "NA"
# Collapse probes
gex <- do.call("rbind", by(as.matrix(affy::exprs(gex)),
INDICES = nam,
FUN = collapse_fun
))
# Sun et al does not use Hugo names so the below code makes the names for sun
# et al comparable with those in tcga/taylor
compare_names <- data.frame(
original = row.names(gex),
current = limma::alias2SymbolTable(row.names(gex),
species = "Hs"
)
)
duplicated_hugo_symbols <- compare_names[duplicated(compare_names$current), ]$current
compare_names <- compare_names %>%
dplyr::mutate(new_names = dplyr::case_when(
current %in% duplicated_hugo_symbols ~ original,
TRUE ~ current
))
row.names(gex) <- compare_names$new_names
}
}
# Sun et al.
else if (geo_code == "GSE25136") {
# Open the tarball(s)
utils::untar(tarfile = rownames(supfiles))
# Make sure to function in a working directory where the are no other tarballs present
gz_files <- list.files()
gz_files <- gz_files[grep(".gz", gz_files)]
if (pckg == "oligo") {
# Read CEL
gex <- oligo::read.celfiles(gz_files)
# Normalize background convolution of noise and signal using RMA (median-polish)
gex <- oligo::rma(gex)
# Extract expression matrix with probe ids
gex <- oligo::exprs(gex)
# Sanitize column names
colnames(gex) <- gsub(".CEL.gz", "", colnames(gex))
# Map the probes to gene symbols stored in curatedPCaData_genes for hgu133a
genes <- curatedPCaData_genes_affy_hg_u133a[match(rownames(gex), curatedPCaData_genes_affy_hg_u133a$affy_hg_u133a), "hgnc_symbol"]
# Collapse probes that target the same gene
gex <- do.call("rbind", by(gex, INDICES = genes, FUN = collapse_fun))
} else {
stop("Package 'oligo' used for 'GSE25136' (Sun et al.)")
}
}
# Taylor et al.
# [HuEx-1_0-st] Affymetrix Human Exon 1.0 ST Array [probe set (exon) version]
# Affymetrix Human Exon 1.0 ST Array [CDF: HuEx_1_0_st_v2_main_A20071112_EP.cdf]
else if (geo_code == "GSE21032") { # TODO: Alternative more specific accession code "GSE21034"
# Open the tarball(s)
utils::untar(tarfile = rownames(supfiles))
# Make sure to function in a working directory where the are no other tarballs present
gz_files <- list.files()
gz_files <- gz_files[grep(".gz", gz_files)]
# Subset to prostate cancer or normal patient samples
gz_files <- grep("PCA|PAN", gz_files, value = TRUE)
if (pckg == "oligo") {
# Read CEL
gex <- oligo::read.celfiles(gz_files)
# Normalize background convolution of noise and signal using RMA (median-polish)
gex <- oligo::rma(gex)
# Extract annotated gene information
Biobase::featureData(gex) <- oligo::getNetAffx(gex, "transcript")
genes <- unlist(lapply(Biobase::fData(gex)[, "geneassignment"], FUN = function(z) {
strsplit(z, " // ")[[1]][2]
}))
# Extract expression matrix with probe ids
gex <- oligo::exprs(gex)
# Sanitize column names
colnames(gex) <- unlist(lapply(colnames(gex), FUN = function(x) {
grep("PCA|PAN", gsub(".CEL.gz", "", strsplit(x, "_")[[1]]), value = TRUE)
}))
# Collapse the mapped probes under the same gene name
gex <- do.call("rbind", by(gex, INDICES = genes, FUN = collapse_fun))
# Omit duplicated entries that are replicated columns
gex <- gex[, !duplicated(colnames(gex))]
} else {
stop("Package 'oligo' used for 'GSE21032' (Taylor et al.)")
}
}
# True et al.
# GPL3834 FHCRC Human Prostate PEDB cDNA Array
# GPL3836 FHCRC Human Prostate PEDB cDNA Array v3 (-> single sample only! 11th, GSM115769)
else if (geo_code == "GSE5132") {
# Open the tarball(s)
utils::untar(tarfile = rownames(supfiles))
# Make sure to function in a working directory where the are no other tarballs present
gz_files <- list.files()
gz_files <- gz_files[grep(".gz", gz_files)]
if (pckg == "limma") {
# Extract latest GPL annotations from GEO
gpl <- GEOquery::getGEO(geo_code, GSEMatrix = FALSE, getGPL = TRUE)
# Add meta Block column for mapping between GPL and genes read in with limma::read.maimages; GEO's GPL-files do not share identifiers with raw .gpr files
gpl@gpls[[1]]@dataTable@table$Block <- rep(1:32, each = 484)
gpl@gpls[[2]]@dataTable@table$Block <- rep(1:32, each = 420)
# Construct ID with xx_yy_zz where xx = Block, yy = Column, zz = Row (includes leading zeroes)
gpl@gpls[[1]]@dataTable@table$ID <- paste(sprintf("%02d", gpl@gpls[[1]]@dataTable@table$Block), sprintf("%02d", gpl@gpls[[1]]@dataTable@table$Column), sprintf("%02d", gpl@gpls[[1]]@dataTable@table$Row), sep = "_")
gpl@gpls[[2]]@dataTable@table$ID <- paste(sprintf("%02d", gpl@gpls[[2]]@dataTable@table$Block), sprintf("%02d", gpl@gpls[[2]]@dataTable@table$Column), sprintf("%02d", gpl@gpls[[2]]@dataTable@table$Row), sep = "_")
# Gunzip .gpr files and read them via limma
lapply(gz_files, FUN = GEOquery::gunzip)
# FHCRC Human Prostate PEDB cDNA Array v3 vs v4
gpl_1 <- grep(".gpr", list.files(), value = TRUE)
gpl_1 <- gpl_1[!gpl_1 == "GSM115769.gpr"]
gpl_2 <- "GSM115769.gpr"
gex1 <- limma::read.maimages(gpl_1, source = "genepix")
gex2 <- limma::read.maimages(gpl_2, source = "genepix")
# Platform 2 has IDs in different format
# Assign gene names to the metadata
gex1$genes$genename <- gpl@gpls[[1]]@dataTable@table[, "Related Gene Symbol"]
gex2$genes$genename <- gpl@gpls[[2]]@dataTable@table[, "Hugo"]
# Background correction separately for both platforms
gex1 <- limma::backgroundCorrect(gex1, method = "normexp", offset = 1)
gex2 <- limma::backgroundCorrect(gex2, method = "normexp", offset = 1)
# Normalize with global loess
gex1 <- limma::normalizeWithinArrays(gex1, method = "loess")
gex2 <- limma::normalizeWithinArrays(gex2, method = "loess")
# Average expression
gex1 <- limma::avereps(gex1)
gex2 <- limma::avereps(gex2)
# Collapse for gene names
gex1 <- do.call("rbind", by(as.matrix(gex1), INDICES = gex1$genes$genename, FUN = collapse_fun))
# gex2 <- do.call("rbind", by(as.matrix(gex2), INDICES=gex2$genes$genename, FUN=collapse_fun))
name <- colnames(gex2) # Store single sample name which would be dropped otherwise
gex2 <- as.matrix(by(as.matrix(gex2), INDICES = gex2$genes$genename, FUN = collapse_fun))
colnames(gex2) <- name
# Omit unannotated collapsed probes
gex1 <- gex1[which(!rownames(gex1) %in% c("", NA)), , drop = FALSE]
gex2 <- gex2[which(!rownames(gex2) %in% c("", NA)), , drop = FALSE]
# Median impute the genes not measured in v3 in order to retain full dimension of 3615 of v4 array instead of downtoning to 2727 present in v3
# This imputation happens only for a single paired sample:
gex2 <- gex2[match(rownames(gex1), rownames(gex2)), , drop = FALSE]
rownames(gex2) <- rownames(gex1)
for (r in which(is.na(gex2))) {
gex2[r, ] <- stats::median(gex1[r, ])
}
# Merge data based on intersecting genes
# 11th sample is a special case for v3 array and must be placed separately
gex <- cbind(gex1[, 1:10], GSM115769 = gex2, gex1[, 11:31])
# Channels were swapped in samples #3, 4, 7, 8, 12, 14, 15, 17, 18, 20, 22, 23, 24, 26, 30, 31
# In these tumor vs normal log ratio is inverted
for (i in c(3, 4, 7, 8, 12, 14, 15, 17, 18, 20, 22, 23, 24, 26, 30, 31)) {
gex[, i] <- -gex[, i]
}
# 11th sample (GSM115769) was processed with v3 array, while samples 1-10, 12-32 were processed with v4 array
} else {
stop("Package 'limma' used for 'GSE5132' (True et al.)")
}
}
# Wallace et al.
else if (geo_code == "GSE6956") {
unmatched_healty_tissue <- c("GSM160418", "GSM160419", "GSM160420", "GSM160421", "GSM160422", "GSM160430")
# Open the tarball(s)
utils::untar(tarfile = rownames(supfiles))
# Make sure to function in a working directory where the are no other tarballs present
gz_files <- list.files()
gz_files <- gz_files[grep(".gz", gz_files)]
if (pckg == "oligo") {
# Read CEL
gex <- oligo::read.celfiles(gz_files)
# Normalize background convolution of noise and signal using RMA (median-polish)
gex <- oligo::rma(gex)
# Extract expression matrix with probe ids
gex <- oligo::exprs(gex)
# Sanitize column names
colnames(gex) <- gsub(".CEL.gz", "", colnames(gex))
# Map the probes to gene symbols stored in curatedPCaData_genes for hgu133a
genes <- curatedPCaData_genes_affy_hg_u133a_2[match(rownames(gex), curatedPCaData_genes_affy_hg_u133a_2$affy_hg_u133a_2), "hgnc_symbol"]
# Collapse probes that target the same gene
gex <- do.call("rbind", by(gex, INDICES = genes, FUN = collapse_fun))
} else {
stop("Package 'oligo' used for 'GSE6956' (Wallace et al.)")
}
}
# Wang et al.
else if (geo_code == "GSE8218") {
# Open the tarball(s)
utils::untar(tarfile = rownames(supfiles))
# Make sure to function in a working directory where the are no other tarballs present
gz_files <- list.files()
gz_files <- gz_files[grep(".gz", gz_files)]
if (pckg == "oligo") {
# Read CEL
gex <- oligo::read.celfiles(gz_files)
# Normalize background convolution of noise and signal using RMA (median-polish)
gex <- oligo::rma(gex)
# Extract expression matrix with probe ids
gex <- oligo::exprs(gex)
# Sanitize column names
colnames(gex) <- gsub(".CEL.gz", "", colnames(gex))
# Map the probes to gene symbols stored in curatedPCaData_genes for hgu133a
genes <- curatedPCaData_genes_affy_hg_u133a[match(rownames(gex), curatedPCaData_genes_affy_hg_u133a$affy_hg_u133a), "hgnc_symbol"]
# Collapse probes that target the same gene
gex <- do.call("rbind", by(gex, INDICES = genes, FUN = collapse_fun))
} else {
stop("Package 'oligo' used for 'GSE8218' (Wang et al.)")
}
}
# Weiner et al.
else if (geo_code == "GSE157548") {
# Open the tarball(s)
utils::untar(tarfile = rownames(supfiles))
# Make sure to function in a working directory where the are no other tarballs present
gz_files <- list.files()
gz_files <- gz_files[grep(".gz", gz_files)]
if (pckg == "oligo") {
# Read CEL
gex <- oligo::read.celfiles(gz_files)
# Normalize background convolution of noise and signal using RMA (median-polish)
gex <- oligo::rma(gex)
# Extract annotated gene information
Biobase::featureData(gex) <- oligo::getNetAffx(gex, "transcript")
genes <- unlist(lapply(Biobase::fData(gex)[, "geneassignment"], FUN = function(z) {
strsplit(z, " // ")[[1]][2]
}))
# Extract expression matrix with probe ids
gex <- oligo::exprs(gex)
# Sanitize column names
colnames(gex) <- gsub(".CEL.gz", "", colnames(gex))
# Collapse the mapped probes under the same gene name
gex <- do.call("rbind", by(gex, INDICES = genes, FUN = collapse_fun))
# Sanitize sample names to only contain GSM####
colnames(gex) <- unlist(lapply(colnames(gex), FUN = function(x) {
strsplit(x, "_")[[1]][1]
}))
# Store intensities up to 6th decimal point to conserve space
gex <- round(gex, 6)
} else {
stop("Package 'oligo' used for 'GSE157548' (Weiner et al.)")
}
}
# Unknown GEO id (throw an error) -----
else {
stop("Unknown GEO id, see allowed parameter values for geo_code")
}
## TODO: Move cleanup inside each GEO as file types and custom files are very cohort specific
# Remove downloaded files
if (cleanup) {
# First GEO download
file.remove(rownames(supfiles))
# Tarballs
# file.remove(gz_files)
# Remove empty folder
file.remove(paste0(here::here(), "/", geo_code))
}
# Remove rows with no names or redundancy
if (any(is.na(rownames(gex))) | any(rownames(gex) %in% c(""))) {
gex <- gex[-which(is.na(rownames(gex)) | rownames(gex) %in% c("")), ]
}
# Sort genes to alphabetic order for consistency
gex <- gex[order(rownames(gex)), ]
# Cast to matrix type, remove empty rows/columns and return gene expression matrix
gex <- as.matrix(gex)
gex <- gex |> janitor::remove_empty(which = c("rows", "cols"))
gex
}
#' Download copy number variant data from GEO using study specific id and process it
#'
#' @param geo_code character string indicating name of GEO dataset. Default is "GSE21035"
#' (Taylor et al)
#' @param file_directory character string indicating path for downloading raw
#' GEO data
#' @param cleanup logical value to remove intermediate files
#' @param ... additional arguments
#' @noRd
#' @keywords internal
generate_cna_geo <- function(geo_code = c(
"GSE54691", # Hieronymus et al.
"GSE21035" # Taylor et al. (aCGH only GSE-subset)
),
file_directory,
filter_regex,
cleanup = TRUE,
...) {
if (!missing(file_directory)) here::set_here(file_directory)
# Supplementary files include the raw CEL files, possibility to use regular expression filter
if (missing(filter_regex)) {
supfiles <- GEOquery::getGEOSuppFiles(geo_code)
} else {
supfiles <- GEOquery::getGEOSuppFiles(geo_code, filter_regex = filter_regex)
}
# Open the tarball(s)
utils::untar(tarfile = rownames(supfiles))
##
# same rCGH pipeline applied to datasets:
# Taylor et al. : GPL4091 Agilent-014693 Human Genome CGH Microarray 244A (Feature number version)
# Hieronymus et al. : GPL8737 Agilent-021529 Human CGH Whole Genome Microarray 1x1M (G4447A) (Probe Name version)
##
if (geo_code %in% c(
"GSE21035", # Taylor
"GSE54691" # Hieronymus
)) {
# Extract sample names in Taylor et al. and format to PCA####, for this need to access GSM-level metadata
if (geo_code == "GSE21035") {
# Access metadata based on GSM####
samplenames <- lapply(gsub(".txt.gz", "", grep("GSM", list.files(), value = TRUE)), FUN = function(x) {
GEOquery::getGEO(x, GSEMatrix = FALSE, getGPL = FALSE)@header$title
})
# Parse to just PCA#### from the title, splitting on whitespace character ' '
samplenames <- unlist(lapply(samplenames, FUN = function(x) {
strsplit(x, " ")[[1]][3]
}))
# File names as they are
filenames <- grep("GSM", list.files(), value = TRUE)
# Omit cell lines
filenames <- filenames[grep("PCA", samplenames)]
samplenames <- samplenames[grep("PCA", samplenames)]
}
# Hieronymus et al names based on GSM####
else {
# File names end in suffix .txt.gz, sample names have this string replaced
samplenames <- gsub(".txt.gz", "", grep("GSM", list.files(), value = TRUE))
filenames <- grep("GSM", list.files(), value = TRUE)
}
# For now, the package 'rCGH' has to be available in the workspace,
requireNamespace("rCGH")
# otherwise below functions will fail on e.g. rCGH::adjustSignal and when trying to find 'hg18'
# Read in Agilent 2-color data
cna <- lapply(1:length(filenames), FUN = function(i) {
try({
cat("\nProcessing: ", filenames[i], "\n")
rCGH::readAgilent(filenames[i], genome = "hg38", sampleName = samplenames[i])
})
})
#> list.files()[which(unlist(lapply(cna, FUN=class))=="try-error")]
# [1] "GSM525755.txt" "GSM525763.txt"
# Some files appear broken in Taylor et al; missing columns?
# Not all files can always be successfully processed
tryerr <- which(lapply(cna, FUN = class) == "try-error")
if (length(tryerr) > 0) {
# Warn of try-errors
warning(paste(
"Error while processing files: ",
# Collapse file names
paste(list.files()[which(unlist(lapply(cna, FUN = class)) == "try-error")], collapse = ", ")
))
# Omit data that could not be succcessfully read
cna <- cna[-tryerr]
}
# Signal adjustments
cna <- lapply(cna, FUN = function(z) {
try({
rCGH::adjustSignal(z)
})
})
# Segmentation
cna <- lapply(cna, FUN = function(z) {
try({
rCGH::segmentCGH(z)
})
})
# EM-algorithm normalization
cna <- lapply(cna, FUN = function(z) {
try({
rCGH::EMnormalize(z)
})
})
## Samplenames picked prior to this
# Remove additional suffixes from sample names
# cna <- lapply(cna, FUN = function(z){
# try({
# if(!"rCGH-Agilent" %in% class(z)){
# stop("This function is intended for Agilent aCGH analyzed with rCGH R Package (class \'rCGH-Agilent\')")
# }
# # e.g. transform "GSM525575.txt|.gz" -> "GSM525575"
# z@info["sampleName"] <- gsub(pattern = ".gz|.txt", replacement = "", z@info["fileName"])
# z
# })
# })
# Save sample names separately (of 'length(cna)')
# samplenames <- unlist(lapply(cna, FUN = function(z) { z@info["sampleName"] }))
# Reformat samplenames in Hieronymus
if (geo_code == "GSE54691") {
# Pick GSM-part in GSM###_PCA###
samplenames <- unlist(lapply(samplenames, FUN = function(z) {
strsplit(z, "_")[[1]][[1]]
}))
}
# Get segmentation table
cna <- lapply(cna, FUN = function(z) {
try({
rCGH::getSegTable(z)
})
})
# Get per-gene table
cna <- lapply(cna, FUN = function(z) {
try({
rCGH::byGeneTable(z)
})
})
# Extract all unique gene symbols present over all samples
genenames <- unique(unlist(lapply(cna, FUN = function(z) {
z$symbol
})))
# Bind genes to rows, name samples afterwards
cna <- do.call("cbind", lapply(cna, FUN = function(z) {
# Return CNAs as Log2Ratios
z[match(genenames, z$symbol), "Log2Ratio"]
}))
# Name rows and columns to genes and sample names, respectively
rownames(cna) <- genenames
colnames(cna) <- samplenames
# CNA matrix is ready
cna <- as.matrix(cna)
cna <- cna[!is.na(rownames(cna)), ]
}
# Other (placeholder) - should probably place this in the beginning so it
# doesnt try to run through all the beginning steps
else if (geo_code == "") {
stop("Must supply a GEO id")
}
# Unknown
else {
stop("Unknown GEO id, see allowed parameter values for geo_code")
}
# Remove downloaded files
# TODO: Make sure appropriate permissions exist for removing files
if (cleanup) {
# First GEO download
file.remove(rownames(supfiles))
# TODO: Tarballs
# file.remove(gz_files)
# Remove empty folder
file.remove(paste0(here::here(), "/", geo_code))
}
# Sort genes to alphabetic order for consistency
cna <- cna[order(rownames(cna)), ]
# Return numeric matrix
cna <- as.matrix(cna)
cna <- cna %>% janitor::remove_empty(which = c("rows", "cols"))
cna
}
#' Download copy number variant data from GEO using study specific id and process it to GISTIC compatible input format
#'
#' @param geo_code character string indicating name of GEO dataset. Default is "GSE21035"
#' (Taylor et al)
#' @param file_directory character string indicating path for downloading raw
#' GEO data
#' @param cleanup logical value to remove intermediate files
#' @param ... additional arguments
#' @noRd
#' @keywords internal
generate_gistic_geo <- function(geo_code = c(
"GSE54691", # Hieronymus et al.
"GSE21035" # Taylor et al. (aCGH only GSE-subset)
),
file_directory,
filter_regex,
cleanup = TRUE,
...) {
if (!missing(file_directory)) here::set_here(file_directory)
# Supplementary files include the raw CEL files, possibility to use regular expression filter
if (missing(filter_regex)) {
supfiles <- GEOquery::getGEOSuppFiles(geo_code)
} else {
supfiles <- GEOquery::getGEOSuppFiles(geo_code, filter_regex = filter_regex)
}
# Open the tarball(s)
utils::untar(tarfile = rownames(supfiles))
##
# same rCGH pipeline applied to datasets:
# Taylor et al. : GPL4091 Agilent-014693 Human Genome CGH Microarray 244A (Feature number version)
# Hieronymus et al. : GPL8737 Agilent-021529 Human CGH Whole Genome Microarray 1x1M (G4447A) (Probe Name version)
##
if (geo_code %in% c(
"GSE21035", # Taylor
"GSE54691" # Hieronymus
)) {
# Extract sample names in Taylor et al. and format to PCA####, for this need to access GSM-level metadata
if (geo_code == "GSE21035") {
# Access metadata based on GSM####
samplenames <- lapply(gsub(".txt.gz", "", grep("GSM", list.files(), value = TRUE)), FUN = function(x) {
GEOquery::getGEO(x, GSEMatrix = FALSE, getGPL = FALSE)@header$title
})
# Parse to just PCA#### from the title, splitting on whitespace character ' '
samplenames <- unlist(lapply(samplenames, FUN = function(x) {
strsplit(x, " ")[[1]][3]
}))
# File names as they are
filenames <- grep("GSM", list.files(), value = TRUE)
# Omit cell lines
filenames <- filenames[grep("PCA", samplenames)]
samplenames <- samplenames[grep("PCA", samplenames)]
}
# Hieronymus et al names based on GSM####
else {
# File names end in suffix .txt.gz, sample names have this string replaced
samplenames <- gsub(".txt.gz", "", grep("GSM", list.files(), value = TRUE))
filenames <- grep("GSM", list.files(), value = TRUE)
}
# For now, the package 'rCGH' has to be available in the workspace,
requireNamespace("rCGH")
# otherwise below functions will fail on e.g. rCGH::adjustSignal and when trying to find 'hg18'
# Read in Agilent 2-color data
cna <- lapply(1:length(filenames), FUN = function(i) {
try({
cat("\nProcessing: ", filenames[i], "\n")
rCGH::readAgilent(filenames[i], genome = "hg38", sampleName = samplenames[i])
})
})
#> list.files()[which(unlist(lapply(cna, FUN=class))=="try-error")]
# [1] "GSM525755.txt" "GSM525763.txt"
# Some files appear broken in Taylor et al; missing columns?
# Not all files can always be successfully processed
tryerr <- which(lapply(cna, FUN = class) == "try-error")
if (length(tryerr) > 0) {
# Warn of try-errors
warning(paste(
"Error while processing files: ",
# Collapse file names
paste(list.files()[which(unlist(lapply(cna, FUN = class)) == "try-error")], collapse = ", ")
))
# Omit data that could not be succcessfully read
cna <- cna[-tryerr]
}
# Signal adjustments
cna <- lapply(cna, FUN = function(z) {
try({
rCGH::adjustSignal(z)
})
})
# Segmentation
cna <- lapply(cna, FUN = function(z) {
try({
rCGH::segmentCGH(z)
})
})
# EM-algorithm normalization
cna <- lapply(cna, FUN = function(z) {
try({
rCGH::EMnormalize(z)
})
})
## Samplenames picked prior to this
# Remove additional suffixes from sample names
# cna <- lapply(cna, FUN = function(z){
# try({
# if(!"rCGH-Agilent" %in% class(z)){
# stop("This function is intended for Agilent aCGH analyzed with rCGH R Package (class \'rCGH-Agilent\')")
# }
# # e.g. transform "GSM525575.txt|.gz" -> "GSM525575"
# z@info["sampleName"] <- gsub(pattern = ".gz|.txt", replacement = "", z@info["fileName"])
# z
# })
# })
# Save sample names separately (of 'length(cna)')
# samplenames <- unlist(lapply(cna, FUN = function(z) { z@info["sampleName"] }))
# Reformat samplenames in Hieronymus
if (geo_code == "GSE54691") {
# Pick GSM-part in GSM###_PCA###
samplenames <- unlist(lapply(samplenames, FUN = function(z) {
strsplit(z, "_")[[1]][[1]]
}))
}
# Get segmentation table
cna <- lapply(cna, FUN = function(z) {
try({
rCGH::getSegTable(z)
})
})
# Get per-gene table
cna <- lapply(cna, FUN = function(z) {
try({
rCGH::byGeneTable(z)
})
})
# Extract all unique gene symbols present over all samples
genenames <- unique(unlist(lapply(cna, FUN = function(z) {
z$symbol
})))
# Bind genes to rows, name samples afterwards
cna <- do.call("cbind", lapply(cna, FUN = function(z) {
# Return CNAs as Log2Ratios
z[match(genenames, z$symbol), "Log2Ratio"]
}))
# Name rows and columns to genes and sample names, respectively
rownames(cna) <- genenames
colnames(cna) <- samplenames
# CNA matrix is ready
cna <- as.matrix(cna)
cna <- cna[!is.na(rownames(cna)), ]
}
# Other (placeholder) - should probably place this in the beginning so it
# doesnt try to run through all the beginning steps
else if (geo_code == "") {
stop("Must supply a GEO id")
}
# Unknown
else {
stop("Unknown GEO id, see allowed parameter values for geo_code")
}
# Remove downloaded files
# TODO: Make sure appropriate permissions exist for removing files
if (cleanup) {
# First GEO download
file.remove(rownames(supfiles))
# TODO: Tarballs
# file.remove(gz_files)
# Remove empty folder
file.remove(paste0(here::here(), "/", geo_code))
}
# Sort genes to alphabetic order for consistency
cna <- cna[order(rownames(cna)), ]
# Return numeric matrix
cna <- as.matrix(cna)
cna <- cna %>% janitor::remove_empty(which = c("rows", "cols"))
cna
}
#' Download generic 'omics data from cBioPortal using dataset specific query
#'
#' @param genes character vector of genes to query
#' @param geneticProfiles charatcer string of cbioportal genetic profiles
#' @param caseList charcter string of patient IDs for that genetic profile
#' @param delay numberic value for delay time between querying gene sets
#' @param splitsize number of genes in each query
#' @param verb logical value for displaying progress bar
#' @noRd
#' @keywords internal
generate_cbioportal <- function(
genes = sort(unique(curatedPCaData_genes$hgnc_symbol)),
geneticProfiles = c(
"prad_tcga_pub_rna_seq_v2_mrna", # TCGA GEX
"prad_tcga_pub_gistic", # TCGA CNA (GISTIC)
"prad_tcga_pub_linear_CNA", # TCGA CNA (Capped relative linear copy-number values)
"prad_tcga_pub_mutations", # TCGA mutation data
# prad_tcga_pub_fusion, # TCGA fusions
"prad_mskcc_mrna_median_Zscores", # Taylor et al. GEX (z-score normalized)
"prad_mskcc_cna", # Taylor et al. CNA
"prad_broad_mrna", # PRAD Broad GEX
"prad_broad_cna", # PRAD Broad CNA
"prad_eururol_2017_rna_seq_mrna", # PRAD Eururol GEX
"prad_eururol_2017_cna", # PRAD Eururol CNA
"prad_su2c_2019_mrna_seq_fpkm_capture_all_sample_Zscores", # Abida et al. FPKM values capture assay
"prad_su2c_2019_mrna_seq_fpkm_polya_all_sample_Zscores", # Abida et al. FPKM values PolyA
"prad_su2c_2019_gistic", # Abida et al. CNA
"prad_broad_2013_cna" # BACA
), # for cgdsr calls, platform and dataset specific string
caseList = c(
"prad_tcga_pub_sequenced", # TCGA
"prad_mskcc_sequenced", # Taylor et al.
"prad_broad_sequenced", # Barbieri et al.
"prad_eururol_2017_sequenced", # Ren et al.
"prad_su2c_2019_sequenced", # Abida et al.
"prad_broad_2013_sequenced" # Baca et al.
), # for cgdsr calls, platform and dataset specific string
delay = 0.05,
splitsize = 100,
verb = TRUE) {
# If given genes is a list (with slots for various annotation types), try to extract hugo gene symbols
if (is(genes, "list")) {
genes <- genes$hgnc_symbol
}
# Establisigh connection to cBioPortal
mycgds <- cgdsr::CGDS("http://www.cbioportal.org/")
# Split gene name vector into suitable lengths
genesplit <- rep(1:ceiling(length(genes) / splitsize),
each = splitsize
)[1:length(genes)]
splitgenes <- split(genes, f = genesplit)
# Fetch split gene name lists as separate calls
pb <- progress::progress_bar$new(total = length(splitgenes))
# Bind the API calls as per columns
ret <- as.matrix(do.call("cbind", lapply(1:length(splitgenes), FUN = function(z) {
if (verb == TRUE) pb$tick()
# Sleep if necessary to avoid API call overflow
Sys.sleep(delay)
# Fetch each split gene name list from the URL-based API, essentially a wrapper for cgdsr's own function
cgdsr::getProfileData(mycgds,
genes = splitgenes[[z]],
geneticProfiles = geneticProfiles, caseList = caseList
)
})))
ret <- t(ret)
# Remove fully empty rows/columns (redundancy)
ret <- ret %>% janitor::remove_empty(which = c("rows", "cols"))
}
#' Download mutation data from cBioPortal and format them into an oncoprint-friendly matrix
#'
#' @param study_id cBioPortal identifier
#' @param genes Hugo gene symbols
#' @param delay Delay between fetches
#' @param splitsize Number of genes per fetch
#' @param verb Level of verbosity
#' @param oncoprintify Whether an oncoprint ought to be output
#' @examples
#' oncop_tcga <- generate_cbioportal_oncoprint(study_id = "tcga", oncoprintify = TRUE)
#' oncop_taylor <- generate_cbioportal_oncoprint(study_id = "taylor", oncoprintify = TRUE)
#' oncop_barbieri <- generate_cbioportal_oncoprint(study_id = "barbieri", oncoprintify = TRUE)
#' oncop_ren <- generate_cbioportal_oncoprint(study_id = "ren", oncoprintify = TRUE)
#'
#' @noRd
#' @keywords internal
generate_cbioportal_oncoprint <- function(study_id, # tcga, taylor, barbieri, ren, or abida
genes = sort(unique(curatedPCaData_genes$hgnc_symbol)),
delay = 0.05,
splitsize = 100,
verb = TRUE,
oncoprintify = TRUE) {
study_id <- tolower(study_id)
# Remember cBioPortal genetic profile names and query according to study id
# Mutations
geneticProfileMutations <- c(
tcga = "prad_tcga_pub_mutations", # TCGA mutations
taylor = "prad_mskcc_mutations", # Taylor et al. mutations
barbieri = "prad_broad_mutations", # Barbieri et al. mutations
ren = "prad_eururol_2017_mutations", # Ren et al. mutations
abida = "prad_su2c_2019_mutations" # Abida et al. mutations
)
# CNA listings for GISTIC
geneticProfileCNA <- c( # Gistic-level mutation info, {-2,-1,0,1,2}
tcga = "prad_tcga_pub_gistic", # TCGA (GISTIC instead of capped relative linear copy numbers)
taylor = "prad_mskcc_cna", # Taylor et al., GISTIC
barbieri = "prad_broad_cna", # Barbieri et al., GISTIC
ren = "prad_eururol_2017_cna", # Ren et al. (GISTIC instead of capped relative linear copy numbers)
abida = "prad_su2c_2019_gistic" # Abida et al., GISTIC
)
# Sample ID list
caseList <- c(
tcga = "prad_tcga_pub_sequenced", # TCGA samples
taylor = "prad_mskcc_sequenced", # Taylor et al. samples
barbieri = "prad_broad_sequenced", # Barbieri et al. samples
ren = "prad_eururol_2017_sequenced", # Ren et al. samples
abida = "prad_su2c_2019_sequenced" # Abida et al. samples
)
# If given genes is a list (with slots for various annotation types), try to extract hugo gene symbols
if (is(genes, "list")) {
genes <- genes$hgnc_symbol
}
# Establisigh connection to cBioPortal
mycgds <- cgdsr::CGDS("http://www.cbioportal.org/")
# Split gene name vector into suitable lengths
genesplit <- rep(1:ceiling(length(genes) / splitsize), each = splitsize)[1:length(genes)]
splitgenes <- split(genes, f = genesplit)
# Fetch split gene name lists as separate calls
pb <- progress::progress_bar$new(total = length(splitgenes))
# Bind the API calls as per columns
if (verb) print("Downloading mutation data...")
mut <- do.call("rbind", lapply(1:length(splitgenes), FUN = function(z) {
# Sleep if necessary to avoid API call overflow
Sys.sleep(delay)
# Fetch each split gene name list from the URL-based API, essentially a wrapper for cgdsr's own function
cgdsr::getMutationData(mycgds,
genes = splitgenes[[z]], #
geneticProfile = geneticProfileMutations[study_id],
caseList = caseList[study_id]
)[, c("case_id", "gene_symbol", "mutation_type")]
}))
if (verb) print("Downloading copy number alteration (GISTIC) data...")
cna <- as.matrix(do.call("cbind", lapply(1:length(splitgenes), FUN = function(z) {
# Sleep if necessary to avoid API call overflow
Sys.sleep(delay)
# Fetch each split gene name list from the URL-based API, essentially a wrapper for cgdsr's own function
cgdsr::getProfileData(mycgds,
genes = splitgenes[[z]],
geneticProfile = geneticProfileCNA[study_id],
caseList = caseList[study_id]
)
})))
cna <- t(cna)
cna[which(cna == "NaN")] <- NA
if (!oncoprintify) {
# Return raw mutation and cna calls
list(mut, cna)
} else {
# Return an oncoprint-friendly, textified matrix with ;-separators
# Create base text matrix from GISTIC CNAs
if (verb) print("Appending CNAs to the oncoprint")
oncop <- t(apply(cna, MARGIN = 1, FUN = function(z) {
c("HOMDEL", "HETLOSS", "", "LOW_GAIN", "HIGH_AMP")[z + 3]
}))
dimnames(oncop) <- dimnames(cna)
# Add samples from mutation list that may have been lacking from CNA matrix
if (!all(mut$case_id %in% colnames(oncop))) {
samps <- matrix(NA, nrow = nrow(oncop), ncol = sum(!unique(mut$case_id) %in% colnames(oncop)))
colnames(samps) <- unique(mut$case_id)[!unique(mut$case_id) %in% colnames(oncop)]
oncop <- cbind(oncop, samps)
}
if (verb) print("Appending mutations to the oncoprint")
# Add gene names from mutation list that may have been lacking from CNA matrix
if (!all(mut$gene_symbol %in% rownames(oncop))) {
genes <- matrix(NA, nrow = sum(!unique(mut$gene_symbol) %in% rownames(oncop)), ncol = ncol(oncop))
rownames(genes) <- unique(mut$gene_symbol)[!unique(mut$gene_symbol) %in% rownames(oncop)]
oncop <- rbind(oncop, genes)
}
# Sanitize '-' symbols to '.' in mutations for genes and sample names similar to how CNA matrix has them
mut[, "gene_symbol"] <- gsub("-", ".", mut[, "gene_symbol"])
mut[, "case_id"] <- gsub("-", ".", mut[, "case_id"])
# Append mutations
for (i in 1:nrow(mut)) {
if (is.na(oncop[mut[i, "gene_symbol"], mut[i, "case_id"]]) | oncop[mut[i, "gene_symbol"], mut[i, "case_id"]] == "") {
oncop[mut[i, "gene_symbol"], mut[i, "case_id"]] <- mut[i, "mutation_type"]
} else {
oncop[mut[i, "gene_symbol"], mut[i, "case_id"]] <- paste(oncop[mut[i, "gene_symbol"], mut[i, "case_id"]], mut[i, "mutation_type"], sep = ";")
}
}
if (verb) print("Subsetting samples to match those present in MAE")
# Sample names from the corresponding MAE object
cols <- switch(study_id,
"tcga" = MultiAssayExperiment::colData(curatedPCaData::mae_tcga)$sample_name,
"taylor" = MultiAssayExperiment::colData(curatedPCaData::mae_taylor)$patient_id,
"barbieri" = MultiAssayExperiment::colData(curatedPCaData::mae_barbieri)$sample_name,
"ren" = MultiAssayExperiment::colData(curatedPCaData::mae_ren)$sample_name,
"abida" = MultiAssayExperiment::colData(curatedPCaData::mae_abida)$sample_name
)
# oncop <- oncop %>% janitor::remove_empty(which = c("rows", "cols"))
# NA columns if samples are not present
oncop <- oncop[, match(cols, colnames(oncop))]
colnames(oncop) <- cols
oncop
}
}
#' Download mutation data from cBioPortal using cgsdr package
#'
#' @param study_id cBioPortal identifier
#' @param genes Hugo gene symbols
#' @param delay Delay between fetches
#' @param splitsize Number of genes per fetch
#' @param verb Level of verbosity
#' @examples
#' ren_mut <- generate_cgdsr_mut("ren", genes = curatedPCaData_genes$hgnc_symbol)
#' barbieri_mut <- generate_cgdsr_mut("barbieri", genes = curatedPCaData_genes$hgnc_symbol)
#' @noRd
#' @keywords internal
generate_cgdsr_mut <- function(
study_id, # tcga, taylor, barbieri, ren, or abida
genes = sort(unique(curatedPCaData_genes$hgnc_symbol)),
delay = 0.05,
splitsize = 100,
verb = TRUE) {
study_id <- tolower(study_id)
# Remember cBioPortal genetic profile names and query according to study id
# Mutations
geneticProfileMutations <- c(
tcga = "prad_tcga_pub_mutations", # TCGA mutations
taylor = "prad_mskcc_mutations", # Taylor et al. mutations
barbieri = "prad_broad_mutations", # Barbieri et al. mutations
ren = "prad_eururol_2017_mutations", # Ren et al. mutations
abida = "prad_su2c_2019_mutations" # Abida et al. mutations
)
# CNA listings for GISTIC
geneticProfileCNA <- c( # Gistic-level mutation info, {-2,-1,0,1,2}
tcga = "prad_tcga_pub_gistic", # TCGA (GISTIC instead of capped relative linear copy numbers)
taylor = "prad_mskcc_cna", # Taylor et al., GISTIC
barbieri = "prad_broad_cna", # Barbieri et al., GISTIC
ren = "prad_eururol_2017_cna", # Ren et al. (GISTIC instead of capped relative linear copy numbers)
abida = "prad_su2c_2019_gistic" # Abida et al., GISTIC
)
# Sample ID list
caseList <- c(
tcga = "prad_tcga_pub_sequenced", # TCGA samples
taylor = "prad_mskcc_sequenced", # Taylor et al. samples
barbieri = "prad_broad_sequenced", # Barbieri et al. samples
ren = "prad_eururol_2017_sequenced", # Ren et al. samples
abida = "prad_su2c_2019_sequenced" # Abida et al. samples
)
# If given genes is a list (with slots for various annotation types), try to extract hugo gene symbols
if (is(genes, "list")) {
genes <- genes$hgnc_symbol
}
# Establisigh connection to cBioPortal
mycgds <- cgdsr::CGDS("http://www.cbioportal.org/")
# Split gene name vector into suitable lengths
genesplit <- rep(1:ceiling(length(genes) / splitsize), each = splitsize)[1:length(genes)]
splitgenes <- split(genes, f = genesplit)
# Fetch split gene name lists as separate calls
pb <- progress::progress_bar$new(total = length(splitgenes))
# Bind the API calls as per columns
if (verb) print("Downloading mutation data...")
mut <- do.call("rbind", lapply(1:length(splitgenes), FUN = function(z) {
# Sleep if necessary to avoid API call overflow
Sys.sleep(delay)
# Fetch each split gene name list from the URL-based API, essentially a wrapper for cgdsr's own function
cgdsr::getMutationData(mycgds,
genes = splitgenes[[z]], #
geneticProfile = geneticProfileMutations[study_id],
caseList = caseList[study_id]
)
}))
list(mut)
# }
}
#' Download generic 'omics data from cBioPortal using the cbioportaldata package
#'
#' @param profile character string of variable
#' @param caseList character string of patient IDs for that genetic profile
#' @examples
#' cna.gistic_abida <- generate_cbioportaldata("prad_su2c_2019", "cna")
#' gex.relz_abida <- generate_cbioportaldata("prad_su2c_2019", "gex")
#' @noRd
#' @keywords internal
generate_cbioportaldata <- function(caselist, profile) {
harmonize_matrix <- function(matrix) {
# if the gene names dont match the hgnc_symbols column, create a seperate matrix called no_match with those rownames
no_match <- matrix[is.na(match(rownames(matrix), curatedPCaData_genes$hgnc_symbol)), ]
no_match <- as.data.frame(no_match)
# if the gene names match the hgnc_symbols column, create a seperate matrix called match with those rownames
match <- matrix[!is.na(match(rownames(matrix), curatedPCaData_genes$hgnc_symbol)), ]
match <- as.data.frame(match)
# Replace any "." in gene names with "-" since that is how the curatedpcadata_genes dictionary has them
rownames(match) <- gsub("\\.", "-", rownames(match))
rownames(no_match) <- gsub("\\.", "-", rownames(no_match))
vector <- rownames(no_match)
symbols <- vector()
# For those genes with no match try matching it to the aliase column and pull the hgnc_symbol associated with it.
original_gene <- vector()
# curatedPCaData_genes<-curatedPCaData_genes
for (i in 1:length(vector)) {
# match_name genes to the aliases column in the curatedpcadata dictionary
match_name <- curatedPCaData_genes[grep(paste0("(?<![^;])", vector[i], "(?![^;])"), curatedPCaData_genes$Aliases, value = FALSE, perl = TRUE), "hgnc_symbol"]
# Assign NAs to the ones that had no match_name
match_name[length(match_name) == 0] <- NA
# Store data with duplicates
orig2 <- replicate(length(match_name), vector[i])
original_gene <- c(original_gene, orig2)
symbols <- c(symbols, match_name)
}
# create a dictionary/df with the aliases and the associated hgnc_symbol
no_match_dict <- data.frame(original_gene = original_gene, mapped_gene = symbols)
# Remove those aliases that did not map to any hgnc_symbol
no_match_dict2 <- no_match_dict[!is.na(no_match_dict$mapped_gene), ]
# Remove duplicates in the mapped hgnc symbols
no_match_dict3 <- no_match_dict2[!duplicated(no_match_dict2$mapped_gene), ]
# check which aliase is duplicated(ie maps to multiple hgnc symbols)
dup <- no_match_dict3[duplicated(no_match_dict3$original_gene), ]
dup_vector <- dup[!duplicated(dup$original_gene), ]$original_gene
# Keep only those aliases that have a one to one mapping
remove_dup <- no_match_dict3[!(no_match_dict3$original_gene %in% dup_vector), ]
# create a vector by matching aliases to the new df
final_map <- vector()
for (i in 1:length(vector)) {
if (vector[i] %in% remove_dup$original_gene) {
map <- remove_dup$mapped_gene[which(vector[i] == remove_dup$original_gene)]
final_map <- c(final_map, map)
} else {
map <- NA
final_map <- c(final_map, map)
}
}
no_match <- no_match[!is.na(final_map), ]
final_map <- final_map[!is.na(final_map)]
rownames(no_match) <- final_map
# For those that match just pull hgnc_symbols directly
symbols2 <- curatedPCaData_genes[match(rownames(match), curatedPCaData_genes$hgnc_symbol), "hgnc_symbol"]
match <- match[!is.na(symbols2), ]
symbols2 <- symbols2[!is.na(symbols2)]
rownames(match) <- symbols2
# Combine the match and no_match matrices
final_matrix <- rbind(match, no_match)
# Replace "-" in colnames with "."
colnames(final_matrix) <- gsub("-", ".", colnames(final_matrix))
# Remove rows with all NAs
final_matrix <- final_matrix[rowSums(is.na(final_matrix)) != ncol(final_matrix), ]
# Return sorted based on alphabetic rownames
return(final_matrix[order(rownames(final_matrix)), ])
}
harmonize_raggedexp <- function(ragexp2) {
no_match <- ragexp2[is.na(match(rownames(ragexp2), curatedPCaData_genes$hgnc_symbol)), ]
match <- ragexp2[!is.na(match(rownames(ragexp2), curatedPCaData_genes$hgnc_symbol)), ]
rownames(match) <- gsub("\\.", "-", rownames(match))
rownames(no_match) <- gsub("\\.", "-", rownames(no_match))
vector <- rownames(no_match)
symbols <- vector()
# no_match_dict=data.frame(matrix(ncol=2))
# curatedPCaData_genes<-curatedPCaData_genes
# For those genes with no match try matching it to the aliase column and pull the hgnc_symbol associated with it.
# Match just the first gene and store it in a vector
# Do the aliase match for the rest of the genes and append it to the vector called symbols
original_gene <- vector()
for (i in 1:length(vector)) {
# match_name genes to the aliases column in the curatedpcadata dictionary
match_name <- curatedPCaData_genes[grep(paste0("(?<![^;])", vector[i], "(?![^;])"), curatedPCaData_genes$Aliases, value = FALSE, perl = TRUE), "hgnc_symbol"]
# Assign NAs to the ones that had no match_name
match_name[length(match_name) == 0] <- NA
# Store data with duplicates
orig2 <- replicate(length(match_name), vector[i])
original_gene <- c(original_gene, orig2)
symbols <- c(symbols, match_name)
}
# create a dictionary/df with the aliases and the associated hgnc_symbol
no_match_dict <- data.frame(original_gene = original_gene, mapped_gene = symbols)
# Remove those aliases that did not map to any hgnc_symbol
no_match_dict2 <- no_match_dict[!is.na(no_match_dict$mapped_gene), ]
# Remove duplicates in the mapped hgnc symbols
no_match_dict3 <- no_match_dict2[!duplicated(no_match_dict2$mapped_gene), ]
# check which aliase is duplicated(ie maps to multiple hgnc symbols)
dup <- no_match_dict3[duplicated(no_match_dict3$original_gene), ]
dup_vector <- dup[!duplicated(dup$original_gene), ]$original_gene
# Keep only those aliases that have a one to one mapping
remove_dup <- no_match_dict3[!(no_match_dict3$original_gene %in% dup_vector), ]
# create a vector by matching aliases to the new df
final_map <- vector()
for (i in 1:length(vector)) {
if (vector[i] %in% remove_dup$original_gene) {
map <- remove_dup$mapped_gene[which(vector[i] == remove_dup$original_gene)]
final_map <- c(final_map, map)
} else {
map <- NA
final_map <- c(final_map, map)
}
}
no_match <- no_match[!is.na(final_map), ]
final_map <- final_map[!is.na(final_map)]
rownames(no_match) <- final_map
# For those that match just pull hgnc_symbols directly
symbols2 <- curatedPCaData_genes[match(rownames(match), curatedPCaData_genes$hgnc_symbol), "hgnc_symbol"]
match <- match[!is.na(symbols2), ]
symbols2 <- symbols2[!is.na(symbols2)]
rownames(match) <- symbols2
# Combine the match and no_match matrices
match_df <- match@assays
match_df <- unlist(match_df)
match_df <- data.frame(match_df, names = names(match_df))
# match_df<-match_df[,c(1:3,46,4:45)]
no_match_df <- no_match@assays
no_match_df <- unlist(no_match_df)
no_match_df <- data.frame(no_match_df, names = names(no_match_df))
# no_match_df<-no_match_df[,c(1:3,46,4:45)]
final <- rbind(match_df, no_match_df)
final$NCBI_Build <- "GRCh38"
final$sample <- sub("^(.*)[.].*", "\\1", final$names)
final$gene <- sub(".*\\.", "", final$names)
final <- final[, -which(names(final) %in% "names")]
GRL <- GenomicRanges::makeGRangesListFromDataFrame(final,
split.field = "sample",
names.field = "gene", keep.extra.columns = TRUE
)
GenomeInfoDb::genome(GRL) <- "GRCh38"
ragexp_final <- RaggedExperiment::RaggedExperiment(GRL)
colnames(ragexp_final) <- gsub("-", ".", colnames(ragexp_final))
# final_matrix= c(rowRanges(match),rowRanges(no_match))
return(ragexp_final)
}
prof <- cBioPortalData::cBioDataPack(caselist, ask = FALSE)
if (profile == "cna") {
if (caselist == "prad_broad") {
# Get the copy number matrix as Summarizedexperiment object
res <- prof[["cna"]]
# Get gene names from Summarizedexperiment and store it in a variable
a <- RaggedExperiment::rowData(res)[match(rownames(res), rownames(RaggedExperiment::rowData(res))), "Hugo_Symbol"]
# Create a matrix with cna and gene names making sure that there are no NAs in gene names
res2 <- RaggedExperiment::assay(res)
res2 <- res2[!is.na(a), ]
a <- a[!is.na(a)]
rownames(res2) <- a
# Harmonize matrix
final_matrix_cna <- harmonize_matrix(res2)
return(as.matrix(final_matrix_cna))
} else if (caselist == "prad_mskcc_2014") {
# Get the copy number matrix as Summarizedexperiment object
res <- prof[["CNA"]]
# Get gene names from Summarizedexperiment and store it in a variable
a <- RaggedExperiment::rowData(res)[match(rownames(res), rownames(RaggedExperiment::rowData(res))), "Hugo_Symbol"]
# Create a matrix with cna and gene names making sure that there are no NAs in gene names
res2 <- RaggedExperiment::assay(res)
res2 <- res2[!is.na(a), ]
a <- a[!is.na(a)]
rownames(res2) <- a
res2 <- as.data.frame(res2)
# Harmonize matrix
final_matrix_cna <- harmonize_matrix(res2)
return(as.matrix(final_matrix_cna))
} else if (caselist == "prad_eururol_2017") {
res <- prof[["cna"]]
# Create a matrix with cna and gene names
res2 <- RaggedExperiment::assay(res)
# Harmonize matrix
final_matrix_cna <- harmonize_matrix(res2)
return(as.matrix(final_matrix_cna))
} else if (caselist == "prad_su2c_2019") {
# Get cna from metadata of the MAE
res <- metadata(prof)$cna
res <- as.data.frame(res)
# Remove duplicate gene symbols
res2 <- res[!(duplicated(res$Hugo_Symbol) | duplicated(res$Hugo_Symbol, fromLast = TRUE)), , drop = FALSE]
# Set gene symbols as rownames and remove the extra column
rownames(res2) <- res2$Hugo_Symbol
res2 <- res2[, -1]
# Harmonize matrix
final_matrix_cna <- harmonize_matrix(res2)
return(as.matrix(final_matrix_cna))
} else if (caselist == "prad_mskcc") {
# Download study and store cna matrix in res2
study <- cBioPortalData::downloadStudy("prad_mskcc")
ut <- cBioPortalData::untarStudy(study[[1]])
res2 <- rio::import(paste0(ut, "/prad_mskcc/", "data_cna.txt"))
res2 <- res2[, -2]
# Remove duplicate gene symbols
res2 <- res2[!duplicated(res2$Hugo_Symbol), ]
# Set gene symbols as rownames and remove the extra column
rownames(res2) <- res2$Hugo_Symbol
res2 <- res2[, -1]
# Exclude cell line samples
same_barcode <- colnames(res2)[grepl("PCA", colnames(res2))]
ind <- which(colnames(res2) %in% same_barcode == "TRUE")
res2 <- res2[, c(ind)]
# Harmonize matrix
final_matrix_cna <- harmonize_matrix(res2)
return(as.matrix(final_matrix_cna))
} else if (caselist == "prad_broad_2013") {
res <- prof[["cna"]]
res2 <- RaggedExperiment::assay(res)
# Harmonize matrix
final_matrix_cna <- harmonize_matrix(res2)
return(as.matrix(final_matrix_cna))
}
}
if (profile == "mut") {
ragexp <- prof[["mutations"]]
# Download chain file into data raw directory and unzip it
# download.file("https://hgdownload.soe.ucsc.edu/gbdb/hg19/liftOver/hg19ToHg38.over.chain.gz", "./data-raw/hg19ToHg38.over.chain.gz")
# system("gzip -d ./data-raw/hg19ToHg38.over.chain.gz")
ch <- rtracklayer::import.chain("./data-raw/hg19ToHg38.over.chain")
if (caselist == "prad_eururol_2017") {
# Liftover using chain file
names(ch) <- gsub("chr", "", names(ch))
ranges <- rtracklayer::liftOver(rowRanges(ragexp), ch)
ragexp2 <- ragexp[as.logical(lengths(ranges))]
ranges <- unlist(ranges)
GenomeInfoDb::genome(ranges) <- "GRCh38"
rowRanges(ragexp2) <- ranges
# Run harmonize function to harmonize gene names
final_ragexp <- harmonize_raggedexp(ragexp2)
# Add tumor barcode to metadata of the Grangelist
final_ragexp_df <- final_ragexp@assays
final_ragexp_df <- unlist(final_ragexp_df)
final_ragexp_df <- data.frame(final_ragexp_df, names = names(final_ragexp_df))
final_ragexp_df$sample <- sub("^(.*)[.].*", "\\1", final_ragexp_df$names)
final_ragexp_df$gene <- sub(".*\\.", "", final_ragexp_df$names)
final_ragexp_df <- final_ragexp_df[, -which(names(final_ragexp_df) %in% "names")]
final_ragexp_df$Tumor_sample_barcode <- final_ragexp_df$sample
GRL <- GenomicRanges::makeGRangesListFromDataFrame(final_ragexp_df,
split.field = "sample",
names.field = "gene", keep.extra.columns = TRUE
)
ragexp_final2 <- RaggedExperiment::RaggedExperiment(GRL)
return(ragexp_final2)
} else if (caselist == "prad_broad") {
# Liftover using chain file
names(ch) <- gsub("chr", "", names(ch))
ranges <- rtracklayer::liftOver(rowRanges(ragexp), ch)
ragexp2 <- ragexp[as.logical(lengths(ranges))]
ranges <- unlist(ranges)
GenomeInfoDb::genome(ranges) <- "GRCh38"
rowRanges(ragexp2) <- ranges
# Run harmonize function to harmonize gene names
final_ragexp <- harmonize_raggedexp(ragexp2)
# Add tumor barcode to metadata of the Grangelist
final_ragexp_df <- final_ragexp@assays
final_ragexp_df <- unlist(final_ragexp_df)
final_ragexp_df <- data.frame(final_ragexp_df, names = names(final_ragexp_df))
final_ragexp_df$sample <- sub("^(.*)[.].*", "\\1", final_ragexp_df$names)
final_ragexp_df$gene <- sub(".*\\.", "", final_ragexp_df$names)
final_ragexp_df <- final_ragexp_df[, -which(names(final_ragexp_df) %in% "names")]
final_ragexp_df$Tumor_sample_barcode <- final_ragexp_df$sample
GRL <- GenomicRanges::makeGRangesListFromDataFrame(final_ragexp_df,
split.field = "sample",
names.field = "gene", keep.extra.columns = TRUE
)
ragexp_final2 <- RaggedExperiment::RaggedExperiment(GRL)
return(ragexp_final2)
} else if (caselist == "prad_su2c_2019") {
# Convert ragexp to Grangelist and store it in ragexp_grangelist
ragexp_grangelist <- ragexp@assays
# Add gene names to metadata of the Grangelist
ragexp_grangelist@unlistData@elementMetadata@listData$gene <- ragexp_grangelist@unlistData@ranges@NAMES
# Liftover using chain file
names(ch) <- gsub("chr", "", names(ch))
ranges2 <- rtracklayer::liftOver(ragexp_grangelist, ch)
GenomeInfoDb::genome(ranges2) <- "GRCh38"
# Convert grangelist to raggedexperiment
ragexp2 <- RaggedExperiment::RaggedExperiment(ranges2)
# Assign gene names in the metadata to the rownames of the raggedexperiment object
rownames(ragexp2) <- ragexp2@assays@unlistData@elementMetadata@listData[["gene"]]
# Run harmonize function to harmonize gene names
final_ragexp <- harmonize_raggedexp(ragexp2)
# Add tumor barcode to metadata of the Grangelist
final_ragexp_df <- final_ragexp@assays
final_ragexp_df <- unlist(final_ragexp_df)
final_ragexp_df <- data.frame(final_ragexp_df, names = names(final_ragexp_df))
final_ragexp_df$sample <- sub("^(.*)[.].*", "\\1", final_ragexp_df$names)
final_ragexp_df$gene <- sub(".*\\.", "", final_ragexp_df$names)
final_ragexp_df <- final_ragexp_df[, -which(names(final_ragexp_df) %in% "names")]
final_ragexp_df$Tumor_sample_barcode <- final_ragexp_df$sample
sift <- rio::import("./data-raw/mut_abida_edited.txt.hg38_multianno.txt")
add_sift <- cbind(final_ragexp_df, sift)
add_sift$Matched_Norm_Sample_Barcode <- gsub("-", ".", add_sift$Matched_Norm_Sample_Barcode)
add_sift <- add_sift[, -which(names(add_sift) %in% c("Chr", "Start", "End", "Ref", "Alt"))]
GRL <- GenomicRanges::makeGRangesListFromDataFrame(add_sift,
split.field = "sample",
names.field = "gene", keep.extra.columns = TRUE
)
ragexp_final2 <- RaggedExperiment::RaggedExperiment(GRL)
return(ragexp_final2)
} else if (caselist == "prad_mskcc") {
# Liftover using chain file
names(ch) <- gsub("chr", "", names(ch))
ranges <- rtracklayer::liftOver(rowRanges(ragexp), ch)
ragexp2 <- ragexp[as.logical(lengths(ranges))]
ranges <- unlist(ranges)
GenomeInfoDb::genome(ranges) <- "GRCh38"
rowRanges(ragexp2) <- ranges
# Run harmonize function to harmonize gene names
final_ragexp <- harmonize_raggedexp(ragexp2)
# Exclude cell lines
final_ragexp_df <- final_ragexp@assays
final_ragexp_df <- unlist(final_ragexp_df)
final_ragexp_df <- data.frame(final_ragexp_df, names = names(final_ragexp_df))
final_ragexp_df$sample <- sub("^(.*)[.].*", "\\1", final_ragexp_df$names)
final_ragexp_df$gene <- sub(".*\\.", "", final_ragexp_df$names)
final_ragexp_df <- final_ragexp_df[, -which(names(final_ragexp_df) %in% "names")]
# final_ragexp_df<-final_ragexp_df[final_ragexp_df$sample %like% "PCA", ]
final_ragexp_df <- final_ragexp_df[data.table::`%like%`(final_ragexp_df$sample, "PCA"), ]
final_ragexp_df$Tumor_sample_barcode <- final_ragexp_df$sample
GRL <- GenomicRanges::makeGRangesListFromDataFrame(final_ragexp_df,
split.field = "sample",
names.field = "gene", keep.extra.columns = TRUE
)
ragexp_final2 <- RaggedExperiment::RaggedExperiment(GRL)
return(ragexp_final2)
} else if (caselist == "prad_broad_2013") {
# Convert ragexp to Grangelist and store it in ragexp_grangelist
ragexp_grangelist <- ragexp@assays
# Add gene names to metadata of the Grangelist
ragexp_grangelist@unlistData@elementMetadata@listData$gene <- ragexp_grangelist@unlistData@ranges@NAMES
# Liftover using chain file
names(ch) <- gsub("chr", "", names(ch))
ranges2 <- rtracklayer::liftOver(ragexp_grangelist, ch)
GenomeInfoDb::genome(ranges2) <- "GRCh38"
# Convert grangelist to raggedexperiment
ragexp2 <- RaggedExperiment::RaggedExperiment(ranges2)
# Assign gene names in the metadata to the rownames of the raggedexperiment object
rownames(ragexp2) <- ragexp2@assays@unlistData@elementMetadata@listData[["gene"]]
# Run harmonize function to harmonize gene names
final_ragexp <- harmonize_raggedexp(ragexp2)
# Add tumor barcode to metadata of the Grangelist
final_ragexp_df <- final_ragexp@assays
final_ragexp_df <- unlist(final_ragexp_df)
final_ragexp_df <- data.frame(final_ragexp_df, names = names(final_ragexp_df))
final_ragexp_df$sample <- sub("^(.*)[.].*", "\\1", final_ragexp_df$names)
final_ragexp_df$gene <- sub(".*\\.", "", final_ragexp_df$names)
final_ragexp_df <- final_ragexp_df[, -which(names(final_ragexp_df) %in% "names")]
final_ragexp_df$Tumor_sample_barcode <- final_ragexp_df$sample
GRL <- GenomicRanges::makeGRangesListFromDataFrame(final_ragexp_df,
split.field = "sample",
names.field = "gene", keep.extra.columns = TRUE
)
ragexp_final2 <- RaggedExperiment::RaggedExperiment(GRL)
return(ragexp_final2)
}
}
if (profile == "gex") {
if (caselist == "prad_eururol_2017") {
gex <- prof[["mrna_seq_rpkm_zscores_ref_all_samples"]]
gex2 <- RaggedExperiment::assay(gex)
# Harmonize matrix
final_matrix_gex <- harmonize_matrix(gex2)
return(as.matrix(final_matrix_gex))
} else if (caselist == "prad_broad") {
gex <- prof[["mrna_agilent_microarray_zscores_ref_all_samples"]]
gex2 <- RaggedExperiment::assay(gex)
# Harmonize matrix
final_matrix_gex <- harmonize_matrix(gex2)
return(as.matrix(final_matrix_gex))
} else if (caselist == "prad_su2c_2019") {
gex <- metadata(prof)$mrna_seq_fpkm_polya_zscores_ref_all_samples
gex <- as.data.frame(gex)
# Remove duplicate gene symbols
gex2 <- gex[!(duplicated(gex$Hugo_Symbol) | duplicated(gex$Hugo_Symbol, fromLast = TRUE)), , drop = FALSE]
rownames(gex2) <- gex2$Hugo_Symbol
gex2 <- gex2[, -1]
# Harmonize matrix
final_matrix_gex <- harmonize_matrix(gex2)
return(as.matrix(final_matrix_gex))
}
}
}
#' Download and generate omics from the ICGC
#'
#' @param icgc_id character string indicating name of ICGC dataset
#' @param file_directory character string indicating path for downloading the ICGC files
#' @param omic character string indicating which omic to download and generate for the specified icgc_id
#'
#' ICGC Publication Policy for embargoes etc: http://www.icgc.org/icgc/goals-structure-policies-guidelines/e3-publication-policy
#' ICGC Publication guidelines: http://docs.icgc.org/portal/publication/#current-moratorium-status-for-icgc-projects .
#' (e.g. 'All data shall become free of a publication moratorium when either the data is published by the ICGC member project or
#' one year after a specified quantity of data (e.g. genome dataset from 100 tumours per project) has been released via the ICGC
#' database or other public databases. In all cases data shall be free of a publication moratorium two years after its initial release.')
#' "PRAD-CA: No Embargo. Data available without limitations"
#' "PRAD-UK: No Embargo. Data available without limitations"
#' "PRAD-FR: No Embargo. Data available without limitations"
#' "PRAD-CN: Publication limitations in place until 2020-04-30" (expired) (Only simple mutations, excluded)
#' "EOPC-DE: No Embargo. Data available without limitations" (Need to verify biological applicability)
#'
#' NOTE: Sometimes the downloads seem to fail randomly; perhaps a fixed amount of retries ought to be allowed?
#' @noRd
#' @keywords internal
generate_icgc <- function(icgc_id = "PRAD_CA", # Study which ought to be downloaded; Canadian Prostate Adenocarcima study as default; note ICGC uses format 'PRAD-CA' but '_' is used for R-friendliness
set = "gex", # Which dataset (patient or sample data / omics platform) to try to extract from the data; valid values: 'clinical', 'gex', 'cna', ...
file_directory, # Temporary download location
collapseFUN = mean, # Function to use to average/median etc over multiple genes/probes mapped to same hugo symbol
verb = 0 # Level of verbosity; 0 = minimal, 1 = informative, 2 = debugging
) {
# Use upper case similar to ICGC's naming conventions, also map '-' into '_' as it is not a mathematical operator and is safer
icgc_id <- base::gsub("-", "_", base::toupper(icgc_id))
# Currently the studies from Canada, UK and France have enough samples & omics to fit to the package
if (!icgc_id %in% c("PRAD_CA", "PRAD_FR", "PRAD_UK")) {
stop("The queried ICGC study ought to be one of: 'PRAD_CA', 'PRAD_UK', or 'PRAD_FR'")
}
# If provided, set to a custom temporary download directory
if (!missing(file_directory)) here::set_here(file_directory)
# At the time of writing, the latest public release is 28
# The downloadable files depend on study and fixed URLs are listed here-in
icgc_sets <- list(
# Canada (Release 28 fixed reference, instead of 'current', for reproducibility)
"PRAD_CA" = c(
"https://dcc.icgc.org/api/v1/download?fn=/release_28/Projects/PRAD-CA/copy_number_somatic_mutation.PRAD-CA.tsv.gz",
"https://dcc.icgc.org/api/v1/download?fn=/release_28/Projects/PRAD-CA/donor.PRAD-CA.tsv.gz",
"https://dcc.icgc.org/api/v1/download?fn=/release_28/Projects/PRAD-CA/donor_exposure.PRAD-CA.tsv.gz",
"https://dcc.icgc.org/api/v1/download?fn=/release_28/Projects/PRAD-CA/donor_family.PRAD-CA.tsv.gz",
"https://dcc.icgc.org/api/v1/download?fn=/release_28/Projects/PRAD-CA/donor_therapy.PRAD-CA.tsv.gz",
"https://dcc.icgc.org/api/v1/download?fn=/release_28/Projects/PRAD-CA/exp_array.PRAD-CA.tsv.gz",
"https://dcc.icgc.org/api/v1/download?fn=/release_28/Projects/PRAD-CA/meth_array.PRAD-CA.tsv.gz",
"https://dcc.icgc.org/api/v1/download?fn=/release_28/Projects/PRAD-CA/sample.PRAD-CA.tsv.gz",
"https://dcc.icgc.org/api/v1/download?fn=/release_28/Projects/PRAD-CA/simple_somatic_mutation.open.PRAD-CA.tsv.gz",
"https://dcc.icgc.org/api/v1/download?fn=/release_28/Projects/PRAD-CA/specimen.PRAD-CA.tsv.gz",
"https://dcc.icgc.org/api/v1/download?fn=/release_28/Projects/PRAD-CA/structural_somatic_mutation.PRAD-CA.tsv.gz"
),
# France (Release 28 fixed reference, instead of 'current', for reproducibility)
"PRAD_FR" = c(
"https://dcc.icgc.org/api/v1/download?fn=/release_28/Projects/PRAD-FR/copy_number_somatic_mutation.PRAD-FR.tsv.gz",
"https://dcc.icgc.org/api/v1/download?fn=/release_28/Projects/PRAD-FR/donor.PRAD-FR.tsv.gz",
"https://dcc.icgc.org/api/v1/download?fn=/release_28/Projects/PRAD-FR/donor_family.PRAD-FR.tsv.gz",
"https://dcc.icgc.org/api/v1/download?fn=/release_28/Projects/PRAD-FR/donor_surgery.PRAD-FR.tsv.gz",
"https://dcc.icgc.org/api/v1/download?fn=/release_28/Projects/PRAD-FR/exp_array.PRAD-FR.tsv.gz",
"https://dcc.icgc.org/api/v1/download?fn=/release_28/Projects/PRAD-FR/exp_seq.PRAD-FR.tsv.gz",
"https://dcc.icgc.org/api/v1/download?fn=/release_28/Projects/PRAD-FR/sample.PRAD-FR.tsv.gz",
"https://dcc.icgc.org/api/v1/download?fn=/release_28/Projects/PRAD-FR/simple_somatic_mutation.open.PRAD-FR.tsv.gz",
"https://dcc.icgc.org/api/v1/download?fn=/release_28/Projects/PRAD-FR/specimen.PRAD-FR.tsv.gz",
"https://dcc.icgc.org/api/v1/download?fn=/release_28/Projects/PRAD-FR/structural_somatic_mutation.PRAD-FR.tsv.gz"
),
# United Kingdom (Release 28 fixed, instead of 'current', for reproducibility)
"PRAD_UK" = c(
"https://dcc.icgc.org/api/v1/download?fn=/release_28/Projects/PRAD-UK/copy_number_somatic_mutation.PRAD-UK.tsv.gz",
"https://dcc.icgc.org/api/v1/download?fn=/release_28/Projects/PRAD-UK/donor.PRAD-UK.tsv.gz",
"https://dcc.icgc.org/api/v1/download?fn=/release_28/Projects/PRAD-UK/donor_exposure.PRAD-UK.tsv.gz",
"https://dcc.icgc.org/api/v1/download?fn=/release_28/Projects/PRAD-UK/donor_family.PRAD-UK.tsv.gz",
"https://dcc.icgc.org/api/v1/download?fn=/release_28/Projects/PRAD-UK/donor_therapy.PRAD-UK.tsv.gz",
"https://dcc.icgc.org/api/v1/download?fn=/release_28/Projects/PRAD-UK/meth_array.PRAD-UK.tsv.gz",
"https://dcc.icgc.org/api/v1/download?fn=/release_28/Projects/PRAD-UK/sample.PRAD-UK.tsv.gz",
"https://dcc.icgc.org/api/v1/download?fn=/release_28/Projects/PRAD-UK/simple_somatic_mutation.open.PRAD-UK.tsv.gz",
"https://dcc.icgc.org/api/v1/download?fn=/release_28/Projects/PRAD-UK/specimen.PRAD-UK.tsv.gz",
"https://dcc.icgc.org/api/v1/download?fn=/release_28/Projects/PRAD-UK/structural_somatic_mutation.PRAD-UK.tsv.gz"
)
# Possibly? (Early Onset Prostate Cancer, Germany)
# https://dcc.icgc.org/releases/release_28/Projects/EOPC-DE
)
# Small internal function to assist with the downloads
.icgcDownload <- function(url) {
# Pick the filename from the end of the URL
filename <- strsplit(url, "/")
filename <- filename[[1]][[length(filename[[1]])]]
# Download file into parsed *.tsv.gz
utils::download.file(url = url, destfile = filename)
# gunzip the files open
GEOquery::gunzip(filename, overwrite = TRUE)
filename
}
# Loop over the list of urls, download and gunzip them
files <- icgc_sets[[icgc_id]]
# Select subset of files to download
if (set == "clinical") { # Clinical data
files <- grep("sample|donor|specimen", files, value = TRUE)
} else if (set == "gex") { # Gene expression (array or sequencing)
files <- grep("exp_array|exp_seq", files, value = TRUE)
} else if (set == "cna") { # Copy number alterations
files <- grep("copy_number_somatic_mutation", files, value = TRUE)
} else if (set == "mut") { # Point mutations
files <- grep("simple_somatic_mutation", files, value = TRUE)
} else if (set == "met") { # Methylation
files <- grep("meth_array", files, value = TRUE)
} else if (set == "str") { # Larger structural aberrations
files <- grep("structural_somatic_mutation", files, value = TRUE)
} else {
stop("Invalid parameter 'set'; should be one of: 'clinical, 'gex', 'cna', 'mut', 'met', or 'str'")
}
# list apply the .icgcDownload-function to all eligible files
files <- unlist(lapply(files, FUN = .icgcDownload))
# Verbosity
if (verb >= 1) print(paste("Files downloaded & extracted:", paste(files, collapse = ", ")))
# Gunzip has extracted the compressed files, removing the suffix accordingly
files <- gsub(".gz", "", files)
# Pick an 'omics based on the requested 'omic' and process into a suitable data.frame or matrix
if (set == "clinical") { # Clinical data
# Main id file, should contain only simple file with prefix 'sample.*'
ret <- read.table(grep("sample", files, value = TRUE), sep = "\t", header = TRUE, stringsAsFactors = FALSE)
# Other files to append:
# donor.* contains multiple useful clinical features
# donor_family.* possibly interesting family history related to cancer
# Append useful donor.* -file contained information to the clinical data
don <- read.table(grep("donor.", files, value = TRUE, fixed = TRUE), sep = "\t", header = TRUE, stringsAsFactors = FALSE)
ret <- cbind(ret, don[match(ret$icgc_donor_id, don$icgc_donor_id), ])
# Append useful specimen.* -file containing information e.g. for gleason grade
spe <- read.table(grep("specimen.", files, value = TRUE, fixed = TRUE), sep = "\t", header = TRUE, stringsAsFactors = FALSE)
ret <- cbind(ret, spe[match(ret$icgc_donor_id, spe$icgc_donor_id), ])
} else if (set == "gex") { # Gene expression (array or sequencing)
# In RNA-seq, use raw read counts and normalize them
if (any(grepl("exp_seq", files))) {
ret <- read.table(grep("exp_seq", files, value = TRUE), sep = "\t", header = TRUE, stringsAsFactors = FALSE)[, c("icgc_sample_id", "gene_id", "raw_read_count")]
# In microarrays, need to use the prenormalized expression values
} else if (any(grepl("exp_array", files))) {
ret <- read.table(grep("exp_array", files, value = TRUE), sep = "\t", header = TRUE, stringsAsFactors = FALSE)[, c("icgc_sample_id", "gene_id", "normalized_expression_value")]
} else {
stop("Unknown gene expression file type")
}
# Mapping of genes to hugo symbols is dependent on the dataset
if (icgc_id == "PRAD_CA") {
# Normalized expression values based on refseq gene ids, add hugo symbols
ret$hgnc_symbol <- curatedPCaData_genes[match(ret$gene_id, curatedPCaData_genes$refseq_mrna), "hgnc_symbol"]
# Remove entries for which gene_id or hgnc_symbol is missing
ret <- ret[-which(is.na(ret$gene_id) | is.na(ret$hgnc_symbol)), ]
# Create a gex matrix of samples as columns and genes as rows
genes <- sort(unique(ret$hgnc_symbol))
genes <- genes[-which(genes == "")]
samples <- unique(ret$icgc_sample_id)
# Loop over samples, inner loop for mapping genes and then collapsing if multiple measurements exist for same hgnc symbol
ret <- do.call("cbind", by(ret,
INDICES = ret$icgc_sample_id,
FUN = function(x) {
unlist(lapply(genes, FUN = function(z) {
collapseFUN(x[match(z, x$hgnc_symbol), "normalized_expression_value"])
}))
}
))
rownames(ret) <- genes
} else if (icgc_id == "PRAD_FR") {
# Normalizing raw counts
}
} else if (set == "cna") { # Copy number alterations
ret <- read.table(grep("copy_number_somatic", files, value = TRUE), sep = "\t", header = TRUE, stringsAsFactors = FALSE)[, c("icgc_sample_id", "gene_affected", "mutation_type", "copy_number", "chromosome", "chromosome_start", "chromosome_end", "assembly_version")]
} else if (set == "mut") { # Point mutations
ret <- read.table(grep("simple_somatic_mutation", files, value = TRUE), sep = "\t", header = TRUE, stringsAsFactors = FALSE)[, c("icgc_sample_id", "gene_affected", "mutation_type", "chromosome", "chromosome_start", "chromosome_end", "assembly_version", "chromosome_strand", "reference_genome_allele", "mutated_from_allele", "mutated_to_allele", "quality_score", "consequence_type", "aa_mutation", "cds_mutation", "transcript_affected")]
} else if (set == "met") { # Methylation
ret <- read.table(grep("meth_array", files, , value = TRUE), sep = "\t", header = TRUE, stringsAsFactors = FALSE)[, c("icgc_sample_id")]
} else if (set == "str") { # Larger structural aberrations
ret <- read.table(grep("structural_somatic_mutation", files, value = TRUE), sep = "\t", header = TRUE, stringsAsFactors = FALSE)[, c("icgc_sample_id", "variant_type", "sv_id", "placement", "annotation", "interpreted_annotation", "chr_from", "chr_from_strand", "chr_from_range", "chr_from_flanking_seq", "chr_to", "chr_to_bkpt", "chr_to_strand", "chr_to_range", "chr_to_flanking_seq", "assembly_version", "gene_affected_by_bkpt_to", "gene_affected_by_bkpt_to", "transcript_affected_by_bkpt_from", "transcript_affected_by_bkpt_to", "bkpt_from_context", "bkpt_to_context", "gene_build_version", "platform")]
}
ret
}
#' Roxygen topic for generate_xenabrowser
#'
#' @noRd
#' @keywords internal
generate_xenabrowser <- function(
id = "TCGA-PRAD", # Study ID (by expectation TCGA's Prostate Adenocarcinoma
type = c("gex", "cna", "mut", "clinical"), # First instance of vector is used to determine what is extracted
# Function for collapsing rows for identical gene symbols; separate for gene expression (GEX) or copy number alteration (CNA) data, as latter is rounded to integers in case of median giving out means between two mid-most samples
collapse_fun_gex = function(z) {
apply(z, MARGIN = 2, FUN = stats::median)
},
collapse_fun_cna = function(z) {
apply(z, MARGIN = 2, FUN = function(x) {
round(stats::median(x), 0)
})
},
# If Sample IDs should be truncated down to Patient ID level (leave out last segment of the '-' or '.' separators)
truncate = TRUE,
# Number of digits to store for the data object; for large matrices this may be required to stay beneath 100 MB, or to get rid of insignificant digits
digits,
# If intermediate files ought to be removed
cleanup = TRUE,
...) {
# Small internal function to assist with the downloads from xenabrowser.net
.xenabrowserDownload <- function(url, gz = TRUE) {
# Pick the filename from the end of the URL
filename <- strsplit(url, "/")
filename <- filename[[1]][[length(filename[[1]])]]
# Download file into parsed *.tsv.gz
utils::download.file(url = url, destfile = filename)
# if .gz, gunzip the files open
if (gz) GEOquery::gunzip(filename, overwrite = TRUE)
gsub(".gz", "", filename)
}
harmonize_matrix <- function(matrix) {
# if the gene names dont match the hgnc_symbols column, create a seperate matrix called no_match with those rownames
no_match <- matrix[is.na(match(rownames(matrix), curatedPCaData_genes$hgnc_symbol)), ]
no_match <- as.data.frame(no_match)
# if the gene names match the hgnc_symbols column, create a seperate matrix called match with those rownames
match <- matrix[!is.na(match(rownames(matrix), curatedPCaData_genes$hgnc_symbol)), ]
match <- as.data.frame(match)
# Replace any "." in gene names with "-" since that is how the curatedpcadata_genes dictionary has them
rownames(match) <- gsub("\\.", "-", rownames(match))
rownames(no_match) <- gsub("\\.", "-", rownames(no_match))
vector <- rownames(no_match)
vector <- gsub("\\?.*", NA, vector)
vector <- vector[!is.na(vector)]
symbols <- vector()
no_match <- no_match[vector, ]
# For those genes with no match try matching it to the aliase column and pull the hgnc_symbol associated with it.
original_gene <- vector()
# curatedPCaData_genes<-curatedPCaData_genes
for (i in 1:length(vector)) {
# match_name genes to the aliases column in the curatedpcadata dictionary
match_name <- curatedPCaData_genes[grep(paste0("(?<![^;])", vector[i], "(?![^;])"), curatedPCaData_genes$Aliases, value = FALSE, perl = TRUE), "hgnc_symbol"]
# Assign NAs to the ones that had no match_name
match_name[length(match_name) == 0] <- NA
# Store data with duplicates
orig2 <- replicate(length(match_name), vector[i])
original_gene <- c(original_gene, orig2)
symbols <- c(symbols, match_name)
}
# create a dictionary/df with the aliases and the associated hgnc_symbol
no_match_dict <- data.frame(original_gene = original_gene, mapped_gene = symbols)
# Remove those aliases that did not map to any hgnc_symbol
no_match_dict2 <- no_match_dict[!is.na(no_match_dict$mapped_gene), ]
# Remove duplicates in the mapped hgnc symbols
no_match_dict3 <- no_match_dict2[!duplicated(no_match_dict2$mapped_gene), ]
# check which aliase is duplicated(ie maps to multiple hgnc symbols)
dup <- no_match_dict3[duplicated(no_match_dict3$original_gene), ]
dup_vector <- dup[!duplicated(dup$original_gene), ]$original_gene
# Keep only those aliases that have a one to one mapping
remove_dup <- no_match_dict3[!(no_match_dict3$original_gene %in% dup_vector), ]
# create a vector by matching aliases to the new df
final_map <- vector()
for (i in 1:length(vector)) {
if (vector[i] %in% remove_dup$original_gene) {
map <- remove_dup$mapped_gene[which(vector[i] == remove_dup$original_gene)]
final_map <- c(final_map, map)
} else {
map <- NA
final_map <- c(final_map, map)
}
}
no_match <- no_match[!is.na(final_map), ]
final_map <- final_map[!is.na(final_map)]
rownames(no_match) <- final_map
# For those that match just pull hgnc_symbols directly
symbols2 <- curatedPCaData_genes[match(rownames(match), curatedPCaData_genes$hgnc_symbol), "hgnc_symbol"]
match <- match[!is.na(symbols2), ]
symbols2 <- symbols2[!is.na(symbols2)]
rownames(match) <- symbols2
# Combine the match and no_match matrices
final_matrix <- rbind(match, no_match)
# Replace "-" in colnames with "."
colnames(final_matrix) <- gsub("-", ".", colnames(final_matrix))
# Remove rows with all NAs
final_matrix <- final_matrix[rowSums(is.na(final_matrix)) != ncol(final_matrix), ]
return(final_matrix)
}
harmonize_raggedexp <- function(ragexp2) {
no_match <- ragexp2[is.na(match(rownames(ragexp2), curatedPCaData_genes$hgnc_symbol)), ]
match <- ragexp2[!is.na(match(rownames(ragexp2), curatedPCaData_genes$hgnc_symbol)), ]
rownames(match) <- gsub("\\.", "-", rownames(match))
rownames(no_match) <- gsub("\\.", "-", rownames(no_match))
vector <- rownames(no_match)
symbols <- vector()
# curatedPCaData_genes<-curatedPCaData_genes
# For those genes with no match try matching it to the aliase column and pull the hgnc_symbol associated with it.
# Match just the first gene and store it in a vector
# Do the aliase match for the rest of the genes and append it to the vector called symbols
original_gene <- vector()
for (i in 1:length(vector)) {
# match_name genes to the aliases column in the curatedpcadata dictionary
match_name <- curatedPCaData_genes[grep(paste0("(?<![^;])", vector[i], "(?![^;])"), curatedPCaData_genes$Aliases, value = FALSE, perl = TRUE), "hgnc_symbol"]
# Assign NAs to the ones that had no match_name
match_name[length(match_name) == 0] <- NA
# Store data with duplicates
orig2 <- replicate(length(match_name), vector[i])
original_gene <- c(original_gene, orig2)
symbols <- c(symbols, match_name)
}
# create a dictionary/df with the aliases and the associated hgnc_symbol
no_match_dict <- data.frame(original_gene = original_gene, mapped_gene = symbols)
# Remove those aliases that did not map to any hgnc_symbol
no_match_dict2 <- no_match_dict[!is.na(no_match_dict$mapped_gene), ]
# Remove duplicates in the mapped hgnc symbols
no_match_dict3 <- no_match_dict2[!duplicated(no_match_dict2$mapped_gene), ]
# check which aliase is duplicated(ie maps to multiple hgnc symbols)
dup <- no_match_dict3[duplicated(no_match_dict3$original_gene), ]
dup_vector <- dup[!duplicated(dup$original_gene), ]$original_gene
# Keep only those aliases that have a one to one mapping
remove_dup <- no_match_dict3[!(no_match_dict3$original_gene %in% dup_vector), ]
# create a vector by matching aliases to the new df
final_map <- vector()
for (i in 1:length(vector)) {
if (vector[i] %in% remove_dup$original_gene) {
map <- remove_dup$mapped_gene[which(vector[i] == remove_dup$original_gene)]
final_map <- c(final_map, map)
} else {
map <- NA
final_map <- c(final_map, map)
}
}
no_match <- no_match[!is.na(final_map), ]
final_map <- final_map[!is.na(final_map)]
rownames(no_match) <- final_map
# For those that match just pull hgnc_symbols directly
symbols2 <- curatedPCaData_genes[match(rownames(match), curatedPCaData_genes$hgnc_symbol), "hgnc_symbol"]
match <- match[!is.na(symbols2), ]
symbols2 <- symbols2[!is.na(symbols2)]
rownames(match) <- symbols2
# Combine the match and no_match matrices
match_df <- match@assays
match_df <- unlist(match_df)
match_df <- data.frame(match_df, names = names(match_df))
# match_df<-match_df[,c(1:3,46,4:45)]
no_match_df <- no_match@assays
no_match_df <- unlist(no_match_df)
no_match_df <- data.frame(no_match_df, names = names(no_match_df))
# no_match_df<-no_match_df[,c(1:3,46,4:45)]
final <- rbind(match_df, no_match_df)
final$NCBI_Build <- "GRCh38"
final$sample <- sub("^(.*)[.].*", "\\1", final$names)
final$gene <- sub(".*\\.", "", final$names)
final <- final[, -which(names(final) %in% "names")]
GRL <- GenomicRanges::makeGRangesListFromDataFrame(final,
split.field = "sample",
names.field = "gene", keep.extra.columns = TRUE
)
GenomeInfoDb::genome(GRL) <- "GRCh38"
ragexp_final <- RaggedExperiment::RaggedExperiment(GRL)
# final_matrix= c(rowRanges(match),rowRanges(no_match))
return(ragexp_final)
}
# Cases (typically TCGA-PRAD)
if (id == "TCGA-PRAD") {
# Release Mid 2019ish
urls <- list(
"rsem.log" = "https://tcga-xena-hub.s3.us-east-1.amazonaws.com/download/TCGA.PRAD.sampleMap%2FHiSeqV2.gz",
"gistic" = "https://tcga-xena-hub.s3.us-east-1.amazonaws.com/download/TCGA.PRAD.sampleMap%2FGistic2_CopyNumber_Gistic2_all_thresholded.by_genes.gz",
"mc3" = "https://tcga-xena-hub.s3.us-east-1.amazonaws.com/download/mc3%2FPRAD_mc3.txt.gz",
"phenotype" = "https://tcga-xena-hub.s3.us-east-1.amazonaws.com/download/TCGA.PRAD.sampleMap%2FPRAD_clinicalMatrix",
"os" = "https://tcga-xena-hub.s3.us-east-1.amazonaws.com/download/survival%2FPRAD_survival.txt"
)
# Gene expression
if (type == "gex") {
# Download the FPKM-UQ values processed via HTSeq
file <- .xenabrowserDownload(urls[["rsem.log"]])
dat <- read.table(file, sep = "\t", header = TRUE, row.names = 1)
dat <- harmonize_matrix(dat)
# If column names should truncate last segment (sample -> patient id level truncation)
if (truncate >= 1) {
print("Truncating to 3 dot-separated names...")
# Remove fourth element separated by '.'
colnames(dat) <- unlist(lapply(colnames(dat), FUN = function(x) {
paste(strsplit(x, ".", fixed = TRUE)[[1]][1:3], collapse = ".")
}))
# Lesser truncation with suffix '.01A' -> '.01' as in cBio
} else if (truncate >= 0) {
print("Substituting '.01A|.01B' with '.01' ...")
# Sub '.01A" with the default ".01"
colnames(dat) <- gsub(".01A|.01B", ".01", colnames(dat))
}
# Low quality RNA samples omitted
# Utilize additional information of low quality samples from pathology review and RNA degradation
# Samples excluded in pathology review
PathReview <- c(
"TCGA-G9-6332",
"TCGA-HC-7738",
"TCGA-HC-7745",
"TCGA-HC-7819",
"TCGA-HC-8259",
"TCGA-EJ-A46B",
"TCGA-EJ-A46E",
"TCGA-EJ-A46H",
"TCGA-H9-A6BX",
"TCGA-HC-7817",
"TCGA-KK-A8IM",
"TCGA-V1-A8MK",
"TCGA-EJ-A8FO",
"TCGA-EJ-A8FP",
"TCGA-J4-A83J",
"TCGA-KK-A8I7"
)
# Samples indicated to suffer from RNA degradation
RNAdegrade <- c(
"TCGA-J4-A67O-01A-11R-A30B-07",
"TCGA-QU-A6IM-01A-11R-A31N-07",
"TCGA-QU-A6IL-01A-11R-A31N-07",
"TCGA-KK-A7AW-01A-11R-A32O-07",
"TCGA-G9-6347-01A-11R-A31N-07",
"TCGA-EJ-A46F-01A-31R-A250-07",
"TCGA-QU-A6IO-01A-11R-A31N-07",
"TCGA-J4-A67Q-01A-21R-A30B-07",
"TCGA-QU-A6IN-01A-11R-A31N-07",
"TCGA-G9-6379-01A-11R-A31N-07",
"TCGA-KK-A59V-01A-11R-A29R-07",
"TCGA-EJ-7325-01B-11R-A32O-07",
"TCGA-HC-A6HX-01A-11R-A31N-07",
"TCGA-G9-6362-01A-11R-1789-07",
"TCGA-J4-A67N-01A-11R-A30B-07",
"TCGA-FC-A6HD-01A-11R-A31N-07",
"TCGA-KK-A7AY-01A-11R-A33R-07",
"TCGA-G9-6498-01A-12R-A311-07",
"TCGA-KK-A7B0-01A-11R-A32O-07",
"TCGA-V1-A9OT-01A-11R-A41O-07",
"TCGA-HC-A6AQ-01A-11R-A30B-07",
"TCGA-EJ-A65D-01A-11R-A30B-07",
"TCGA-J4-A67R-01A-21R-A30B-07",
"TCGA-J4-A67M-01A-11R-A30B-07",
"TCGA-EJ-A65M-01A-11R-A29R-07",
"TCGA-MG-AAMC-01A-11R-A41O-07",
"TCGA-M7-A723-01A-12R-A32O-07",
"TCGA-EJ-A65B-01A-12R-A30B-07",
"TCGA-KK-A6E3-01A-21R-A30B-07",
"TCGA-H9-A6BY-01A-11R-A30B-07",
"TCGA-EJ-AB20-01A-12R-A41O-07",
"TCGA-J4-A67S-01A-11R-A30B-07",
"TCGA-HC-A6AP-01A-11R-A30B-07",
"TCGA-X4-A8KQ-01A-12R-A36G-07",
"TCGA-HC-7752-01A-11R-2118-07",
"TCGA-G9-6496-01A-11R-1789-07",
"TCGA-HC-A6AN-01A-11R-A30B-07",
"TCGA-FC-A66V-01A-21R-A30B-07",
"TCGA-HC-A6AS-01A-11R-A30B-07",
"TCGA-HC-A6AO-01A-11R-A30B-07",
"TCGA-HI-7169-01A-11R-2118-07",
"TCGA-J4-A67L-01A-11R-A30B-07",
"TCGA-HC-A6HY-01A-11R-A31N-07",
"TCGA-G9-6354-01A-11R-A311-07",
"TCGA-J4-A67K-01A-21R-A30B-07",
"TCGA-G9-6369-01A-21R-1965-07",
"TCGA-KK-A6E4-01A-11R-A30B-07",
"TCGA-KK-A6E7-01A-11R-A31N-07",
"TCGA-G9-6339-01A-12R-A311-07",
"TCGA-G9-6373-01A-11R-1789-07",
"TCGA-KK-A7AQ-01A-11R-A33R-07",
"TCGA-HC-A6AL-01A-11R-A30B-07",
"TCGA-ZG-A9L9-01A-11R-A41O-07",
"TCGA-KK-A6E8-01A-11R-A31N-07",
"TCGA-2A-A8VX-01A-11R-A37L-07",
"TCGA-G9-6343-01A-21R-1965-07",
"TCGA-KC-A7F5-01A-11R-A33R-07",
"TCGA-J9-A52E-01A-11R-A26U-07",
"TCGA-J9-A52C-01A-11R-A26U-07",
"TCGA-XJ-A9DX-01A-11R-A37L-07",
"TCGA-G9-6338-01A-12R-1965-07",
"TCGA-M7-A724-01A-12R-A32O-07",
"TCGA-KK-A7B2-01A-12R-A32O-07",
"TCGA-ZG-A9LB-01A-11R-A41O-07",
"TCGA-V1-A8MM-01A-11R-A37L-07",
"TCGA-M7-A71Z-01A-12R-A32O-07",
"TCGA-XK-AAK1-01A-11R-A41O-07",
"TCGA-VN-A88M-01A-11R-A352-07",
"TCGA-KK-A7AZ-01A-12R-A32O-07",
"TCGA-ZG-A9LM-01A-11R-A41O-07",
"TCGA-XJ-A83F-01A-11R-A352-07",
"TCGA-XJ-A9DQ-01A-11R-A37L-07",
"TCGA-ZG-A9LU-01A-11R-A41O-07",
"TCGA-G9-7525-01A-31R-2263-07",
"TCGA-G9-6496-11A-01R-1789-07", # Normal sample
"TCGA-G9-6362-11A-01R-1789-07", # Normal sample
"TCGA-G9-6348-11A-01R-1789-07" # Normal sample
)
# Substituting '-' with '.' due to R's variable name conventions
PathReview <- paste(gsub("-", ".", PathReview), ".01", sep = "") # All pathology review samples with low quality ended in .01
RNAdegrade <- gsub("-", ".", RNAdegrade)
RNAdegrade <- gsub(".01A", ".01", substr(RNAdegrade, start = 0, stop = 16))
# Exclusion of low quality samples (3 normals, after the larger data was subset to provisional tumor samples)
dat <- dat[, which(!colnames(dat) %in% c(PathReview, RNAdegrade))]
# Alphabetic ordering
dat <- dat[order(rownames(dat)), ]
# Copy number alterations (discretized by GISTIC)
} else if (type == "cna") {
# Download the copy number alteration values processed via GISTIC
file <- .xenabrowserDownload(urls[["gistic"]])
dat <- read.table(file, sep = "\t", header = TRUE, row.names = 1)
dat <- harmonize_matrix(dat)
# If column names should truncate last segment (sample -> patient id level truncation)
if (truncate >= 1) {
print("Truncating to 3 dot-separated names...")
# Remove fourth element separated by '.'
colnames(dat) <- unlist(lapply(colnames(dat), FUN = function(x) {
paste(strsplit(x, ".", fixed = TRUE)[[1]][1:3], collapse = ".")
}))
# Lesser truncation with suffix '.01A' -> '.01' as in cBio
} else if (truncate >= 0) {
print("Substituting '.01A|.01B' with '.01' ...")
# Sub '.01A" with the default ".01"
colnames(dat) <- gsub(".01A|.01B", ".01", colnames(dat))
}
# Alphabetic ordering
dat <- dat[order(rownames(dat)), ]
# Small mutations (SNV / INDELs called by Mutect2)
} else if (type == "mut") {
# Download the MuTect2 somatic mutation calls
file <- .xenabrowserDownload(urls[["mc3"]])
# Suitable for RaggedExperiment style data storage
dat <- read.table(file, sep = "\t", header = TRUE, quote = "<")
# UCSCXenaTools::XenaGenerate(subset = XenaHostNames=="tcgaHub") %>%
# XenaFilter(filterDatasets = "PRAD")%>% XenaFilter(filterDatasets = "PRAD_mc3.txt") -> df_todo
# XenaQuery(df_todo) %>%
# XenaDownload() -> xe_download
# dat<-XenaPrepare(xe_download)
# if(cleanup) file.remove(file)
tcga_mut <- dat[, c(2:4, 1, 5:12)]
colnames(tcga_mut)[1:3] <- c("seqnames", "start", "end")
tcga_mut$sample <- gsub("-", ".", tcga_mut$sample)
tcga_mut$sample <- gsub("01A", "01", tcga_mut$sample)
names(tcga_mut)[names(tcga_mut) == "effect"] <- "Variant_Classification"
# Add tumor and normal sample barcodes
prof <- cBioPortalData::cBioDataPack("prad_tcga", ask = FALSE)
ragexp <- prof[["mutations"]]
ragexp_df <- ragexp@assays
ragexp_df <- unlist(ragexp_df)
ragexp_df <- data.frame(ragexp_df, names = names(ragexp_df))
ragexp_df$sample <- sub("\\..*", "", ragexp_df$names)
ragexp_df$gene <- gsub("^.*?\\.", "", ragexp_df$names)
ragexp_df$Tumor_sample_barcode <- ragexp_df$sample
ragexp_df$sample <- gsub("-", ".", ragexp_df$sample)
ragexp_df <- ragexp_df[, c("sample", "Tumor_sample_barcode", "Matched_Norm_Sample_Barcode")]
ragexp_df <- ragexp_df[!duplicated(ragexp_df$sample), ]
merged <- merge(tcga_mut, ragexp_df, by = "sample")
# a=subset(tcga_mut, Sample_ID %in% colnames(mae_tcga[["gex.fpkm"]]))
GRL <- GenomicRanges::makeGRangesListFromDataFrame(merged,
split.field = "sample",
names.field = "gene", keep.extra.columns = TRUE
)
ragexp_tcga <- RaggedExperiment::RaggedExperiment(GRL)
# Liftover from hg19 to hg38
ch <- rtracklayer::import.chain("./data-raw/hg19ToHg38.over.chain")
names(ch) <- gsub("chr", "", names(ch))
ranges <- rtracklayer::liftOver(rowRanges(ragexp_tcga), ch)
ragexp2 <- ragexp_tcga[as.logical(lengths(ranges))]
ranges <- unlist(ranges)
GenomeInfoDb::genome(ranges) <- "GRCh38"
rowRanges(ragexp2) <- ranges
# Harmonize gene names
final_ragexp <- harmonize_raggedexp(ragexp2)
return(final_ragexp)
# Clinical data matrix construction
} else if (type == "clinical") {
# Generic phenotype information
file <- .xenabrowserDownload(urls[["phenotype"]], gz = FALSE)
phenotype <- read.table(file, sep = "\t", header = TRUE, row.names = 1, quote = "#")
if (cleanup) file.remove(file)
# Overall Survival
file <- .xenabrowserDownload(urls[["os"]], gz = FALSE)
os <- read.table(file, sep = "\t", header = TRUE, row.names = 1, quote = "#")
if (cleanup) file.remove(file)
# Combine the two
dat <- cbind(phenotype, os[match(rownames(phenotype), rownames(os)), ])
return(dat)
# .11A are healthy samples (GEX)
# rownames(dat) <- gsub(".01A|.11A|01B", "", gsub("-", ".", rownames(dat)))
# Unknown data type
} else {
stop(paste("Invalid query type for xenabrowser:", type))
}
}
# Round to certain digits if requested
if (!missing(digits)) dat <- round(dat, digits)
# Return the processed dat
return(as.matrix(dat))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.