knitr::opts_chunk$set(echo = TRUE, collapse = TRUE)
suppressPackageStartupMessages({ library(dplyr) })
We used human RNA sequencing datasets from RNA-seq Sample Compendia in refine.bio, which hosts uniformly processed gene expression data from EBI’s ArrayExpress, NCBI’s GEO and SRA. Data were downloaded on April 10th, 2020, and we selected studies based on the following criteria: 1) Exclude studies with more than 1,000 samples because they are more likely to be single-cell RNA sequencing datasets. 2) Exclude studies assigned with a Medical Subject Headings (MeSH) term, “Single-Cell Analysis”. 3) Exclude studies with less than or equal to 50 successfully downloaded and imported samples. After filtering, the complete compendium includes 536 studies (defined as a single SRA study) comprising 44,890 samples.
The detail of training data is summarized in studyMeta.tsv
table
in GenomicSuperSignature
package. This vignette explains how this summary
table is built.
extdata <- system.file("extdata", package = "GenomicSuperSignature") studyMeta <- read.table(file.path(extdata, "studyMeta.tsv.gz")) head(studyMeta)
aggregated_metadata.json
is downloaded along with the training dataset. We
collect the number of samples per study based on this metadata.
data.dir <- system.file("extdata", package = "GenomicSuperSignaturePaper") meta_json <- jsonlite::fromJSON(file.path(data.dir, "aggregated_metadata.json")) experiments <- meta_json[[2]] studies <- list() for (i in seq_along(experiments)) { studies <- append(studies, list(experiments[[i]]$sample_accession_codes)) names(studies)[i] <- names(experiments)[i] } sampleNum <- sapply(studies, function(x) length(x))
I summarized the number of samples based on aggregated_metadata.json
under
metadata
column and the number of actually downloaded _quant.sf
sample
files under downloaded
column in the studyMeta
table.
## Directory where refine.bio human samples were downloaded download.dir <- "~/Data/refinebio_download/rna_seq"
# download.dir <- "path/to/download/directory" dirNames <- list.files(download.dir) ind_rm <- which(dirNames %in% c("aggregated_metadata.json", "README.md", "LICENSE.TXT")) if (length(ind_rm) != 0) {dirNames <- dirNames[-ind_rm]} length(dirNames) # 6457 studies were downloaded studyMeta <- as.data.frame(matrix(NA, nrow = length(dirNames), ncol = 3)) colnames(studyMeta) <- c("studyName", "downloaded", "metadata") studyMeta$studyName <- dirNames for (i in seq_along(dirNames)) { dir_path <- file.path(download.dir, dirNames[i]) ## actually downloaded samples ind1 <- which(studyMeta$studyName == dirNames[i]) studyMeta$downloaded[ind1] <- grep("_quant.sf", list.files(dir_path)) %>% length ## available samples based on metadata ind2 <- which(names(sampleNum) == dirNames[i]) studyMeta$metadata[ind2] <- sampleNum[ind2] }
1399 studies with more than 20 samples were successfully imported
by tximport::tximport
download.dir2 <- "~/Data/refinebio_processed/rna_seq_v2"
studyNames <- studyMeta$studyName importedFiles <- c() # download.dir2 <- "path/to/imported/directory" for (i in seq_along(studyNames)) { out.path <- file.path(download.dir2, studyNames[i]) fname <- file.path(out.path, paste0(studyNames[i], ".rds")) if (file.exists(fname)) { importedFiles <- c(importedFiles, studyNames[i]) } }
imported_ind <- which(studyMeta$studyName %in% importedFiles) studyMeta$imported <- FALSE studyMeta$imported[imported_ind] <- TRUE
Study title is also added.
for (i in seq_along(experiments)) { title <- experiments[[i]]$title studyMeta$title[i] <- title }
Example study, SRP152576
:
- 13,127 samples are available based on aggregated_metadata.json
.
- No sample is available based on refine.bio webpage.
- 100 samples are actually downloaded.
- 3,927 samples are available based on metadata_SRP152576.csv
.
ind <- which.max(studyMeta$metadata) studyMeta[ind,]
file_dir <- file.path(download.dir, studyMeta$studyName[ind]) fname <- paste0("metadata_", studyMeta$studyName[ind], ".tsv") sampleMeta <- read.table(file.path(file_dir, fname), header = TRUE, sep = "\t") dim(sampleMeta) head(sampleMeta, 3)
Metadata bar (light blue) shows the number of studies with the given ranges of
study sizes based on the metadata. Downloaded bar (pink) represents the number
of studies with the given ranges of study sizes that were successfully
downloaded and imported through tximport
. Based on metadata, there were
studies with more than 100 samples, but at the time of snapshot, only up to
100 samples were available. Thus, the plot displays only up to 100 samples/study
cases. Due to the unavailability of certain samples, more studies belong to 0-5
samples/study bracket than metadata suggests.
## Colors c1 <- rgb(173,216,230, max = 255, alpha = 95, names = "lt.blue") c2 <- rgb(255,192,203, max = 255, alpha = 95, names = "lt.pink") ax <- seq(0, 100, 5) meta <- studyMeta$metadata[which(studyMeta$metadata < 100)] dl <- studyMeta$downloaded[which(studyMeta$downloaded < 100)] hg_meta <- hist(meta, breaks = ax, plot = FALSE) hg_download <- hist(dl, breaks = ax, plot = FALSE) plot(hg_meta, col = c1, ylim = c(1, 2000), main = "Data Availability", xlab = "# of samples/study", ylab = "# of studies") plot(hg_download, col = c2, ylim = c(1, 2000), add = TRUE) legend("topright", c("metadata", "downloaded"), col = c(c1, c2), lwd = 10)
## Cases where all the available samples (based on metadata) were downloaded sum(studyMeta$downloaded == studyMeta$metadata) ## The number of studies with more than 100 samples: ## Actually downloaded vs. expected by metadata sum(studyMeta$downloaded > 100) sum(studyMeta$metadata > 100)
data.dir <- system.file("extdata", package = "GenomicSuperSignaturePaper") fname <- file.path(data.dir, "source_name_annotated.tsv") source_name_annotated <- read.csv(fname, sep = "\t")
source_cancer <- source_name_annotated$cancer source_cancer <- tidyr::replace_na(source_cancer, 2) slices <- table(source_cancer) lbls <- c("Not Cancer", "Cancer", "NA") pct <- round(slices/sum(slices)*100) lbls <- paste(lbls, pct) # add percents to labels lbls <- paste(lbls, "%", sep="") # ad % to labels pie(slices, labels = lbls, main = "Cancer Studies in Training Data")
source_blood <- source_name_annotated$blood source_blood <- tidyr::replace_na(source_blood, 2) slices <- table(source_blood) lbls <- c("Not Blood", "Blood", "NA") pct <- round(slices/sum(slices)*100) lbls <- paste(lbls, pct) # add percents to labels lbls <- paste(lbls, "%", sep="") # ad % to labels pie(slices, labels = lbls, main = "Blood Studies in Training Data")
source_cellLine <- source_name_annotated$cell_line source_cellLine <- tidyr::replace_na(source_cellLine, 2) slices <- table(source_cellLine) lbls <- c("Not Cell Line", "Cell Line", "NA") pct <- round(slices/sum(slices)*100) lbls <- paste(lbls, pct) # add percents to labels lbls <- paste(lbls, "%", sep="") # ad % to labels pie(slices, labels = lbls, main = "Cell Line in Training Data")
studies_with_error.csv
file contains the sample names that were failed to be
downloaded/imported when I tried to use studies > 50 samples based on the
metadata. You can find more documentation on this under Methods/model_building
.
gsig.dir <- "~/Data/refinebio_processed" lib.dir <- "~/GSS/GenomicSuperSignatureLibrary/refinebioRseq/canonicalPathways"
## Name of studies that tximport failed due to malformed quant.sf # gsig.dir <- "path/to/tximport/directory" errors <- read.table(file.path(gsig.dir, "studies_with_error.csv")) studies.errors <- gsub("ERROR with study ", "", errors$V1) %>% stringr::str_split(., " :") %>% sapply(., function(x) unlist(x)[1]) ## Studies with more than 20 downloaded samples # lib.dir <- "path/to/GenomicSuperSignatureLibrary" dat <- readRDS(file.path(lib.dir, "allZ.rds")) final <- gsub(".PC.*$", "", colnames(dat)) %>% unique # 677 studies ind_677 <- which(studyMeta$studyName %in% final) studyMeta$RAVmodel_677 <- FALSE studyMeta$RAVmodel_677[ind_677] <- TRUE
All the studies that have more than 20 successfully imported samples. So, it's
practically same as imported
column.
studyMeta$RAVmodel_1399 <- studyMeta$imported
These 536 studies meet the criteria, > 50 (downloaded) and <1000 (metadata) samples per study. Also, any study with “Single-Cell Analysis” MeSH term was removed.
load(file.path(data.dir, "MeSH_terms_1399refinebio.rda")) # load `mesh_table` sc <- which(mesh_table$name %in% "Single-Cell Analysis") sc_studies <- mesh_table$identifier[sc] # single-cell associated studies to exclude sc_ind <- which(studyMeta$studyName %in% sc_studies) sn_ind <- which(studyMeta$downloaded > 50 & studyMeta$metadata < 1000 & studyMeta$imported == "TRUE") ind_536 <- setdiff(sn_ind, sc_ind) studyMeta$RAVmodel_536 <- FALSE studyMeta$RAVmodel_536[ind_536] <- TRUE
## Table with sample and run information dir <- system.file("extdata", package = "GenomicSuperSignature") studyMeta <- read.table(file.path(dir, "studyMeta.tsv.gz"), header = TRUE) ind <- which(studyMeta$RAVmodel_536 == "TRUE") studies <- vector(mode = "list", length = 536) names(studies) <- studyMeta$studyName[ind] ## Samples actually downloaded and used for the current model building download.dir2 <- "~/Data/refinebio_processed/rna_seq_v2" for (i in seq_along(studies)) { study_name <- names(studies)[i] dir_path <- file.path(download.dir2, study_name, paste0(study_name, "_count.csv")) sample_name <- read.csv(dir_path) %>% colnames(.) %>% stringr::str_split(., "\\.") %>% unlist studies[[study_name]] <- sample_name } ## `res` data.frame with 9029 rows + 3 columns res <- stack(studies) res$source <- NA colnames(res) <- c("run_accession", "study_accession", "source") ## RNAseq datasets downloaded from refine.bio json_file <- "~/Data/refinebio_download/rna_seq/aggregated_metadata.json" refinebio_meta <- rjson::fromJSON(paste(readLines(json_file), collapse="")) sample_meta <- refinebio_meta$samples for (i in 1:nrow(res)) { run_accession <- res$run[i] ind <- which(names(sample_meta) == run_accession) # res$sample[i] <- sample_meta[[ind]]$refinebio_title res$source[i] <- sample_meta[[ind]]$refinebio_source_database } ## All accessions from `GenomicSuperSignaturePaper/Results_temp/Training_Datasets.Rmd` x <- readRDS("~/Data/GenomicSuperSignaturePaper/Results_temp/trainingData_accessions.rds") res_all <- dplyr::inner_join(res[,c("run_accession", "source")], x, by = "run_accession")
## Table with study and run accessions saveRDS(res_all, "accessions.rds")
## Training data used for GenomicSuperSignature res_all <- readRDS("accessions.rds") ## Training data used for multiPLIER recount2 <- read.table("https://raw.githubusercontent.com/greenelab/multi-plier/master/data/sample_info/recount2_srr_srs_srp.tsv", header = TRUE) head(res_all, 3) head(recount2, 3)
## multiPLIER training data a <- length(unique(recount2$run)) b <- length(unique(recount2$sample)) c <- length(unique(recount2$study)) ## GenomicSupersignature training data d <- length(unique(res_all$run_accession)) e <- length(unique(res_all$sample_accession)) f <- length(unique(res_all$study)) ## Overlapping training data g <- length(intersect(unique(recount2$run), unique(res_all$run_accession))) h <- length(intersect(unique(recount2$sample), unique(res_all$sample_accession))) i <- length(intersect(unique(recount2$study), unique(res_all$study)))
r a
unique runs from r b
samples in r c
studies. r d
unique runs from r e
samples in r f
studies. r g
overlapping runs from r h
samples in r i
studies used for both models.sessionInfo()
## Save summary table write.table(studyMeta, "~/data2/GenomicSuperSignature/inst/extdata/studyMeta.tsv.gz") ## Save histogram comparing metadata vs. downloaded data ## `available_samples.png` is moved to `Results/others` png("available_samples.png") plot(hg_meta, col = c1, ylim = c(1, 2000), main = "Data Availability", xlab = "# of samples/study", ylab = "# of studies") plot(hg_download, col = c2, ylim = c(1, 2000), add = TRUE) legend("topright", c("metadata", "downloaded"), col = c(c1, c2), lwd = 10) dev.off()
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.