knitr::opts_chunk$set(echo = TRUE, collapse = TRUE)
suppressPackageStartupMessages({
  library(dplyr)
})

Source

We used human RNA sequencing datasets from RNA-seq Sample Compendia in refine.bio, which hosts uniformly processed gene expression data from EBI’s ArrayExpress, NCBI’s GEO and SRA. Data were downloaded on April 10th, 2020, and we selected studies based on the following criteria: 1) Exclude studies with more than 1,000 samples because they are more likely to be single-cell RNA sequencing datasets. 2) Exclude studies assigned with a Medical Subject Headings (MeSH) term, “Single-Cell Analysis”. 3) Exclude studies with less than or equal to 50 successfully downloaded and imported samples. After filtering, the complete compendium includes 536 studies (defined as a single SRA study) comprising 44,890 samples.

The detail of training data is summarized in studyMeta.tsv table in GenomicSuperSignature package. This vignette explains how this summary table is built.

extdata <- system.file("extdata", package = "GenomicSuperSignature")
studyMeta <- read.table(file.path(extdata, "studyMeta.tsv.gz"))
head(studyMeta)

Availability: metadata

aggregated_metadata.json is downloaded along with the training dataset. We collect the number of samples per study based on this metadata.

data.dir <- system.file("extdata", package = "GenomicSuperSignaturePaper")
meta_json <- jsonlite::fromJSON(file.path(data.dir, 
                                          "aggregated_metadata.json"))
experiments <- meta_json[[2]] 

studies <- list()
for (i in seq_along(experiments)) {
  studies <- append(studies, list(experiments[[i]]$sample_accession_codes))
  names(studies)[i] <- names(experiments)[i]
}

sampleNum <- sapply(studies, function(x) length(x))

Availability: download/import

I summarized the number of samples based on aggregated_metadata.json under metadata column and the number of actually downloaded _quant.sf sample files under downloaded column in the studyMeta table.

## Directory where refine.bio human samples were downloaded
download.dir <- "~/Data/refinebio_download/rna_seq"
# download.dir <- "path/to/download/directory"
dirNames <- list.files(download.dir)

ind_rm <- which(dirNames %in% 
                  c("aggregated_metadata.json", "README.md", "LICENSE.TXT"))
if (length(ind_rm) != 0) {dirNames <- dirNames[-ind_rm]}
length(dirNames) # 6457 studies were downloaded

studyMeta <- as.data.frame(matrix(NA, nrow = length(dirNames), ncol = 3))
colnames(studyMeta) <- c("studyName", "downloaded", "metadata") 
studyMeta$studyName <- dirNames

for (i in seq_along(dirNames)) {
  dir_path <- file.path(download.dir, dirNames[i])

  ## actually downloaded samples
  ind1 <- which(studyMeta$studyName == dirNames[i]) 
  studyMeta$downloaded[ind1] <- grep("_quant.sf", 
                                     list.files(dir_path)) %>% length

  ## available samples based on metadata
  ind2 <- which(names(sampleNum) == dirNames[i])  
  studyMeta$metadata[ind2] <- sampleNum[ind2]
}

Imported samples

1399 studies with more than 20 samples were successfully imported by tximport::tximport

download.dir2 <- "~/Data/refinebio_processed/rna_seq_v2"
studyNames <- studyMeta$studyName
importedFiles <- c()

# download.dir2 <- "path/to/imported/directory"
for (i in seq_along(studyNames)) {
  out.path <- file.path(download.dir2, studyNames[i])
  fname <- file.path(out.path, paste0(studyNames[i], ".rds"))
  if (file.exists(fname)) {
    importedFiles <- c(importedFiles, studyNames[i])
  }
}
imported_ind <- which(studyMeta$studyName %in% importedFiles)
studyMeta$imported <- FALSE
studyMeta$imported[imported_ind] <- TRUE 

Study title is also added.

for (i in seq_along(experiments)) {
  title <- experiments[[i]]$title
  studyMeta$title[i] <- title
}

Example

Example study, SRP152576:
- 13,127 samples are available based on aggregated_metadata.json.
- No sample is available based on refine.bio webpage.
- 100 samples are actually downloaded.
- 3,927 samples are available based on metadata_SRP152576.csv.

ind <- which.max(studyMeta$metadata)  
studyMeta[ind,]
file_dir <- file.path(download.dir, studyMeta$studyName[ind])
fname <- paste0("metadata_", studyMeta$studyName[ind], ".tsv")
sampleMeta <- read.table(file.path(file_dir, fname), header = TRUE, sep = "\t")
dim(sampleMeta)

head(sampleMeta, 3)

Summary plot

Availability

Metadata bar (light blue) shows the number of studies with the given ranges of study sizes based on the metadata. Downloaded bar (pink) represents the number of studies with the given ranges of study sizes that were successfully downloaded and imported through tximport. Based on metadata, there were studies with more than 100 samples, but at the time of snapshot, only up to 100 samples were available. Thus, the plot displays only up to 100 samples/study cases. Due to the unavailability of certain samples, more studies belong to 0-5 samples/study bracket than metadata suggests.

## Colors
c1 <- rgb(173,216,230, max = 255, alpha = 95, names = "lt.blue")
c2 <- rgb(255,192,203, max = 255, alpha = 95, names = "lt.pink")
ax <- seq(0, 100, 5)

meta <- studyMeta$metadata[which(studyMeta$metadata < 100)]
dl <- studyMeta$downloaded[which(studyMeta$downloaded < 100)]

hg_meta <- hist(meta, breaks = ax, plot = FALSE) 
hg_download <- hist(dl, breaks = ax, plot = FALSE) 

plot(hg_meta, col = c1, ylim = c(1, 2000), main = "Data Availability", 
     xlab = "# of samples/study", ylab = "# of studies") 
plot(hg_download, col = c2, ylim = c(1, 2000), add = TRUE) 
legend("topright", c("metadata", "downloaded"), col = c(c1, c2), lwd = 10)
## Cases where all the available samples (based on metadata) were downloaded
sum(studyMeta$downloaded == studyMeta$metadata)

## The number of studies with more than 100 samples: 
## Actually downloaded vs. expected by metadata
sum(studyMeta$downloaded > 100)
sum(studyMeta$metadata > 100)

Types of data

data.dir <- system.file("extdata", package = "GenomicSuperSignaturePaper")
fname <- file.path(data.dir, "source_name_annotated.tsv")
source_name_annotated <- read.csv(fname, sep = "\t")

Cancer

source_cancer <- source_name_annotated$cancer
source_cancer <- tidyr::replace_na(source_cancer, 2)

slices <- table(source_cancer)
lbls <- c("Not Cancer", "Cancer", "NA")
pct <- round(slices/sum(slices)*100)
lbls <- paste(lbls, pct)   # add percents to labels
lbls <- paste(lbls, "%", sep="")   # ad % to labels

pie(slices, labels = lbls, main = "Cancer Studies in Training Data")

Blood

source_blood <- source_name_annotated$blood
source_blood <- tidyr::replace_na(source_blood, 2)

slices <- table(source_blood)
lbls <- c("Not Blood", "Blood", "NA")
pct <- round(slices/sum(slices)*100)
lbls <- paste(lbls, pct)   # add percents to labels
lbls <- paste(lbls, "%", sep="")   # ad % to labels

pie(slices, labels = lbls, main = "Blood Studies in Training Data")

Cell Line

source_cellLine <- source_name_annotated$cell_line
source_cellLine <- tidyr::replace_na(source_cellLine, 2)

slices <- table(source_cellLine)
lbls <- c("Not Cell Line", "Cell Line", "NA")
pct <- round(slices/sum(slices)*100)
lbls <- paste(lbls, pct)   # add percents to labels
lbls <- paste(lbls, "%", sep="")   # ad % to labels

pie(slices, labels = lbls, main = "Cell Line in Training Data")

Studies used for model buildling

RAVmodel_677

studies_with_error.csv file contains the sample names that were failed to be downloaded/imported when I tried to use studies > 50 samples based on the metadata. You can find more documentation on this under Methods/model_building.

gsig.dir <- "~/Data/refinebio_processed"
lib.dir <- "~/GSS/GenomicSuperSignatureLibrary/refinebioRseq/canonicalPathways"
## Name of studies that tximport failed due to malformed quant.sf
# gsig.dir <- "path/to/tximport/directory"
errors <- read.table(file.path(gsig.dir, "studies_with_error.csv"))
studies.errors <- gsub("ERROR with study ", "", errors$V1) %>%
  stringr::str_split(., " :") %>%
  sapply(., function(x) unlist(x)[1])

## Studies with more than 20 downloaded samples 
# lib.dir <- "path/to/GenomicSuperSignatureLibrary"
dat <- readRDS(file.path(lib.dir, "allZ.rds"))
final <- gsub(".PC.*$", "", colnames(dat)) %>% unique   # 677 studies
ind_677 <- which(studyMeta$studyName %in% final)

studyMeta$RAVmodel_677 <- FALSE
studyMeta$RAVmodel_677[ind_677] <- TRUE 

RAVmodel_1399

All the studies that have more than 20 successfully imported samples. So, it's practically same as imported column.

studyMeta$RAVmodel_1399 <- studyMeta$imported

RAVmodel_536

These 536 studies meet the criteria, > 50 (downloaded) and <1000 (metadata) samples per study. Also, any study with “Single-Cell Analysis” MeSH term was removed.

load(file.path(data.dir, "MeSH_terms_1399refinebio.rda")) # load `mesh_table`
sc <- which(mesh_table$name %in% "Single-Cell Analysis")
sc_studies <- mesh_table$identifier[sc]   # single-cell associated studies to exclude

sc_ind <- which(studyMeta$studyName %in% sc_studies)
sn_ind <- which(studyMeta$downloaded > 50 & 
                  studyMeta$metadata < 1000 & 
                  studyMeta$imported == "TRUE")
ind_536 <- setdiff(sn_ind, sc_ind)

studyMeta$RAVmodel_536 <- FALSE
studyMeta$RAVmodel_536[ind_536] <- TRUE 

MultiPLIER vs. RAVmodel

## Table with sample and run information
dir <- system.file("extdata", package = "GenomicSuperSignature")
studyMeta <- read.table(file.path(dir, "studyMeta.tsv.gz"), header = TRUE)

ind <- which(studyMeta$RAVmodel_536 == "TRUE")
studies <- vector(mode = "list", length = 536)
names(studies) <- studyMeta$studyName[ind]

## Samples actually downloaded and used for the current model building
download.dir2 <- "~/Data/refinebio_processed/rna_seq_v2"
for (i in seq_along(studies)) {
  study_name <- names(studies)[i]
  dir_path <- file.path(download.dir2, study_name, 
                        paste0(study_name, "_count.csv"))
  sample_name <- read.csv(dir_path) %>%
    colnames(.) %>%
    stringr::str_split(., "\\.")  %>% 
    unlist

  studies[[study_name]] <- sample_name
}

## `res` data.frame with 9029 rows + 3 columns
res <- stack(studies)
res$source <- NA
colnames(res) <- c("run_accession", "study_accession", "source")

## RNAseq datasets downloaded from refine.bio
json_file <- "~/Data/refinebio_download/rna_seq/aggregated_metadata.json"
refinebio_meta <- rjson::fromJSON(paste(readLines(json_file), collapse=""))
sample_meta <- refinebio_meta$samples

for (i in 1:nrow(res)) {
  run_accession <- res$run[i]
  ind <- which(names(sample_meta) == run_accession)
  # res$sample[i] <- sample_meta[[ind]]$refinebio_title
  res$source[i] <- sample_meta[[ind]]$refinebio_source_database
}

## All accessions from `GenomicSuperSignaturePaper/Results_temp/Training_Datasets.Rmd`
x <- readRDS("~/Data/GenomicSuperSignaturePaper/Results_temp/trainingData_accessions.rds")
res_all <- dplyr::inner_join(res[,c("run_accession", "source")],
                              x, by = "run_accession")
## Table with study and run accessions
saveRDS(res_all, "accessions.rds")  
## Training data used for GenomicSuperSignature
res_all <- readRDS("accessions.rds")

## Training data used for multiPLIER
recount2 <- read.table("https://raw.githubusercontent.com/greenelab/multi-plier/master/data/sample_info/recount2_srr_srs_srp.tsv", header = TRUE)

head(res_all, 3)
head(recount2, 3)
## multiPLIER training data
a <- length(unique(recount2$run))
b <- length(unique(recount2$sample))
c <- length(unique(recount2$study))

## GenomicSupersignature training data
d <- length(unique(res_all$run_accession))
e <- length(unique(res_all$sample_accession))
f <- length(unique(res_all$study))

## Overlapping training data
g <- length(intersect(unique(recount2$run), unique(res_all$run_accession)))
h <- length(intersect(unique(recount2$sample), unique(res_all$sample_accession)))
i <- length(intersect(unique(recount2$study), unique(res_all$study)))
  1. multiPLIER used r a unique runs from r b samples in r c studies.
  2. GenomicSuperSignature used r d unique runs from r e samples in r f studies.
  3. There are r g overlapping runs from r h samples in r i studies used for both models.

Session Info

sessionInfo()

## Save summary table
write.table(studyMeta, 
            "~/data2/GenomicSuperSignature/inst/extdata/studyMeta.tsv.gz")

## Save histogram comparing metadata vs. downloaded data
## `available_samples.png` is moved to `Results/others` 
png("available_samples.png")
plot(hg_meta, col = c1, ylim = c(1, 2000), main = "Data Availability", 
     xlab = "# of samples/study", ylab = "# of studies") 
plot(hg_download, col = c2, ylim = c(1, 2000), add = TRUE) 
legend("topright", c("metadata", "downloaded"), col = c(c1, c2), lwd = 10)
dev.off()


shbrief/GenomicSuperSignaturePaper documentation built on Aug. 2, 2022, 2:04 p.m.