library(knitr) knitr::opts_chunk$set( fig.path="figures/", echo=FALSE, eval=TRUE, cache=FALSE, message=FALSE, warning=FALSE) library(recount) library(SummarizedExperiment) library(S4Vectors)
min.samples.per.class <- 10 recount.subsets <- c("sra", "gtex", "tcga") ## Supported values for recount subset attribute recount.subset <- "sra" ## For the time being we work with SRA (Short Read Archive database) recount.metadata.sra <- all_metadata(subset='sra') recount.metadata.gtex <- all_metadata(subset='gtex') recount.metadata.tgca <- all_metadata(subset='tcga') recount.metadata <- recount.metadata.sra # View(recount.metadata) # View(as.data.frame(recount.metadata)) # names(recount.metadata) dim(recount.metadata) summary.per.col <- data.frame() for (column in names(recount.metadata)) { col.stats <- data.frame( "column" = column, "unique.values" = length(unique(recount.metadata[, column])) ) summary.per.col <- rbind(summary.per.col, col.stats) } summary.per.col kable(summary.per.col)
Each Recount entry (corresponding to a sequencing run for a given sample) can be described by one or more charcateristics (e.g. tissue, conditions, genotype, treatment, ...). This creates a problem when converting the metadata to a data frame with one column per field, since multi-value charcateristics cannot be automatically convereted to a value to be placed in a single column.
characteristics.per.run <- as.data.frame.table(table(unlist(lapply(recount.metadata$characteristics, length)))) names(characteristics.per.run) <- c("Characteristics", "Runs") barplot(as.numeric(unlist(characteristics.per.run[, "Runs"])), main = "Number of characteristics per sample ", names.arg = unlist(characteristics.per.run[, "Characteristics"]), col = "#DDBBFF", ylab = "Number of runs", xlab = "Number of characteristics", horiz = FALSE, las=2, cex.names = 0.8) # kable(characteristics.per.run, caption = "Number of characteristics per run. ")
For the sake of summarization, we will concatenate the characteristics values in a single string per run, in order to view them in the metadata table (where there is a single row per run, and a single column for the field "characteristics").
# ## Concatenate characteristics in a single string per run # characteristics.string <- unlist(lapply(recount.metadata$characteristics, paste, collapse="; ")) # # ## Back-conversion of "NA" strings (obtained from NA values) to NA values # characteristics.string[characteristics.string == "NA"] <- NA # # ## Count the number of NA or non-NA values for characteristics # charact.defined <- as.data.frame.table(table(!is.na(characteristics.string))) # names(charact.defined) <- c("Characteristics defined", "Number of runs") # kable(charact.defined, caption = "Number of runs with defined or undefined characteristics in Recount2 metadata. ") # #table(is.na(unlist(recount.metadata$characteristics))) # # ## Cast metadata to a data frame with a single row per run # ## Use the concatenated chacteristics to fill the column "characteristics" # recount.metadata.frame <- as.data.frame(recount.metadata) # recount.metadata.frame$characteristics <- characteristics.string # # table(is.na(recount.metadata.frame$characteristics)) # # ## An example of dataset with multi-valued characteristics # ## Return all projects havin a field named "disease status" # query.charact <- "disease status" ## Query for a field to be found in "characteristics" # matching.runs <- grep(recount.metadata.frame$characteristics, pattern = query.charact) # matching.runs.project.id <- recount.metadata.frame[matching.runs, "project"] # matching.project.ids1 <- unique(sort(matching.runs.project.id)) # # # ## Count the number of runs per matching project # matching.projects.Nruns <- as.data.frame.table(sort(table(matching.runs.project.id), decreasing=TRUE), row.names = 1) # names(matching.projects.Nruns) <- "Nruns" # # if (run.kable) { # kable(matching.projects.Nruns, caption=paste("Recount projects having", query.charact, "as field name for the characteristics. ")) # } # # if (run.barplot) { # par.ori <- par(no.readonly = TRUE) # par(mar=c(4,10,5,1)) # barplot(as.vector(as.matrix(matching.projects.Nruns)), # names.arg = rownames(matching.projects.Nruns), # main=query.charact, # horiz = TRUE, las = 1, cex.names = 0.7, xlab="Runs per project", col="#88FFDD") # par(par.ori) # } projects.disease.status <- selectProjectsByCharacteristics(recount.metadata, query.charact = "disease status", run.kable = FALSE, run.barplot = TRUE) projects.time <- selectProjectsByCharacteristics(recount.metadata, query.charact = "time", run.kable = FALSE, run.barplot = TRUE) ## A lot of projects have "tissue" in the characteristcs projects.tissue <- selectProjectsByCharacteristics(recount.metadata, query.charact = "tissue", run.kable = TRUE, run.barplot = FALSE) ## select datasets with time in the characteristics (supposedly because the corresonding experiments are time series) # time.series <- as.data.frame.table(sort(table(recount.metadata.frame[grep(characteristics.string, pattern = "time: "), "project"]), decreasing = TRUE)) kable(projects.time$matching.projects.Nruns, caption="Recount projects corresponding to time series (the characteristics field contains 'time:'")
We selected all the projects having "disease status" field in the characteristics of the samples^[more precisely, the characteristics is associated to a run, but we can assume that, when there are several runs for a same sample, they all bear the same characteristics since they are technical replicates]. In total there are r length(matching.project.ids)
projects characterized by a disease status.
project.disease.summary <- data.frame() for ( nb in 1:length(matching.project.ids)) { sample.stat.of.disease.project <- data.frame( "project name" = matching.project.ids[nb], "samples of disease status" = (as.vector(unlist(subset(recount.metadata.frame, project == matching.project.ids[nb], select = "sample")))), "sample sums" = length((as.vector(unlist(subset(recount.metadata.frame, project == matching.project.ids[nb], select = "sample")))) ) ) # end of data frame project.disease.summary <- rbind(project.disease.summary, sample.stat.of.disease.project) } unique.project.disease.summary <- lapply(project.disease.summary, unique) as.data.frame( table(unlist(unique.project.disease.summary))) project.disease.summary$sample.sums
recountID <- "SRP057196" studyPath <- paste0("~/test_recount/",recountID) dir.create(studyPath, recursive = TRUE, showWarnings = FALSE) download_study(recountID, outdir = studyPath) rse <- file.path(studyPath,"rse_gene.Rdata") load(rse) countTable <- assay(rse_gene) runPheno <- colData(rse_gene) length( row.names(runPheno)) length(unique(row.names(runPheno))) # View(as.data.frame(runPheno)) # Get the GEO accession ides geoids <- sapply(runPheno$run[1:2], find_geo) # Get the data from GEO geodata <- do.call(rbind, sapply(geoids, geo_info)) # Get characteristics from pheno table geo_characteristics(runPheno) (unique( geo_characteristics(runPheno)[,1])) unique(geo_characteristics(runPheno)) # dim(countTable) names(runPheno) # dim(runPheno) row.names(runPheno) <- NULL ######################################################## ### How to build data.frame from GEO's characteristics for a given sample ## Load required library library('SummarizedExperiment') ## Get the GEO accession ids geoids < sapply(recount.metadata$run[1:2], find_geo) ## Get the data from GEO geodata <- do.call(rbind, sapply(geoids, geo_info)) ## Add characteristics in a way that we can access easily later on geodata <- cbind(geodata, geo_characteristics(geodata)) # View(as.data.frame(geodata)) ## Explore the original characteristics and the result from ## geo_characteristics() geodata[, c('characteristics', 'cells', 'treatment')]
## Select the subset of metadata for which the field "characteristics" is defined metadata.nona <- recount.metadata[!is.na(recount.metadata.frame$characteristics), ] dim(metadata.nona) message("Number of entries in recount.medatata is :", nrow(recount.metadata.frame)) message("Number of entries in metadata.nona is :", nrow(metadata.nona)) ## count the number of projects with no NA value nona.projects <- unique(metadata.nona$project) message("Recount contains ", length (unique(recount.metadata$project)), " among which ", length(nona.projects), " projects have defined characteristics. These are the ones we can work with ") ## Count th number of distinct classes (characteristics) for each project length(nona.projects) # View((nona.projects)) project <- "SRP008145" x <- metadata.nona[project == nona.projects, ] # dim(x) # View(as.data.frame(x)) new.selected.project <- data.frame() metadata.names <-names(metadata.nona) if (is.null(nona.projects) || length(nona.projects) < 1) { message("There are no meta data for such project") } else { pro <- 1 for (pro in 1:length(nona.projects)) { selected.project <- metadata.nona$project[pro] new.project <- subset(metadata.nona, project == selected.project) new.project$sample.nb <- length(unique(new.project$sample)) new.project$geo.acc.nb <- length(unique(na.omit(new.project$geo_accession))) new.project$characteristic.nb <- length(unique(new.project$characteristics)) if( pro ==1 ){ new.selected.project <- data.frame(new.project) } else { new.selected.project <- rbind(new.selected.project, new.project ) } # end else } # end for } new.selected.project<- asS4(new.selected.project) ## Select projects with at least 3 classes x <- subset(new.selected.project, new.selected.project$characteristic.nb > 3) ## Select projects with precisely 2 classes ## Select projects where there are at least 2 classes with the user-specified min number of samples (parameter previously defined) # View(as.data.frame(metadata.nona))
Recount contains r length (unique(recount.metadata$project))
among which r length(nona.projects)
projects have defined characteristics.
library("IRanges") ## In the future, we wold like to understand why some GSE do work andother ones not # abstract_search(recountID) project_info <- abstract_search('GSE32465') ## This one works # project_info <- abstract_search('GSE58135') ## Supposedly breast cancer recountID <- project_info$project ## Select a Recount ID recountID <- "SRP003611" studyPath <- paste0("~/test_recount/",recountID) dir.create(studyPath, recursive = TRUE, showWarnings = FALSE) # dir.exists(studyPath) # list.files(studyPath) ## TO DO for Mustaa: adapt the 2 lines bore in order to ensure that you get the right GSE for the recountID selectedGSMs <- as.vector(unlist(subset(recount.metadata, project == recountID, select="geo_accession"))) ## project_info <- abstract_search(selectedGSMs[1]) ## Does not work with GSM instead of GSE ## Download data from Recount url <- download_study(recountID, outdir = studyPath) ## Load data rseFile <- file.path(studyPath, "rse_gene.Rdata") load(rseFile) colData(rse_gene) ## hereafter are just the classes for each project # View(as.data.frame(rse_gene@colData@listData$characteristics@unlistData)) length(rse_gene@colData@listData$characteristics@unlistData) row.names(unique(rse_gene@colData)) colnames(rse_gene) # View(as.data.frame(rse_gene)) # to retrieve the raw count Table # View(assay(rse_gene)) # explore the pheno Table rse_gene[, rse_gene$characteristics@elementMetadata] rse_gene$characteristics@metadata rse_gene$characteristics@partitioning rse_gene$characteristics@unlistData rse_gene$characteristics@elementType names(colData(rse_gene)) find_geo() x <-sapply(colData(rse_gene)$run[1:2], find_geo) # View(as.data.frame(x)) length(unique( colData(rse_gene)$project)) length(unique(colData(rse_gene)$experiment)) length( unique(colData(rse_gene)$run)) table( unique( colData(rse_gene)$characteristics)) Na.characteristics <- sum( is.na(colData(rse_gene)$characteristics)) # To explor which regions we are interest rowData(rse_gene) rowRanges(rse_gene) # Experiment-wide metadata metadata(rse_gene) # How to subsetting the SE rse_gene[1:5 , 1:4] rse_gene[, rse_gene$characteristics != "NA"] ## put the code to extract the pheno table from this dataset, and explore it!Are there several interesting columns tere ? ## Count samples for this project selected.project.metadata <- subset(recount.metadata, project == recountID) dim(selected.project.metadata) sample.nb <- length(unique(selected.project.metadata$sample)) geo.acc.nb <- length(unique(na.omit(selected.project.metadata$geo_accession))) characteristic.nb <- length(unique(selected.project.metadata$characteristics)) message(sample.nb, " samples, ", geo.acc.nb, " non-NA geo accession numbers, ", "classes Number:", characteristic.nb )
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.