library(knitr)
knitr::opts_chunk$set(
  fig.path="figures/",
  echo=FALSE, 
  eval=TRUE, 
  cache=FALSE, 
  message=FALSE, 
  warning=FALSE)
library(recount)
library(SummarizedExperiment)
library(S4Vectors)

General exploration of the metadata table

min.samples.per.class <- 10


recount.subsets <- c("sra", "gtex", "tcga") ## Supported values for recount subset attribute
recount.subset <- "sra" ## For the time being we work with SRA (Short Read Archive database)

recount.metadata.sra <- all_metadata(subset='sra')
recount.metadata.gtex <- all_metadata(subset='gtex')
recount.metadata.tgca <- all_metadata(subset='tcga')

recount.metadata <- recount.metadata.sra
# View(recount.metadata)
# View(as.data.frame(recount.metadata))
# names(recount.metadata)
dim(recount.metadata)

summary.per.col <- data.frame()
for (column in names(recount.metadata)) {

  col.stats <- data.frame(
    "column" = column,
    "unique.values" = length(unique(recount.metadata[, column]))
    )

    summary.per.col <- 
    rbind(summary.per.col, col.stats)

}
summary.per.col

kable(summary.per.col)

Exploration of characteristics

Each Recount entry (corresponding to a sequencing run for a given sample) can be described by one or more charcateristics (e.g. tissue, conditions, genotype, treatment, ...). This creates a problem when converting the metadata to a data frame with one column per field, since multi-value charcateristics cannot be automatically convereted to a value to be placed in a single column.

characteristics.per.run <- as.data.frame.table(table(unlist(lapply(recount.metadata$characteristics, length))))
names(characteristics.per.run) <- c("Characteristics", "Runs")

barplot(as.numeric(unlist(characteristics.per.run[, "Runs"])), 
        main = "Number of characteristics per sample ", 
        names.arg = unlist(characteristics.per.run[, "Characteristics"]), 
        col = "#DDBBFF", 
        ylab = "Number of runs", 
        xlab = "Number of characteristics",
        horiz = FALSE, las=2, cex.names = 0.8)

# kable(characteristics.per.run, caption = "Number of characteristics per run. ")

For the sake of summarization, we will concatenate the characteristics values in a single string per run, in order to view them in the metadata table (where there is a single row per run, and a single column for the field "characteristics").

# ## Concatenate characteristics in a single string per run
# characteristics.string <- unlist(lapply(recount.metadata$characteristics, paste, collapse="; "))
# 
# ## Back-conversion of "NA" strings (obtained from NA values) to NA values
# characteristics.string[characteristics.string == "NA"] <- NA
# 
# ## Count the number of NA or non-NA values for characteristics
# charact.defined <- as.data.frame.table(table(!is.na(characteristics.string)))
# names(charact.defined) <- c("Characteristics defined", "Number of runs")
# kable(charact.defined, caption = "Number of runs with defined or undefined characteristics in Recount2 metadata. ")
# #table(is.na(unlist(recount.metadata$characteristics)))
# 
# ## Cast metadata to a data frame with a single row per run
# ## Use the concatenated chacteristics to fill the column "characteristics"
# recount.metadata.frame <- as.data.frame(recount.metadata)
# recount.metadata.frame$characteristics <- characteristics.string
# # table(is.na(recount.metadata.frame$characteristics))
# 
# ## An example of dataset with multi-valued characteristics


# ## Return all projects havin a field named "disease status"
# query.charact <- "disease status" ## Query for a field to be found in "characteristics"
# matching.runs <- grep(recount.metadata.frame$characteristics, pattern = query.charact)
# matching.runs.project.id <- recount.metadata.frame[matching.runs, "project"]
# matching.project.ids1 <- unique(sort(matching.runs.project.id))
# 
# 
# ## Count the number of runs per matching project
# matching.projects.Nruns <- as.data.frame.table(sort(table(matching.runs.project.id), decreasing=TRUE), row.names = 1)
# names(matching.projects.Nruns) <- "Nruns"
# 
# if (run.kable) {
#   kable(matching.projects.Nruns, caption=paste("Recount projects having", query.charact, "as field name for the characteristics. "))
# }
# 
# if (run.barplot) {
#   par.ori <- par(no.readonly = TRUE)
#   par(mar=c(4,10,5,1))
#   barplot(as.vector(as.matrix(matching.projects.Nruns)), 
#           names.arg = rownames(matching.projects.Nruns), 
#           main=query.charact,
#           horiz = TRUE, las = 1, cex.names = 0.7, xlab="Runs per project", col="#88FFDD")
#   par(par.ori)
# } 

projects.disease.status <- selectProjectsByCharacteristics(recount.metadata, query.charact = "disease status", run.kable = FALSE, run.barplot = TRUE)

projects.time <- selectProjectsByCharacteristics(recount.metadata, query.charact = "time", run.kable = FALSE, run.barplot = TRUE)

## A lot of projects have "tissue" in the characteristcs
projects.tissue <- selectProjectsByCharacteristics(recount.metadata, query.charact = "tissue", run.kable = TRUE, run.barplot = FALSE)


## select datasets with time in the characteristics (supposedly because the corresonding experiments are time series)
# time.series <- as.data.frame.table(sort(table(recount.metadata.frame[grep(characteristics.string, pattern = "time: "), "project"]), decreasing = TRUE))

kable(projects.time$matching.projects.Nruns, caption="Recount projects corresponding to time series (the characteristics field contains 'time:'")

We selected all the projects having "disease status" field in the characteristics of the samples^[more precisely, the characteristics is associated to a run, but we can assume that, when there are several runs for a same sample, they all bear the same characteristics since they are technical replicates]. In total there are r length(matching.project.ids) projects characterized by a disease status.

project.disease.summary <- data.frame()
for ( nb in 1:length(matching.project.ids)) {
sample.stat.of.disease.project <- data.frame(
  "project name" = matching.project.ids[nb],
  "samples of disease status" = (as.vector(unlist(subset(recount.metadata.frame, project == matching.project.ids[nb], select = "sample")))),
  "sample sums" = length((as.vector(unlist(subset(recount.metadata.frame, project == matching.project.ids[nb], select = "sample")))) )

) # end of data frame
project.disease.summary <- rbind(project.disease.summary, sample.stat.of.disease.project)
  }
unique.project.disease.summary <- lapply(project.disease.summary, unique) 
as.data.frame( table(unlist(unique.project.disease.summary)))
project.disease.summary$sample.sums
recountID <- "SRP057196"
studyPath <- paste0("~/test_recount/",recountID)
dir.create(studyPath, recursive = TRUE, showWarnings = FALSE)
download_study(recountID, outdir = studyPath)
rse <- file.path(studyPath,"rse_gene.Rdata")
load(rse)
countTable <- assay(rse_gene)
runPheno <- colData(rse_gene)
length( row.names(runPheno))
length(unique(row.names(runPheno)))

# View(as.data.frame(runPheno))

# Get the GEO accession ides
geoids <- sapply(runPheno$run[1:2], find_geo) 
# Get the data from GEO
geodata <- do.call(rbind, sapply(geoids, geo_info))

# Get characteristics from pheno table
geo_characteristics(runPheno)

(unique( geo_characteristics(runPheno)[,1]))

unique(geo_characteristics(runPheno))

# dim(countTable)
names(runPheno)
# dim(runPheno)
row.names(runPheno) <- NULL

########################################################
### How to build data.frame from GEO's characteristics for a given sample

## Load required library
library('SummarizedExperiment')

## Get the GEO accession ids
geoids < sapply(recount.metadata$run[1:2], find_geo)
## Get the data from GEO
geodata <- do.call(rbind, sapply(geoids, geo_info))

## Add characteristics in a way that we can access easily later on
geodata <- cbind(geodata, geo_characteristics(geodata))
# View(as.data.frame(geodata))
## Explore the original characteristics and the result from 
## geo_characteristics()
geodata[, c('characteristics', 'cells',  'treatment')]

Filtering out the NA values

## Select the subset of metadata for which the field "characteristics" is defined
metadata.nona <- recount.metadata[!is.na(recount.metadata.frame$characteristics), ]
dim(metadata.nona)
message("Number of entries in recount.medatata is :", nrow(recount.metadata.frame))
message("Number of entries in metadata.nona is :", nrow(metadata.nona))

## count the number of projects with no NA value
nona.projects <- unique(metadata.nona$project)
message("Recount contains ", length (unique(recount.metadata$project)), " among which ", length(nona.projects), " projects have defined characteristics. These are the ones we can work with ")

## Count th number of distinct classes (characteristics) for each project
length(nona.projects)
# View((nona.projects))
project <- "SRP008145"
x <- metadata.nona[project == nona.projects, ]
# dim(x)
# View(as.data.frame(x))

new.selected.project <- data.frame()
metadata.names <-names(metadata.nona)
if (is.null(nona.projects) || length(nona.projects) < 1) {
  message("There are no meta data for such project")
} else  {
  pro <- 1

  for (pro in 1:length(nona.projects)) {
     selected.project  <- metadata.nona$project[pro]
     new.project <- subset(metadata.nona, project == selected.project)
     new.project$sample.nb <- length(unique(new.project$sample))
     new.project$geo.acc.nb <- length(unique(na.omit(new.project$geo_accession)))
     new.project$characteristic.nb <- length(unique(new.project$characteristics))
     if( pro ==1 ){
       new.selected.project <- data.frame(new.project)
     } else {
       new.selected.project <- rbind(new.selected.project, new.project )
     } # end else 
  } # end for
}

new.selected.project<- asS4(new.selected.project)


## Select projects with at least 3 classes
x <- subset(new.selected.project,  new.selected.project$characteristic.nb > 3)

## Select projects with precisely 2 classes

## Select projects where there are at least 2 classes with the user-specified min number of samples (parameter previously defined)


# View(as.data.frame(metadata.nona))

Recount contains r length (unique(recount.metadata$project)) among which r length(nona.projects) projects have defined characteristics.

Exporation of a problematic dataset (with NA values in the characteristics column)

library("IRanges")
## In the future, we wold like to understand why some GSE do work andother ones not
# abstract_search(recountID)
project_info <- abstract_search('GSE32465') ## This one works
# project_info <- abstract_search('GSE58135') ## Supposedly breast cancer
recountID <- project_info$project

## Select a Recount ID
recountID <- "SRP003611"
studyPath <- paste0("~/test_recount/",recountID)
dir.create(studyPath, recursive = TRUE, showWarnings = FALSE)
# dir.exists(studyPath)
# list.files(studyPath)


## TO DO for Mustaa: adapt the 2 lines bore in order to ensure that you get the right GSE for the recountID
selectedGSMs <- as.vector(unlist(subset(recount.metadata, project == recountID, select="geo_accession")))

## project_info <- abstract_search(selectedGSMs[1]) ## Does not work with GSM instead of GSE

## Download data from Recount
url <- download_study(recountID, outdir = studyPath)

## Load data
rseFile <- file.path(studyPath, "rse_gene.Rdata")
load(rseFile)
colData(rse_gene)
## hereafter are just the classes for each project
# View(as.data.frame(rse_gene@colData@listData$characteristics@unlistData))
length(rse_gene@colData@listData$characteristics@unlistData)
row.names(unique(rse_gene@colData))
colnames(rse_gene)
# View(as.data.frame(rse_gene))

# to retrieve the raw count Table
# View(assay(rse_gene))


# explore the pheno Table
rse_gene[, rse_gene$characteristics@elementMetadata]
rse_gene$characteristics@metadata
rse_gene$characteristics@partitioning
rse_gene$characteristics@unlistData
rse_gene$characteristics@elementType


names(colData(rse_gene))
find_geo()
x <-sapply(colData(rse_gene)$run[1:2], find_geo)
# View(as.data.frame(x))
length(unique( colData(rse_gene)$project))
length(unique(colData(rse_gene)$experiment))
length( unique(colData(rse_gene)$run))
table( unique( colData(rse_gene)$characteristics))
Na.characteristics <- sum( is.na(colData(rse_gene)$characteristics))

# To explor which regions we are interest 
rowData(rse_gene)
rowRanges(rse_gene)

# Experiment-wide metadata
metadata(rse_gene)

# How to subsetting the SE
rse_gene[1:5 , 1:4]
rse_gene[, rse_gene$characteristics != "NA"]


## put the code to extract the pheno table from this dataset, and explore it!Are there several interesting columns tere ? 

## Count samples for this project
selected.project.metadata <- subset(recount.metadata, project == recountID)
dim(selected.project.metadata)
sample.nb <- length(unique(selected.project.metadata$sample))
geo.acc.nb <- length(unique(na.omit(selected.project.metadata$geo_accession)))
characteristic.nb <- length(unique(selected.project.metadata$characteristics))
message(sample.nb, " samples, ", geo.acc.nb, " non-NA geo accession numbers, ", "classes Number:", characteristic.nb  )


elqumsan/RNAseqMVA documentation built on March 10, 2021, 8:10 a.m.