RNAseqMVA: Application of Multivariate Analysis methods to RNA-seq Data

Documented in loadRecountExperiment

#' @title  Loading one count table from Recount repository by given experiment ID and then having two option one made merge run to prevent
#' redundancy between multipl runs per sample.
#' @author Jacques van Helden and Mustafa Abuelqumsan
#' @description Load one count table from Recount for a given experiment ID, and optionally
#' merge the counts in order to avoid redundancy between multipl runs per sample.
#' @param recountID identifier of one study in ReCount database
#' @param parameters global and specific parameters for the analysis of this recountID
#' @param forceDownload=FALSE by default, the data is downloaded only if it is not found in the studyPath folder.
#' If forceDownload is TRUE, the data will be downloaded irrespective of existing files.
#'
#' @examples
#' ## Load required packages
#' require("RNAseqMVA")
#' require(yaml)
#'
#' ## Load parameters
#' project.parameters <- yaml.load_file("misc/00_project_parameters.yml")
#'
#' ## Load counts for a Leukemia dataset
#' recountID <- "SRP048759"
#' recountData <- loadRecountExperiment(recountID, parameters = project.parameters$global)
#'
#' ## Check the dimension of the table with counts per run
#' dim(recountData$result$runs$dataTable)
#'
#' ## Check the number of runs per sample
#' table(recountData$runPheno$geo_accession)
#'
#' ## Count the number of unique sample IDs in the run-wise description table (runPheno)
#' message("Number of runs = ", length(recountData$runPheno$geo_accession))
#' message("Number unique geo_accession values: ", length(unique(sort(recountData$runPheno$geo_accession))))
#'
#' ## Check the dimension of the table with counts per sample
#' dim(recountData$merged$sampleCounts)
#'
#' cor(log(recountData$result$runs$dataTable[, recountData$runPheno$geo_accession == "GSM1521620"]+1))
#'
#' ## Test correlation between 8 randomly selected biological replicates (distinct samples)
#' cor(log(recountData$merged$sampleCounts[, sample(1:ncol(recountData$merged$sampleCounts), size=8)]+1))
#'
#'
#' @return
#' A list containing: the count table, the pheno table, and some additional parameters (study ID, ...).
#'    \itemize{
#'         \item countsPerRun:  count table before any pre-proccessing procedure
#'         \item originalCounts: count table after merge runs proccess in order to get ride of multipl runs per sample.
#'    }
#'
#' @import S4Vectors
#' @export
loadRecountExperiment <- function(recountID,
                                  parameters,
                                  forceDownload = FALSE,
                                  ...) {

  message.with.time("loadRecountExperiment()\trecountID = ", recountID, "\tfeature type = ", parameters$feature)

  ## Check required parameters
  for (p in c("mergeRuns", "sampleIdColumn", "classColumn", "verbose", "studyPath", "feature")) {
    if (is.null(parameters[[p]])) {
      stop("Missing required parameter: '", p,
           "'.\n\tPlease check configuration file. ")
    } else {
      assign(p, parameters[[p]])
    }
  }

  ## Check required directory
  if (is.null(parameters$dir$workspace)) {
    stop("Missing required parameter: 'parameters$dir$workspace'.\n\tPlease check configuration file. ")
  } else {
    dir.workspace = parameters$dir$workspace
    dir.create(dir.workspace, recursive = TRUE, showWarnings = FALSE)
  }


  result <- list()

  #### Create studyPath directory ####
  if (!file.exists(parameters$studyPath)) {
    message("\tCreating directory to store Recount dataset ", recountID," in ", parameters$studyPath)
    dir.create(parameters$studyPath, recursive = TRUE, showWarnings = FALSE)
  }


  #### Define the file where the downloaded counts will be stored ####
  if (parameters$feature == "gene") {
    rse.type <- "gene"
  } else if (parameters$feature == "exon") {
    rse.type <- "exon"
  } else if (parameters$feature == "transcript") {
    rse.type <- "tx"
  } else if (parameters$feature == "junction") {
    rse.type <- "jx"
  } else {
    stop("\t", parameters$feature, " is not a valid feature type. Supported: gene, exon, transcript, junction.")
  }

  #### Get the URL of the recount file ####
  ## We do this because on the recount site the uppercases of the extensions
  ## are inconsistent between data types:
  ## - Rdata for genes and exons
  ## - RData for transcripts
  recountURL <- download_study(
    recountID,
    outdir = parameters$studyPath,
    type = paste0("rse-", rse.type),
    download = FALSE)
  rseFile <- file.path(parameters$studyPath, basename(recountURL))
  parameters$recountURL <- recountURL
  parameters$rseFile <- rseFile

  #### Add parameters to the result ####
  result$parameters <- parameters

  #### Download the counts if required ####
  if ((forceDownload) || (!file.exists(rseFile))) {
    if (verbose) {
      message("\tDowloading counts from ReCount for study ", recountID)
    }
    url <- download_study(
      recountID,
      outdir = parameters$studyPath,
      type = paste0("rse-", rse.type))
    result$param["url"] <- url
  }

  #### Download phenotype TSV file if required ####
  recountPhenoURL <- download_study(recountID, outdir = parameters$studyPath, type = "phenotype", download = FALSE)
  phenoFile <- file.path(parameters$studyPath, basename(recountPhenoURL))
  parameters$recountPhenoURL <- recountPhenoURL
  parameters$phenoFile <- phenoFile
  if ((forceDownload) || (!file.exists(phenoFile))) {
    if (verbose) {
      message("\tDowloading phenotypes from ReCount for study ", recountID)
    }
    recountPhenoURL <- download_study(recountID, outdir = parameters$studyPath, type = "phenotype", download = TRUE)
    result$param["recountPhenoURL"] <- recountPhenoURL
  }


  #### Load in memory data from the recount database ####
  if (verbose) {
    message("\tLoading counts per ", parameters$feature," from local file ", rseFile)
  }
  load(rseFile)


  #### Scale counts by mapped reads, in order to to get read counts per gene ####
  rse.variable <- paste0("rse_", rse.type)
  if (verbose) {
    message("\tScaling counts by mapped reads")
  }
  rse <- scale_counts(get(rse.variable), by = "mapped_reads")
  # class(rse)

  #### Extract a matrix with the counts per feature for each run ####
  if (verbose) {
    message("\tExtracting table of counts per run")
  }

  ## Extract the count table
  dataTable <- assay(rse)
  if (verbose) {
    message("\tLoaded counts per run: ", nrow(dataTable), " features x ", ncol(dataTable), " runs.")
  }

#
#   ## Check consistency between pheno tables in  genes and transcripts
#   stopifnot(identical(
#     geo_characteristics(colData(rse_gene)),
#     geo_characteristics(colData(rse_tx))
#   ))
#

  #### Extract pheno table ####
  ## The pheno table contains information about the columns of the RangedSeummaryExperiment.
  if (verbose) {
    message("\tBuilding pheno table")
  }
  phenoTable <- colData(rse) ## phenotype per run
  # dim(phenoTable)
  # class(phenoTable)
  # names(phenoTable)
  # View(phenoTable)
  # class(phenoTable$characteristics)
  # table(phenoTable$characteristics)
  # is.character(phenoTable$characteristics)

  #### Fix a problem with the structure of the characteristics field in some recount records ####
  ## See here for details
  ##     https://support.bioconductor.org/p/116480/#124335
  ## and here (repost)
  ##     https://support.bioconductor.org/p/127123/
  if (is.character(phenoTable$characteristics)) {
    ## Solves https://support.bioconductor.org/p/116480/
    phenoTable$characteristics <- IRanges::CharacterList(
      lapply(lapply(phenoTable$characteristics, str2lang), eval)
    )
  }
  # class(phenoTable$characteristics)
  # View(phenoTable$characteristics)

  ## PATCH JvH 2019-01-03: I fix a bug with the phenotable characteristics in the transcript rse, which contains quotes
  # phenoTable$characteristics <- gsub(x = phenoTable$characteristics, pattern = '"', replacement = '')
  # geo.characteristics <- recount::geo_characteristics(phenoTable)
  geochar <- geocharFromPheno(runPheno = phenoTable)
  # class(geochar)
  # dim(geochar)
  # head(geochar)
  # recountID
  # parameters$short_label
  # geochar[1:10, "tissue"]
  phenoTable <- cbind(phenoTable, geochar)
  # View(phenoTable)
  # table(phenoTable$characteristics)
  # names(phenoTable)
  # dim(phenoTable)

  # phenoTable2 <- read.delim(file = phenoFile, header = 1, sep = "\t")
  # View(phenoTable2)
  # names(phenoTable2)


  #### Extract a matrix with the counts per feature for each run ####
  if (verbose) {
    message("\tExtracting table of counts per run")
  }

  ## Extract the count table
  dataTable <- assay(rse)
  if (verbose) {
    message("\tLoaded counts per run: ", nrow(dataTable), " features x ", ncol(dataTable), " runs.")
  }



  #### Instantiate a DataTableWithClasses object with the counts and pheno data ###
  ## This object will be further used to test classifiers.
  countsPerRun <- DataTableWithClasses(dataTable = dataTable,
                                       phenoTable = phenoTable,
                                       dataType = "raw_counts_per_run",
                                       parameters = parameters)
  # class(countsPerRun)
  summary(countsPerRun)

  ################################################################
  ## Use either the merged or the original runs as original count table
  if (mergeRuns) {
    ## Merge runs if required
    if (verbose) { message("\tMerging run-wise counts by sample") }
    result$countsPerRun <- countsPerRun
    result$originalCounts <- MergeRuns(runs = countsPerRun) #,
                                  # classColumn  = classColumn ,
                                  # sampleIdColumn = sampleIdColumn,
                                  # verbose = verbose)
  } else {
    result$originalCounts <- countsPerRun
  }
  # class(result$originalCounts)
  # summary(result$originalCounts)



  message.with.time("Finished loading Recount experiment ID ", parameters$recountID)
  return(result)
}