R/SummarizedExperiment_helpers.R

Defines functions countTable_regions countTable scoreSummarizedExperiment makeSummarizedExperimentFromBam

Documented in countTable countTable_regions makeSummarizedExperimentFromBam scoreSummarizedExperiment

#' Make a count matrix from a library or experiment
#'
#' Make a summerizedExperiment / matrix object from bam files
#'
#' If txdb or gtf path is added, it is a rangedSummerizedExperiment
#' NOTE: If the file called saveName exists, it will then load file,
#' not remake it!
#' @param df an ORFik \code{\link{experiment}}
#' @param saveName a character (default NULL),
#' if set save experiment to path given. Always saved as .rds.,
#' it is optional to add .rds, it will be added for you if not present.
#' Also used to load existing file with that name.
#' @param longestPerGene a logical (default TRUE), if FALSE all transcript
#' isoforms per gene.
#' @param geneOrTxNames a character vector (default "tx"), should row names
#' keep trancript names ("tx") or change to gene names ("gene")
#' @param region a character vector (default: "mrna"), make raw count matrices
#' of whole mrnas or one of (leaders, cds, trailers).
#' Can also be a \code{\link{GRangesList}}, then it uses this region directly.
#' @param type default: "count" (raw counts matrix), alternative is "fpkm",
#' "log2fpkm" or "log10fpkm"
#' @param lib.type a character(default: "default"), load files in experiment
#' or some precomputed variant, either "ofst", "bedo", "bedoc" or "pshifted".
#' These are made with ORFik:::convertLibs() or shiftFootprintsByExperiment().
#' Can also be custom user made folders inside the experiments bam folder.
#' @param weight numeric or character, a column to score overlaps by. Default "score",
#' will check for a metacolumn called "score" in libraries. If not found,
#' will not use weights.
#' @import SummarizedExperiment
#' @export
#' @return a \code{\link{SummarizedExperiment}} object or data.table if
#' "type" is not "count, with rownames as transcript / gene names.
#' @examples
#' ##Make experiment
#' df <- ORFik.template.experiment()
#' # makeSummarizedExperimentFromBam(df)
#' # Only cds (coding sequences):
#' # makeSummarizedExperimentFromBam(df, region = "cds")
#' # FPKM instead of raw counts on whole mrna regions
#' # makeSummarizedExperimentFromBam(df, type = "fpkm")
makeSummarizedExperimentFromBam <- function(df, saveName = NULL,
                                            longestPerGene = TRUE,
                                            geneOrTxNames = "tx",
                                            region = "mrna", type = "count",
                                            lib.type = "ofst",
                                            weight = "score") {
  if(!is.null(saveName)) {
    if (file_ext(saveName) != "rds") saveName <- paste0(saveName,".rds")
    if (file.exists(saveName)) return(readRDS(saveName))
  }
  validateExperiments(df)

  if (is(region, "character")) {
    txdb <- loadTxdb(df@txdb)
    tx <- loadRegion(txdb, region)
  } else tx <- region

  if (geneOrTxNames == "gene") {
    if (!is(region, "character")) txdb <- loadTxdb(df@txdb)
    names(tx) <- txNamesToGeneNames(names(tx), txdb)
  }

  varNames <- bamVarName(df)
  outputLibs(df, chrStyle = tx, type = lib.type)

  rawCounts <- data.table(matrix(0, ncol = length(varNames),
                                 nrow = length(tx)))
  for (i in seq(length(varNames))) { # For each sample
    print(varNames[i])
    if (is.character(weight) & length(weight) == 1) {
      if (!(weight %in% colnames(mcols(get(varNames[i])))))
        weight <- NULL
    }

    co <- countOverlapsW(tx, get(varNames[i]), weight = weight)
    rawCounts[, (paste0("V",i)) := co]
  }
  mat <- as.matrix(rawCounts);colnames(mat) <- NULL

  colData <- DataFrame(SAMPLE = as.factor(bamVarName(df, TRUE)),
                       row.names=varNames)
  # Add sample columns
  if (!is.null(df$rep)) colData$replicate <- as.factor(df$rep)
  if (!is.null(df$stage)) colData$stage <- as.factor(df$stage)
  if (!is.null(df$libtype)) colData$libtype <- as.factor(df$libtype)
  if (!is.null(df$condition)) colData$condition <- as.factor(df$condition)
  if (!is.null(df$fraction)) colData$fraction <- as.factor(df$fraction)

  res <- SummarizedExperiment(assays=list(counts=mat), rowRanges=tx,
                              colData=colData)
  if (type %in% c("fpkm", "log2fpkm", "log10fpkm")) {
    res <- as.data.table(scoreSummarizedExperiment(res, score = type))
    rownames(res) <- names(tx)
  }
  if(!is.null(saveName)) {
    if (!dir.exists(dirname(saveName))) {
      dir.create(dirname(saveName), showWarnings = FALSE, recursive = TRUE)
    }
    if (file_ext(saveName) != "rds") saveName <- paste0(saveName,".rds")
    saveRDS(res, file = saveName)
  }
  return(res)
}

#' Helper function for makeSummarizedExperimentFromBam
#'
#' If txdb or gtf path is added, it is a rangedSummerizedExperiment
#' For FPKM values, DESeq2::fpkm(robust = FALSE) is used
#' @param final ranged summarized experiment object
#' @param score default: "transcriptNormalized"
#' (row normalized raw counts matrix),
#' alternative is "fpkm", "log2fpkm" or "log10fpkm"
#' @param collapse a logical/character (default FALSE), if TRUE all samples
#' within the group SAMPLE will be collapsed to one. If "all", all
#' groups will be merged into 1 column called merged_all. Collapse is defined
#' as rowSum(elements_per_group) / ncol(elements_per_group)
#' @import SummarizedExperiment DESeq2
#' @export
#' @return a DEseq summerizedExperiment object (transcriptNormalized)
#'  or matrix (if fpkm input)
scoreSummarizedExperiment <- function(final, score = "transcriptNormalized",
                                      collapse = FALSE) {
  if (is.factor(final$SAMPLE)) {
    lvls <- levels(final$SAMPLE) %in% unique(colData(final)$SAMPLE)
    final$SAMPLE <- factor(final$SAMPLE, levels = levels(final$SAMPLE)[lvls])
  }
  if (collapse %in% c(TRUE, "all")) {
    if (collapse == TRUE) {
      collapsedAll <- collapseReplicates(final, final$SAMPLE)
      nlibs <- t(matrix(as.double(table(colData(final)$SAMPLE)),
                        ncol = nrow(assay(collapsedAll)) ,
                        nrow = length(unique(colData(final)$SAMPLE))))
    } else { # all
      collapsedAll <- collapseReplicates(final, rep("merged_all",
                                                    ncol(final)))
      nlibs <- ncol(final)
    }
    # Number of samples per group as matrix

    assay(collapsedAll) <- ceiling(assay(collapsedAll) / nlibs)
  } else collapsedAll <- final

  only.one.group <- length(unique(collapsedAll$SAMPLE) == 1) |
    (ncol(collapsedAll) == 1)
  if ((collapse == "all") | only.one.group) {
    dds <- DESeqDataSet(collapsedAll, design = ~ 1)
  } else {
    dds <- DESeqDataSet(collapsedAll, design = ~ SAMPLE)
  }

  if (score %in% c("transcriptNormalized", "fpkm", "log2fpkm", "log10fpkm")) {
    fpkmCollapsed <- DESeq2::fpkm(dds, robust = FALSE)
    if (score == "transcriptNormalized") {
      normalization <- matrix(rep(rowSums2(fpkmCollapsed),
                                  ncol(fpkmCollapsed)),
                              ncol = ncol(fpkmCollapsed))
      fpkmTranscriptNormalized <- fpkmCollapsed / normalization
      assay(dds) <- fpkmTranscriptNormalized
      return(dds)
    } else if (score == "fpkm") {
      return(fpkmCollapsed)
    } else if (score == "log2fpkm") {
      return(log2(fpkmCollapsed))
    } else if (score == "log10fpkm") {
      return(log10(fpkmCollapsed))
    }
  }
  return(dds)
}

#' Extract count table directly from experiment
#'
#' Used to quickly load read count tables to R. \cr
#' If df is experiment:
#' Extracts by getting /QC_STATS directory, and searching for region
#' Requires \code{\link{ORFikQC}} to have been run on experiment!
#'
#' If df is path to folder:
#' Loads the the file in that directory with the regex region.rds,
#' where region is what is defined by argument. If loaded as SummarizedExperiment
#' or deseq, the colData will be made from ORFik.experiment information.
#' @param df an ORFik \code{\link{experiment}} or path to folder with
#' countTable, use path if not same folder as experiment libraries. Will subset to
#' the count tables specified if df is experiment. If experiment has 4 rows and you subset it
#' to only 2, then only those 2 count tables will be outputted.
#' @param region a character vector (default: "mrna"), make raw count matrices
#'  of whole mrnas or one of (leaders, cds, trailers).
#' @param type character, default: "count" (raw counts matrix).
#' Which object type and normalization do you want ?
#' "summarized" (SummarizedExperiment object),
#' "deseq" (Deseq2 experiment, design will be all valid non-unique
#' columns except replicates, change by using DESeq2::design,
#' normalization alternatives are: "fpkm", "log2fpkm" or "log10fpkm".
#' @param collapse a logical/character (default FALSE), if TRUE all samples
#' within the group SAMPLE will be collapsed to one. If "all", all
#' groups will be merged into 1 column called merged_all. Collapse is defined
#' as rowSum(elements_per_group) / ncol(elements_per_group)
#' @return a data.table/SummarizedExperiment/DESeq object
#' of columns as counts / normalized counts per library, column name
#' is name of library. Rownames must be unique for now. Might change.
#' @importFrom DESeq2 DESeqDataSet
#' @importFrom stats as.formula
#' @export
#' @family countTable
#' @examples
#' # Make experiment
#' ORFik.template.experiment()
#' # Make QC report to get counts ++
#' # ORFikQC(df)
#'
#' # Get count Table of mrnas
#' # countTable(df, "mrna")
#' # Get count Table of cds
#' # countTable(df, "cds")
#' # Get count Table of mrnas as fpkm values
#' # countTable(df, "mrna", type = "count")
#' # Get count Table of mrnas with collapsed replicates
#' # countTable(df, "mrna", collapse = TRUE)
#' # Get count Table of mrnas as summarizedExperiment
#' # countTable(df, "mrna", type = "summarized")
#' # Get count Table of mrnas as DESeq2 object,
#' # for differential expression analysis
#' # countTable(df, "mrna", type = "deseq")
countTable <- function(df, region = "mrna", type = "count",
                       collapse = FALSE) {
  # TODO fix bug if deseq!
  df.temp <- NULL
  if (is(df, "experiment")) {
    if (nrow(df) == 0) stop("df experiment has 0 rows (samples)!")
    df.temp <- df
    dir = dirname(df$filepath[1])
    df <- paste0(dir, "/QC_STATS")
  }
  if (is(df, "character")) {
    if (dir.exists(df)) {
      df <- list.files(path = df, pattern = paste0(region, ".rds"),
                       full.names = TRUE)
    }
    if (length(df) == 1) {
      res <- readRDS(df)
      # Subset to samples wanted
      if (!is.null(df.temp)) {
        if ((ncol(res) != nrow(df.temp))) {
          subset <- if (sum(colnames(res) %in% bamVarName(df.temp, FALSE)) == nrow(df.temp)) {
            colnames(res) %in% bamVarName(df.temp, FALSE)
          } else if (sum(colnames(res) %in%
                         bamVarName(df.temp, FALSE, skip.experiment = FALSE)) == nrow(df.temp)) {
            colnames(res) %in% bamVarName(df.temp, FALSE, skip.experiment = FALSE)
          } else if (sum(colnames(res) %in%
                         bamVarName(df.temp)) == nrow(df.temp)) {
            colnames(res) %in% bamVarName(df.temp)
          } else if (sum(colnames(res) %in%
                         bamVarName(df.temp, FALSE, FALSE)) == nrow(df.temp)) {
            colnames(res) %in% bamVarName(df.temp, FALSE, FALSE)
          } else if (sum(colnames(res) %in%
                         bamVarName(df.temp, TRUE, FALSE, FALSE)) == nrow(df.temp)) {
            colnames(res) %in% bamVarName(df.temp, TRUE, FALSE, FALSE)
          } else if (sum(colnames(res) %in%
                         bamVarName(df.temp, FALSE, FALSE, FALSE)) == nrow(df.temp)) {
            colnames(res) %in% bamVarName(df.temp, FALSE, FALSE, FALSE)
          } else stop("No valid names for count tables found from experiment")
          res <- res[, subset]
        }
      }
      # Add all sample columns if not existing and it is possible
      if (is.null(colData(res)$stage)) {
        if (length(unique(df.temp$stage)) > 1) {
          colData(res)$stage <- as.factor(df.temp$stage)
          if (anyNA(colData(res)$stage))
            colData(res)$stage[is.na(colData(res)$stage)] <- ""
        }
        if ((length(unique(df.temp$libtype)) > 0) & (type != "deseq")) {
          colData(res)$libtype <- as.factor(df.temp$libtype)
          if (anyNA(colData(res)$libtype))
            colData(res)$libtype[is.na(colData(res)$libtype)] <- ""
        }
        if (length(unique(df.temp$condition)) > 1) {
          colData(res)$condition <- as.factor(df.temp$condition)
          if (anyNA(colData(res)$condition))
            colData(res)$condition[is.na(colData(res)$condition)] <- ""
        }
        if (length(unique(df.temp$fraction)) > 1) {
          colData(res)$fraction <- as.factor(df.temp$fraction)
          if (anyNA(colData(res)$fraction))
            colData(res)$fraction[is.na(colData(res)$fraction)] <- ""
        }
        colData(res)$replicate <- as.factor(df.temp$rep)
      }

      # Decide output format
      if (type == "summarized") return(res)
      if (type == "deseq") {
        # remove replicate from formula
        formula <- colnames(colData(res))
        if ("replicate" %in% formula)
          formula <- formula[-grep("replicate", formula)]
        formula <- as.formula(paste(c("~", paste(formula,
                                    collapse = " + ")), collapse = " "))
        return(DESeqDataSet(res, design = formula))
      }
      ress <- scoreSummarizedExperiment(res, type, collapse)
      if (is(ress, "matrix")) {
        ress <- as.data.table(ress)
      } else { # is deseq
        ress <- as.data.table(assay(ress))
      }
      rownames(ress) <- names(ranges(res))
      return(ress)
    }
  }
  message(paste("Invalid count table:", df))
  stop("df must be filepath to directory with countTable, the path
       to the countTable or ORFik experiment with a QC_STATS folder!")
}

#' Make a list of count matrices from experiment
#'
#' @inheritParams makeSummarizedExperimentFromBam
#' @inheritParams QCreport
#' @param regions a character vector, default:
#'  c("mrna", "leaders", "cds", "trailers"), make raw count matrices
#' of whole regions specified.
#' @param BPPARAM how many cores/threads to use? default: bpparam()
#' @return a list of data.table, 1 data.table per region. The regions
#' will be the names the list elements.
#' @family countTable
countTable_regions <- function(df, out.dir = dirname(df$filepath[1]),
                               longestPerGene = TRUE,
                               geneOrTxNames = "tx",
                               regions = c("mrna", "leaders", "cds",
                                           "trailers"),
                               type = "count", lib.type = "ofst",
                               weight = "score",
                               BPPARAM = bpparam()) {

  stats_folder <- pasteDir(out.dir, "/QC_STATS/")
  countDir <- paste0(stats_folder, "countTable_")
  libs <- bplapply(
    regions,
    function(region, countDir, df, geneOrTxNames, longestPerGene) {
     message("- Creating read count tables for region:")
     message(region)
     path <- paste0(countDir, region)
     makeSummarizedExperimentFromBam(df, region = region,
                                     geneOrTxNames = "tx",
                                     longestPerGene = FALSE,
                                     saveName = path, lib.type = lib.type)
    },
    countDir = countDir, df = df,
    geneOrTxNames = geneOrTxNames,
    longestPerGene = longestPerGene, BPPARAM = BPPARAM
  )
  names(libs) <- regions
  return(libs)
}

Try the ORFik package in your browser

Any scripts or data that you put into this service are public.

ORFik documentation built on March 27, 2021, 6 p.m.