R/SummarizedExperiment_helpers.R

Defines functions metadata_count_table subset_count_table countTable_regions countTable scoreSummarizedExperiment makeSummarizedExperimentFromBam

Documented in countTable countTable_regions makeSummarizedExperimentFromBam scoreSummarizedExperiment

#' Make a count matrix from a library or experiment
#'
#' Make a summerizedExperiment / matrix object from bam files or
#' other library formats sepcified by lib.type argument. Works
#' like HTSeq, to give you count tables per library.
#'
#' If txdb or gtf path is added, it is a rangedSummerizedExperiment
#' NOTE: If the file called saveName exists, it will then load file,
#' not remake it!\cr
#' There are different ways of counting hits on transcripts, ORFik does
#' it as pure coverage (if a single read aligns to a region with 2 genes, both
#' gets a count of 1 from that read).
#' This is the safest way to avoid false negatives
#' (genes with no assigned hits that actually have true hits).
#' @param df an ORFik \code{\link{experiment}}
#' @param saveName a character (default NULL),
#' if set save experiment to path given. Always saved as .rds.,
#' it is optional to add .rds, it will be added for you if not present.
#' Also used to load existing file with that name.
#' @param longestPerGene a logical (default FALSE), if FALSE all transcript
#' isoforms per gene. Ignored if "region" is not a character of either:
#' "mRNA","tx", "cds", "leaders" or "trailers".
#' @param geneOrTxNames a character vector (default "tx"), should row names
#' keep trancript names ("tx") or change to gene names ("gene")
#' @param region a character vector (default: "mrna"), make raw count matrices
#' of whole mrnas or one of (leaders, cds, trailers).
#' Can also be a \code{\link{GRangesList}}, then it uses this region directly.
#' Can then be uORFs or a subset of CDS etc.
#' @param type default: "count" (raw counts matrix), alternative is "fpkm",
#' "log2fpkm" or "log10fpkm"
#' @param lib.type a character(default: "default"), load files in experiment
#' or some precomputed variant, either "ofst", "bedo", "bedoc" or "pshifted".
#' These are made with ORFik:::convertLibs() or shiftFootprintsByExperiment().
#' Can also be custom user made folders inside the experiments bam folder.
#' @param weight numeric or character, a column to score overlaps by. Default "score",
#' will check for a metacolumn called "score" in libraries. If not found,
#' will not use weights.
#' @param forceRemake logical, default FALSE. If TRUE, will not look for existing file count table files.
#' @param force logical, default TRUE. IF TRUE, will not use existing libraries found in environment
#' of experiment. See argument 'force' in \code{link{outputLibs}}
#' @param BPPARAM how many cores/threads to use? default: BiocParallel::SerialParam()
#' @import SummarizedExperiment
#' @export
#' @return a \code{\link{SummarizedExperiment}} object or data.table if
#' "type" is not "count, with rownames as transcript / gene names.
#' @examples
#' ##Make experiment
#' df <- ORFik.template.experiment()
#' # makeSummarizedExperimentFromBam(df)
#' ## Only cds (coding sequences):
#' # makeSummarizedExperimentFromBam(df, region = "cds")
#' ## FPKM instead of raw counts on whole mrna regions
#' # makeSummarizedExperimentFromBam(df, type = "fpkm")
#' ## Make count tables of pshifted libraries over uORFs
#' uorfs <- GRangesList(uorf1 = GRanges("chr23", 17599129:17599156, "-"))
#' #saveName <- file.path(dirname(df$filepath[1]), "uORFs", "countTable_uORFs")
#' #makeSummarizedExperimentFromBam(df, saveName, region = uorfs)
#' ## To load the uORFs later
#' # countTable(df, region = "uORFs", count.folder = "uORFs")
makeSummarizedExperimentFromBam <- function(df, saveName = NULL,
                                            longestPerGene = FALSE,
                                            geneOrTxNames = "tx",
                                            region = "mrna", type = "count",
                                            lib.type = "ofst",
                                            weight = "score", forceRemake = FALSE,
                                            force = TRUE,
                                            BPPARAM = BiocParallel::SerialParam()) {
  stopifnot(length(geneOrTxNames) == 1)
  stopifnot(geneOrTxNames %in% c("tx", "gene"))

  if(!is.null(saveName)) {
    if (file_ext(saveName) != "rds") saveName <- paste0(saveName,".rds")
    if (file.exists(saveName) & !forceRemake) {
      message("Loading existing count table, set forceRemake=TRUE if you want to remake")
      return(readRDS(saveName))
    }
  }
  validateExperiments(df)

  if (is(region, "character")) {
    txdb <- loadTxdb(df)
    if (longestPerGene) {
      longestTxNames <- filterTranscripts(txdb, 0, 1, 0, longestPerGene = TRUE)
      tx <- loadRegion(txdb, region, names.keep = longestTxNames)
    } else tx <- loadRegion(txdb, region)
  } else tx <- region

  if (geneOrTxNames == "gene") {
    if (!is(region, "character")) txdb <- loadTxdb(df)
    names(tx) <- txNamesToGeneNames(names(tx), txdb)
  }

  varNames <- bamVarName(df)
  outputLibs(df, chrStyle = tx, type = lib.type, force = force,
             BPPARAM = BPPARAM)

  rawCounts <- data.table(matrix(0, ncol = length(varNames),
                                 nrow = length(tx)))
  message("    - Counting overlaps")
  for (i in seq(length(varNames))) { # For each sample
    message(varNames[i])
    if (is.character(weight) & length(weight) == 1) {
      if (!(weight %in% colnames(mcols(get(varNames[i], envir = envExp(df))))))
        weight <- NULL
    }

    co <- countOverlapsW(tx, get(varNames[i], envir = envExp(df)), weight = weight)
    rawCounts[, (paste0("V",i)) := co]
  }
  mat <- as.matrix(rawCounts);colnames(mat) <- NULL

  colData <- DataFrame(SAMPLE = as.factor(bamVarName(df, TRUE)),
                       row.names=varNames)
  # Add sample columns
  if (!is.null(df$rep)) colData$replicate <- as.factor(df$rep)
  if (!is.null(df$stage)) colData$stage <- as.factor(df$stage)
  if (!is.null(df$libtype)) colData$libtype <- as.factor(df$libtype)
  if (!is.null(df$condition)) colData$condition <- as.factor(df$condition)
  if (!is.null(df$fraction))
    colData$fraction <- as.factor(as.character(df$fraction))

  res <- SummarizedExperiment(assays=list(counts=mat), rowRanges=tx,
                              colData=colData)
  if (type %in% c("fpkm", "log2fpkm", "log10fpkm")) {
    res <- as.data.table(scoreSummarizedExperiment(res, score = type))
    rownames(res) <- names(tx)
  }
  if(!is.null(saveName)) {
    if (!dir.exists(dirname(saveName))) {
      dir.create(dirname(saveName), showWarnings = FALSE, recursive = TRUE)
    }
    if (file_ext(saveName) != "rds") saveName <- paste0(saveName,".rds")
    saveRDS(res, file = saveName)
  }
  return(res)
}

#' Helper function for makeSummarizedExperimentFromBam
#'
#' If txdb or gtf path is added, it is a rangedSummerizedExperiment
#' For FPKM values, DESeq2::fpkm(robust = FALSE) is used
#' @param final ranged summarized experiment object
#' @param score default: "transcriptNormalized"
#' (row normalized raw counts matrix),
#' alternative is "fpkm", "log2fpkm" or "log10fpkm"
#' @param collapse a logical/character (default FALSE), if TRUE all samples
#' within the group SAMPLE will be collapsed to one. If "all", all
#' groups will be merged into 1 column called merged_all. Collapse is defined
#' as rowSum(elements_per_group) / ncol(elements_per_group)
#' @import SummarizedExperiment DESeq2
#' @export
#' @return a DEseq summerizedExperiment object (transcriptNormalized)
#'  or matrix (if fpkm input)
scoreSummarizedExperiment <- function(final, score = "transcriptNormalized",
                                      collapse = FALSE) {
  if (is.factor(final$SAMPLE)) {
    lvls <- levels(final$SAMPLE) %in% unique(colData(final)$SAMPLE)
    final$SAMPLE <- factor(final$SAMPLE, levels = levels(final$SAMPLE)[lvls])
  }
  if (collapse %in% c(TRUE, "all")) {
    if (collapse == TRUE) {
      collapsedAll <- collapseReplicates(final, final$SAMPLE)
      nlibs <- t(matrix(as.double(table(colData(final)$SAMPLE)),
                        ncol = nrow(assay(collapsedAll)) ,
                        nrow = length(unique(colData(final)$SAMPLE))))
    } else { # all
      collapsedAll <- collapseReplicates(final, rep("merged_all",
                                                    ncol(final)))
      nlibs <- ncol(final)
    }
    # Number of samples per group as matrix

    assay(collapsedAll) <- ceiling(assay(collapsedAll) / nlibs)
  } else collapsedAll <- final

  only.one.group <- (length(unique(collapsedAll$SAMPLE)) == 1) |
    (ncol(collapsedAll) == 1)
  if ((collapse == "all") | only.one.group) {
    dds <- DESeqDataSet(collapsedAll, design = ~ 1)
  } else {
    dds <- DESeqDataSet(collapsedAll, design = ~ SAMPLE)
  }

  if (score %in% c("transcriptNormalized", "fpkm", "log2fpkm", "log10fpkm")) {
    fpkmCollapsed <- DESeq2::fpkm(dds, robust = FALSE)
    fpkmCollapsed[is.nan(fpkmCollapsed)] <- 0
    if (score == "transcriptNormalized") {
      normalization <- matrix(rep(rowSums2(fpkmCollapsed),
                                  ncol(fpkmCollapsed)),
                              ncol = ncol(fpkmCollapsed))
      fpkmTranscriptNormalized <- fpkmCollapsed / normalization
      assay(dds) <- fpkmTranscriptNormalized
      return(dds)
    } else if (score == "fpkm") {
      return(fpkmCollapsed)
    } else if (score == "log2fpkm") {
      return(log2(fpkmCollapsed))
    } else if (score == "log10fpkm") {
      return(log10(fpkmCollapsed))
    }
  }
  return(dds)
}

#' Extract count table directly from experiment
#'
#' Used to quickly load pre-created read count tables to R. \cr
#' If df is experiment:
#' Extracts by getting /QC_STATS directory, and searching for region
#' Requires \code{\link{ORFikQC}} to have been run on experiment,
#' to get default count tables!
#'
#' If df is path to folder:
#' Loads the the file in that directory with the regex region.rds,
#' where region is what is defined by argument, if multiple exist,
#' see if any start with "countTable_", if so, subset. If loaded as SummarizedExperiment
#' or deseq, the colData will be made from ORFik.experiment information.
#' @param df an ORFik \code{\link{experiment}} or path to folder with
#' countTable, use path if not same folder as experiment libraries. Will subset to
#' the count tables specified if df is experiment. If experiment has 4 rows and you subset it
#' to only 2, then only those 2 count tables will be outputted.
#' @param region a character vector (default: "mrna"), make raw count matrices
#'  of whole mrnas or one of (leaders, cds, trailers).
#' @param type character, default: "count" (raw counts matrix).
#' Which object type and normalization do you want ?
#' "summarized" (SummarizedExperiment object),
#' "deseq" (Deseq2 experiment, design will be all valid non-unique
#' columns except replicates, change by using DESeq2::design,
#' normalization alternatives are: "fpkm", "log2fpkm" or "log10fpkm".
#' @param collapse a logical/character (default FALSE), if TRUE all samples
#' within the group SAMPLE will be collapsed to one. If "all", all
#' groups will be merged into 1 column called merged_all. Collapse is defined
#' as rowSum(elements_per_group) / ncol(elements_per_group)
#' @param count.folder character, default "auto" (Use count tables from
#' original bam files stored in "QC_STATS", these are like HTseq count tables).
#' To load your custome count tables from pshifted reads, set to "pshifted"
#' (remember to create the pshifted tables first!). If you
#' have custom ranges, like reads over uORFs stored in a folder called
#' "/uORFs" relative to the bam files, set to "uORFs". Always create these
#' custom count tables with \code{\link{makeSummarizedExperimentFromBam}}.
#' Always make the location of the folder directly
#' inside the bam file directory!
#' @return a data.table/SummarizedExperiment/DESeq object
#' of columns as counts / normalized counts per library, column name
#' is name of library. Rownames must be unique for now. Might change.
#' @importFrom DESeq2 DESeqDataSet
#' @importFrom stats as.formula
#' @export
#' @family countTable
#' @examples
#' # Make experiment
#' df <- ORFik.template.experiment()
#' # Make QC report to get counts ++ (not needed for this template)
#' # ORFikQC(df)
#'
#' # Get count Table of mrnas
#' # countTable(df, "mrna")
#' # Get count Table of cds
#' # countTable(df, "cds")
#' # Get count Table of mrnas as fpkm values
#' # countTable(df, "mrna", type = "count")
#' # Get count Table of mrnas with collapsed replicates
#' # countTable(df, "mrna", collapse = TRUE)
#' # Get count Table of mrnas as summarizedExperiment
#' # countTable(df, "mrna", type = "summarized")
#' # Get count Table of mrnas as DESeq2 object,
#' # for differential expression analysis
#' # countTable(df, "mrna", type = "deseq")
countTable <- function(df, region = "mrna", type = "count",
                       collapse = FALSE,
                       count.folder = "default") {
  # TODO fix bug if deseq!
  df.temp <- NULL
  if (is(df, "experiment")) {
    if (nrow(df) == 0) stop("df experiment has 0 rows (samples)!")
    df.temp <- df
    df <-
      if (count.folder == "default") {
        QCfolder(df)
      } else paste0(libFolder(df), "/", count.folder)
  }
  if (is(df, "character")) {
    if (dir.exists(df)) {
      df <- list.files(path = df, pattern = paste0(region, ".rds"),
                       full.names = TRUE)
      if (length(df) > 1) {
        hits <- grep("^countTable_", basename(df))
        if (length(hits) == 1) {
          df <- df[hits]
        }
      }
    }
    if (length(df) == 1) {
      res <- readRDS(df)
      # Subset to samples wanted
      if (!is.null(df.temp)) {
        if ((ncol(res) != nrow(df.temp))) {
          res <- subset_count_table(res, df.temp)
        }
      }
      res <- metadata_count_table(res, df.temp, type)
      # Give important sanity check info:
      is_ribo <- any(c("RFP", "RPF", "LSU","80S") %in% colData(res)$libtype, na.rm = TRUE)
      if(count.folder != "pshifted" & is_ribo)
        message("Loading default 80S counts, update count.folder to pshifted if wanted?")

      # Decide output format
      if (type == "summarized") return(res)
      if (type == "deseq") {
        # remove replicate from formula
        formula <- colnames(colData(res))
        if ("replicate" %in% formula)
          formula <- formula[-grep("replicate", formula)]
        formula <- as.formula(paste(c("~", paste(formula,
                                    collapse = " + ")), collapse = " "))
        return(DESeqDataSet(res, design = formula))
      }
      ress <- scoreSummarizedExperiment(res, type, collapse)
      if (is(ress, "matrix")) {
        ress <- as.data.table(ress)
      } else { # is deseq
        ress <- as.data.table(assay(ress))
      }
      rownames(ress) <- names(ranges(res))
      return(ress)
    } else if (length(df) > 1) {
      message(paste("More than 1 count table: ", df))
      stop("Folder contains multiple count tables for the same region, ORFik does not
           know which to pick. Delete or move the one that is not supposed to be there!")
    }
  }
  message(paste("Invalid count table:", df))
  stop("Table not found!",
      " Must be either: filepath to directory with defined countTable of region, the full path
       to the countTable, run ORFikQC to get default countTables!")
}

#' Make a list of count matrices from experiment
#'
#' By default will make count tables over mRNA, leaders, cds and trailers for
#' all libraries in experiment. region
#'
#' @inheritParams makeSummarizedExperimentFromBam
#' @inheritParams QCreport
#' @param regions a character vector, default:
#'  c("mrna", "leaders", "cds", "trailers"), make raw count matrices
#' of whole regions specified. Can also be a custom GRangesList of
#' for example uORFs or a subset of cds etc.
#' @param rel.dir relative output directory for out.dir, default:
#' "QC_STATS". For pshifted, write "pshifted".
#' @param BPPARAM how many cores/threads to use? default: bpparam()
#' @return a list of data.table, 1 data.table per region. The regions
#' will be the names the list elements.
#' @family countTable
#' @export
#' @examples
#' ##Make experiment
#' df <- ORFik.template.experiment()
#' ## Create count tables for all default regions
#' # countTable_regions(df)
#' ## Pshifted reads (first create pshiftead libs)
#' # countTable_regions(df, lib.type = "pshifted", rel.dir = "pshifted")
countTable_regions <- function(df, out.dir = libFolder(df),
                               longestPerGene = FALSE,
                               geneOrTxNames = "tx",
                               regions = c("mrna", "leaders", "cds",
                                           "trailers"),
                               type = "count", lib.type = "ofst",
                               weight = "score",
                               rel.dir = "QC_STATS", forceRemake = FALSE,
                               BPPARAM = bpparam()) {

  countDir <- pasteDir(file.path(out.dir, rel.dir, "countTable_"))
  libs <- bplapply(
    regions,
    function(region, countDir, df, geneOrTxNames, longestPerGene, forceRemake) {
     message("- Creating read count tables for region:")
     message("  - ", region)
     path <- paste0(countDir, region)
     makeSummarizedExperimentFromBam(df, region = region,
                                     geneOrTxNames = geneOrTxNames,
                                     longestPerGene = longestPerGene,
                                     saveName = path, lib.type = lib.type,
                                     forceRemake = forceRemake, force = FALSE)
    },
    countDir = countDir, df = df,
    geneOrTxNames = geneOrTxNames,
    longestPerGene = longestPerGene, forceRemake = forceRemake, BPPARAM = BPPARAM
  )
  names(libs) <- regions
  return(libs)
}

subset_count_table <- function(res, df.temp) {
  subset <- df.temp$index
  if (length(subset) > ncol(res)) {
    # Fall back to old method, useful if you made some edits and forgot to
    # update count table
    subset <- if (sum(colnames(res) %in% bamVarName(df.temp, FALSE)) == nrow(df.temp)) {
      colnames(res) %in% bamVarName(df.temp, FALSE)
    } else if (sum(colnames(res) %in%
                   bamVarName(df.temp, FALSE, skip.experiment = FALSE)) == nrow(df.temp)) {
      colnames(res) %in% bamVarName(df.temp, FALSE, skip.experiment = FALSE)
    } else if (sum(colnames(res) %in%
                   bamVarName(df.temp)) == nrow(df.temp)) {
      colnames(res) %in% bamVarName(df.temp)
    } else if (sum(colnames(res) %in%
                   bamVarName(df.temp, FALSE, FALSE)) == nrow(df.temp)) {
      colnames(res) %in% bamVarName(df.temp, FALSE, FALSE)
    } else if (sum(colnames(res) %in%
                   bamVarName(df.temp, TRUE, FALSE, FALSE)) == nrow(df.temp)) {
      colnames(res) %in% bamVarName(df.temp, TRUE, FALSE, FALSE)
    } else if (sum(colnames(res) %in%
                   bamVarName(df.temp, FALSE, FALSE, FALSE)) == nrow(df.temp)) {
      colnames(res) %in% bamVarName(df.temp, FALSE, FALSE, FALSE)
    } else if (sum(colnames(res) %in%
                   bamVarName(df.temp, FALSE, FALSE, FALSE, FALSE)) == nrow(df.temp)) {
      colnames(res) %in% bamVarName(df.temp, FALSE, FALSE, FALSE, FALSE)
    } else if (sum(colnames(res) %in%
                   bamVarName(df.temp, TRUE, FALSE, TRUE, FALSE)) == nrow(df.temp)) {
      colnames(res) %in% bamVarName(df.temp, TRUE, FALSE, TRUE, FALSE)
    } else if (sum(colnames(res) %in%
                   bamVarName(df.temp, FALSE, TRUE, TRUE, FALSE)) == nrow(df.temp)) {
      colnames(res) %in% bamVarName(df.temp, FALSE, TRUE, TRUE, FALSE)
    } else if (sum(colnames(res) %in%
                   bamVarName(df.temp, FALSE, FALSE, TRUE, FALSE)) == nrow(df.temp)) {
      colnames(res) %in% bamVarName(df.temp, FALSE, FALSE, TRUE, FALSE)
    } else stop("No valid names for count tables found from experiment")
  }
  return(res[, subset])
}

metadata_count_table <- function(res, df.temp, type) {
  # Add all sample columns if not existing and it is possible
  if (is.null(colData(res)$stage)) {
    if (length(unique(df.temp$stage)) > 1) {
      colData(res)$stage <- as.factor(df.temp$stage)
      if (anyNA(colData(res)$stage))
        colData(res)$stage[is.na(colData(res)$stage)] <- ""
    }
    if ((length(unique(df.temp$libtype)) > 0) & (type != "deseq")) {
      colData(res)$libtype <- as.factor(df.temp$libtype)
      if (anyNA(colData(res)$libtype))
        colData(res)$libtype[is.na(colData(res)$libtype)] <- ""
    }
    if (length(unique(df.temp$condition)) > 1) {
      colData(res)$condition <- as.factor(df.temp$condition)
      if (anyNA(colData(res)$condition))
        colData(res)$condition[is.na(colData(res)$condition)] <- ""
    }
    if (length(unique(df.temp$fraction)) > 1) {
      colData(res)$fraction <- as.factor(as.character(df.temp$fraction))
      if (anyNA(colData(res)$fraction))
        colData(res)$fraction[is.na(colData(res)$fraction)] <- ""
    }
    colData(res)$replicate <- as.factor(df.temp$rep)
  }
  return(res)
}
Roleren/ORFik documentation built on April 25, 2024, 8:41 p.m.