R/gemini_create_input.R

Defines functions gemini_create_input

Documented in gemini_create_input

#' gemini_create_input
#'
#' @description Creates a gemini.input object from a counts matrix with given annotations.
#' @param counts.matrix a matrix of counts with rownames corresponding to features (e.g. guides) and colnames corresponding to samples.
#' @param sample.replicate.annotation a data.frame of annotations for each sample/replicate pair.
#' Note that at least one column in \code{sample.replicate.annotation} must correspond to the colnames of \code{counts.matrix} (see Details) (default = NULL)
#' @param guide.annotation a data.frame of annotations for each guide.  Note that at least one column in \code{guide.annotation} must correspond to the rownames of counts.matrix (default = NULL)
#' @param samplesAreColumns a logical indicating if samples are on the columns or rows of counts.matrix. (default = TRUE)
#' @param sample.column.name a character or integer indicating which column of \code{sample.replicate.annotation} describes the samples.
#' @param gene.column.names a character or integer vector of length(2) indicating which columns of \code{guide.annotation} describe the genes being targeted.
#' @param ETP.column a character or integer vector indicating which column(s) of \code{counts.matrix} contain the early time-point(s) of the screen (i.e. pDNA, early sequencing, etc.).  Defaults to the first column.
#' @param LTP.column a character or integer vector indicating which column(s) is the later time-point of the screen (i.e. day21, post-treatment, etc.).  Defaults to \code{(1:ncol(counts.matrix))[-ETP.column]}, or all other columns except for those specified by \code{ETP.column}.
#' @param verbose Verbosity (default FALSE)
#' @return a gemini.input object
#'
#' @details
#' This function initializes a gemini.input object from a counts matrix. There are a few key assumptions made in the input format.
#' \itemize{
#' \item The counts matrix is regular.
#' \item The counts matrix structure is in accordance with the \code{samplesAreColumns} parameter.
#' \item The first column of \code{sample.replicate.annotation} matches with the existing dimension names of the counts matrix.
#' \item The first column of \code{guide.annotations} matches with the existing dimension names of the counts matrix.
#' \item \code{sample.column.name} must specify a column in \code{sample.replicate.annotation} (either by name or index) that describes unique samples.
#' \item \code{gene.column.names} must specify two columns in \code{sample.replicate.annotation} (either by name or index) that describe genes.
#' }
#'
#' @importFrom dplyr mutate
#'
#' @examples
#' data("counts", package = "gemini")
#' data("sample.replicate.annotation", package = "gemini")
#' data("guide.annotation", package = "gemini")
#' Input <- gemini_create_input(
#'     counts.matrix = counts,
#'     sample.replicate.annotation = sample.replicate.annotation,
#'     guide.annotation = guide.annotation,
#'     sample.column.name = "samplename",
#'     gene.column.names = c("U6.gene", "H1.gene")
#' )
#'
#' @export
gemini_create_input <-
  function(counts.matrix,
           sample.replicate.annotation = NULL,
           guide.annotation = NULL,
           samplesAreColumns = TRUE,
           sample.column.name = "samplename",
           gene.column.names = NULL,
           ETP.column = 1,
           LTP.column = NULL,
           verbose = FALSE) {
    # Check ETP/LTP column identification
    if (is.numeric(ETP.column) & is.null(LTP.column)) {
      LTP.column <- seq(from = 1, to = ncol(counts.matrix))[-ETP.column]
    } else if (is.character(ETP.column) & is.null(LTP.column)) {
      ETP.column <- which(colnames(counts.matrix) %in% ETP.column)
      LTP.column <-
        seq(from = 1, to = ncol(counts.matrix))[-ETP.column]
    }
    
    # Require dimension names for counts matrix if no guide and replicate annotations provided
    if (is.null(dimnames(counts.matrix)) |
        is.null(guide.annotation) | is.null(sample.replicate.annotation))
      stop("No dimnames for counts.matrix - no annotations available.", "")
    
    # Require sample.column.name and gene.column.names specification
    if (is.null(gene.column.names) | is.null(sample.column.name)) {
      stop("Did you provide gene.column.names and/or sample.column.name?")
    }
    
    # transpose matrix
    if (!samplesAreColumns) {
      if (verbose)
        message("Transposing matrix...")
      # transpose and preserve dimnames
      dn <- dimnames(counts.matrix)
      counts.matrix <- t(counts.matrix)
      dimnames(counts.matrix) <- rev(dn)
    }
    
    # default guide annotations to rownames of counts matrix
    gannot <-
      data.frame(rowname = rownames(counts.matrix),
                 stringsAsFactors = FALSE)
    
    # Default sample annotations to column names of counts matrix, ordering by ETP -> LTP
    sannot <-
      data.frame(
        colname = colnames(counts.matrix)[c(ETP.column, LTP.column)],
        stringsAsFactors = FALSE,
        row.names = seq(from = 1, to = length(c(
          ETP.column, LTP.column
        )))
      ) %>%
      dplyr::mutate(TP = c(rep("ETP", length(ETP.column)), rep("LTP", length(LTP.column))))
    
    # Merge existing sample annotations with colnames, ensuring formatting and matching names
    if (!is.null(sample.replicate.annotation) &
        !is.null(sample.column.name)) {
      colnames(sample.replicate.annotation)[colnames(sample.replicate.annotation) == sample.column.name] <-
        "samplename" # Set sample column name to "samplename"
      if (verbose)
        message("Merging sample annotations with colnames of counts.matrix...")
      i = which(apply(sample.replicate.annotation, 2, function(x)
        all(x %in% sannot[, 1])))
      if (!length(i) > 0) {
        if (verbose)
          message(
            "No columns found in sample.replicate.annotation which completely match colnames of counts.matrix..."
          )
      }
      sannot <-
        merge(
          sannot,
          sample.replicate.annotation,
          by.x = 1,
          by.y = i[1],
          no.dups = FALSE,
          all = FALSE,
          sort = FALSE,
          suffixes = c("", ".y")
        )
    } else{
      stop(
        "Could not determine samplename.  Please add sample/replicate annotation and specify and sample.column.name.  See ?gemini_create_input."
      )
    }
    
    # Merge guide annotations with existing rownames, ensuring formatting and matching names
    if (!is.null(guide.annotation)) {
      if (verbose)
        message("Merging guide annotations with rownames()...")
      i = which(apply(guide.annotation, 2, function(x)
        all(x %in% gannot[, 1])))
      if (!length(i) > 0) {
        if (verbose)
          message("No columns found in guide.annotation which completely match rownames()...")
      }
      gannot <-
        merge(
          gannot,
          guide.annotation,
          by.x = 1,
          by.y = i,
          no.dups = FALSE,
          all = FALSE,
          sort = FALSE,
          suffixes = c("", ".y")
        )
    } else{
      stop(
        "Could not determine gene/guide data.  Please add guide annotation and specify and gene.column.names. See ?gemini_create_input."
      )
    }
    
    # Create new Input object
    Output <- list(
      counts = data.matrix(counts.matrix[, c(ETP.column, LTP.column)]),
      replicate.map = as.data.frame(
        sannot,
        optional = TRUE,
        row.names = seq(from = 1, to = nrow(sannot))
      ),
      guide.pair.annot = as.data.frame(
        gannot,
        optional = TRUE,
        rownames = seq(from = 1, to = nrow(gannot))
      )
    )
    
    Output <-
      gemini_prepare_input(Output, gene.columns = gene.column.names)
    
    class(Output) <- union(class(Output), "gemini.input")
    if (verbose)
      message("Created gemini input object.")
    return(Output)
  }
sellerslab/gemini documentation built on Dec. 5, 2022, 8:56 a.m.