R/utils.R

Defines functions do_quickQC trim_variables trim_samples make_asurat_obj

Documented in do_quickQC make_asurat_obj trim_samples trim_variables

#' Make an ASURAT object.
#'
#' This function creates an ASURAT object.
#'
#' An ASURAT object consists of history, variable, sample, and data slots
#' at the beginning, and several others in the future.
#' Here, the parameters of ASURAT's functions that users run are recorded in
#' history, while information of genes, samples, and gene expression matrices
#' are written in variable, sample, and data, respectively.
#'
#' @param mat_expression A numeric matrix or data frame of raw read counts,
#'   where rows and columns stand for genes and samples, respectively.
#' @param gene_symbol A one-dimensional vector of gene symbols.
#' @param gene_entrez A one-dimensional vector of Entrez gene IDs.
#' @param sample_identity A one-dimensional vector of sample identities.
#'
#' @return An ASURAT object in the form of a list.
#' @export
#'
#' @examples
#' mat_expression <- matrix(1:12, nrow = 3, ncol = 4)
#' gene_symbol <- data.frame(c("gene_A", "gene_B", "gene_C"))
#' gene_entrez <- data.frame(c("1", "2", "3"))
#' sample_identity <- data.frame(c("cell_1", "cell_2", "cell_3", "cell_4"))
#' myobj <- asurat_make_obj(
#'   mat_expression = mat_expression,
#'   gene_symbol = gene_symbol,
#'   gene_entrez = gene_entrez,
#'   sample_identity = sample_identity
#' )
#'
make_asurat_obj <- function(
  mat_expression, gene_symbol, gene_entrez, sample_identity
){
  #-----------------------------------------------
  # Error check
  #-----------------------------------------------
  if(anyNA(gene_symbol)){
    stop("gene_symbol is not permitted to contain NAs.")
  }else if(anyNA(sample_identity)){
    stop("sample_identity is not permitted to contain NAs.")
  }else if(anyNA(mat_expression)){
    stop("mat_expression is not permitted to contain NAs.")
  }
  #-----------------------------------------------
  # Definition
  #-----------------------------------------------
  obj <- list(
    history = list(),
    variable = data.frame(
      symbol = gene_symbol,
      entrez = gene_entrez
    ),
    sample = data.frame(
      identity = sample_identity
    ),
    data = c()
  )
  obj[["data"]][["raw"]] <- as.data.frame(mat_expression)
  #-----------------------------------------------
  # Names
  #-----------------------------------------------
  names(obj[["variable"]]) <- c("symbol", "entrez")
  names(obj[["sample"]]) <- c("identity")
  rownames(obj[["data"]][["raw"]]) <- obj[["variable"]][["symbol"]]
  colnames(obj[["data"]][["raw"]]) <- obj[["sample"]][["identity"]]
  #-----------------------------------------------
  # Message
  #-----------------------------------------------

  return(obj)
}



#' Filter samples by previously calculated single-cell metrics
#'
#' @param obj ASURAT object
#' @param min_nReads Minimal number of total read counts per cell.
#' @param max_nReads Maximal number of total read counts per cell.
#' @param min_nGenes Minimal number of detected genes per cell.
#' @param max_nGenes Maximal number of detected genes per cell.
#' @param min_percent_MT Minimal percentage of mitochondrial gene expression per cell.
#' @param max_percent_MT Maximal percentage of mitochondrial gene expression per cell.
#'
#' @return ASURAT object.
#' @export
#'
#' @examples trim_samples(obj = pbmc_4000,
#' min_nReads = 2606, max_nReads = 30000,
#' min_nGenes = 993, max_nGenes = 1e+10,
#' min_percent_MT = 0, max_percent_MT = 14)
trim_samples <- function(obj,
                         min_nReads = 0, max_nReads = 1e+10,
                         min_nGenes = 0, max_nGenes = 1e+10,
                         min_percent_MT = 0, max_percent_MT = 100
){
  #--------------------------------------------------
  # History
  #--------------------------------------------------
  slot_name <- "trim_samples"
  obj[["history"]][[slot_name]][["min_nReads"]] <- min_nReads
  obj[["history"]][[slot_name]][["max_nReads"]] <- max_nReads
  obj[["history"]][[slot_name]][["min_nGenes"]] <- min_nGenes
  obj[["history"]][[slot_name]][["max_nGenes"]] <- max_nGenes
  obj[["history"]][[slot_name]][["min_percent_MT"]] <- min_percent_MT
  obj[["history"]][[slot_name]][["max_percent_MT"]] <- max_percent_MT
  #--------------------------------------------------
  # Reduce count matrices
  #--------------------------------------------------
  inds_1 <- which(
    (obj[["sample"]][["nReads"]] >= min_nReads) &
      (obj[["sample"]][["nReads"]] <= max_nReads)
  )
  inds_2 <- which(
    (obj[["sample"]][["nGenes"]] >= min_nGenes) &
      (obj[["sample"]][["nGenes"]] <= max_nGenes)
  )
  inds_3 <- which(
    (obj[["sample"]][["percent_MT"]] >= min_percent_MT) &
      (obj[["sample"]][["percent_MT"]] <= max_percent_MT)
  )
  inds <- intersect(intersect(inds_1, inds_2), inds_3)
  obj[["data"]][["raw"]] <- as.data.frame(obj[["data"]][["raw"]][,inds])
  #--------------------------------------------------
  # The other information
  #--------------------------------------------------
  obj[["sample"]] <- data.frame(
    barcode = obj[["sample"]][["barcode"]][inds],
    nReads = as.integer(obj[["sample"]][["nReads"]][inds]),
    nGenes = as.integer(obj[["sample"]][["nGenes"]][inds]),
    percent_MT = obj[["sample"]][["percent_MT"]][inds]
  )

  return(obj)
}


#' Remove low quality genes
#'
#' Remove low quality genes based on the number of mean read counts across all cells
#'
#' @param obj ASURAT object.
#' @param min_meanReads Minimal mean read counts per gene across all cells.
#'
#' @return ASURAT object.
#' @export
#'
#' @examples trim_variables(obj = pbmc_4000, min_meanReads = 0.05)
trim_variables <- function(obj, min_meanReads){
  #--------------------------------------------------
  # History
  #--------------------------------------------------
  slot_name <- "trim_variables"
  obj[["history"]][[slot_name]][["min_meanReads"]] <- min_meanReads
  #--------------------------------------------------
  # Reduce count matrix
  #--------------------------------------------------
  tmp <- as.matrix(obj[["data"]][["raw"]])
  inds <- which(apply(tmp, 1, mean) >= min_meanReads)
  mat <- tmp[inds, ]
  obj[["data"]][["raw"]] <- as.data.frame(mat)
  #--------------------------------------------------
  # The others
  #--------------------------------------------------
  genes <- data.frame(
    symbol  = obj[["variable"]][["symbol"]][inds],
    entrez  = obj[["variable"]][["entrez"]][inds]
  )
  obj[["variable"]] <- genes

  return(obj)
}


#' Perform quick QC
#'
#' Perform quick quality control by removing variables for which the number of
#' non-zero expressing samples is less than min_nsamples. ***Explanation unclear***
#'
#' @param obj ASURAT object.
#' @param min_nsamples ***Need clarification***
#' @param mitochondria_symbol Regex for mitochondrial genes (e.g. "^MT-").
#'
#' @return ASURAT object.
#' @export
#'
#' @examples do_quickQC(obj = pbmc_4000, min_nsamples = 10,
#' mitochondria_symbol = "^MT-")
do_quickQC <- function(obj, min_nsamples, mitochondria_symbol){
  #--------------------------------------------------
  # History
  #--------------------------------------------------
  slot_name <- "do_quickQC"
  obj[["history"]][[slot_name]][["min_nsamples"]] <- min_nsamples
  #--------------------------------------------------
  # Reduce variables
  #--------------------------------------------------
  tmp <- as.matrix(obj[["data"]][["raw"]])
  inds <- which(apply(tmp, 1, function(x) sum(x>0)) >= min_nsamples)
  mat <- tmp[inds,]
  obj[["data"]][["raw"]] <- as.data.frame(mat)
  #--------------------------------------------------
  # Other information
  #--------------------------------------------------
  obj[["variable"]] <- data.frame(
    symbol = obj[["variable"]][["symbol"]][inds],
    entrez = obj[["variable"]][["entrez"]][inds]
  )
  obj[["sample"]] <- data.frame(
    barcode = colnames(mat),
    nReads = as.integer(apply(mat, 2, function(x) sum(x))),
    nGenes = as.integer(apply(mat, 2, function(x) sum(x > 0))),
    percent_MT = apply(
      mat[grepl(mitochondria_symbol, rownames(mat)),], 2, function(x) 100 * sum(x)
    ) / apply(mat, 2, function(x) sum(x))
  )

  return(obj)
}
johannesnicolaus/ASURAT_source documentation built on Dec. 21, 2021, 2:11 a.m.