R/ascend_cellranger.R

Defines functions loadCellRanger

Documented in loadCellRanger

################################################################################
#
# ascend_cellranger.R
# description: Functions related to the loading and prepation of Chromium
# data generated by the Cell Ranger pipeline
#
################################################################################

#' loadCellRanger
#' 
#' Automatically loads and prepares data from a folder containing Cell Ranger
#' output. Filtered data should be used as raw data is too large for most 
#' desktop systems. 
#' 
#' @param x Path to Cell Ranger filtered output 
#' (eg. outs/filtered_gene_bc_matrices_mex/GRCh38p7)
#' @return An \linkS4class{EMSet} with Mt and Rb genes set as controls
#' 
#' @examples
#' \dontrun{
#' # Output folder generated by Cell Ranger
#' cellranger_dir <- "CellRangerOutput/outs/filtered_gene_bc_matrices_mex/GRCh38p7"
#' em_set <- loadCellRanger(cellranger_dir)
#' }
#' @include ascend_objects.R
#' @include ascend_methods.R
#' @importFrom S4Vectors DataFrame
#' @export
loadCellRanger <- function(x){
  matrix_file <- joinPaths(c(x, "matrix.mtx"))
  barcodes_file <- joinPaths(c(x, "barcodes.tsv"))
  
  # Check if path exists - for Cell Ranger 3.0.0
  genes_file <- joinPaths(c(x, "genes.tsv"))
  
  if (!file.exists(genes_file)){
    genes_file <- joinPaths(c(x, "features.tsv"))
  }
  
  # Create things from scratch to ensure nothing is missed
  barcodes <- utils::read.csv(barcodes_file, header = FALSE, sep = "\t", 
                       stringsAsFactors = FALSE)
  colnames(barcodes) <- c("cell_barcode")
  barcodes$batch <- as.numeric(unlist(
    lapply(strsplit(as.character(barcodes$cell_barcode), "-"), `[`, 2)))
  
  genes <- utils::read.csv(genes_file, 
                    header = FALSE, 
                    sep = "\t", 
                    stringsAsFactors = FALSE)
  
  colnames(genes) <- c("ensembl_gene_id", "gene_id")
  genes$gene_id <- make.unique(genes$gene_id)
  genes <- genes[ , c("gene_id", "ensembl_gene_id")]
  
  # Read in sparseMatrix
  expression_matrix <- Matrix::readMM(matrix_file)
  
  # Coerce into dgCMatrix
  expression_matrix <- as(expression_matrix, "dgCMatrix")
  colnames(expression_matrix) <- barcodes[, "cell_barcode"]
  rownames(expression_matrix) <- genes[, "gene_id"]
  
  # Create EMSet
  controls <- list(Mt = grep("^Mt-", genes[,"gene_id"], ignore.case = TRUE, value = TRUE), 
                   Rb = grep("^Rps|^Rpl", genes[, "gene_id"], ignore.case = TRUE, value = TRUE))
  
  barcodes <- S4Vectors::DataFrame(barcodes, row.names = barcodes[, 1])
  genes <- S4Vectors::DataFrame(genes, row.names = genes[, 1])

  object <- EMSet(list(counts = expression_matrix), 
                         colInfo = barcodes,
                         rowInfo = genes,
                         controls = controls)
  
  return(object)
}
IMB-Computational-Genomics-Lab/ascend documentation built on Aug. 29, 2019, 4:10 a.m.