R/scLoad.R

Defines functions readMultipleInDropMatrices readInDropMatrix readMultipleKleinMatrices readKleinMatrix readMultiple10XmatricesAsList read10Xmatrices read10xMatrix

Documented in read10Xmatrices read10xMatrix readInDropMatrix readKleinMatrix readMultiple10XmatricesAsList readMultipleInDropMatrices readMultipleKleinMatrices

#' @importFrom utils read.table
#' @importFrom Matrix readMM
NULL

#' @title  Read 10x matrix
#' @description This function reads a matrix generated by the 10x processing pipeline
#' from the specified directory and returns it. It aborts if one of the required
#' files in the specified directory do not exist.
#' @param path location of 10x output
#' @param version version of 10x output to read, must be one of 'V2' or 'V3'
#' @param transcript.id transcript identifier to use, can be SYMBOL or ENSEMBL
#' @return read matrix
#' @import Matrix
#' @import methods
#' @export read10xMatrix
read10xMatrix <- function(path, version='V2', transcript.id = 'SYMBOL') {
    if(version == 'V2') {
        unpackFunction <- I
        suffix <- ''
    } else if (version == 'V3') {
        unpackFunction <- gzfile
        suffix <- '.gz'
    } else {
        stop('Unknown file version!')
    }
    if(transcript.id == 'SYMBOL') {
        transcript.id.col.idx = 2
    } else if (transcript.id == 'ENSEMBL') {
        transcript.id.col.idx = 1
    } else {
        stop('Unknown transcript identifier')
    }
    matrixFile <- paste0(path, '/matrix.mtx', suffix);
    if (version == 'V2') {
        genesFile <- paste0(path, '/genes.tsv', suffix);
    } else if (version == 'V3') {
        genesFile <- paste0(path, '/features.tsv', suffix);
    }
    barcodesFile <- paste0(path, '/barcodes.tsv', suffix);
    if (!file.exists(matrixFile)) { stop('Matrix file does not exist');  }
    if (!file.exists(genesFile)) { stop('Genes file does not exist'); }
    if (!file.exists(barcodesFile)) { stop('Barcodes file does not exist'); }
    x <- as(Matrix::readMM(unpackFunction(matrixFile)), 'dgCMatrix')
    genes <- read.table(unpackFunction(genesFile));
    rownames(x) <- genes[,transcript.id.col.idx];
    barcodes <- read.table(unpackFunction(barcodesFile));
    colnames(x) <- barcodes[,1]
    invisible(x);
}


#' @title read multiple 10x matrices into a single sparse array
#' @description given a named list of paths of 10X matrices return a single large matrix
#' with all the data and cell prefixed with the corresponding sample name
#' @param paths named vector of location of the data (readable by read10Xmatrix())
#' @param min.common.genes minimum number of common genes to allow
#' @param common.genes logical, subset all matrices to common genes, required for merge
#' @param merge logical, merge all the matrices to one, requires common.genes and prefix.cells
#' @param prefix.cells prefix all cells with the name of the respective path in paths
#' @param prefix.sep separator for prefix of cells
#' @return a sparce matrix of the Matrix package that contains all the data prefixes by the corresponding sample name
#' @export read10Xmatrices
read10Xmatrices <- function(paths, min.common.genes = 1000, common.genes = FALSE, merge =FALSE, prefix.cells=FALSE,
                            prefix.sep = '_') {
  if (merge && !common.genes) stop("Can't merge matrices if common.genes is not set. Aborting.");
  if (merge && !prefix.cells) stop("Can't merge matrices if prefix.cells is not set. Aborting.");

  # Read the matrices one by one
  matrices <- sapply(paths, read10xMatrix)

  ## Prefix the arrays
  if (prefix.cells) {
    matrices <- mapply(
      function(m, name) {
        colnames(m) <- paste(name, colnames(m), sep=prefix.sep);
        m
      },
      matrices,
      names(matrices)
    )
  }

  ## Merge the arrays
  if (merge) {
    ## Get the genes in each array
    genelists <- lapply(matrices, function(x) rownames(x))
    ## Find the common genes
    commongenes <- Reduce(intersect,genelists)
    ## Stop if common genes too low
    if (length(commongenes) < min.common.genes) stop('The number of common genes is too low!');
    # Subset to common genes
    matrices <- mapply(
      function(m, name) {
        m[commongenes,]
      },
      matrices,
      names(matrices)
    )

    if (merge) {
      matrices <- Reduce(cbind, matrices)
    }
  }

  ## Return
  matrices
}

#' @title read multiple 10x matrices and return as a list
#' @description given a named list of paths of 10X matrices return a list of matrices
#' @param matrices a names list of paths to the matrices (that can be read by read10XMatrix)
#' @return a list
#' @export readMultiple10XmatricesAsList
readMultiple10XmatricesAsList <- function(pathList) {
  # Read the matrices one by one
  matrices <- sapply(pathList, read10xMatrix)

  invisible(matrices)
}

#' Reads in an expression matrix as formatted by the Klein lab
#' pipeline
#' @param path path of the file
#' @param prefix prefix to add to the cell names
#' @return a sparse matrix
#' @import Matrix
#' @export readKleinMatrix
readKleinMatrix <- function(path, prefix) {
  require(Matrix)

  matrix <- read.table(path, header=T, row.names=1, sep='\t', as.is=T, stringsAsFactors=F)
  matrix <- data.matrix(matrix)
  rownames(matrix) <- paste0(prefix, '_' ,rownames(matrix))
  matrix <- Matrix(t(matrix), sparse=T)

  matrix
}

#' Reads in multiple klein matrices and returns them in a list
#' @param file.names named list of the files to load, the names will become prefixes
#' @return a list of sparse matrices
#' @export readMultipleKleinMatrices
readMultipleKleinMatrices <- function(file.names) {
  mapply(readKleinMatrix, file.names, names(file.names))
}

#' Reads in a data matrix from the in house indrop pipeline
#' @param name prefix to give to cells
#' @param path the file path
#' @return a sparse expression matrix
#' @export readInDropMatrix
readInDropMatrix <- function(name, path) {
  m <- readRDS(path);
  m <- m$cm
  colnames(m) <- paste(name, colnames(m), sep='_')
  m
}

#' Reads in multiple indrop matrices
#' @param file.names named vector of filenames, names will become prefixes
#' @return list of matrices
#' @export readMultipleInDropMatrices
readMultipleInDropMatrices <- function(file.names) {
  mapply(readInDropMatrix, names(file.names),file.names)
}
barkasn/nbHelpers documentation built on Oct. 10, 2020, 9:46 p.m.