R/io.R

#' @include indRop.R
NULL

#' Import indRop processed data
#'
#' \code{readIndrop} takes a (vector of) file-path(s) that point to .tsv files
#' that were the output of indrops pre-processing pipeline (python package). 
#'
#' @param paths File paths (one or more) to .tsv files generated by indrops analyses
#' @param whichFile = FALSE Vector that corresponds to which files each cell came from
#'
#' @return Minimally, a sparse matrix that can be used for Seurat. May be contained
#' inside an S3 object if other parameters are set to TRUE 
#' @import Matrix
#' @import data.table
#' @examples
#' file1 <- paste(system.file("extdata", package = "indRop"),
#' "small1.counts.tsv", sep = "/")
#' file2 <- paste(system.file("extdata", package = "indRop"),
#' "small2.counts.tsv", sep = "/")
#' dat <- readIndrop(c(file1,file2))
#' # now make a Seurat object
#' # library(Seurat)
#' # seuratObject <- new("seurat", raw.data = dat) 
#' 
#' @export
setGeneric(name = "readIndrop", def = function(paths, whichFile = FALSE)
    standardGeneric("readIndrop"))

#' @rdname readIndrop
setMethod("readIndrop", signature("character", "ANY"),
          definition = function(paths, whichFile = FALSE) {

          # Make sure files exist
          badImport <- lapply(paths, function(file){
            stopifnot(file.exists(file))
          })
          
          # Read in files
          allData <- lapply(paths, function(file){
            d <- data.frame(data.table::fread(file))
            barcodes <- d[,1]
            genes <- colnames(d)[-1]
            matrix <- Matrix(data.matrix(d[,-1]))
            list(barcodes = barcodes, genes = genes, dat = t(matrix))
          })
          
          # Process the gene lists
          if( length(allData) > 1){
            allGenes <- lapply(allData, function(run) run[["genes"]])
            
             # Check that gene lists match; exit out if not
            if(length(unique(allGenes)) > 1){
              stop("Cannot import samples as they have different gene lists (columns)")
            }
            geneVector <- allGenes[[1]]
            
          } else {
            geneVector <- allData[[1]][["genes"]]
          }

          # Process the sample names
          if( length(allData) > 1){
            allBarcodes <- lapply(allData, function(run) run[["barcodes"]])
            barcodeVector <- unlist(allBarcodes, recursive = TRUE, use.names = TRUE)
            
            # Check that sample names (barcodes) are all unique; if not, modify then slightly
            if(sum(table(barcodeVector) == 1) != length(barcodeVector)){
              message("Found matching barcode IDs; changing the name to verify uniqueness")
              l <- sapply(allBarcodes, length)
              preVec <- unlist(sapply(1:length(l), function(i){ rep(as.character(i), l[i])}))
              barcodeVector <- paste0("file", preVec, "_", barcodeVector)
            }
          } else {
             barcodeVector <- allData[[1]][["barcodes"]]
          }
          
          # Process the data matrices; update names; return
          full_data <- do.call(cbind, lapply(allData, function(d){ d[["dat"]]}))
          colnames(full_data) <- barcodeVector
          rownames(full_data) <- geneVector
          return(full_data)
})


#' Export data into three files similar to that of 10X pre-processing
#'
#' \code{export10X} takes a sparse matrix and then
#' exports the object into the files that 10X data typically comes in. 
#'
#' @param obj A sparse matrix object to be exported
#' @param folder Which folder should the data be exported to
#'
#' @return Nothing; just text saying that it was successful or not. 
#' This function wil write files to the disk though. 
#' 
#' @import Matrix
#' @import methods
#' @importFrom utils write.table
#' @examples
#' file1 <- paste(system.file("extdata", package = "indRop"),
#' "small1.counts.tsv", sep = "/")
#' file2 <- paste(system.file("extdata", package = "indRop"),
#' "small2.counts.tsv", sep = "/")
#' dat <- readIndrop(c(file1,file2))
#' export10X(dat, "dataFolder")
#' 
#' @export
setGeneric(name = "export10X", def = function(obj, folder)
    standardGeneric("export10X"))

#' @rdname export10X
setMethod("export10X", signature("ANY", "character"),
          definition = function(obj, folder) {
            
            # https://stackoverflow.com/questions/4216753/check-existence-of-directory-and-create-if-doesnt-exist
            dir.create(file.path(folder), showWarnings = FALSE)
            
            # make data.frames
            geneDF <- data.frame(genes1 = rownames(obj), genes2 = rownames(obj))
            sampleDF <- data.frame(samples = colnames(obj))
            
            # write files
            Matrix::writeMM(obj, file = paste0(file.path(folder), "/", "matrix.mtx"))
            write.table(geneDF, file = paste0(file.path(folder), "/", "genes.tsv"),
                        quote = FALSE, sep = "\t", col.names = FALSE, row.names = FALSE)
            write.table(sampleDF, file = paste0(file.path(folder), "/", "barcodes.tsv"),
                        quote = FALSE, sep = "\t", col.names = FALSE, row.names = FALSE)
            
          })
caleblareau/indRop documentation built on May 29, 2019, 1:17 p.m.