R/preprocessCSV.R

Defines functions runLTMG preprocessCSV

Documented in preprocessCSV runLTMG

#' Pre-process scRNA-seq Count Matrix in CSV File
#'
#' A function that reads a CSV file containing scRNA-seq counts
#' and pre-process the scRNA-seq matrix with the given criteria.
#' before feeding it to the scGNN framework for analysis.
#'
#' @param path A string the represents the path to the input file, where
#'     directories in the path are separated by "/", and the input data file
#'     should have extension .csv or .gz
#' @param log_transform A Boolean value indicating whether to perform natural
#'     logarithmic transformation for every value of the data at the end of
#'     pre-processing or not. If TRUE, the log(x+1) of the original expression
#'     level will be calculated.
#' @param cell_zero_ratio A double-precision value strictly greater than 0 and
#'     less than 1. It defines the maximum proportion of genes with zeros in cells.
#'     Default is 0.99, which means cells with more than 99% genes that are
#'     zeros will be filtered out.
#' @param gene_zero_ratio A double-precision value strictly greater than 0 and
#'     less than 1. It defines the maximum proportion of zeros in genes.
#'     Default is 0.99, which means genes with more than 99% zero values
#'     will be filtered out.
#' @param geneSelectNum A positive integer that defines how many most variant
#'     genes to keep. The default is 2000, which 2000 genes, ranked from
#'     most variant to least by their variances of expression values in each
#'     sample, will be retained in the final pre-processed data and be used
#'     for later analysis.
#' @param transpose A Boolean value telling the function whether the input
#'     scRNA-seq matrix is given in a transposed manner, i.e.
#'     cells as rows and genes as columns.
#'     If TRUE, a transpose operation will be performed. Default is FALSE.
#' @param toCSV A Boolean value indicating whether to save the pre-processed
#'     expression matrix to a CSV file. If TRUE a CSV file with <savename> will
#'     be saved to /output directory under user's current working directory.
#' @param outdir_path A character vector that designates the path to the directory where
#'     the output CSV will be saved if toCSV is TRUE. This argument won't be
#'     used if toCSV is FALSE. There must be no path separator at the end.
#'     If the designated does not exist in the parent directory,
#'     it will create one automatically. Default is a directory named "output"
#'     under user's current R working directory.
#' @param savename A character vector representing the name of the csv file that saves
#'     pre-processed scRNA-seq data. There is no strict rule for this. However,
#'     it is highly recommended to give it a meaningful name.
#'
#' @return Returns a scDataset with the pre-processed scRNA-seq expression values,
#'     where row names are cells and column names are genes.
#'
#' @examples
#' # Example 2:
#' \dontrun{
#' # Accessing the demo gene_counts_small dataset available with the package
#' inputCountsPath <- system.file("extdata", "GSE138852_small.csv", package = "scRGNet")
#' # Preprocess the raw counts
#' counts <- preprocessCSV(path = inputCountsPath)
#'}
#' # Example 1:
#' \dontrun{
#' # Preprocess the whole scRNA-seq dataset
#' inputCountsPath <- system.file("extdata", "GSE138852_counts.csv.gz", package = "scRGNet")
#' counts <- preprocessCSV(path = inputCountsPath)
#' # Take ~20 seconds to process.
#' }
#' @references
#' \insertRef{scGNN}{scRGNet}
#'
#' @export
#' @import R.utils Matrix
#' @importClassesFrom data.table data.table
#' @importFrom Rdpack reprompt
#' @importFrom data.table fread transpose
#' @importFrom utils write.csv
preprocessCSV <- function(path,
                          log_transform   = TRUE,
                          cell_zero_ratio = 0.99,
                          gene_zero_ratio = 0.99,
                          geneSelectNum   = 2000L,
                          transpose       = FALSE,
                          toCSV           = FALSE,
                          outdir_path     = file.path(getwd(), "output"),
                          savename        = "preprocessedCSV") {

    if (!(is.character(path) & (nchar(path) > 0))) {
        stop("Invalid file path. Must be a string of non-zero length.")
    } else {
        if (file.exists(path)) {
            ## check whether given data file is in csv format
            if (!grepl(pattern = "\\.(csv|gz)$", path))
                stop("Input file is not a valid csv file.")
        } else {
            stop(paste("File not found:", path, "does not exist."))
        }
    }

    if (is.numeric(gene_zero_ratio)) {
        if (gene_zero_ratio <= 0 || gene_zero_ratio >= 1)
            stop("gene_zero_ratio out of range: Must be within 0 to 1")
    } else {
        stop("Invalid input for gene_zero_ratio: Must be numeric")
    }
    if (is.numeric(cell_zero_ratio)) {
        if (cell_zero_ratio <= 0 || cell_zero_ratio >= 1)
            stop("cell_zero_ratio out of range: Must be within 0 to 1")
    } else {
        stop("Invalid input for cell_zero_ratio: Must be numeric")
    }

    if (is.numeric(geneSelectNum)) {
        if (geneSelectNum < 128) {
            stop("Too few genes selected. Must be at least 128 genes to perform analysis.")
        } else {
            geneSelectNum <- as.integer(geneSelectNum)
        }
    } else {
        stop("Invalid input for geneSelectNum: Must be numeric")
    }

    if (!is.logical(log_transform))
        stop("Invalid input for log_transform: Must be logical")
    if (!is.logical(transpose))
        stop("Invalid input for transpose: Must be logical")

    if (is.logical(toCSV)) {
        if (toCSV) {
            if (!is.character(savename))
                stop("Invalid input for savename: Must be a string")
            if (!is.character(outdir_path))
                stop("Invalid input for output directory path: Must be a string")
        }
    } else {
        stop("Invalid input for toCSV: Must be logical")
    }


    message("CSV file found. Start pre-processing data...")
    start_time <- Sys.time() ## measure CSV processing time

    ## Load scRNA-seq matrix using data.table for faster loading time
    counts_raw <- data.table::fread(path)
    ## transpose data table if raw matrix given in transposed manner
    if (transpose)
        counts_raw <- data.table::transpose(counts_raw)

    ## using matrix for consequential processing
    ## since data.table doesn't support row names indexing
    counts_raw <- as.matrix(counts_raw, rownames = 1)

    ## Find genes with zero ratio over gene_ratio
    valid_gene <- apply(
        counts_raw,
        MARGIN = 1,
        function(x){
            ## Proportion of non-zero counts for a gene
            nonzero_ratio <- mean(as.logical(x))
            if (nonzero_ratio >= (1 - gene_zero_ratio)) {
                return(TRUE)
            } else {
                return(FALSE)
            }
        }
    )

    ## filters out genes with more than (gene_ratio x 100%) genes that are zeros
    counts_filtered_gene <- counts_raw[which(valid_gene), ]

    n_gene <- dim(counts_filtered_gene)[1] ## number of genes after filtering
    if (n_gene == 0) {
        stop("No genes left after filtering. Consider using a higher gene_zero_ratio.")
    } else {
        message(
        sprintf(
            "After preprocessing, %i genes with at most %f zero count remaining",
            n_gene,
            gene_zero_ratio
            )
        )
    }

    ## free up memory
    rm(list = c("valid_gene", "counts_raw"))
    invisible(gc())

    valid_cell <- apply(
        counts_filtered_gene,
        MARGIN = 2,
        function(x) {
            ## Proportion of non-zero genes in a cell
            cell_non_zero_gene_ratio <- mean(as.logical(x))
            if (cell_non_zero_gene_ratio >= (1 - cell_zero_ratio)) {
                return(TRUE)
            } else {
                return(FALSE)
            }
        }
    )

    ## filters out cells with more than (cell_ratio x 100%) genes that are zeros
    counts_filtered_cell <- counts_filtered_gene[, which(valid_cell)]

    ## number of cell samples remaining after filtering
    n_cell <- dim(counts_filtered_cell)[2]
    if (n_cell == 0) {
        stop("No cell left after filtering. Consider using a higher cell_zero_ratio.")
    } else {
        message(
        sprintf(
            "After preprocessing, %i cells have at most %f zero gene counts",
            n_cell, cell_zero_ratio
            )
        )
    }

    ## free up memory
    rm(list = c("counts_filtered_gene", "valid_cell"))
    invisible(gc())

    ## Manually compute gene-wise variance rather than using apply + stats::var
    ## 200x faster than apply(counts_filtered_cell, 1, stats::var)
    ## Improved computation run-time
    ## Source:
    ## https://stackoverflow.com/questions/25099825/row-wise-variance-of-a-matrix-in-r
    gene_means <- rowMeans(counts_filtered_cell)
    n_cell     <- dim(counts_filtered_cell)[2]
    varGene    <- rowSums((counts_filtered_cell - gene_means)^2)/(n_cell - 1)

    ## Check how many genes available to select
    n_gene <- dim(counts_filtered_cell)[1]
    if (n_gene < 128) {
        stop("Not enough genes remaining to perform analysis. Consider raising gene_zero_ratio or cell_zero_ratio")
    }
    if (geneSelectNum > n_gene) {
        message(
            sprintf("Remaining genes are fewer than %i. All remaining %i genes will be selected.",
                    geneSelectNum, n_gene)
            )
        geneSelectNum <- n_gene
    }

    ## select <geneSelectNum> of most variant genes
    varGene <- names(sort(varGene, decreasing = TRUE)[1:geneSelectNum])
    counts_processed <- counts_filtered_cell[varGene, ]

    ## free up memory
    rm(list = c("gene_means", "varGene", "counts_filtered_cell", "n_cell", "n_gene"))

    if (log_transform)
        counts_processed <- log1p(counts_processed)

    if (toCSV) {
        if (!dir.exists(outdir_path))
            dir.create(outdir_path)

        utils::write.csv(counts_processed,
                         file = file.path(outdir_path,
                                          paste(savename, "csv", sep = ".")
                                          )
                         )
    }

    counts_processed <- Matrix::Matrix(counts_processed, sparse = TRUE)
    scData           <- scDataset(data = counts_processed)

    rm(counts_processed)
    invisible(gc())

    finish_time  <- Sys.time()
    process_time <- finish_time - start_time
    message(sprintf("Time taken for pre-processing data: %f", process_time))

    return(scData)
}

#' Get discretised regulatory signals from LTMG model
#'
#' A wrapper function for running scGNNLTMG that uses
#' the Left-Trunctruncated-Mixed-Gaussian(LTMG) model to
#' translate the input gene expression data pre-processed by preprocessCSV into
#' a discretised regulatory signal as the regularizer for
#' the feature autoencoder of scGNN.
#' It will take around 10-15 minutes for inferring LTMG tags, depending on
#' the dataset.
#' This step is optional but highly recommended prior to scGNN analysis.
#'
#' @param scDataset A scDataset object containing a matrix with row names as cell samples
#'     and column names as genes containing scRNA-seq expression values
#'     pre-processed by preprocessCSV. If reading from a expression file,
#'     this argument needs not be provided.
#' @param fromFile A Boolean value indicating whether to read RNA-seq matrix
#'     from a expression file. If TRUE, expr_mat needs not be provided, and
#'     it will check whether the expression file exists in readPath.
#'     If FALSE, expr_mat must be provided.
#' @param readPath A character vector representing path to the expression file.
#'     If fromFile is TRUE, it must be provided. Otherwise optional.
#' @param toFile   A Boolean value indicating whether to write the returned
#'     LTMG object to file. If TRUE, outdir_path and savename must be provided.
#' @param outdir_path A character vector representing the path to the parent
#'     directory where the object will be saved as a file. If toFile is TRUE,
#'     it must be provided. Otherwise optional.
#' @param savename A character vector representing the name of the saved
#'     LTMG object file. If toFile is TRUE, it must be provided.
#'     Otherwise optional.
#'
#' @return Returns an LTMG matrix containing the inferred LTMG tags of expr_mat.
#'
#' @examples
#' # Example 1:
#' # Tested examples. Not run for fast package compiling
#' \dontrun{
#' # Accessing the demo gene_counts_small dataset available with the package
#' inputCountsPath <- system.file("extdata", "GSE138852_small.csv", package = "scRGNet")
#' # Preprocess the raw counts
#' counts <- preprocessCSV(path = inputCountsPath)
#' LTMG <- runLTMG(counts)
#' # ~30 seconds
#'}
#' # Example 2:
#' # Preprocess the whole scRNA-seq dataset
#' \dontrun{
#' inputCountsPath <- system.file("extdata", "GSE138852_counts.csv.gz", package = "scRGNet")
#' counts <- preprocessCSV(path = inputCountsPath)
#' LTMG <- runLTMG(counts)
#' # Take 10-14 minutes to finish running
#' }
#'
#' @references
#' \insertRef{LTMG}{scRGNet}
#' \insertRef{scGNN}{scRGNet}
#'
#' @export
#' @importFrom Rdpack reprompt
#' @import scGNNLTMG
#' @importClassesFrom data.table data.table
#' @importFrom data.table fread
runLTMG <- function(scDataset   = NULL,
                    fromFile    = FALSE,
                    readPath    = NULL,
                    toFile      = FALSE,
                    outdir_path = file.path(getwd(), "output"),
                    savename    = "ltmg") {
    ## checking validity for inputs
    if (is.logical(fromFile) && !is.na(fromFile)) {
        if (fromFile) {
            if (is.null(readPath)) {
                stop("No path to data file provided for argument readPath.")
            } else if (is.character(readPath)) {
                stop("Invalid argument for readPath. Must be a character vector.")
            } else {
                if (file.exists(readPath)) {
                    expr_mat = as.matrix(data.table::fread(readPath),
                                         rownames = 1)
                } else {
                    stop(paste("File not found:", readPath, "does not exist."))
                }
            }
        } else {
            if (is.null(scDataset)) {
                stop("No scDataset object provided for argument scDataset.")
            } else {
                expr_mat <- Matrix::t(scDataset$features)
            }
        }
    } else {
        stop("Invalid argument fromFile. Must be a boolean value.")
    }

    if (is.logical(toFile) & !is.na(toFile)) {
        if (toFile) {
            ## validate save path and save name only if user choose to save file.
            if (!(is.character(outdir_path) & (nchar(outdir_path) > 0)))
                stop("Invalid argument outdir_path. Must be a string of non-zero length.")
            if (!(is.character(savename) & (nchar(savename) > 0)))
                stop("Invalid argument savename. Must be a string of non-zero length.")
        }
    }

    message("Starting inferring LTMG from expression matrix...")
    start_time <- Sys.time()
    LTMGObject <- scGNNLTMG::CreateLTMGObject(expr_mat)
    LTMGObject <- scGNNLTMG::RunLTMG(LTMGObject, Gene_use = 'all')
    if (toFile) {
        message("Writing LTMG object to file...")
        scGNNLTMG::WriteSparse(LTMGObject, gene.name=FALSE, cell.name=FALSE)
    }

    finish_time  <- Sys.time()
    process_time <- finish_time - start_time
    message(sprintf("Time taken for inferring LTMG: %f", process_time))

    return(LTMGObject@OrdinalMatrix)
}

# [END]
ff98li/scRGNet documentation built on Jan. 14, 2022, 4:58 a.m.