R/preprocessing_filtering_reduction.R

Defines functions subsample_scExp num_cell_after_QC_filt_scExp num_cell_scExp sweep_sparse pca_irlba_for_sparseMatrix reduce_dim_batch_correction reduce_dims_scExp choose_perplexity feature_annotation_scExp normalize_scExp preprocess_feature_size_only preprocess_TFIDF preprocess_CPM preprocess_RPKM preprocess_TPM exclude_features_scExp get_genomic_coordinates has_genomic_coordinates remove_chr_M_fun remove_non_canonical_fun create_scExp combine_datamatrix check_correct_datamatrix read_count_mat_with_separated_chr_start_end separator_count_mat import_scExp generate_count_matrix generate_feature_names create_scDataset_raw rebin_matrix rebin_helper beds_to_matrix_indexes index_peaks_barcodes_to_matrix_indexes bams_to_matrix_indexes import_count_input_files wrapper_Signac_FeatureMatrix define_feature warning_raw_counts_to_sparse_matrix raw_counts_to_sparse_matrix read_sparse_matrix create_sample_name_mat detect_samples

Documented in bams_to_matrix_indexes beds_to_matrix_indexes check_correct_datamatrix choose_perplexity combine_datamatrix create_sample_name_mat create_scDataset_raw create_scExp define_feature detect_samples exclude_features_scExp feature_annotation_scExp generate_count_matrix generate_feature_names get_genomic_coordinates has_genomic_coordinates import_count_input_files import_scExp index_peaks_barcodes_to_matrix_indexes normalize_scExp num_cell_after_QC_filt_scExp num_cell_scExp pca_irlba_for_sparseMatrix preprocess_CPM preprocess_feature_size_only preprocess_RPKM preprocess_TFIDF preprocess_TPM raw_counts_to_sparse_matrix read_count_mat_with_separated_chr_start_end read_sparse_matrix rebin_helper rebin_matrix reduce_dim_batch_correction reduce_dims_scExp remove_chr_M_fun remove_non_canonical_fun separator_count_mat subsample_scExp warning_raw_counts_to_sparse_matrix wrapper_Signac_FeatureMatrix

## Authors : PacĂ´me Prompsy Title : Wrappers & functions to preprocess & reduce
## dimensionality of single-cell Matrix Description : Funtions to load, filter,
## normalize & reduce single-cell Epigenetic Matrices prior to analysis

#' Heuristic discovery of samples based on cell labels
#'
#' Identify a fixed number of common string (samples) in a set of varying
#' strings (cells). E.g. in the set
#' "Sample1_cell1","Sample1_cell2","Sample2_cell1","Sample2_cell2" and with
#' nb_samples=2, the function returns "Sample1","Sample1","Sample2","Sample2".
#'
#'
#' @param barcodes Vector of cell barcode names (e.g. Sample1_cell1,
#'   Sample1_cell2...)
#' @param nb_samples Number of samples to find
#'
#' @return character vector of sample names the same length as cell labels
#' @export
#'
#' @importFrom stringdist stringdistmatrix
#' @importFrom qualV LCS
#' @examples
#'
#' barcodes = c(paste0("HBCx22_BC_",seq_len(100)),
#' paste0("mouse_sample_XX",208:397))
#' samples = detect_samples(barcodes, nb_samples=2)
#' 
detect_samples <- function(barcodes, nb_samples = 1) {
    t = system.time({
        mat = stringdist::stringdistmatrix(
            barcodes, barcodes, useNames = "strings",
            method = "lv", weight = c(1, 1, 1))
        
        dist = stats::as.dist(1 - stats::cor(mat))
        hc = stats::hclust(dist, method = 'ward.D')
        hc$labels = rep("", ncol(mat))
    })
    message("ChromSCape::detect_samples - found samples in ", t[3], " secs.\n")
    sample_groups = stats::cutree(hc, k = nb_samples)
    samp_name = c()
    for (i in seq_len(nb_samples)) {
        x = barcodes[which(sample_groups == i)]
        samp = as.character(sample(x, min(length(x), 10), replace = TRUE))
        lcs = strsplit(samp[1], "")[[1]]
        for (j in 2:min(length(x), 10)) {
            lcs = unlist(qualV::LCS(lcs, strsplit(samp[j], "")[[1]])[4])
        }
        samp_name = c(samp_name, paste(lcs, collapse = ""))
    }
    samples_names = gsub("[[:punct:]]$", "", samp_name)
    samples_names = gsub("^[[:punct:]]", "", samples_names)
    if (nb_samples > 1) {
        mat <- create_sample_name_mat(nb_samples,samples_names)
        longest_common_between_samples = 
            mat[which.min(unlist(lapply(mat, nchar)))][1]
        if (length(longest_common_between_samples) > 2) {
            lc <- grep(pattern = longest_common_between_samples, samples_names)
            if (length(lc) == nb_samples)
                samples_names = gsub(
                    longest_common_between_samples, "", samples_names)
        }
    }
    samples_names = samples_names[sample_groups]
    return(samples_names)
}

#' Create a sample name matrix 
#'
#' @param nb_samples Number of samples
#' @param samples_names Character vector of sample names
#'
#' @return A matrix
#'
create_sample_name_mat <- function(nb_samples, samples_names){
    mat = matrix("", nrow = nb_samples, ncol = nb_samples)
    for (i in seq_len(nb_samples)) {
        for (j in seq_len(nb_samples)) {
            x = as.character(samples_names[i])
            y = as.character(samples_names[j])
            x = strsplit(x, "")[[1]]
            y = strsplit(y, "")[[1]]
            lcs = qualV::LCS(x, y)
            ls = split(lcs[6]$va, cumsum(c(TRUE, diff(lcs[6]$va) != 1)))
            ls = ls[[which.max(lengths(ls))]]
            x = as.character(samples_names[i])
            x = strsplit(x, "")[[1]]
            lcs = paste0(x[ls], collapse = "")
            if (length(lcs) == 0) lcs = ""
            mat[i, j] = lcs
        }
    }
    
    mat = as.character(mat)
    return(mat)
}


#' Read in one or multiple sparse matrices (10X format)
#' 
#' @description 
#' Given one or multiple directories, look in each directory for a combination 
#' of the following files :
#' - A 'features' file containing unique feature genomic locations 
#' -in tab separated format ( *_features.bed / .txt / .tsv / .gz), 
#' e.g. chr, start and end
#' - A 'barcodes' file containing unique barcode names 
#' ( _barcode.txt / .tsv / .gz)
#' - A 'matrix' A file containing indexes of non zero entries 
#' (_matrix.mtx / .gz)
#'
#' @param files_dir_list A named character vector containing the full path 
#' towards folders. Each folder should contain only the Feature file, the 
#' Barcode file and the Matrix file (see description). 
#' @param verbose Print ?
#' @param ref Reference genome (used to filter non-canonical chromosomes).
#'
#' @return Returns a list containing a datamatrix and cell annotation
#' @export
#'
#' @importFrom Matrix readMM
#' @importFrom GenomicRanges seqnames start end
#' @importFrom rtracklayer import.bed
#' @examples
#' \dontrun{
#' sample_dirs = c("/path/to/folder1/", "/path/to/folder2/")
#' names(sample_dirs) = c("sample_1", "sample_2")
#' out <- read_sparse_matrix(sample_dirs, ref = "hg38")
#' head(out$datamatrix)
#' head(out$annot_raw)
#' }
#' 
read_sparse_matrix <- function(files_dir_list,
                               ref = c("hg38","mm10", "ce11")[1],
                               verbose = TRUE
                              ){
    feature_file <- barcode_file <- matrix_file <- NULL
    
    if(is.null(names(files_dir_list))){
        names(files_dir_list) = gsub("[^[:alnum:]|^_|^\\. ]","", basename(files_dir_list))
    }
    
    feature_indexes = list()
    samples_ids = barcodes = c()
    for(i in seq_along(files_dir_list)){
        
        
        pattern = ".*features.tsv|.*features.txt|.*features.bed|.*features.*.gz|.*peaks.bed|.*peaks.bed.gz"
        feature_file = list.files(path = files_dir_list[i], full.names = TRUE,
                                  pattern = pattern)
        pattern = ".*barcodes.tsv|.*barcodes.txt|.*barcodes.*.gz"
        barcode_file = list.files(path = files_dir_list[i], full.names = TRUE,
                                  pattern = pattern)
        pattern = ".*matrix.mtx|.*matrix.*.gz"
        matrix_file = list.files(path = files_dir_list[i], full.names = TRUE,
                                 pattern = pattern)
        
        if (length(c(feature_file, matrix_file, barcode_file)) != 3)
            stop(
                "ChromSCape::read_sparse_matrix - For ",
                "SparseMatrix Count type, the folder must contain exactly three files ",
                "matching respectively :\n",
                "- Matrix:  .*matrix.mtx|.*matrix.*.gz\n",
                "- Features: .*features.tsv|.*features.txt|.*features.bed|.*features.*.gz|.*peaks.bed|.*peaks.bed.gz\n",
                "- Barcodes: .*barcodes.tsv|.*barcodes.txt|.*barcodes.*.gz"
                )
        
        t1 = system.time({
            sample_id = names(files_dir_list)[i]
            
            mat = Matrix::readMM(matrix_file)
            barcode = as.character(read.table(barcode_file)[,1])
            name_cell <- paste0(sample_id, "_", barcode)
            barcodes <- c(barcodes,barcode)
            
            which = tryCatch({rtracklayer::import.bed(feature_file)},
                             error = function(e) NULL) 
            if(is.null(which)){
                separator <- separator_count_mat(feature_file)
                format_test = read.table(feature_file, header = TRUE,
                                         sep = separator, nrows = 5)
                separated_chr_start_end = c(
                    grep("chr", colnames(format_test)[seq_len(3)]),
                    grep("start|begin", colnames(format_test)[seq_len(3)]),
                    grep("end|stop", colnames(format_test)[seq_len(3)]))
                if (length(separated_chr_start_end) > 0 &&
                    all.equal(separated_chr_start_end, c(1, 2, 3)) == TRUE) {
                    val = c("character",
                            "integer",
                            "integer",
                            rep("NULL", ncol(format_test) -3))
                    which = read.table(
                        feature_file,
                        header = FALSE,
                        sep = separator,
                        colClasses = val
                    )
                    which = as(which, "GRanges")
                } else{
                    which = read.table(feature_file,
                                       sep = separator,
                                       header = FALSE
                                       )
                    if(ncol(which) > 3 ){
                        which = as(
                        setNames(which[,1:3],c("chr","start","end")), "GRanges")
                    } else {
                        which = as(which[,1, drop=FALSE] %>% tidyr::separate(
                        col = 1, into = c("chr","start","end"),sep =":|-"),
                        "GRanges")
                    }
                }
                
            }
            rownames(mat) = paste0(GenomicRanges::seqnames(which),"_",
                                   GenomicRanges::start(which),"_",
                                   GenomicRanges::end(which))
            colnames(mat) = name_cell
            feature_indexes[[sample_id]] = mat
            samples_ids = c(samples_ids, rep(sample_id, length(name_cell)))
        })
        if (verbose)
            message("ChromSCape::read_sparse_matrix - ",
                    "Count matrix ", sample_id ,
                    " created from Index-Peak-Barcodes files in ",
                    round(t1[3],3), " sec.")
    }

    common_features = lapply(feature_indexes, rownames) 

    if(!isTRUE(all.equal(max(sapply(common_features, length)), min(sapply(common_features, length))))){
        common_features = Reduce(intersect,common_features)
        percent_in_common =  100 * length(common_features) / max(sapply(feature_indexes, nrow))
      warning("ChromSCape::read_sparse_matrix - ",
              "Not all features are found in all the samples - found ",
              round(percent_in_common,2), " % common features.")
      if(percent_in_common < 5) stop("ChromSCape::read_sparse_matrix - ",
                                     "Not enough features in common.")
    } else{
        common_features = rownames(feature_indexes[[1]])
    }
      
    feature_indexes = sapply(feature_indexes, function(i) {
      df = i[match(common_features,rownames(i)),] 
      return(df)
      }) 
    tryCatch({mat <- do.call("cbind", feature_indexes)}, error = function(e){
        paste0("ChromSCape::read_sparse_matrix - All feature file do not have ",
               "the same number of features", e)})
    
    eval(parse(text = paste0("data(", ref, ".chromosomes)")))
    chr <- eval(parse(text = paste0("", ref, ".chromosomes")))

    regions_to_remove = which(!as.character(which@seqnames) %in% chr$chr)
    if(length(regions_to_remove) > 0) mat = mat[-regions_to_remove,]
    annot_raw = data.frame("barcode" = barcodes,
                           "cell_id" = colnames(mat),
                           "sample_id" = samples_ids,
                           "batch_id" = factor(rep(1, ncol(mat)))
    )
    out = list("datamatrix" = mat, "annot_raw" = annot_raw)
    return(out)
}

#' Create a sparse count matrix from various format of input data.
#'
#' This function takes three different type of single-cell input: - Single cell
#' BAM files (sorted) - Single cell BED files (gzipped) - A combination of an
#' index file, a peak file and cell barcode file (The index file is composed of
#' three column: index i, index j and value x for the non zeroes entries in the
#' sparse matrix.)
#'
#' This functions re-counts signal on either fixed genomic bins, a set of
#' user-defined peaks or around the TSS of genes.
#'
#' @param ref reference genome to use (hg38)
#' @param verbose Verbose (TRUE)
#' @param use_Signac Use Signac wrapper function 'FeatureMatrix' if the Signac
#' package is installed (TRUE).
#' @param files_dir_list A named character vector of directories containing
#'  the files. The names correspond to sample names.
#' @param peak_file A file containing genomic location of peaks (NULL)
#' @param n_bins The number of bins to tile the genome (NULL)
#' @param bin_width The size of bins to tile the genome (NULL)
#' @param genebody Count on genes (body + promoter) ? (NULL)
#' @param extendPromoter If counting on genes, number of base pairs to extend up or
#'  downstream of TSS (2500).
#' @param file_type Input file(s) type(s) ('scBED','scBAM','FragmentFile')
#' @param progress Progress object for Shiny
#' @param BPPARAM BPPARAM object for multiprocessing. See
#'  \link[BiocParallel]{bpparam} for more informations. Will take the default
#'  BPPARAM set in your R session.
#'  
#' @return A sparse matrix of features x cells
#' @source 
#' @references Stuart el al.,  Multimodal single-cell chromatin analysis with 
#' Signac bioRxiv \url{https://doi.org/10.1101/2020.11.09.373613}
#' @importFrom IRanges IRanges
#' @importFrom parallel detectCores mclapply
#' @importFrom GenomicRanges GRanges tileGenome width seqnames GRangesList
#' sort.GenomicRanges
#' 
raw_counts_to_sparse_matrix <- function(
    files_dir_list, file_type = c("scBED", "scBAM", "FragmentFile"), use_Signac = TRUE,
    peak_file = NULL, n_bins = NULL, bin_width = NULL, genebody = NULL,
    extendPromoter = 2500, verbose = TRUE, ref = c("hg38","mm10", "ce11")[1],
    progress = NULL, BPPARAM = BiocParallel::bpparam()) {
    warning_raw_counts_to_sparse_matrix(
        files_dir_list, file_type, peak_file, n_bins, bin_width, genebody,
        extendPromoter, verbose, ref)
    
    if (!is.null(progress)) progress$set(detail = "Defining features...",
                                         value = 0.1)
    which <- define_feature(ref, peak_file, bin_width, genebody,
                            extendPromoter)
    
    if(is.null(names(files_dir_list))){
        names(files_dir_list) = gsub("[^[:alnum:]|^_|^\\. ]","", basename(files_dir_list))
    }
    if(use_Signac &&file_type == "FragmentFile"){
        if( !requireNamespace("Signac", quietly=TRUE) ){
            stop("ChromSCape::raw_counts_to_sparse_matrix - Signac package not",
            " found but required for treating Fragment files.")
        } else{
            out <- wrapper_Signac_FeatureMatrix(files_dir_list = files_dir_list,
                                                which = which,
                                                ref = ref,
                                                verbose = verbose,
                                                progress = progress)
        }

    } else {
        out <- import_count_input_files(
            files_dir_list, file_type, which, ref, verbose, progress,
            BPPARAM = BPPARAM)
        
        if (!is.null(progress)) progress$set(detail = "Combining matrices...",
                                             value = 0.05)
        mat = list()
        samples_ids = barcodes = c()
        for(i in seq_along(out$feature_indexes)){
            sample_id = names(out$feature_indexes)[i]
            feature_indexes <- out$feature_indexes[[i]]
            name_cells <- paste0(sample_id, "_", out$name_cells[[i]])
            barcodes <- c(barcodes,out$name_cells[[i]])
            samples_ids = c(samples_ids, rep(sample_id, length(name_cells)))
            
            mat[[sample_id]] = Matrix::sparseMatrix(
                i = as.numeric(feature_indexes$feature_index),
                j = feature_indexes$barcode_index,
                x = feature_indexes$counts,
                dims = c(length(which), length(name_cells)),
                dimnames = list(
                    rows = gsub(":|-", "_", as.character(which)),
                    cols = name_cells
                ))
        }
        mat <- do.call("cbind", mat)
        
        
        # If Gene information is present in metadata, add it to the rownames
        is_gene = ifelse("Gene" %in% colnames(GenomicRanges::elementMetadata(which)),
                         TRUE, FALSE)
        if(is_gene) rownames(mat) = paste0(which$Gene,":",rownames(mat))
        
        eval(parse(text = paste0("data(", ref, ".chromosomes)")))
        chr <- eval(parse(text = paste0("", ref, ".chromosomes")))
        mat = mat[which(as.character(which@seqnames) %in% chr$chr),]
        
        annot_raw = data.frame(barcode = barcodes,
                               cell_id = colnames(mat),
                               sample_id = samples_ids,
                               batch_id = factor(rep(1, ncol(mat)))
        )
        out = list("datamatrix" = mat, "annot_raw" = annot_raw)
    }
    return(out)
}


#' Warning for raw_counts_to_sparse_matrix
#'
#' @param ref reference genome to use (hg38)
#' @param verbose Verbose (TRUE)
#' @param files_dir_list A named character vector of directory containing
#'  the raw files
#' @param peak_file A file containing genomic location of peaks (NULL)
#' @param n_bins The number of bins to tile the genome (NULL)
#' @param bin_width The size of bins to tile the genome (NULL)
#' @param genebody Count on genes (body + promoter) ? (NULL)
#' @param extendPromoter If counting on genes, number of base pairs to extend up or
#'  downstream of TSS (2500).
#' @param file_type Input file(s) type(s) ('scBED','scBAM','SparseMatrix')
#'
#' @return Error or warnings if the input are not correct
warning_raw_counts_to_sparse_matrix <- function(
    files_dir_list, file_type = c("scBAM", "scBED", "SparseMatrix"),
    peak_file = NULL, n_bins = NULL, bin_width = NULL, genebody = NULL,
    extendPromoter = 2500, verbose = TRUE, ref = "hg38"){
        stopifnot(any(dir.exists(files_dir_list)), is.numeric(extendPromoter),
                ref %in% c("hg38", "mm10", "ce11"))
        
        if (!is.null(peak_file) && !file.exists(peak_file))
            stop("ChromSCape::raw_counts_to_sparse_matrix - 
            Can't find peak file.")
        
        if (!is.null(n_bins) && !is.numeric(n_bins))
            stop("ChromSCape::raw_counts_to_sparse_matrix - 
            n_bins must be a number.")
        
        if (!is.null(bin_width) && !is.numeric(bin_width))
            stop("ChromSCape::raw_counts_to_sparse_matrix - 
            bin_width must be a number.")
        
        if (!is.null(genebody) && !is.logical(genebody))
            stop(paste0(
                "ChromSCape::raw_counts_to_sparse_matrix - genebody ",
                "must be a TRUE or FALSE"))
    }

#' Define the features on which reads will be counted
#' 
#' @usage define_feature(ref = c("hg38","mm10", "ce11")[1],
#'  peak_file = NULL,
#'  bin_width  = NULL,
#'  genebody = FALSE,
#'  extendPromoter = 2500)
#' 
#' @param ref Reference genome
#' @param peak_file A bed file if counting on peaks
#' @param bin_width A number of bins if divinding genome into fixed width bins
#' @param genebody A logical indicating if feature should be counted in 
#' genebodies and promoter.
#' @param extendPromoter Extension length before TSS (2500).
#'
#' @return A GRanges object
#'   
#' @importFrom rtracklayer import
#' @importFrom GenomicRanges GRanges duplicated tileGenome width seqnames
#' @importFrom IRanges subsetByOverlaps
#' 
#' @export
#' @examples
#' gr_bins = define_feature("hg38", bin_width = 50000)
#' gr_genes = define_feature("hg38", genebody = TRUE, extendPromoter = 5000)
#' 
define_feature <- function(ref = c("hg38","mm10", "ce11")[1], peak_file = NULL,
                           bin_width  = NULL, genebody = FALSE,
                           extendPromoter = 2500){
    eval(parse(text = paste0("data(", ref, ".chromosomes)")))
    chr <- eval(parse(text = paste0("", ref, ".chromosomes")))
    chr <- GenomicRanges::GRanges(chr)
    if (!is.null(peak_file)) {
        message('ChromSCape::define_feature - ',
                'Reading in peaks file...')
        features <- rtracklayer::import(peak_file, format="bed")
        which <- IRanges::subsetByOverlaps(features,chr)
        if(length(which(GenomicRanges::duplicated(which)))>0) 
            message("ChromSCape::define_feature - Removing ",
                    length(which(GenomicRanges::duplicated(which))), 
                    " duplicated regions from peak file.")
        which = which[!GenomicRanges::duplicated(which)]
    } else if (!is.null(bin_width)) {
        message('ChromSCape::define_feature - ',
                'Counting on genomic bins...')
            which <- unlist(GenomicRanges::tileGenome(
                setNames(
                    GenomicRanges::width(chr),
                    GenomicRanges::seqnames(chr)
                ),
                tilewidth = bin_width
            ))
    } else if (genebody == TRUE) {
        # Retrieve gene TSS from ref and create GRanges
        eval(parse(text = paste0("data(", ref, ".GeneTSS)")))
        geneTSS_df <- eval(parse(text = paste0("", ref, ".GeneTSS")))

        geneTSS_df$start = ifelse(geneTSS_df$strand == "+",
                                  geneTSS_df$start - extendPromoter,
                                  geneTSS_df$start)
        
        geneTSS_df$end = ifelse(geneTSS_df$strand == "-",
                                  geneTSS_df$end + extendPromoter,
                                  geneTSS_df$end)
    
        geneTSS_df$strand = NULL
        geneTSS_df$distanceToTSS = 0
        which = GenomicRanges::GRanges(geneTSS_df)
        which = which[!GenomicRanges::duplicated(which)]
        names(which@ranges) = NULL
    }
    return(which)
}

#' Wrapper around 'FeatureMatrix' function from Signac Package
#' 
#' @param files_dir_list A named character vector of directories containing
#'  the files. The names correspond to sample names.
#' @param which A GenomicRanges containing the features to count on.
#' @param ref Reference genome to use (hg38).Chromosomes that are not present in 
#' the canonical chromosomes of the given reference genome will be excluded from
#'  the matrix.
#' @param process_n Number of regions to load into memory at a time, per thread.
#'  Processing more regions at once can be faster but uses more memory. (2000)
#' @param set_future_plan Set 'multisession' plan within the function (TRUE). 
#' If TRUE, the previous plan (e.g. future::plan()) will be set back on exit.
#' @param verbose Verbose (TRUE).
#' @param progress Progress object for Shiny.
#'
#' @return A sparse matrix of features x cells
#' 
#' @details Signac & future are not required packages for ChromSCape as they are
#' required only for the fragment matrix calculations. To use this function,
#' install Signac package first (future will be installed as a dependency).  
#' For the simplicity of the application & optimization, the function
#' by defaults sets future::plan("multisession") with workers = 
#' future::availableCores(omit = 1) in order to allow parallel processing
#' with Signac. On exit the plan is re-set to the previously set future plan.
#' Note that future multisession may have trouble running when VPN is on. To 
#' run in parallel, first deactivate your VPN if you encounter long runtimes.
#' 
#' @references Stuart el al.,  Multimodal single-cell chromatin analysis with 
#' Signac bioRxiv \url{https://doi.org/10.1101/2020.11.09.373613}
#' 
#' @export
#' @examples 
#' \dontrun{
#' gr_bins = define_feature("hg38", bin_width = 50000)
#' wrapper_Signac_FeatureMatrix("/path/to/dir_containing_fragment_files",
#'  gr_bins, ref = "hg38")
#' }
wrapper_Signac_FeatureMatrix <- function(files_dir_list, which, ref = "hg38",
                                         process_n = 2000,
                                         set_future_plan = TRUE, verbose = TRUE, 
                                         progress = NULL){
    
    fragment_files = lapply(files_dir_list, function(dir) 
        list.files(dir, full.names = TRUE, recursive = FALSE,
                   include.dirs = FALSE, pattern = ".tsv$|.tsv.gz$")
    )
    
    # Number of workers equal to number of workers in BPPARAM
    if(set_future_plan){
        cl <- future::availableCores(omit = 1)
        oplan <- future::plan("multisession", workers = cl,  gc = TRUE)
    }
    
    message("ChromSCape::wrapper_Signac_FeatureMatrix - Running ",
            "Signac::FeatureMatrix with ", cl,
            " workers...")
    
    if(any(lapply(fragment_files, length) > 1)){
        message("ChromSCape::wrapper_Signac_FeatureMatrix - Multiple files ",
                "detected per folder, assigning sample name based on files",
                " names.")
        fragment_files = unlist(fragment_files)
        names(fragment_files) = gsub(".tsv|.gz","",basename(fragment_files))
    } else{
        names(fragment_files) = basename(files_dir_list)
    }
    
    mat_list = list()
    samples_ids = barcodes = c()
    for(sample_name in names(fragment_files)){
        message("ChromSCape::wrapper_Signac_FeatureMatrix - Running Signac for",
                " sample ", sample_name, "...")
        
        if(!is.null(progress)) progress$inc(
            amount = 0.6/(length(fragment_files)),
            detail = paste0("Counting sample - ", sample_name))
        
        fragment_file = fragment_files[[sample_name]]
        dir = dirname(fragment_file)
        
        # if needed index file using samtools tabix
        if(!file.exists(file.path(dir, paste0(basename(fragment_file),".tbi")))){
            message("ChromSCape::wrapper_Signac_FeatureMatrix - creating tabix ",
                    "index...")
            idx = Rsamtools::indexTabix(fragment_file, format="bed")
        }
        
        # create Signac Fragment 
        fragments = Signac::CreateFragmentObject(fragment_file)
       
        
        # create count matrix
        system.time({
            mat = Signac::FeatureMatrix(fragments = fragments,
                                        features = which,
                                        process_n = process_n,
                                        verbose = TRUE)
        })
        rownames(mat) = gsub("-","_", rownames(mat))
        sample_id = sample_name
        barcodes <- c(barcodes, colnames(mat))
        colnames(mat) = paste0(sample_id,"_", colnames(mat))
        samples_ids = c(samples_ids, rep(sample_id, ncol(mat)))
        mat_list[[sample_id]] = mat
        
        gc()
    }
    if(set_future_plan) future::plan(oplan)
    
    if (!is.null(progress)) progress$set(detail = "Combining matrices...",
                                         value = 0.05)
    mat <- do.call("cbind", mat_list)
    
    # If Gene information is present in metadata, add it to the rownames
    is_gene = ifelse(
        "Gene" %in% colnames(GenomicRanges::elementMetadata(which)),TRUE, FALSE)
    if(is_gene) rownames(mat) = paste0(which$Gene,":",rownames(mat))
    
    
    eval(parse(text = paste0("data(", ref, ".chromosomes)")))
    chr <- eval(parse(text = paste0("", ref, ".chromosomes")))

    mat = mat[which(gsub("_.*","", rownames(mat)) %in% chr$chr),]
    
    annot_raw = data.frame(barcode = barcodes,
                           cell_id = colnames(mat),
                           sample_id = samples_ids,
                           batch_id = factor(rep(1, ncol(mat)))
                           
    )
    
    out <- list("datamatrix" = mat,
                "annot_raw" = annot_raw)
    
    return(out)
}

#' Import and count input files depending on their format
#'
#' @param files_dir_list A named list of directories containing the input files.
#' @param which A GRanges object of features. 
#' @param ref Reference genome.
#' @param verbose Print ?
#' @param file_type Input file type.
#' @param progress A progress object for Shiny.
#' @param BPPARAM BPPARAM object for multiprocessing. See
#'  \link[BiocParallel]{bpparam} for more informations. Will take the default
#'  BPPARAM set in your R session.
#'  
#' @return A list with the 
#' feature indexes data.frame containing non-zeroes entries in the count matrix
#' and the cell names
#'
import_count_input_files <- function(files_dir_list, file_type,
                                    which, ref, verbose, progress,
                                    BPPARAM = BiocParallel::bpparam()){

    feature_indexes = list()
    name_cells = list()
    for(i in seq_along(files_dir_list)){
        dir = files_dir_list[[i]]
        sample_id = names(files_dir_list)[i]
        if(!is.null(progress)) progress$inc(
            detail=paste0("Computing ", sample_id, "...",
            "\nLook for progress in the console."),
            amount = 0.6/length(files_dir_list))
        message("ChromSCape::import_count_input_files - Computing ",
                sample_id, "...")
        if (file_type == "scBAM") {
            t1 = system.time({
                l = bams_to_matrix_indexes(dir, which, BPPARAM = BPPARAM)
            })
            if (verbose)
                message("ChromSCape::raw_counts_to_sparse_matrix - ",
                        "Count matrix ", sample_id ,"created from BAM files in "
                        , t1[3], " sec.\n")
        }
        else if (file_type == "scBED") {
            t1 = system.time({
                l = beds_to_matrix_indexes(dir, which, BPPARAM = BPPARAM)
            })
            if (verbose)
                message("ChromSCape::raw_counts_to_sparse_matrix - ",
                        "Count matrix ", sample_id ,"
                        created from BED files in ", t1[3], " sec.")}
        
        feature_indexes[[i]] = l[[1]]
        names(feature_indexes)[i] = sample_id
        name_cells[[i]] = l[[2]]
        names(name_cells)[i] = sample_id
    }
    
    out <- list("feature_indexes" = feature_indexes,
                "name_cells" = name_cells)

    return(out)
}

#' Count bam files on interval to create count indexes
#'
#' @param dir A directory containing single cell  BAM files and BAI files
#' @param which Genomic Range on which to count
#' @param BPPARAM BPPARAM object for multiprocessing. See
#'  \link[BiocParallel]{bpparam} for more informations. Will take the default
#'  BPPARAM set in your R session.
#'  
#' @return A list containing a "feature index" data.frame and a 
#' count vector for non 0 entries, both used to form the sparse matrix
#'
#' @importFrom Rsamtools BamFileList indexBam ScanBamParam countBam
#' @importFrom BiocParallel bplapply
bams_to_matrix_indexes = function(dir, which, 
                                  BPPARAM = BiocParallel::bpparam()) {
    single_cell_bams = list.files(dir,
                                full.names = TRUE,
                                pattern = paste0(".*.bam$"))
    bam_files = Rsamtools::BamFileList(single_cell_bams)
    name_cells = gsub("*.bam$", "", basename(as.character(single_cell_bams)))
    names(bam_files) = name_cells
    
    indexes = list.files(dir,
                        full.names = TRUE,
                        pattern = paste0(".*.bai$"))
    if (length(indexes) < length(bam_files)) {
        message("ChromSCape::bams_to_matrix_indexes - indexing BAM files...")
        BiocParallel::bplapply(bam_files, Rsamtools::indexBam)
        indexes = list.files(files_dir_list,
                            full.names = TRUE,
                            pattern = paste0(".*.bai$"))
    }
    names_index = gsub("*.bam.bai$", "", basename(as.character(indexes)))
    names(indexes) = names_index
    if (!all.equal(names_index, name_cells))
        stop("Different number of BAM and indexes files. Stopping.")
    param = Rsamtools::ScanBamParam(which = which)
    
    BiocParallel::bpprogressbar(BPPARAM) <- TRUE
    if(BiocParallel::bpworkers(BPPARAM) > 1) 
        BiocParallel::bptasks(BPPARAM) <- ceiling(length(single_cell_bams) /
                                    (10*BiocParallel::bpworkers(BPPARAM)))
    system.time({
        feature_list = BiocParallel::bplapply(BPPARAM = BPPARAM,
            names(bam_files), function(bam_name, bam_files, param) {
                bam_files[[bam_name]]$index = indexes[[bam_name]]
                tmp = Rsamtools::countBam(
                    file = bam_files[[bam_name]], param = param)
                tmp$feature_index = rownames(tmp)
                tmp$cell_id = bam_name
                sel = which(tmp$records > 0)
                if (length(sel) > 0)
                    tmp = tmp[sel, c("cell_id", "feature_index", "records")]
                tmp
            }, bam_files = bam_files, param = param)
    })
    
    feature_indexes = do.call(rbind, feature_list)
    feature_indexes$barcode_index = 
        as.numeric(as.factor(feature_indexes$cell_id))
    colnames(feature_indexes)[3] = "counts"
    return(list(feature_indexes, name_cells))
}

#' Read index-peaks-barcodes trio files on interval to create count indexes
#'
#'
#' @param feature_file A file containing the features genomic locations
#' @param matrix_file A file containing the indexes of non-zeroes values and
#'   their value (respectively i,j,x,see sparseMatrix)
#' @param barcode_file A file containing the barcode ids
#' @param binarize Binarize matrix ?
#'
#' @importFrom GenomicRanges GRanges
#'
#' @return A list containing a "feature index" data.frame, name_cells, and a
#'  region GenomicRange object used to form the sparse matrix
#'   
index_peaks_barcodes_to_matrix_indexes = function(
    feature_file, matrix_file, barcode_file, binarize = FALSE) {
    regions = GenomicRanges::GRanges(setNames(
        read.table(feature_file, sep = "\t", quote = "")[, seq_len(3)],
        c("chr", "start", "end")
    ))
    
    if(any(GenomicRanges::duplicated(regions))){
        stop("ChromSCape::index_peaks_barcodes_to_matrix_indexes - The ",
             "features are not unique, check your file : ", feature_file)
    }
    
    name_cells = read.table(barcode_file, sep = "", quote = "",
                            stringsAsFactors = FALSE)[, 1]
    if(length(unique(name_cells)) != length(name_cells)){
        stop("ChromSCape::index_peaks_barcodes_to_matrix_indexes - The ",
             "barcode IDs are not unique, check your file : ",barcode_file)
    }
    feature_indexes = setNames(
        Matrix::readMM(matrix_file),
        c("feature_index", "barcode_index", "counts")
    )
    
    feature_indexes$cell_id = name_cells[feature_indexes$barcode_index]
    feature_indexes = feature_indexes[, c("cell_id", "feature_index",
                                        "counts", "barcode_index")]
    return(list(feature_indexes, name_cells, regions))
}

#' Count bed files on interval to create count indexes
#'
#' @param dir A directory containing the single cell BED files
#' @param which Genomic Range on which to count
#' @param BPPARAM BPPARAM object for multiprocessing. See
#'  \link[BiocParallel]{bpparam} for more informations. Will take the default
#'  BPPARAM set in your R session.
#'
#' @importFrom BiocParallel bplapply
#' @importFrom rtracklayer import
#' @importFrom GenomicRanges GRanges countOverlaps
#' 
#' @return A list containing a "feature index" data.frame and a 
#' names of cells as vector both used to form the sparse matrix
#' 
beds_to_matrix_indexes <- function(dir, which,
                                   BPPARAM = BiocParallel::bpparam()) {
    single_cell_beds = list.files(
        dir, full.names = TRUE, pattern = ".*.bed$|.*.bed.gz$")
    names_cells = gsub(".bed$|.bed.gz$", 
                    "", basename(as.character(single_cell_beds)))
    names(single_cell_beds) = names_cells

    BiocParallel::bpprogressbar(BPPARAM) <- TRUE
    # BiocParallel::bptasks(BPPARAM) <- ceiling(length(single_cell_beds) /
    #                                 (10*BiocParallel::bpworkers(BPPARAM)))

    system.time({
    feature_list = BiocParallel::bplapply( BPPARAM = BPPARAM,
            names(single_cell_beds),
            function(bed_name, single_cell_beds, which) {
                bed = rtracklayer::import(single_cell_beds[[bed_name]],
                                          format = "bed")
                suppressWarnings({
                    tmp = data.frame(
                        counts = GenomicRanges::countOverlaps(
                            which, bed, minoverlap = 1))
                })
                tmp$feature_index = as.numeric(rownames(tmp))
                tmp = tmp[which(tmp$counts > 0), ]
                if(nrow(tmp)>0) {
                    tmp$cell_id = bed_name 
                    tmp = tmp[, c("cell_id", "feature_index", "counts")]
                } else { tmp = NULL}
                tmp
            }, single_cell_beds = single_cell_beds,
            which = which)
    })
    
    gc()
    message("ChromSCape::import_count_input_files - Merging single-cells into ",
            "indexes...")
    feature_indexes = do.call(rbind, feature_list)
    feature_indexes$barcode_index = as.numeric(
        as.factor(feature_indexes$cell_id))
    out = list(feature_indexes, names_cells)
    gc()
    return(out)
}



#' Rebin Helper for rebin_matrix function
#'
#' @param mat_df A data.frame corresponding to sparse matrix indexes & values.
#'
#' @return a data.frame grouped mean-summarised  by col and new_row
#' 
rebin_helper = function(mat_df){
  library(dplyr)
  mat_df_grouped = mat_df %>% dplyr::group_by(col, new_row) %>%
    dplyr::summarise(new_value = sum(origin_value))
}

#' Transforms a bins x cells count matrix into a larger bins x cells count matrix.
#' 
#' 
#' This functions is best used to re-count large number of small bins or peaks
#'  (e.g. <= 5000bp) into equal or larger sized bins. The genome is either cut in
#'   fixed bins (e.g. 50,000bp) or into an user defined number of bins. Bins are
#'  calculated  based on the canconical chromosomes. Note that if peaks are larger
#'   than bins,  or if peaks are overlapping multiple bins, the signal is added 
#'   to each bin.
#' Users can increase the minimum overlap to consider peaks overlapping bins (by
#' default 150bp, size of a nucleosome) to disminish the number of peaks
#' overlapping multiple region. Any peak smaller than the minimum overlapp
#' threshold will be dismissed. Therefore, library size might be slightly
#' different from peaks to bins if signal was duplicated into multiple bins or
#' ommitted due to peaks smaller than minimum overlap.
#'
#' @param mat A matrix of peaks x cells
#' @param bin_width Width of bins to produce in base pairs (minimum 500) (50000)
#' @param ref Reference genome to use (hg38)
#' @param minoverlap Minimum overlap between the original bins and the new features
#'  to consider the peak as overlapping the bin . We recommand to put this
#'  number at exactly half of the original bin size (e.g. 500bp for original bin
#'  size of 1000bp) so that no original bins are counted twice. (500)
#' @param custom_annotation A GenomicRanges object specifying the new features 
#' to count the matrix on instead of recounting on genomic bins. If not NULL,
#' takes predecency over bin_width.
#' @param verbose Verbose
#' @param nthreads Number of threads to use for paralell processing
#'
#' @return A sparse matrix of larger bins or peaks.
#' @export
#'
#' @importFrom IRanges IRanges
#' @importFrom dplyr group_by summarise group_split mutate bind_rows
#' @importFrom GenomicRanges GRanges tileGenome width seqnames findOverlaps start
#' end
#' @importFrom stringr str_split_fixed
#'
#' @examples
#' mat = create_scDataset_raw()$mat
#' binned_mat = rebin_matrix(mat,bin_width = 10e6)
#' dim(binned_mat)
#' 
rebin_matrix <- function(mat,
                         bin_width = 50000,
                         custom_annotation = NULL,
                         minoverlap = 500,
                         verbose = TRUE,
                         ref = "hg38",
                         nthreads = 1,
                         rebin_function = rebin_helper
                         ){
  
  stopifnot(!is.null(mat), ref %in% c("hg38", "mm10", "ce11"))
  
  if (is.matrix(mat)) mat = as(mat, "CsparseMatrix")
  mat = as(mat, "CsparseMatrix")
  
  if (is.null(bin_width) & is.null(custom_annotation)) stop("One of bin_width or custom_annotation must be set")
  
  eval(parse(text = paste0("data(", ref, ".chromosomes)")))
  chr <- eval(parse(text = paste0("", ref, ".chromosomes")))
  chr <- GenomicRanges::GRanges(chr)
  
  if(is.null(custom_annotation)){
    bin_ranges <- unlist(GenomicRanges::tileGenome(
      setNames( GenomicRanges::width(chr), GenomicRanges::seqnames(chr)),
      tilewidth = bin_width )) 
  } else{
    bin_ranges <- custom_annotation
    if(!is(bin_ranges, "GenomicRanges")) stop("ChromSCape::rebin_matrix",
                                              "custom_annotation must be a GenomicRanges object.")
  }
  
  t1 = system.time({
    original_bins = rownames(mat)
    original_bins = stringr::str_split_fixed(original_bins, pattern = "_", n = 3)
    original_bins_chr = as.character(original_bins[,1])
    original_bins_start = as.numeric(original_bins[,2])
    original_bins_end = as.numeric(original_bins[,3])
  })
  message("ChromSCape::rebin_matrix - ",
          "Read the original bins in ", round(t1[3],4), " sec.")
  
  if (anyNA(original_bins_start) | anyNA(original_bins_end))
    stop("The rows of mat should be regions in format chr_start_end or",
         "chr:start-end, without non canonical chromosomes")
  
  if (verbose) {
    message("ChromSCape::rebin_matrix - converting ", dim(mat)[1],
            " original_bins into ", length(bin_ranges)[1], " bins of ",
            floor(mean(bin_ranges@ranges@width)), " bp in average.") }
  
  original_bins = GenomicRanges::GRanges(
    seqnames = original_bins_chr, ranges = IRanges::IRanges(original_bins_start, original_bins_end))
  
  hits <- GenomicRanges::findOverlaps(
    bin_ranges, original_bins, minoverlap = minoverlap)
  
  bins_names = paste0(bin_ranges@seqnames, "_", 
                      GenomicRanges::start(bin_ranges), "_",
                      GenomicRanges::end(bin_ranges))
  hits = as.data.frame(hits)
  hits$bins_names = bins_names[hits$queryHits]
  
  mat_df = data.frame(origin_row = mat@i+1, col = 0, origin_value = mat@x)
  sizes = mat@p[2:length(mat@p)] - mat@p[1:(length(mat@p)-1)]
  mat_df$col = as.numeric(unlist(lapply(1:(length(mat@p)-1), function(i) rep(i,sizes[i]))))
  
  match_hits = match(mat_df$origin_row, hits$subjectHits)
  
  if(length(which(is.na(match_hits)))>0){
    message("ChromSCape::rebin_matrix - Warning ! ", length(unique((mat@i+1)[which(is.na(match_hits))])),
            " original features were not found in the new bins",
            ".\nContinuing without these features...\n")
    mat_df = mat_df[-which(is.na(match_hits)),]
    gc()
  }
  
  mat_df$bins_names = hits$bins_names[match(mat_df$origin_row, hits$subjectHits)]
  mat_df$new_row = hits$queryHits[match(mat_df$origin_row, hits$subjectHits)]
  mat_df$new_row= seq_along(unique(mat_df$new_row))[match(mat_df$new_row, sort(unique(mat_df$new_row)))]
  bins_names = mat_df$bins_names[match(sort(unique(mat_df$new_row)), mat_df$new_row)]
  
  message("Finished matching original feature to new features...")
  
  mat_df$bins_names = NULL
  gc()
  
  
  # Transform mat_df into chunks of cells
  message("Splitting matrix in chunks of 500 cells...")
  mat_df = mat_df %>% dplyr::mutate(group = ceiling(col / 500))
  list_mat_df = mat_df %>% dplyr::group_split(group, .keep = FALSE)
  rm(mat_df)
  gc()
  
  # Starting Parallelization - sending chunks of mat by columns
  message("Starting parallelization, nthreads = ", nthreads, "...")
  message("Length list_mat_df", length(list_mat_df), "...")
  message("Ncell list_mat_df[[1]]", max(list_mat_df[[1]]$col), "...")
  
  cl <- parallel::makeCluster(nthreads)
  list_rebinned_mat = parallel::clusterApply(cl = cl, x = list_mat_df, fun = rebin_function)
  
  # Stopping multithreading
  message("Finished parallel rebinning...")
  parallel::stopCluster(cl)
  gc()
  
  # Regrouping
  message("Regrouping chunks...")
  mat_df_grouped = dplyr::bind_rows(list_rebinned_mat)
  gc()
  
  message("Creating new matrix...")
  # Creating the matrix with correct names
  new_mat = Matrix::sparseMatrix(i = mat_df_grouped$new_row, j = mat_df_grouped$col, 
                                 x = mat_df_grouped$new_value,
                                 dimnames = list(bins_names, colnames(mat)))
  
  
  gc()
  if (verbose) message("ChromSCape::rebin_matrix - Generated ",
                       nrow(new_mat) ," new non-zero features from ", length(original_bins),
                       " original features.")
  return(new_mat)  
}

#' Create a simulated single cell datamatrix & cell annotation
#'
#' @param cells Number of cells (300)
#' @param features Number of features (600)
#' @param featureType Type of feature (window)
#' @param sparse Is matrix sparse ? (TRUE)
#' @param nsamp Number of samples (4)
#' @param ref Reference genome ('hg38')
#' @param batch_id Batch origin (factor((1,1,1,1))
#'
#' @return A list composed of * mat : a sparse matrix following an approximation
#'   of the negative binomial law (adapted to scChIPseq) * annot : a data.frame
#'   of cell annotation * batches : an integer vector with the batch number for
#'   each cell
#' @export
#'
#' @importFrom GenomicRanges GRanges tileGenome width seqnames GRangesList
#'   sort.GenomicRanges
#'
#' @examples
#' # Creating a basic sparse 600 genomic bins x 300 cells matrix and annotation
#' l = create_scDataset_raw()
#' head(l$mat)
#' head(l$annot)
#' head(l$batches)
#'
#' # Specifying number of cells, features and samples
#' l2 = create_scDataset_raw(cells = 500, features = 500, nsamp=2)
#'
#' # Specifying species
#' mouse_l = create_scDataset_raw(ref="mm10")
#' 
#' # Specifying batches
#' batch_l = create_scDataset_raw(nsamp=4, batch_id = factor(c(1,1,2,2)))
#' 
#' # Peaks of different size as features
#' peak_l = create_scDataset_raw(featureType="peak")
#' head(peak_l$mat)
#' 
#' # Genes as features
#' gene_l = create_scDataset_raw(featureType="gene")
#' head(gene_l$mat)
create_scDataset_raw <- function(
    cells = 300, features = 600, featureType = c("window", "peak", "gene"),
    sparse = TRUE, nsamp = 4, ref = "hg38", batch_id = factor(rep(1, nsamp))){
    stopifnot( featureType %in% c("window", "peak", "gene"),
            ref %in% c("hg38", "mm10", "ce11"), nsamp >= 1, cells >= nsamp,
            features >= 1, length(batch_id) == nsamp)
    stopifnot(is.factor(batch_id))
    # Create cell names
    cell_counts <-
        as.numeric(lapply(split(
            seq_len(cells), sample(nsamp, cells, repl = TRUE)), length))
    cell_names <- sample <- batches <- list()
    for (i in seq_along(cell_counts))
    {
        cell_names[[i]] <- paste0("sample_", i, "_c", seq_len(cell_counts[i]))
        sample[[i]] <- rep(paste0("sample_", i), cell_counts[i])
        batches[[i]] <- rep(batch_id[i], cell_counts[i])
    }
    cell_names <- as.character(unlist(cell_names))
    sample <- as.character(unlist(sample))
    batches <- factor(as.numeric(unlist(batches)))
    
    feature_names <- generate_feature_names(featureType, ref, features)
    mat <- generate_count_matrix(cells, features, sparse,
                                cell_names, feature_names)
    if (length(levels(batches)) > 1)
    {
        mat <- mat %*% as(Matrix::diag(as.numeric(batches) * 
                                            as.numeric(batches)), "dgCMatrix")
    }
    colnames(mat) <- cell_names
    annot <- data.frame(
        barcode = paste0("BC", seq_len(cells)),
        cell_id = cell_names,
        sample_id = sample,
        batch_id = batches,
        total_counts = Matrix::colSums(mat)
    )
    return(list(
        mat = mat,
        annot = annot,
        batches = batches
    ))
}


#' Generate feature names
#'
#' @param featureType Type of feature
#' @param ref Reference genome
#' @param features Number of features to generate
#'
#' @return A character vector of feature names
#'
generate_feature_names <- function(featureType, ref,
                                features){
    
    eval(parse(text = paste0("data(", ref, ".chromosomes)")))
    chr <- eval(parse(text = paste0("", ref, ".chromosomes")))
    chr <- GenomicRanges::GRanges(chr)
    
    if (featureType[1] == "window"){
        chr_ranges <- unlist(GenomicRanges::tileGenome(
            setNames(GenomicRanges::width(chr), GenomicRanges::seqnames(chr)),
            ntile = features
        ))[seq_len(features)]
        
        feature_names <- paste(as.data.frame(chr_ranges)$seqnames,
                                as.data.frame(chr_ranges)$start,
                                as.data.frame(chr_ranges)$end, sep = "_")
    }
    if (featureType[1] == "peak"){
        size_peaks <- c(1000, 2500, 7999, 10000, 150000, 10 ^ 6)
        
        peaks <- as.numeric(lapply(split(seq_len(features), sample(
            length(size_peaks), features, replace = TRUE)), length))
        chr_ranges_list <- GenomicRanges::GRangesList()
        for (i in seq_along(peaks)){
            chr_ranges <- unlist(
                GenomicRanges::tileGenome(
                    setNames(GenomicRanges::width(chr),
                            GenomicRanges::seqnames(chr)),
                    tilewidth = size_peaks[i],
                    cut.last.tile.in.chrom = FALSE))
            chr_ranges_list[[i]] <-
                chr_ranges[sample(seq_along(chr_ranges),
                                size = peaks[i]),]
        }
        chr_ranges <-
            GenomicRanges::sort.GenomicRanges(
                unlist(chr_ranges_list))[seq_len(features)]
        
        feature_names <- paste(as.data.frame(chr_ranges)$seqnames,
                                as.data.frame(chr_ranges)$start,
                                as.data.frame(chr_ranges)$end,
                                sep = "_")
    }
    if (featureType[1] == "gene"){
        eval(parse(text = paste0("data(", ref, ".GeneTSS)")))
        chr <- eval(parse(text = paste0("", ref, ".GeneTSS")))
        feature_names <- as.character(sample(
            chr$Gene, features, replace = FALSE))
    }
    return(feature_names)
}

#' Generate count matrix
#'
#' @param cells Number of cells
#' @param features Number of features
#' @param sparse Is matrix sparse ?
#' @param cell_names Cell names
#' @param feature_names Feature names
#'
#' @return A matrix or a sparse matrix
#'
generate_count_matrix <- function(cells, features, sparse,
                                cell_names, feature_names){
    vec <- stats::rpois(cells * features, 0.5)
    # Add count to values > 0, iteratively
    for (i in seq_len(10))
    {
        vec[vec >= i] <- vec[vec >= i] + i ^ 2 *
            stats::rpois(length(vec[vec >= i]),
                        0.5)
    }
    
    indices_vec <- which(vec > 0)
    j <- ceiling(indices_vec / features)
    i <- ceiling(indices_vec %% (features))
    i[i == 0] <- features
    if (sparse)
    {
        mat <- Matrix::sparseMatrix(i,
                                    j,
                                    x = vec[indices_vec],
                                    dimnames = list(feature_names,
                                                    cell_names))
    } else
    {
        mat <- matrix(
            vec,
            nrow = features,
            ncol = cells,
            dimnames = list(feature_names,
                            cell_names)
        )
    }
    return(mat)
}


#' Read single-cell matrix(ces) into scExp
#'
#' Combine one or multiple matrices together to create a sparse matrix and cell
#' annotation data.frame.
#'
#' @param file_paths A character vector of file names towards single cell
#' epigenomic matrices (features x cells) (must be .txt / .tsv)
#' @param temp_path In case matrices are stored in temporary folder,
#' a character vector of path towards temporary files. (NULL)
#' @param remove_pattern A string pattern to remove from the sample names. Can
#' be a regexp.
#' 
#' @return A list containing:  
#' * datamatrix: a sparseMatrix of features x cells  
#' * annot_raw: an annotation of cells as data.frame  
#'  
#' @export
#' @examples 
#' mat1 = mat2 = create_scDataset_raw()$mat
#' tmp1 = tempfile(fileext = ".tsv")
#' tmp2 = tempfile(fileext = ".tsv")
#' write.table(as.matrix(mat1),file=tmp1,sep = "\t",
#' row.names = TRUE,col.names = TRUE,quote = FALSE)
#' write.table(as.matrix(mat2),file=tmp2, sep = "\t",
#' row.names = TRUE,col.names = TRUE,quote = FALSE)
#' file_paths = c(tmp1,tmp2)
#' out = import_scExp(file_paths)
#' 
#' @importFrom scater readSparseCounts
#' @md
import_scExp <- function(file_paths,
                        remove_pattern = "",
                        temp_path = NULL) {
    stopifnot(is.character(file_paths))
    if (length(grep("(.tsv$)|(.txt$)|(.csv$)|(.gz$)", file_paths)) 
        < length(file_paths))
        stop(paste0("ChromSCape::import_scExp - Matrix files must be in",
                    " .txt, .csv or .tsv or .gz format."))
    if (is.null(temp_path)) temp_path = file_paths
    if (FALSE %in% as.logical(lapply(temp_path, file.exists))) 
        stop("ChromSCape::import_scExp - can't find one of the matrix files.")
    datamatrix = annot_raw = NULL
    for (i in seq_along(file_paths)) {
        sample_name <- gsub('(.tsv$)|(.txt$)|(.csv$)', "", gsub('(.gz$)', '',
                                               basename(file_paths[i])))
        if(remove_pattern != "") sample_name = gsub(remove_pattern, "",
                                                    sample_name)
        
        separator <- separator_count_mat(temp_path[i])
        format_test = read.table(temp_path[i], header = TRUE,
                                sep = separator, nrows = 5)
        separated_chr_start_end = c(
            grep("chr", colnames(format_test)[seq_len(3)]),
            grep("start|begin", colnames(format_test)[seq_len(3)]),
            grep("end|stop", colnames(format_test)[seq_len(3)]))
        if (length(separated_chr_start_end) > 0 &&
            all.equal(separated_chr_start_end, c(1, 2, 3))) {
            datamatrix_single = read_count_mat_with_separated_chr_start_end(
                temp_path[i], format_test, separator)
        } else{
            datamatrix_single <- scater::readSparseCounts(
                temp_path[i], sep = separator, chunk = 1000L)
        }
        gc()
        datamatrix_single <- check_correct_datamatrix(datamatrix_single,
                                                      sample_name)
        gc()
        total_cell <- length(datamatrix_single[1, ])
        annot_single <- data.frame(
            barcode = colnames(datamatrix_single),
            cell_id = paste0(sample_name, "_", colnames(datamatrix_single)),
            sample_id = rep(sample_name, total_cell), batch_id = 1)
        colnames(datamatrix_single) <- annot_single$cell_id
        datamatrix <- combine_datamatrix(datamatrix,
                                        datamatrix_single, file_paths, i)
        rm(datamatrix_single)
        if (is.null(annot_raw)) annot_raw <- annot_single
        else annot_raw <- rbind(annot_raw, annot_single)
        rm(annot_single)
        gc()
    }
    annot_raw$batch_id <- as.factor(annot_raw$batch_id)
    out = list("datamatrix" = datamatrix, "annot_raw" = annot_raw)
    return(out)
}



#' Determine Count matrix separator ("tab" or ",")
#'
#' @param path_to_matrix A path towards the count matrix to check
#'
#' @return A character separator
#'
separator_count_mat <- function(path_to_matrix){
  format_test = as.character(
    read.table(path_to_matrix, header = TRUE, sep = "\t", nrows = 5,
               comment.char = "#")[4, ])
  if (length(format_test) >= 3 & !is.na(as.numeric(format_test[2])) &
      !is.na(as.numeric(format_test[3]))) return("\t")
    
    format_test = as.character(
      read.table(path_to_matrix, header = TRUE, sep = ",", nrows = 5,
                 comment.char = "#")[4, ])
    if (length(format_test) >= 3 & !is.na(as.numeric(format_test[2])) &
        !is.na(as.numeric(format_test[3]))) return(",")
    
    format_test = as.character(
      read.table(path_to_matrix, header = TRUE, sep = "-", nrows = 5,
                 comment.char = "#")[4, ])
    if (length(format_test) >= 3 & !is.na(as.numeric(format_test[2])) &
        !is.na(as.numeric(format_test[3]))) return("-")
    
    format_test = as.character(
      read.table(path_to_matrix, header = TRUE, sep = "_", nrows = 5,
                 comment.char = "#")[4, ])
    if (length(format_test) >= 3 & !is.na(as.numeric(format_test[2])) &
        !is.na(as.numeric(format_test[3]))) return("_")
    
    return("\t")
}

#' Read a count matrix with three first columns (chr,start,end)
#'
#' @param path_to_matrix Path to the count matrix
#' @param format_test Sample of the read.table
#' @param separator Separator character
#'
#' @return A sparseMatrix with rownames in the form "chr1:1222-55555"
#'
read_count_mat_with_separated_chr_start_end <- function(
    path_to_matrix,
    format_test,
    separator
){
    val = c("NULL", "NULL", "NULL",
            rep("integer", length(colnames(
                format_test
            )) - 3))
    
    datamatrix_single = as(as.matrix(
        read.table(
            path_to_matrix,
            header = TRUE,
            sep = separator,
            colClasses = val
        )
    ), "dgCMatrix")
    gc()
    
    val = c("character",
            "integer",
            "integer",
            rep("NULL", ncol(datamatrix_single)))
    regions = read.table(
        path_to_matrix,
        header = TRUE,
        sep = separator,
        colClasses = val
    )
    regions = paste0(regions[, 1], ":", regions[, 2], "-", regions[, 3])
    rownames(datamatrix_single) = regions
    
    return(datamatrix_single)
}


#' Check if matrix rownames are well formated and correct if needed
#'
#' Throws warnings / error if matrix is in the wrong format
#' 
#' @param datamatrix_single A sparse matrix
#' @param sample_name Matrix sample name for warnings
#' 
#' 
#' @return A sparseMatrix in the right rownames format 
#'
check_correct_datamatrix <- function(datamatrix_single,sample_name=""){
    matchingRN <-
        grep("[[:alnum:]]+(:|_|-)[[:digit:]]+(-|_)[[:digit:]]+",
            rownames(datamatrix_single)) # check rowname format
    if (length(matchingRN) < length(rownames(datamatrix_single))) {
        warning(paste0(
            "ChromSCape::import_scExp - ", sample_name, " contains ",(length(
                rownames(datamatrix_single)) - length(matchingRN)),
            " rownames that do not conform to the required format.
                    Please check your data matrix and try again."))
        if (length(matchingRN) < 5) {
            stop(
                "ChromSCape::import_scExp - Maybe your rownames are 
                contained in the first column instead? In this case, remove the 
                header of this column so that they are interpreted as rownames."
            )
        }
        return()
    }
    numericC <-
        apply(datamatrix_single[seq_len(5), seq_len(5)], MARGIN = 2, is.numeric)
    if (sum(numericC) < 5) {
        stop(
            paste0(
                "ChromSCape::import_scExp - ", sample_name,
                " contains non-numeric columns at the following indices: ",
                which(numericC == FALSE), 
                ". Please check your data matrix and try again."))
    }
    if (all.equal(rownames(datamatrix_single)[seq_len(5)],
                c("1", "2", "3", "4", "5")) == TRUE) {
        warning(
            paste0("ChromSCape::import_scExp - ", sample_name, " contains ",
                "numeric rownames, taking first column as rownames."))
        names = datamatrix_single$X0
        datamatrix_single = datamatrix_single[, -1]
        rownames(datamatrix_single) = names
    }
    datamatrix_single <-
        datamatrix_single[!duplicated(rownames(datamatrix_single)), ]
    if (length(grep("chr", rownames(datamatrix_single)[seq_len(10)], 
                    perl = TRUE)) >= 9) {
        rownames(datamatrix_single) <-
            gsub(":|-", "_", rownames(datamatrix_single))
        rownames(datamatrix_single) <-
            gsub("-", "_", rownames(datamatrix_single))
    }
    return(datamatrix_single)
}

#' Combine two matrices and emit warning if no regions are in common
#'
#' @param datamatrix A sparse matrix or NULL if empty
#' @param datamatrix_single Another sparse matrix
#' @param file_names File name corresponding to the matrix for warnings
#' @param i file number
#'
#' @return A combined sparse matrix
#'
combine_datamatrix <- function(datamatrix, datamatrix_single,
                            file_names, i){
    if (is.null(datamatrix)) {datamatrix <- datamatrix_single
    } else {
        
        common_regions <- intersect(rownames(datamatrix),
                                    rownames(datamatrix_single))
        if (length(common_regions) > 0) {
            datamatrix <-
                Matrix::cbind2(datamatrix[common_regions, ],
                            datamatrix_single[common_regions, ])
        } else {
            stop(paste0(
                "ChromSCape::import_scExp - ", file_names[i],
                " contains no common regions with ",file_names[i - 1]))
        }
        if (length(common_regions) < nrow(datamatrix)) {
            warning(paste0(
                "ChromSCape::import_scExp - ", file_names[i],
                " contains less than ", ceiling(
                    100 * length(common_regions) / (nrow(datamatrix))),
                " common regions with ", file_names[i - 1]))
        }
    }
    return(datamatrix)
}

#' Wrapper to create the single cell experiment from count matrix and feature
#' dataframe
#'
#' Create the single cell experiment from (sparse) datamatrix and feature
#' dataframe containing feature names and location. Also optionally removes zero
#' count Features, zero count Cells, non canconical chromosomes, and chromosome
#' M. Calculates QC Metrics (scran).
#'
#' @param datamatrix A matrix or sparseMatrix of raw counts. Features x Cells
#'   (rows x columns).
#' @param annot A data.frame containing informations on cells. Should have the
#'   same number of rows as the number of columns in datamatrix.
#' @param remove_zero_cells remove cells with zero counts ? (TRUE)
#' @param remove_zero_features remove cells with zero counts ? (TRUE)
#' @param remove_non_canonical remove non canonical chromosomes ?(TRUE)
#' @param remove_chr_M remove chromosomes M ? (TRUE)
#' @param mainExpName Name of the mainExpName e.g. 'bins', 'peaks'... 
#' ("default")
#' @param verbose (TRUE)
#'
#' @return Returns a SingleCellExperiment object.
#' @export
#'
#'
#' @importFrom SingleCellExperiment SingleCellExperiment counts colData
#' @importFrom SummarizedExperiment rowRanges colData
#' @importFrom Matrix rowSums colSums
#'
#' @examples
#' raw <- create_scDataset_raw()
#' scExp = create_scExp(raw$mat, raw$annot)
#' scExp
#' 
create_scExp <- function(
    datamatrix, annot, remove_zero_cells = TRUE, remove_zero_features = TRUE,
    remove_non_canonical = TRUE, remove_chr_M = TRUE, mainExpName = "main",
    verbose = TRUE)
{
    stopifnot(is.data.frame(annot), remove_zero_cells %in% c(TRUE, FALSE),
            remove_zero_features %in% c(TRUE, FALSE))
    if (ncol(datamatrix) != nrow(annot)) 
        stop(
            "ChromSCape::create_scExp - datamatrix and annot should contain",
            "the same number of cells")
    if (length(match(c("cell_id", "sample_id"), colnames(annot))) < 2) 
        stop("ChromSCape::create_scExp - annot should contain cell_id &
            sample_id as column names")
    if (length(intersect(colnames(datamatrix), annot$cell_id)) < 
        ncol(datamatrix)) 
        stop("ChromSCape::create_scExp - annot$cell_id and colnames do",
                " not match perfectly !")
    annot = annot[match(colnames(datamatrix), annot$cell_id),]
    if (is(datamatrix, "data.frame")) datamatrix <- as.matrix(datamatrix)
    if(verbose) message("ChromSCape::create_scExp - the matrix has ",
        dim(datamatrix)[2], " cells and ", dim(datamatrix)[1], " features.")
    scExp <- SingleCellExperiment::SingleCellExperiment(
        assays = list(counts = datamatrix), colData = annot)
    
    # If rownames is formatted as gene:chr_start_end, put genes into rowRanges
    contains_genes = all(grepl(":chr",rownames(scExp)[1:5]))
    if(contains_genes){
      if(verbose) message("ChromSCape::create_scExp - Genes detected in rownames...")
        Genes = gsub(":.*","",rownames(scExp))
        rownames(scExp) = gsub(".*:","",rownames(scExp))
        SummarizedExperiment::rowRanges(scExp) = get_genomic_coordinates(scExp)
        SummarizedExperiment::rowRanges(scExp)$Gene = Genes
        SummarizedExperiment::rowRanges(scExp)$distanceToTSS = 0
        rownames(scExp) = SummarizedExperiment::rowRanges(scExp)$ID
    }
    if (has_genomic_coordinates(scExp)) {
        if (remove_non_canonical) scExp <- remove_non_canonical_fun(scExp,
                                                                    verbose)
        if (remove_chr_M) scExp <- remove_chr_M_fun(scExp, verbose)}
    dim_b <- dim(scExp)
    if (remove_zero_features)
        scExp <- scExp[(Matrix::rowSums(
            SingleCellExperiment::counts(scExp) > 0) > 0), ]
    if (remove_zero_cells) 
        scExp <- scExp[, (Matrix::colSums(
            SingleCellExperiment::counts(scExp) > 0) > 0)]
    if (dim(scExp)[2] != dim_b[2]){
      if(verbose) message("ChromSCape::create_scExp - ", dim_b[2] - dim(scExp)[2],
            " cells with 0 signals were removed.")
    }
    if (dim(scExp)[1] != dim_b[1]){
      if(verbose) message("ChromSCape::create_scExp - ",
            dim_b[1] - dim(scExp)[1], " features with 0 signals were removed.")
    }
    if (has_genomic_coordinates(scExp) && !contains_genes){
        rows <- rownames(scExp)
        SummarizedExperiment::rowRanges(scExp) <- get_genomic_coordinates(scExp)
        rownames(scExp) <- rows
    }
    SummarizedExperiment::colData(scExp)$total_counts = 
        colSums(SingleCellExperiment::counts(scExp))
    # if(!is.na(ncol(scExp) * nrow(scExp))){
    # SummarizedExperiment::colData(scExp)$detected = 
    #     apply(SingleCellExperiment::counts(scExp), 2,
    #           function(i) length(which(i>0)))
    # } else{
    #     SummarizedExperiment::colData(scExp)$detected = 
    #         SummarizedExperiment::colData(scExp)$total_counts
    # }
    SingleCellExperiment::mainExpName(scExp) <- mainExpName
    return(scExp)
}

#' Remove non canonical chromosomes from scExp
#'
#' @param scExp A SingleCellExperiment
#' @param verbose Print ?
#'
#' @return A SingleCellExperiment without non canonical chromosomes 
#' (random,unknown, contigs etc...)
#'
remove_non_canonical_fun <- function(scExp, verbose){
    # Removing non-canonical chromosomes
    splitID <- lapply(rownames(scExp), function(x)
    {
        unlist(strsplit(as.character(x),
                        split = "_|:|-",
                        fixed = FALSE))
    })
    normal_chr <- which(as.numeric(lapply(splitID, length)) <= 3)
    # weird chromosomes contain _/:/- in the name
    nrow_init <- nrow(scExp)
    scExp <- scExp[normal_chr, ]
    if (length(normal_chr) < nrow_init && verbose)
    {
        message("ChromSCape::create_scExp - ",nrow(scExp) - length(normal_chr),
                " non canonical regions were removed.\n")
    }
    return(scExp)
}

#' Remove chromosome M from scExprownames
#'
#' @param scExp A SingleCellExperiment
#' @param verbose Print ?
#'
#' @return A SingleCellExperiment without chromosome M (mitochondrial chr)
#'
remove_chr_M_fun <- function(scExp, verbose){
    # Remove chrM from mat if it is inside
    chrM_regions <- grep("chrM", rownames(scExp))
    if (length(chrM_regions) > 0)
    {
        scExp <- scExp[-chrM_regions, ]
        if (verbose)
        {
            message("ChromSCape::create_scExp - ", length(chrM_regions),
                " chromosome M regions were removed.\n")
        }
    }
    return(scExp)
}

#' Filter cells and features
#'
#' Function to filter out cells & features from SingleCellExperiment based on
#' total count per cell, number of cells 'ON' in features and top covered cells
#' that might be doublets.
#'
#' @param scExp A SingleCellExperiment object.
#' @param min_cov_cell Minimum counts for each cell. (1600)
#' @param quant_removal Centile of cell counts above which cells are removed.
#'   (95)
#' @param min_count_per_feature Minimum number of reads per feature (10).
#' @param verbose (TRUE)
#'
#' @return Returns a filtered SingleCellExperiment object.
#'
#' @export
#'
#' @examples
#'
#' raw <- create_scDataset_raw()
#' scExp = create_scExp(raw$mat, raw$annot)
#' scExp. = filter_scExp(scExp)
#'
#' # No feature filtering (all features are valuable)
#' scExp. = filter_scExp(scExp,min_count_per_feature=30)
#'
#' # No cell filtering (all features are valuable)
#' scExp. = filter_scExp(scExp,min_cov_cell=0,quant_removal=100)
#'
#' @importFrom SingleCellExperiment SingleCellExperiment counts colData
#' @importFrom Matrix colSums rowSums
filter_scExp =  function (
    scExp, min_cov_cell = 1600, quant_removal = 95, min_count_per_feature = 10,
    verbose = TRUE){
    stopifnot(is(scExp, "SingleCellExperiment"), is.numeric(min_cov_cell),
            is.numeric(quant_removal),
            is.numeric(min_count_per_feature), verbose %in% c(FALSE, TRUE))
    if (is.null(scExp)) warning(
        "ChromSCape::filter_scExp - Please specify a  SingleCellExperiment")
    
    cellCounts <- Matrix::colSums(SingleCellExperiment::counts(scExp))
    thresh <- stats::quantile(cellCounts, probs = seq(0, 1, 0.01))
    sel <- (cellCounts > min_cov_cell & cellCounts <= thresh[quant_removal + 1])

    scExp <- scExp[, sel]
    counts <- SingleCellExperiment::counts(scExp)
    sel_feature <- (Matrix::rowSums(counts) >= min_count_per_feature)
    
    scExp <- scExp[sel_feature,]
    empty_cells = (Matrix::colSums(SingleCellExperiment::counts(scExp)) < 100)
    if(any(empty_cells)) scExp <- scExp[,!empty_cells]
    empty_features = (Matrix::rowSums(counts(scExp)) < min_count_per_feature)
    if(any(empty_features)) scExp <- scExp[!empty_features,]
    
    SummarizedExperiment::colData(scExp)$total_counts = 
        Matrix::colSums(SingleCellExperiment::counts(scExp))
    
    if (verbose) message(
        "ChromSCape::filter_scExp - ", ncol(scExp), " cells pass the ",
        " threshold of ", min_cov_cell, " minimum reads and are lower than ",
        "the ", quant_removal, "th centile of library size ~= ",
        round(thresh[quant_removal + 1]), " reads.")
    
    if (verbose) message(
        "ChromSCape::filter_scExp - ",nrow(scExp)," features pass ",
        "the threshold of ", min_count_per_feature," count per feature.")
    
    return(scExp)
}

#' Find most covered features
#'
#' @description  
#' Find the top most covered features that will be used for dimensionality 
#' reduction. Optionally remove non-top features. 
#'
#' @param scExp A SingleCellExperiment.
#' @param n Either an integer indicating the number of top covered regions to 
#' find or a character vector of the top percentile of features to keep (e.g.
#' 'q20' to keep top 20% features). 
#' @param keep_others Logical indicating if non-top regions are to be removed 
#' from the SCE or not (FALSE). 
#' @param prioritize_genes First filter by loci being close to genes ? E.g. for
#' differential analysis, it is more relevant to keep features close to genes
#' @param max_distanceToTSS If prioritize_genes is TRUE, the maximum distance to 
#' consider a feature close to a gene.
#' 
#' @param verbose Print ?
#' 
#' @return A SCE with top features
#' @export
#'
#' @examples
#' data(scExp)
#' scExp_top = find_top_features(scExp, n = 4000, keep_others = FALSE)
#' 
find_top_features <- function (scExp, n = 20000, keep_others = FALSE,
                               prioritize_genes = FALSE, 
                               max_distanceToTSS = 10000,
                               verbose = TRUE){
    scExp. = scExp
    if(prioritize_genes){
        # filter by loci close to genes
        scExp. = scExp.[which(
            SingleCellExperiment::rowData(scExp)$distanceToTSS <= max_distanceToTSS),]
        if(verbose) message("ChromScape::find_top_features - ", 
                            nrow(scExp.), " features kept as ",
                            " being closer than ", max_distanceToTSS,
                            "bp to genes TSS...")
    }
    n = min(n, nrow(scExp.))
    feature_counts = Matrix::rowSums(counts(scExp.))
    
    if(is.numeric(n)){
        feature_counts_top = sort(feature_counts,decreasing = TRUE)[seq_len(n)]   
    } else{
        n = as.numeric(gsub("q","",n))
        feature_counts_top = feature_counts[which(
            feature_counts>quantile(feature_counts,1-(n/100) ) )]
    }
    SummarizedExperiment::rowData(scExp)[,"top_feature"] = FALSE
    SummarizedExperiment::rowData(scExp)[names(feature_counts_top),
                                         "top_feature"] = TRUE
    if(keep_others == FALSE) scExp = scExp[names(feature_counts_top),]
    
    if(verbose) message("ChromScape::find_top_features - ", 
                        length(feature_counts_top), " features kept as ",
                        " most covered features...")
    return(scExp)
}

#' Does SingleCellExperiment has genomic coordinates in features ?
#'
#' @param scExp A SingleCellExperiment object
#'
#' @return TRUE or FALSE
#' @export
#' @examples 
#' 
#' raw <- create_scDataset_raw()
#' scExp = create_scExp(raw$mat, raw$annot)
#' has_genomic_coordinates(scExp)
#' raw_genes = create_scDataset_raw(featureType="gene")
#' scExp_gene = create_scExp(raw_genes$mat, raw_genes$annot)
#' has_genomic_coordinates(scExp_gene)
#' 
has_genomic_coordinates <- function(scExp)
{
    stopifnot(is(scExp, "SingleCellExperiment"), !is.null(rownames(scExp)))
    ID <- rownames(scExp)[seq_len(min(10, length(rownames(scExp))))]
    chr <- unlist(lapply(
        strsplit(ID, split = "_|-|:"),
        FUN = function(x)
            unlist(x)[1]
    ))
    
    if (length(grep("chr|(^\\d+$|^X$|^Y$)",
                    chr[seq_len(min(10, length(chr)))], ignore.case = TRUE)) >=
        min(10, length(chr)))
    {
        return(TRUE)
    } else
    {
        return(FALSE)
    }
}

#' Get SingleCellExperiment's genomic coordinates
#'
#' @param scExp A SingleCellExperiment object.
#'
#' @return A GRanges object of genomic coordinates.
#' @importFrom GenomicRanges GRanges
#' @export
#' @examples 
#' 
#' raw <- create_scDataset_raw()
#' scExp = create_scExp(raw$mat, raw$annot)
#' feature_GRanges = get_genomic_coordinates(scExp)
#' 
get_genomic_coordinates <- function(scExp)
{
    stopifnot(is(scExp, "SingleCellExperiment"))
    if (!has_genomic_coordinates(scExp))
    {
        stop("Feature names are not genomic coordinates")
    }
    
    ID <- rownames(scExp)
    chr <- unlist(lapply(
        strsplit(ID, split = "_|-|:"),
        FUN = function(x)
            unlist(x)[1]
    ))
    start <- unlist(lapply(
        strsplit(ID, split = "_|-|:"),
        FUN = function(x)
            unlist(x)[2]
    ))
    end <- unlist(lapply(
        strsplit(ID, split = "_|-|:"),
        FUN = function(x)
            unlist(x)[3]
    ))
    feature <-
        GRanges(data.frame(
            ID = ID,
            chr = chr,
            start = start,
            end = end
        ))
    return(feature)
}

#' Remove specific features (CNA, repeats)
#'
#' @param scExp A SingleCellExperiment object.
#' @param features_to_exclude A GenomicRanges object or data.frame containing
#' genomic regions or features to exclude or path towards a BED file containing
#' the features to exclude.
#' @param by Type of features. Either 'region' or 'feature_name'. If 'region',
#'  will look for genomic coordinates in columns 1-3 (chr,start,stop).
#' If 'feature_name', will look for a genes in first column. ('region')
#' @param verbose (TRUE)
#'
#' @return A SingleCellExperiment object without features to exclude.
#' @export
#'
#' @importFrom GenomicRanges GRanges makeGRangesFromDataFrame findOverlaps
#'  intersect
#' @importFrom SummarizedExperiment rowRanges
#' @importFrom rtracklayer import
#'
#' @examples 
#' 
#' raw <- create_scDataset_raw()
#' scExp = create_scExp(raw$mat, raw$annot)
#' features_to_exclude = data.frame(chr=c("chr4","chr7","chr17"),
#' start=c(50000,8000000,2000000),
#' end=c(100000,16000000,2500000))
#' features_to_exclude = as(features_to_exclude,"GRanges")
#' scExp = exclude_features_scExp(scExp,features_to_exclude)
#' scExp
#' 
exclude_features_scExp <-
    function(scExp, features_to_exclude, by = "region", verbose = TRUE){
        stopifnot(is(scExp, "SingleCellExperiment"), is.character(by[1]))
        if(!is(features_to_exclude,"GenomicRanges") & 
           !is(features_to_exclude,"data.frame")) {
            if(!file.exists(features_to_exclude)) {
                stop(
                "ChromSCape::exclude_features_scExp - features_to_exclude must be
           either GenomicRanges or data.frame containg feature names.")
            } else {
                features_to_exclude = rtracklayer::import(features_to_exclude)
            }
        }
        if (!by[1] %in% c("region", "feature_name")) 
            stop("ChromSCape::exclude_features_scExp - by must be either
            'region' or 'feature_name'")
        if (by[1] == "region")
        {
            if (!has_genomic_coordinates(scExp))
                stop(paste0("ChromSCape::exclude_features_scExp -",
                    "Feature names are not genomic coordinates"))
            regions <- SummarizedExperiment::rowRanges(scExp)
            
            suppressWarnings({
                ovrlps <- as.data.frame(
                    GenomicRanges::findOverlaps(regions, features_to_exclude))[,1]
            })
            if (length(unique(ovrlps) > 0)) scExp <- scExp[-unique(ovrlps), ]
            if (verbose) 
                message("ChromSCape::exclude_features_scExp - Removed ",
                    length(unique(ovrlps)), " regions from the analysis.")}
        if (by[1] == "feature_name"){
            if (has_genomic_coordinates(scExp)) 
                warning(
                    "ChromSCape::exclude_features_scExp - Excluding by feature
                    name while object feature names are genomic coordinates !")
            features <- rownames(scExp)
            features_to_exclude <-
                as.character(features_to_exclude[, 1])
            suppressWarnings({
                ovrlps <-
                    GenomicRanges::intersect(features, features_to_exclude)
            })
            if (length(unique(ovrlps) > 0)){
                scExp <- scExp[-which(rownames(scExp) %in% ovrlps), ]
            }
            if (verbose)
                message("ChromSCape::exclude_features_scExp - Removed ",
                    length(unique(ovrlps)), " features from the analysis.")
        }
        return(scExp)
    }

#' Preprocess scExp - Transcripts per Million (TPM)
#'
#' @param scExp A SingleCellExperiment Object
#'
#' @return A SingleCellExperiment object.
#' @importFrom GenomicRanges width
#' @importFrom SummarizedExperiment rowRanges
#' @importFrom SingleCellExperiment counts normcounts
#' @importFrom Matrix t colSums
#' @export
#' @examples 
#' 
#' raw <- create_scDataset_raw()
#' scExp = create_scExp(raw$mat, raw$annot)
#' scExp = preprocess_TPM(scExp)
#' head(SingleCellExperiment::normcounts(scExp))
#' 
preprocess_TPM <- function(scExp)
{
    size <- GenomicRanges::width(SummarizedExperiment::rowRanges(scExp))
    SummarizedExperiment::assay(scExp, "normcounts", withDimnames = FALSE) <-
        SingleCellExperiment::counts(scExp) / size
    SummarizedExperiment::assay(scExp, "normcounts", withDimnames = FALSE) <-
        10 ^ 6 * Matrix::t(Matrix::t(SingleCellExperiment::normcounts(scExp)) /
                    Matrix::colSums(SingleCellExperiment::normcounts(scExp)))
    return(scExp)
}

#' Preprocess scExp - Read per Kilobase Per Million (RPKM)
#'
#' @param scExp A SingleCellExperiment Object
#'
#' @return A SingleCellExperiment object.
#' @importFrom GenomicRanges width
#' @importFrom SummarizedExperiment rowRanges assay
#' @importFrom SingleCellExperiment counts normcounts
#' @importFrom Matrix t colSums
#' @export
#' @examples 
#' 
#' raw <- create_scDataset_raw()
#' scExp = create_scExp(raw$mat, raw$annot)
#' scExp = preprocess_RPKM(scExp)
#' head(SingleCellExperiment::normcounts(scExp))
preprocess_RPKM <- function(scExp)
{
    SummarizedExperiment::assay(scExp, "normcounts", withDimnames = FALSE) <-
        10 ^ 9 * Matrix::t(
            Matrix::t(SingleCellExperiment::counts(scExp)) /
                Matrix::colSums(SingleCellExperiment::counts(scExp))
        )
    size <-
        GenomicRanges::width(SummarizedExperiment::rowRanges(scExp))
    SummarizedExperiment::assay(scExp, "normcounts", withDimnames = FALSE) <-
        SingleCellExperiment::normcounts(scExp) / size
    
    return(scExp)
}

#' Preprocess scExp - Counts Per Million (CPM)
#'
#' @param scExp A SingleCellExperiment Object
#'
#' @return A SingleCellExperiment object.
#' @importFrom SummarizedExperiment assay
#' @importFrom SingleCellExperiment counts normcounts
#' @importFrom Matrix t colSums
#' @export
#' @examples 
#' raw <- create_scDataset_raw()
#' scExp = create_scExp(raw$mat, raw$annot)
#' scExp = preprocess_CPM(scExp)
#' head(SingleCellExperiment::normcounts(scExp))
#' 
preprocess_CPM <- function(scExp)
{
    SummarizedExperiment::assay(scExp, "normcounts", withDimnames = FALSE) <- 
        10 ^ 6 * Matrix::t(Matrix::t(SingleCellExperiment::counts(scExp)) /
                            Matrix::colSums(SingleCellExperiment::counts(scExp))
        )
    return(scExp)
}


#' Preprocess scExp - TF-IDF
#'
#' @param scExp A SingleCellExperiment Object
#' @param scale A numeric to multiply the matrix in order to have human readeable
#' numbers. Has no impact on the downstream analysis
#' @param log Wether to use neperian log on the TF-IDF normalized data or not.
#'
#' @return A SingleCellExperiment object.
#' @importFrom GenomicRanges width
#' @importFrom SummarizedExperiment rowRanges assay
#' @importFrom SingleCellExperiment counts normcounts
#' @importFrom Matrix t colSums
#' @export
#' @examples 
#' raw <- create_scDataset_raw()
#' scExp = create_scExp(raw$mat, raw$annot)
#' scExp = preprocess_TFIDF(scExp)
#' head(SingleCellExperiment::normcounts(scExp))
#'
preprocess_TFIDF <- function(scExp, scale = 10000, log = TRUE)
{
        counts = SingleCellExperiment::counts(scExp)
        n_features <- Matrix::colSums(counts)
        tf <- Matrix::t(Matrix::t(counts) / n_features)
        idf <- 1 + ncol(counts) / Matrix::rowSums(counts)
        normcounts <- Matrix::Diagonal(length(idf), idf) %*% tf
        if(log) normcounts = log1p(normcounts * scale) else normcounts = 
            normcounts * scale
        SummarizedExperiment::assay(scExp, "normcounts",
                                    withDimnames = FALSE) <- normcounts
        return(scExp)
}

#' Preprocess scExp - size only
#'
#' @param scExp A SingleCellExperiment Object
#'
#' @return A SingleCellExperiment object.
#' @importFrom GenomicRanges width
#' @importFrom SummarizedExperiment rowRanges assay
#' @importFrom SingleCellExperiment counts normcounts
#' @importFrom Matrix t colSums
#' @export
#' @examples 
#' raw <- create_scDataset_raw()
#' scExp = create_scExp(raw$mat, raw$annot)
#' scExp = preprocess_feature_size_only(scExp)
#' head(SingleCellExperiment::normcounts(scExp))
#'
preprocess_feature_size_only <- function(scExp)
{
    size <- GenomicRanges::width(SummarizedExperiment::rowRanges(scExp))
    SummarizedExperiment::assay(scExp, "normcounts", withDimnames = FALSE) <-
        SingleCellExperiment::counts(scExp) / size
    return(scExp)
}

#' Normalize counts
#'
#' @param scExp A SingleCellExperiment object.
#' @param type Which normalization to apply. Either 'CPM', 'TFIDF','RPKM', 'TPM' or
#'  'feature_size_only'. Note that for all normalization by size
#'  (RPKM, TPM, feature_size_only), the features must have defined
#'  genomic coordinates.
#'
#' @return A SingleCellExperiment object containing normalized counts.
#'  (See ?normcounts())
#' @export
#'
#' @examples 
#' raw <- create_scDataset_raw()
#' scExp = create_scExp(raw$mat, raw$annot)
#' scExp = normalize_scExp(scExp)
#' head(SingleCellExperiment::normcounts(scExp))
#'
normalize_scExp <- function(scExp,
                            type = c("CPM", "TFIDF", "RPKM", "TPM",
                                    "feature_size_only"))
{
    stopifnot(
        type[1] %in% c("CPM", "TFIDF", "RPKM", "TPM", "feature_size_only"),
        is(scExp, "SingleCellExperiment")
    )
    if (!is(SingleCellExperiment::counts(scExp), "dgCMatrix")) {
        SingleCellExperiment::counts(scExp) <- as(
            SingleCellExperiment::counts(scExp), "dgCMatrix")
    }
    
    if (!has_genomic_coordinates(scExp))
    {
        warning(
            "ChromSCape::normalize_scExp - Switching to CPM normalization
            as features are not genomic coordinates."
        )
        type <- "CPM"
    }
    switch(
        type[1],
        RPKM = return(preprocess_RPKM(scExp)),
        TPM = return(preprocess_TPM(scExp)),
        feature_size_only = return(preprocess_feature_size_only(scExp)),
        CPM = return(preprocess_CPM(scExp)),
        TFIDF = return(preprocess_TFIDF(scExp))
    )
}


#' Add gene annotations to features
#'
#' @param scExp A SingleCellExperiment object.
#' @param ref Reference genome. Either 'hg38', 'mm10' or 'ce11'. ('hg38')
#' @param reference_annotation A data.frame containing gene (or else) annotation
#'   with genomic coordinates.
#'
#' @return A SingleCellExperiment object with annotated rowData.
#' @export
#'
#' @importFrom GenomicRanges makeGRangesFromDataFrame distanceToNearest
#' @importFrom SummarizedExperiment rowRanges
#' @importFrom dplyr mutate select group_by summarise_all
#'
#' @examples
#' raw <- create_scDataset_raw()
#' scExp = create_scExp(raw$mat, raw$annot)
#' scExp = feature_annotation_scExp(scExp)
#' head(SummarizedExperiment::rowRanges(scExp))
#'
#' # Mouse
#' raw = create_scDataset_raw(ref = "mm10")
#' scExp = create_scExp(raw$mat, raw$annot)
#' scExp = feature_annotation_scExp(scExp,ref="mm10")
#' head(SummarizedExperiment::rowRanges(scExp))
feature_annotation_scExp <- function(scExp, ref = "hg38",
                                    reference_annotation = NULL)
{
    stopifnot(is(scExp, "SingleCellExperiment"), is.character(ref))
    if (is.null(SummarizedExperiment::rowRanges(scExp)))
        stop("ChromSCape::feature_annotation_scExp - The object doesn't have
            ranges of coordinates as rowData")
    if (is.null(reference_annotation) & !(ref %in% c("hg38", "mm10", "ce11")))
        stop("ChromSCape::feature_annotation_scExp - If reference_annotation is
            null, ref must be either 'hg38' or 'mm10' to automatically load
            reference gene annotation.")
    if (is.null(reference_annotation) & (ref %in% c("hg38", "mm10", "ce11")))
    {
        message("ChromSCape::feature_annotation_scExp - Selecting ",
                ref, " genes from Gencode.")
        eval(parse(text = paste0("data(", ref, ".GeneTSS)")))
        reference_annotation <- eval(parse(text = paste0("", ref, ".GeneTSS")))
        start = reference_annotation$start
        reference_annotation$start = ifelse(reference_annotation$strand == "+",
                                            reference_annotation$start,
                                            reference_annotation$end)
        reference_annotation$end = ifelse(reference_annotation$strand == "+",
                                start + 1,
                                reference_annotation$end + 1)
        reference_annotation$strand = NULL
    }
    if (is.data.frame(reference_annotation)) 
        reference_annotation <-
            GenomicRanges::makeGRangesFromDataFrame(reference_annotation,
                                                    keep.extra.columns = TRUE)
    
    feature_ranges <- SummarizedExperiment::rowRanges(scExp)[, 1]
    hits <- GenomicRanges::distanceToNearest(feature_ranges, 
                                                reference_annotation,
                                                ignore.strand = TRUE,
                                                select = "all")
    q_hits <- S4Vectors::queryHits(hits)
    s_hits <- S4Vectors::subjectHits(hits)
    annotFeat <- data.frame(
        "chr" = as.character(GenomicRanges::seqnames(feature_ranges[q_hits])),
        "start" = as.character(GenomicRanges::start(feature_ranges[q_hits])),
        "end" = as.character(GenomicRanges::end(feature_ranges[q_hits])),
        "Gene" = as.character(
            reference_annotation@elementMetadata$Gene)[s_hits],
        "distanceToTSS" = hits@elementMetadata$distance)
    annotFeat <- annotFeat %>% dplyr::mutate(
        "ID" = paste(.data$chr, .data$start, .data$end, sep = "_")) %>%
        dplyr::select("ID", "chr", "start", "end", "Gene", "distanceToTSS")
    annotFeat <- annotFeat %>% dplyr::group_by(.data$ID, .data$chr,
                                                .data$start, .data$end) %>% 
        dplyr::summarise("Gene" = paste(.data$Gene, collapse = ", "),
                            "distanceToTSS" = max(.data$distanceToTSS)) %>%
        as.data.frame()
    annotFeat <- annotFeat[match(rownames(scExp), annotFeat$ID), ]
    duplicated_features = which(duplicated(annotFeat$ID))
    if(length(duplicated_features) > 0){
      message("ChromSCape::feature_annotation_scExp - ",
              length(duplicated_features), " features",
              " are duplicated, unifying theses features...")
      annotFeat = annotFeat[-duplicated_features,]
      scExp = scExp[-duplicated_features,]
    } 
    rownames(annotFeat) <- annotFeat$ID
    if(all(c("ID","Gene","distanceToTSS") %in%
           colnames(SummarizedExperiment::rowData(scExp)))){
        SummarizedExperiment::rowData(scExp) <- annotFeat[,c("ID","Gene","distanceToTSS")]
    } else{
        SummarizedExperiment::rowData(scExp) <- cbind(
            SummarizedExperiment::rowData(scExp),
            annotFeat[,c("Gene","distanceToTSS")]
        )
    }
    
    return(scExp)
}

#' Choose perplexity depending on number of cells for Tsne
#'
#' @param dataset A matrix of features x cells (rows x columns)
#'
#' @return A number between 5 and 30 to use in Rtsne function
choose_perplexity <- function(dataset)
{
    stopifnot(!is.null(dataset), !is.null(dim(dataset)))
    perplexity <- 30
    if (nrow(dataset) <= 200)
    {
        perplexity <- 20
    }
    if (nrow(dataset) <= 250)
    {
        perplexity <- 25
    }
    if (nrow(dataset) <= 150)
    {
        perplexity <- 15
    }
    if (nrow(dataset) <= 100)
    {
        perplexity <- 10
    }
    if (nrow(dataset) <= 50)
    {
        perplexity <- 5
    }
    perplexity
}

#' Reduce dimensions (PCA, TSNE, UMAP)
#'
#' @param scExp A SingleCellExperiment object.
#' @param dimension_reductions A character vector of methods to apply.
#' (c('PCA','TSNE','UMAP'))
#' @param n Numbers of dimensions to keep for PCA. (50)
#' @param batch_correction Do batch correction ? (FALSE)
#' @param batch_list List of characters. Names are batch names, characters are
#'  sample names.
#' @param remove_PC A vector of string indicating which principal components to 
#' remove before downstream analysis as probably correlated to 
#' library size. Should be under the form : 'Component_1', 'Component_2', ...
#' Recommended when using 'TFIDF' normalization method. (NULL)
#' @param verbose Print messages ?(TRUE)
#'
#' @return A SingleCellExperiment object containing feature spaces. See
#'  ?reduceDims().
#'
#' @export
#'

#' @importFrom Rtsne Rtsne
#' @importFrom umap umap umap.defaults
#' @importFrom SummarizedExperiment assays
#' @importFrom SingleCellExperiment counts normcounts reducedDims
#' @importFrom batchelor fastMNN
#' @importFrom Matrix t
#' 
#' @examples 
#' 
#' raw <- create_scDataset_raw()
#' scExp = create_scExp(raw$mat, raw$annot)
#' scExp =  normalize_scExp(scExp, "CPM")
#' scExp = reduce_dims_scExp(scExp,dimension_reductions=c("PCA","UMAP"))
#' 
reduce_dims_scExp <-
    function(scExp, dimension_reductions = c("PCA", "UMAP"), n = 10,
            batch_correction = FALSE, batch_list = NULL,
            remove_PC = NULL, verbose = TRUE)
    {
        stopifnot(is(scExp, "SingleCellExperiment"), is.numeric(n),
            dimension_reductions[1] %in% c("PCA", "TSNE", "UMAP"))
        
        if(!is.null(remove_PC)) stopifnot(is.character(remove_PC), 
                                          all(remove_PC %in% paste0("Component_",1:100)))
        if (!"normcounts" %in% names(SummarizedExperiment::assays(scExp))){
            warning("ChromSCape::reduce_dims_scExp - The raw counts are not
                normalized, running dimensionality reduction on raw counts.")
            mat <- SingleCellExperiment::counts(scExp)
        } else{
            mat <- SingleCellExperiment::normcounts(scExp)
            }
        if (batch_correction && !is.list(batch_list)){
            stop("ChromSCape::reduce_dims_scExp - If doing batch correction,
                batch_list must be a list of the samples IDs of each batch.")}
        if (batch_correction){
            out <- reduce_dim_batch_correction(scExp, mat, batch_list, n)
            scExp <- out$scExp
            pca <- out$pca
        } else{
            scExp$batch_id <- factor(1)
            if ( is(mat, "dgCMatrix") | is(mat, "dgTMatrix")) {
                pca <- pca_irlba_for_sparseMatrix(Matrix::t(mat), n)
            } else{
                pca <- stats::prcomp(Matrix::t(mat),center = TRUE,
                                    scale. = FALSE)
                pca <- pca$x[, seq_len(n)]}}
        pca <- as.data.frame(as.matrix(pca))
        colnames(pca) <- paste0("Component_", seq_len(ncol(pca)))
        if(!is.null(remove_PC)) {
            pca <- pca[,-which(colnames(pca) %in% remove_PC)]
            if(verbose) message("ChromSCape::reduce_dims_scExp - removing ",
                                paste(remove_PC, collapse = ", "), " as probably correlated to library size.")
        }
        rownames(pca) <- colnames(scExp)
        if ("TSNE" %in% dimension_reductions){
            tsne <- Rtsne::Rtsne(pca, dims = 2, pca = FALSE, theta = 0,
                perplexity = choose_perplexity(pca), verbose = verbose,
                max_iter = 1000)
            tsne <- as.data.frame(tsne$Y)
            rownames(tsne) <- rownames(pca)
            colnames(tsne) <- c("Component_1", "Component_2")
        }
        if ("UMAP" %in% dimension_reductions){
            config <- umap::umap.defaults
            config$metric <- "cosine"
            umap <- umap::umap(pca, config = config, method = "naive")
            umap <- as.data.frame(umap$layout)
            colnames(umap) <- c("Component_1", "Component_2")
        }
        listReducedDim <- list(PCA = pca)
        if ("TSNE" %in% dimension_reductions) listReducedDim$TSNE <- tsne
        if ("UMAP" %in% dimension_reductions) listReducedDim$UMAP <- umap
        SingleCellExperiment::reducedDims(scExp) <- listReducedDim
        return(scExp)
    }

#' Reduce dimension with batch corrections
#'
#' @param scExp SingleCellExperiment
#' @param batch_list List of batches
#' @param mat The normalized count matrix
#' @param n Number of PCs to keep
#'
#' @return A list containing the SingleCellExperiment with batch info and
#' the corrected pca
reduce_dim_batch_correction <- function(scExp, mat, batch_list, n){
    print("Running Batch Correction ...")
    num_batches <- length(batch_list)
    batch_names <- names(batch_list)
    batches <- list()
    scExp. <- scExp
    scExp.$batch_name <- "unspecified"
    for (i in seq_len(num_batches))
    {
        for (s_id in batch_list[[i]])
        {
            SummarizedExperiment::colData(
                scExp.)[scExp.$sample_id == s_id,
                        "batch_name"] <- batch_names[i]
        }
    }
    adj_annot <- data.frame()
    b_names <- unique(scExp.$batch_name)
    scExp.$batch_id <- as.factor(as.numeric(as.factor(scExp.$batch_name)))
    for (i in seq_along(b_names))
    {
        b_name <- b_names[i]
        batches[[i]] <- mat[,which(scExp.$batch_name == b_name)]
        adj_annot <- rbind(adj_annot, as.data.frame(
            SummarizedExperiment::colData(
                scExp.)[scExp.$batch_name ==b_name, ]))
    }
    mnn.out <- batchelor::fastMNN(batches, k = 25, d = n, ndist = 3,
                auto.merge = FALSE, cos.norm = FALSE)
    pca <- SingleCellExperiment::reducedDim(mnn.out,"corrected")
    SummarizedExperiment::colData(scExp.) <- as(adj_annot,
                                            "DataFrame")
    out = list("scExp" = scExp., "pca"= pca)
    return(out)
}

#' Run sparse PCA using irlba SVD
#' 
#' @description 
#' This function allows to run a PCA using IRLBA Singular Value Decomposition
#' in a fast & memory efficient way. The increamental Lanczos bidiagonalisation
#' algorithm allows to keep the matrix sparse as the "loci" centering is 
#' implicit. 
#' The function then multiplies by the approximate singular values (svd$d) in 
#' order to get more importance to the first PCs proportionnally to their 
#' singular values. This step is crucial for downstream approaches, e.g. UMAP or
#' T-SNE.
#' 
#' @param x A sparse normalized matrix (features x cells)
#' @param n_comp The number of principal components to keep
#' @param work Working subspace dimension, larger values can speed convergence
#'  at the cost of more memory use.
#'
#' @return The rotated data, e.g. the cells x PC column in case of sc data.
#'
#' @importFrom irlba irlba
#' @importFrom Matrix colMeans
#'
pca_irlba_for_sparseMatrix <- function(x, n_comp, work = 3 * n_comp)
{
    x.means <- Matrix::colMeans(x)
    svd.0 <- irlba::irlba(x, center = x.means, nv = n_comp, work = work)
    pca <- svd.0$u %*% diag(svd.0$d)
    return(pca)
}

sweep_sparse <- function(x, margin, stats, fun = "*") {
    f <- match.fun(fun)
    if (margin == 1) {
        idx <- x@i + 1
    } else {
        idx <- x@p + 1
    }
    x@x <- f(x@x, stats[idx])
    return(x)
}

#' Table of cells
#'
#' @param annot An annotation of cells. Can be obtain through 'colData(scExp)'. 
#' @param datamatrix A matrix of cells per regions  before filtering.
#' 
#' @export
#' @return A formatted kable in HTML.
#' @importFrom SingleCellExperiment colData
#' @importFrom dplyr bind_rows tibble left_join n summarise
#' @importFrom kableExtra kable kable_styling group_rows
#' 
#' @examples 
#' raw <- create_scDataset_raw()
#' scExp = create_scExp(raw$mat, raw$annot)
#' \dontrun{num_cell_scExp(SingleCellExperiment::colData(scExp))}
num_cell_scExp <- function(annot, datamatrix)
{
    stopifnot(!is.null(annot), !is.null(datamatrix))
    annot$total_counts = colSums(datamatrix)
    table <- as.data.frame(
        annot %>% dplyr::group_by(sample_id) %>% 
            dplyr::summarise("#Cells" = n(),
                             "Median"= round(median(total_counts),0),
                             "Std"= round(stats::sd(total_counts),0)))
    
    rownames(table) <- NULL
    
    table[, 1] <- as.character(table[, 1])
    table[nrow(table) + 1, ] = c(
        "", sum(table[, 2]), round(median(annot$total_counts),0),
        round(stats::sd(annot$total_counts),0))
    
    table %>% kableExtra::kable(escape = FALSE, align = "c") %>%
        kableExtra::kable_styling(c("striped",
                                    "condensed"), full_width = TRUE) %>%
        kableExtra::group_rows("Total",
                            dim(table)[1], dim(table)[1])
}

#' Table of cells before / after QC
#'
#' @param scExp A SingleCellExperiment object.
#' @param annot A raw annotation data.frame of cells before filtering.
#' @param datamatrix A matrix of cells per regions  before filtering.
#'
#' @export
#' @return A formatted kable in HTML.
#' @importFrom SingleCellExperiment colData
#' @importFrom dplyr bind_rows tibble left_join n summarise
#' @importFrom kableExtra kable kable_styling group_rows
#' @importFrom kableExtra kable kable_styling group_rows
#'
#' @examples 
#' raw <- create_scDataset_raw()
#' scExp = create_scExp(raw$mat, raw$annot)
#' scExp_filtered = filter_scExp(scExp)
#' \dontrun{ num_cell_after_QC_filt_scExp(
#' scExp_filtered,as.data.frame(SingleCellExperiment::colData(scExp)), counts(scExp))}
#' 
num_cell_after_QC_filt_scExp <- function(scExp, annot, datamatrix)
{
    stopifnot(is(scExp, "SingleCellExperiment"), !is.null(annot))
    annot$total_counts = colSums(datamatrix)
    table <- as.data.frame(
        annot %>% dplyr::group_by(sample_id) %>% 
        dplyr::summarise("#Cells" = n(),
                         "Median"= round(median(total_counts),0),
                         "Std"= round(stats::sd(total_counts),0)))
    table_filtered <-
        as.data.frame(SingleCellExperiment::colData(scExp))
    table_filtered <- as.data.frame(
        table_filtered %>% dplyr::group_by(sample_id) %>% 
            dplyr::summarise("#Cells (filt.)" = n(),
                             "Median (filt.)"= round(median(total_counts),0),
                             "Std (filt.)"= round(stats::sd(total_counts),0)))
    
    colnames(table)[1] <- c("Sample")
    rownames(table) <- NULL
    colnames(table_filtered) [1] <- c("Sample")
    rownames(table_filtered) <- NULL
    
    table_both <-
        dplyr::left_join(table, table_filtered, by = c("Sample"))
    table_both[, 1] <- as.character(table_both[, 1])
    table_both <- dplyr::bind_rows(
        table_both,
        dplyr::tibble(
            "Sample" = "",
            "#Cells" = sum(table_both[,2]),
            "Median"= round(median(annot$total_counts),0),
            "Std"= round(stats::sd(annot$total_counts),0),
            "#Cells (filt.)" = sum(table_both[, 5]),
            "Median (filt.)"= round(median(scExp$total_counts),0),
            "Std (filt.)"= round(stats::sd(scExp$total_counts),0)))
    table_both %>% kableExtra::kable(escape = FALSE, align = "c") %>%
        kableExtra::kable_styling(c("striped",
                                    "condensed"), full_width = TRUE) %>%
        kableExtra::group_rows("Total",
                            dim(table_both)[1], dim(table_both)[1])
}

#' Subsample scExp
#'
#' Randomly sample x cells from each sample in a SingleCellExperiment to return
#' a subsampled SingleCellExperiment with all samples having maximum n cells. If
#' n is higher than the number of cell in a sample, this sample will not be
#' subsampled.
#'
#' @param scExp A SingleCellExperiment
#' @param n_cell_per_sample An integer number of cells to subsample for each sample.
#'  Exclusive with n_cells_total. (500)
#' @param n_cell_total An integer number of cells to subsample in total. Exclusive
#' with n_cell_per_sample (NULL).
#'
#' @return A subsampled SingleCellExperiment
#' @export
#'
#' @importFrom SingleCellExperiment colData
#' 
#' @examples 
#' raw <- create_scDataset_raw()
#' scExp = create_scExp(raw$mat, raw$annot)
#' scExp_sub = subsample_scExp(scExp,50)
#' \dontrun{num_cell_scExp(scExp_sub)}
#' 
subsample_scExp <- function(scExp, n_cell_per_sample = 500, n_cell_total = NULL) {
    stopifnot(is(scExp, "SingleCellExperiment"), is.numeric(n_cell_per_sample))
    
    if(!is.null(n_cell_total) & !is.numeric(n_cell_total)) stop(
      "ChromSCape::subsample_scExp - n_cell_total must be numeric.")
  
    annot = as.data.frame(SingleCellExperiment::colData(scExp))
    counts = SingleCellExperiment::counts(scExp)
    samples = as.character(unique(annot$sample_id))
    counts. = NULL
    annot. = NULL

    
    if(is.numeric(n_cell_total)){
      message("ChromSCape::subsample_scExp - Subsampling cells to a total of ",
              n_cell_total)
      cells = as.character(annot$cell_id)

      cells = sample(cells, min(n_cell_total, length(cells)), replace = FALSE)
    
      counts. = counts[, cells]
      annot. = annot[match(cells, annot$cell_id),]
    } else{
      message("ChromSCape::subsample_scExp - Subsampling each sample to ", n_cell_per_sample)
      for (samp in samples) {
        cells = as.character(annot$cell_id[which(annot$sample_id == samp)])
        cells = sample(cells, min(n_cell_per_sample, length(cells)), replace = FALSE)
        if (is.null(counts.))
          counts. = counts[, cells]
        else
          counts. = Matrix::cbind2(counts., counts[, cells])
        if (is.null(annot.))
          annot. = annot[match(cells, annot$cell_id),]
        else
          annot. = Matrix::rbind2(annot., annot[match(cells, annot$cell_id),])
      }
    }
    
    ord = order(colnames(counts.))
    ord2 = order(rownames(annot.))

    counts. = counts.[, ord]
    annot. = annot.[ord2,]
    scExp. = create_scExp(
        counts.,
        annot.,
        remove_zero_cells = FALSE,
        remove_zero_features = FALSE,
        remove_non_canonical = FALSE,
        remove_chr_M = FALSE,
        verbose = FALSE
    )
    
    return(scExp.)
}
vallotlab/ChromSCape documentation built on Oct. 15, 2023, 1:47 p.m.