R/utilities.R
In Signac: Analysis of Single-Cell Chromatin Data

Documented in AccessiblePeaks AddChromatinModule AverageCounts CellsPerGroup ClosestFeature corSparse CountsInRegion Extend FractionCountsInRegion GeneActivity GetCellsInRegion GetGRangesFromEnsDb GetIntersectingFeatures GetTSSPositions GRangesToString IntersectMatrix LookupGeneCoords MatchRegionStats StringToGRanges SubsetMatrix UnifyPeaks

#' @include generics.R
#' @importFrom utils globalVariables
NULL


#' Add chromatin module
#'
#' Compute chromVAR deviations for groups of peaks. The goal of this function is
#' similar to that of \code{\link[Seurat]{AddModuleScore}} except that it is
#' designed for single-cell chromatin data. The chromVAR deviations for each
#' group of peaks will be added to the object metadata.
#'
#' @param object A Seurat object
#' @param features A named list of features to include in each module. The name
#' of each element in the list will be used to name the modules computed, which
#' will be stored in the object metadata.
#' @param genome A BSgenome object
#' @param assay Name of assay to use. If NULL, use the default assay.
#' @param verbose Display messages
#' @param ... Additional arguments passed to \code{RunChromVAR}
#'
#' @return Returns a Seurat object
#'
#' @importFrom fastmatch fmatch
#' @importFrom Matrix sparseMatrix
#' @importFrom SeuratObject DefaultAssay GetAssayData AddMetaData
#'
#' @export
#' @concept utilities
AddChromatinModule <- function(
  object,
  features,
  genome,
  assay = NULL,
  verbose = TRUE,
  ...
) {
  assay <- SetIfNull(x = assay, y = DefaultAssay(object = object))
  if (!inherits(x = object[[assay]], what = "ChromatinAssay")) {
    stop("The requested assay is not a ChromatinAssay.")
  }

  # first find index of each feature
  feat.idx <- sapply(X = features, FUN = fmatch, rownames(x = object[[assay]]))
  j <- sapply(X = seq_along(along.with = features), FUN = function(x) {
    rep(x = x, length(x = features[[x]]))
  })

  # construct sparse matrix with features
  mat <- sparseMatrix(
    i = unlist(x = feat.idx, use.names = FALSE),
    j = unlist(x = j, use.names = FALSE),
    x = 1,
    dims = c(nrow(x = object[[assay]]), length(x = features))
  )
  rownames(x = mat) <- rownames(x = object[[assay]])
  colnames(x = mat) <- names(x = features)

  # run chromVAR
  cv <- RunChromVAR(
    object = object[[assay]],
    motif.matrix = mat,
    genome = genome,
    verbose = verbose,
    ...
  )

  # add module scores to metadata
  chromvar.data <- GetAssayData(object = cv, slot = "data")
  object <- AddMetaData(
    object = object,
    metadata = as.data.frame(x = t(x = chromvar.data))
  )
  return(object)
}

globalVariables(names = c("group", "readcount"), package = "Signac")
#' Average Counts
#'
#' Compute the mean counts per group of cells for a given assay
#'
#' @param object A Seurat object
#' @param assay Name of assay to use. Default is the active assay
#' @param group.by Grouping variable to use. Default is the active identities
#' @param verbose Display messages
#'
#' @importFrom SeuratObject DefaultAssay Idents GetAssayData
#' @importFrom dplyr group_by summarize
#' @export
#' @concept utilities
#' @return Returns a dataframe
#' @examples
#' AverageCounts(atac_small)
AverageCounts <- function(
  object,
  assay = NULL,
  group.by = NULL,
  verbose = TRUE
) {
  assay <- SetIfNull(x = assay, y = DefaultAssay(object = object))
  if (is.null(x = group.by)) {
    group.by <- Idents(object = object)
  } else {
    group.by <- object[[group.by, drop = TRUE]]
  }
  # pull nCount_ column
  col.use <- paste0("nCount_", assay)
  total.df <- object[[col.use]]
  colnames(x = total.df) <- "readcount"
  total.df$cell <- rownames(x = total.df)
  total.df$group <- group.by[total.df$cell]
  total.df <- group_by(total.df, group)
  if (verbose) {
    message("Computing average counts per group")
  }
  group.means <- summarize(.data = total.df, mn = mean(x = readcount))
  results <- group.means$mn
  names(x = results) <- group.means$group
  return(results)
}

#' Accessible peaks
#'
#' Find accessible peaks in a set of cells
#'
#' @param object A Seurat object
#' @param assay Name of assay to use
#' @param idents A set of identity classes to find accessible peaks for
#' @param cells A vector of cells to find accessible peaks for
#' @param min.cells Minimum number of cells with the peak accessible (>0 counts)
#' for the peak to be called accessible
#' @export
#' @concept utilities
#' @importFrom SeuratObject WhichCells DefaultAssay GetAssayData
#' @importFrom Matrix rowSums
#' @return Returns a vector of peak names
AccessiblePeaks <- function(
  object,
  assay = NULL,
  idents = NULL,
  cells = NULL,
  min.cells = 10
) {
  assay <- SetIfNull(x = assay, y = DefaultAssay(object = object))
  cells <- SetIfNull(x = cells, y = WhichCells(object, idents = idents))
  open.peaks <- GetAssayData(
    object = object,
    assay = assay,
    slot = "counts"
  )[, cells]
  peaks <- names(x = which(x = rowSums(x = open.peaks > 0) > min.cells))
  return(peaks)
}

#' Cells per group
#'
#' Count the number of cells in each group
#'
#' @param object A Seurat object
#' @param group.by A grouping variable. Default is the active identities
#' @importFrom SeuratObject Idents
#' @export
#' @concept utilities
#' @return Returns a vector
#' @examples
#' CellsPerGroup(atac_small)
CellsPerGroup <- function(
  object,
  group.by = NULL
) {
  if (is.null(x = group.by)) {
    cellgroups <- Idents(object = object)
  } else {
    meta.data <- object[[]]
    cellgroups <- meta.data[[group.by]]
  }
  cells.per.group <- table(cellgroups, useNA = "always")
  lut <- as.vector(x = cells.per.group)
  names(x = lut) <- names(x = cells.per.group)
  return(lut)
}

#' Closest Feature
#'
#' Find the closest feature to a given set of genomic regions
#'
#' @param object A Seurat object
#' @param regions A set of genomic regions to query
#' @param annotation A GRanges object containing annotation information. If
#' NULL, use the annotations stored in the object.
#' @param ... Additional arguments passed to \code{\link{StringToGRanges}}
#'
#' @importMethodsFrom GenomicRanges distanceToNearest
#' @importFrom S4Vectors subjectHits mcols
#' @importFrom methods is
#' @importFrom SeuratObject DefaultAssay
#' @importFrom GenomeInfoDb dropSeqlevels
#' @return Returns a dataframe with the name of each region, the closest feature
#' in the annotation, and the distance to the feature.
#' @export
#' @concept utilities
#' @examples
#' \donttest{
#' ClosestFeature(
#'   object = atac_small,
#'   regions = head(granges(atac_small))
#' )
#' }
ClosestFeature <- function(
  object,
  regions,
  annotation = NULL,
  ...
) {
  if (!is(object = regions, class2 = 'GRanges')) {
    regions <- StringToGRanges(regions = regions, ...)
  }
  if (inherits(x = object, what = "Seurat")) {
    # running on Seurat object, extract the assay
    assay <- DefaultAssay(object = object)
    object <- object[[assay]]
  }
  if (!inherits(x = object, what = "ChromatinAssay")) {
    stop("The requested assay is not a ChromatinAssay.")
  }
  if (length(x = regions) == 0) {
    stop("No query regions supplied")
  }
  annotation <- SetIfNull(x = annotation, y = Annotation(object = object))
  missing_seqlevels <- setdiff(
    x = seqlevels(x = regions), y = seqlevels(x = annotation)
  )
  if (length(x = missing_seqlevels) > 0) {
    warning(
      "The following seqlevels present in query regions are not present\n ",
      "in the supplied gene annotations and will be removed: ",
      paste(missing_seqlevels, collapse = ", ")
    )
    regions <- dropSeqlevels(
      x = regions,
      value = missing_seqlevels,
      pruning.mode = "coarse"
    )
    if (length(x = regions) == 0) {
      stop("None of the supplied regions were found in the supplied annotation")
    }
  }
  nearest_feature <- distanceToNearest(x = regions, subject = annotation)
  feature_hits <- annotation[subjectHits(x = nearest_feature)]
  df <- as.data.frame(x = mcols(x = feature_hits))
  df$closest_region <- GRangesToString(grange = feature_hits, ...)
  df$query_region <- GRangesToString(grange = regions, ...)
  df$distance <- mcols(x = nearest_feature)$distance
  return(df)
}


#' Sparse matrix correlation
#' 
#' Compute the Pearson correlation matrix between
#' columns of two sparse matrices.
#' 
#' Originally from 
#' \url{https://stackoverflow.com/questions/5888287/running-cor-or-any-variant-over-a-sparse-matrix-in-r}
#' and the qlcMatrix package.
#' 
#' @param X A matrix
#' @param Y A matrix
#' @param cov return covariance matrix
#' @export
#' @keywords internal
#' @author Michael Cysouw, Karsten Looschen
#' @importMethodsFrom Matrix colMeans
# covmat uses E[(X-muX)'(Y-muY)] = E[X'Y] - muX'muY
# with sample correction n/(n-1) this leads to cov = ( X'Y - n*muX'muY ) / (n-1)
#
# the sd in the case Y!=NULL uses E[X-mu]^2 = E[X^2]-mu^2
# with sample correction n/(n-1) this leads to sd^2 = ( X^2 - n*mu^2 ) / (n-1)
#
# Note that results larger than 1e4 x 1e4 will become very slow, because the resulting matrix is not sparse anymore.
corSparse <- function(X, Y = NULL, cov = FALSE) {
  X <- as(object = X, Class = "CsparseMatrix")
  n <- nrow(x = X)
  muX <- colMeans(x = X)
  
  if (!is.null(x = Y)) {
    if (nrow(x = X) != nrow(x = Y)) {
      stop("Matrices must contain the same number of rows")
    }
    Y <- as(object = Y, Class = "CsparseMatrix")
    muY <- colMeans(x = Y)
    covmat <- ( as.matrix(x = crossprod(x = X, y = Y)) - n * tcrossprod(x = muX, y = muY) ) / (n-1)
    sdvecX <- sqrt( (colSums(x = X^2) - n*muX^2) / (n-1) )
    sdvecY <- sqrt( (colSums(x = Y^2) - n*muY^2) / (n-1) )
    cormat <- covmat / tcrossprod(x = sdvecX, y = sdvecY)
  } else {		
    covmat <- ( as.matrix(crossprod(x = X)) - n * tcrossprod(x = muX) ) / (n-1)
    sdvec <- sqrt(x = diag(x = covmat))
    cormat <- covmat / tcrossprod(x = sdvec)
  }
  if (cov) {
    dimnames(x = covmat) <- NULL
    return(covmat)
  } else {
    dimnames(x = cormat) <- NULL
    return(cormat)
  }
}

#' Create gene activity matrix
#'
#' Compute counts per cell in gene body and promoter region.
#'
#' @param object A Seurat object
#' @param assay Name of assay to use. If NULL, use the default assay
#' @param features Genes to include. If NULL, use all protein-coding genes in
#' the annotations stored in the object
#' @param extend.upstream Number of bases to extend upstream of the TSS
#' @param extend.downstream Number of bases to extend downstream of the TTS
#' @param biotypes Gene biotypes to include. If NULL, use all biotypes in the
#' gene annotation.
#' @param max.width Maximum allowed gene width for a gene to be quantified.
#' Setting this parameter can avoid quantifying extremely long transcripts that
#' can add a relatively long amount of time. If NULL, do not filter genes based
#' on width.
#' @param process_n Number of regions to load into memory at a time, per thread.
#' Processing more regions at once can be faster but uses more memory.
#' @param gene.id Record gene IDs in output matrix rather than gene name.
#' @param verbose Display messages
#'
#' @return Returns a sparse matrix
#'
#' @concept utilities
#' @export
#' @importFrom SeuratObject DefaultAssay
#' @examples
#' fpath <- system.file("extdata", "fragments.tsv.gz", package="Signac")
#' fragments <- CreateFragmentObject(
#'   path = fpath,
#'   cells = colnames(atac_small),
#'   validate.fragments = FALSE
#' )
#' Fragments(atac_small) <- fragments
#' GeneActivity(atac_small)
GeneActivity <- function(
  object,
  assay = NULL,
  features = NULL,
  extend.upstream = 2000,
  extend.downstream = 0,
  biotypes = "protein_coding",
  max.width = 500000,
  process_n = 2000,
  gene.id = FALSE,
  verbose = TRUE
) {
  if (!is.null(x = features)) {
    if (length(x = features) == 0) {
      stop("Empty list of features provided")
    }
  }
  # collapse to longest protein coding transcript
  assay <- SetIfNull(x = assay, y = DefaultAssay(object = object))
  if (!inherits(x = object[[assay]], what = "ChromatinAssay")) {
    stop("The requested assay is not a ChromatinAssay.")
  }
  annotation <- Annotation(object = object[[assay]])
  # replace NA names with gene ID
  annotation$gene_name <- ifelse(
    test = is.na(x = annotation$gene_name) | (annotation$gene_name == ""),
    yes = annotation$gene_id,
    no = annotation$gene_name
  )
  if (length(x = annotation) == 0) {
    stop("No gene annotations present in object")
  }
  if (verbose) {
    message("Extracting gene coordinates")
  }
  transcripts <- CollapseToLongestTranscript(ranges = annotation)
  if (gene.id) {
    transcripts$gene_name <- transcripts$gene_id
  }
  if (!is.null(x = biotypes)) {
    transcripts <- transcripts[transcripts$gene_biotype %in% biotypes]
    if (length(x = transcripts) == 0) {
      stop("No genes remaining after filtering for requested biotypes")
    }
  }

  # filter genes if provided
  if (!is.null(x = features)) {
    transcripts <- transcripts[transcripts$gene_name %in% features]
    if (length(x = transcripts) == 0) {
      stop("None of the requested genes were found in the gene annotation")
    }
  }
  if (!is.null(x = max.width)) {
    transcript.keep <- which(x = width(x = transcripts) < max.width)
    transcripts <- transcripts[transcript.keep]
    if (length(x = transcripts) == 0) {
      stop("No genes remaining after filtering for max.width")
    }
  }

  # extend to include promoters
  transcripts <- Extend(
    x = transcripts,
    upstream = extend.upstream,
    downstream = extend.downstream
  )

  # quantify
  frags <- Fragments(object = object[[assay]])
  if (length(x = frags) == 0) {
    stop("No fragment information found for requested assay")
  }
  cells <- colnames(x = object[[assay]])
  counts <- FeatureMatrix(
    fragments = frags,
    features = transcripts,
    process_n = process_n,
    cells = cells,
    verbose = verbose
  )
  # set row names
  gene.key <- transcripts$gene_name
  names(x = gene.key) <- GRangesToString(grange = transcripts)
  rownames(x = counts) <- as.vector(x = gene.key[rownames(x = counts)])
  counts <- counts[rownames(x = counts) != "", ]

  return(counts)
}

#' Extract genomic ranges from EnsDb object
#'
#' Pulls the transcript information for all chromosomes from an EnsDb object.
#' This wraps \code{\link[biovizBase]{crunch}} and applies the extractor
#' function to all chromosomes present in the EnsDb object.
#'
#' @param ensdb An EnsDb object
#' @param standard.chromosomes Keep only standard chromosomes
#' @param biotypes Biotypes to keep
#' @param verbose Display messages
#'
#' @importFrom GenomeInfoDb keepStandardChromosomes seqinfo
#' @importFrom pbapply pblapply
#' @concept utilities
#' @export
GetGRangesFromEnsDb <- function(
  ensdb,
  standard.chromosomes = TRUE,
  biotypes = c("protein_coding", "lincRNA", "rRNA", "processed_transcript"),
  verbose = TRUE
) {
  if (!requireNamespace("biovizBase", quietly = TRUE)) {
    stop("Please install biovizBase\n",
         "https://www.bioconductor.org/packages/biovizBase/")
  }
  # convert seqinfo to granges
  whole.genome <-  as(object = seqinfo(x = ensdb), Class = "GRanges")
  if (standard.chromosomes) {
    whole.genome <- keepStandardChromosomes(whole.genome, pruning.mode = "coarse")
  }

  # extract genes from each chromosome
  my_lapply <- ifelse(test = verbose, yes = pblapply, no = lapply)
  tx <- my_lapply(X = seq_along(whole.genome), FUN = function(x){
        suppressMessages(expr = biovizBase::crunch(
          obj = ensdb,
          which = whole.genome[x],
          columns = c("tx_id", "gene_name", "gene_id", "gene_biotype")))
      })
  
  # combine
  tx <- do.call(what = c, args = tx)
  tx <- tx[tx$gene_biotype %in% biotypes]
  return(tx)
}

#' Find transcriptional start sites
#'
#' Get the TSS positions from a set of genomic ranges containing gene positions.
#' Ranges can contain exons, introns, UTRs, etc, rather than the whole
#' transcript. Only protein coding gene biotypes are included in output.
#'
#' @param ranges A GRanges object containing gene annotations.
#' @param biotypes Gene biotypes to include. If NULL, use all biotypes in the
#' supplied gene annotation.
#' @importFrom GenomicRanges resize
#' @importFrom S4Vectors mcols
#' @export
#' @concept utilities
GetTSSPositions <- function(ranges, biotypes = "protein_coding") {
  if (!("gene_biotype" %in% colnames(x = mcols(x = ranges)))) {
    stop("Gene annotation does not contain gene_biotype information")
  }
  if (!is.null(x = biotypes)){
    ranges <- ranges[ranges$gene_biotype == "protein_coding"]
  }
  gene.ranges <- CollapseToLongestTranscript(ranges = ranges)
  # shrink to TSS position
  tss <- resize(gene.ranges, width = 1, fix = 'start')
  return(tss)
}

#' Find intersecting regions between two objects
#'
#' Intersects the regions stored in the rownames of two objects and
#' returns a vector containing the names of rows that intersect
#' for each object. The order of the row names return corresponds
#' to the intersecting regions, i.e. the nth feature of the first vector
#' will intersect the nth feature in the second vector. A distance
#' parameter can be given, in which case features within the given
#' distance will be called as intersecting.
#'
#' @param object.1 The first Seurat object
#' @param object.2 The second Seurat object
#' @param assay.1 Name of the assay to use in the first object. If NULL, use
#' the default assay
#' @param assay.2 Name of the assay to use in the second object. If NULL, use
#' the default assay
#' @param distance Maximum distance between regions allowed for an intersection
#' to be recorded. Default is 0.
#' @param verbose Display messages
#'
#' @importMethodsFrom GenomicRanges distanceToNearest
#' @importFrom S4Vectors subjectHits queryHits mcols
#' @importFrom SeuratObject DefaultAssay
#' @export
#' @concept utilities
#' @return Returns a list of two character vectors containing the row names
#' in each object that overlap each other.
#' @examples
#' GetIntersectingFeatures(
#'   object.1 = atac_small,
#'   object.2 = atac_small,
#'   assay.1 = 'peaks',
#'   assay.2 = 'bins'
#' )
GetIntersectingFeatures <- function(
  object.1,
  object.2,
  assay.1 = NULL,
  assay.2 = NULL,
  distance = 0,
  verbose = TRUE
) {
  assay.1 <- SetIfNull(x = assay.1, y = DefaultAssay(object = object.1))
  assay.2 <- SetIfNull(x = assay.2, y = DefaultAssay(object = object.2))
  if (!inherits(x = object.1[[assay.1]], what = "ChromatinAssay")) {
    stop("Requested assay in object 1 is not a ChromatinAssay.")
  }
  if (!inherits(x = object.2[[assay.2]], what = "ChromatinAssay")) {
    stop("Requested assay in object 2 is not a ChromatinAssay")
  }
  regions.1 <- GetAssayData(object = object.1, assay = assay.1, slot = "ranges")
  regions.2 <- GetAssayData(object = object.2, assay = assay.2, slot = "ranges")
  if (verbose) {
    message("Intersecting regions across objects")
  }
  region.intersections <- distanceToNearest(x = regions.1, subject = regions.2)
  keep.intersections <- mcols(x = region.intersections)$distance <= distance
  region.intersections <- region.intersections[keep.intersections, ]
  intersect.object1 <- queryHits(x = region.intersections)
  intersect.object2 <- subjectHits(x = region.intersections)
  return(list(intersect.object1, intersect.object2))
}

#' String to GRanges
#'
#' Convert a genomic coordinate string to a GRanges object
#'
#' @param regions Vector of genomic region strings
#' @param sep Vector of separators to use for genomic string. First element is
#' used to separate chromosome and coordinates, second separator is used to
#' separate start and end coordinates.
#' @param ... Additional arguments passed to
#' \code{\link[GenomicRanges]{makeGRangesFromDataFrame}}
#' @return Returns a GRanges object
#' @importFrom GenomicRanges makeGRangesFromDataFrame
#' @importFrom tidyr separate
#' @examples
#' regions <- c('chr1-1-10', 'chr2-12-3121')
#' StringToGRanges(regions = regions)
#' @export
#' @concept utilities
StringToGRanges <- function(regions, sep = c("-", "-"), ...) {
  ranges.df <- data.frame(ranges = regions)
  ranges.df <- separate(
    data = ranges.df,
    col = "ranges",
    sep = paste0(sep[[1]], "|", sep[[2]]),
    into = c("chr", "start", "end")
  )
  granges <- makeGRangesFromDataFrame(df = ranges.df, ...)
  return(granges)
}

#' GRanges to String
#'
#' Convert GRanges object to a vector of strings
#'
#' @param grange A GRanges object
#' @param sep Vector of separators to use for genomic string. First element is
#' used to separate chromosome and coordinates, second separator is used to
#' separate start and end coordinates.
#' @importMethodsFrom GenomicRanges start end seqnames
#' @examples
#' GRangesToString(grange = blacklist_hg19)
#' @return Returns a character vector
#' @export
#' @concept utilities
GRangesToString <- function(grange, sep = c("-", "-")) {
  regions <- paste0(
    as.character(x = seqnames(x = grange)),
    sep[[1]],
    start(x = grange),
    sep[[2]],
    end(x = grange)
  )
  return(regions)
}

#' Extend
#'
#' Resize GenomicRanges upstream and or downstream.
#' From \url{https://support.bioconductor.org/p/78652/}
#'
#' @param x A range
#' @param upstream Length to extend upstream
#' @param downstream Length to extend downstream
#' @param from.midpoint Count bases from region midpoint,
#' rather than the 5' or 3' end for upstream and downstream
#' respectively.
#'
#' @importFrom GenomicRanges trim
#' @importFrom BiocGenerics start strand end width
#' @importMethodsFrom GenomicRanges strand start end width
#' @importFrom IRanges ranges IRanges "ranges<-"
#' @export
#' @concept utilities
#' @return Returns a \code{\link[GenomicRanges]{GRanges}} object
#' @examples
#' Extend(x = blacklist_hg19, upstream = 100, downstream = 100)
Extend <- function(
  x,
  upstream = 0,
  downstream = 0,
  from.midpoint = FALSE
) {
  if (any(strand(x = x) == "*")) {
    warning("'*' ranges were treated as '+'")
  }
  on_plus <- strand(x = x) == "+" | strand(x = x) == "*"
  if (from.midpoint) {
    midpoints <- start(x = x) + (width(x = x) / 2)
    new_start <- midpoints - ifelse(
      test = on_plus, yes = upstream, no = downstream
    )
    new_end <- midpoints + ifelse(
      test = on_plus, yes = downstream, no = upstream
    )
  } else {
    new_start <- start(x = x) - ifelse(
      test = on_plus, yes = upstream, no = downstream
    )
    new_end <- end(x = x) + ifelse(
      test = on_plus, yes = downstream, no = upstream
    )
  }
  ranges(x = x) <- IRanges(start = new_start, end = new_end)
  x <- trim(x = x)
  return(x)
}

#' Get cells in a region
#'
#' Extract cell names containing reads mapped within a given genomic region
#'
#' @param tabix Tabix object
#' @param region A string giving the region to extract from the fragments file
#' @param cells Vector of cells to include in output. If NULL, include all cells
#'
#' @importFrom Rsamtools scanTabix
#' @importFrom methods is
#' @importFrom fastmatch fmatch
#' @export
#' @concept utilities
#' @return Returns a list
#' @examples
#' fpath <- system.file("extdata", "fragments.tsv.gz", package="Signac")
#' GetCellsInRegion(tabix = fpath, region = "chr1-10245-762629")
GetCellsInRegion <- function(tabix, region, cells = NULL) {
  if (!is(object = region, class2 = "GRanges")) {
    region <- StringToGRanges(regions = region)
  }
  reads <- scanTabix(file = tabix, param = region)
  reads <- lapply(X = reads, FUN = ExtractCell)
  if (!is.null(x = cells)) {
    reads <- sapply(X = reads, FUN = function(x) {
      x <- x[fmatch(x = x, table = cells, nomatch = 0L) > 0L]
      if (length(x = x) == 0) {
        return(NULL)
      } else {
        return(x)
      }
    }, simplify = FALSE)
  }
  return(reads)
}

#' Counts in region
#'
#' Count reads per cell overlapping a given set of regions
#'
#' @param object A Seurat object
#' @param assay Name of a chromatin assay in the object to use
#' @param regions A GRanges object
#' @param ... Additional arguments passed to \code{\link[IRanges]{findOverlaps}}
#'
#' @importFrom IRanges findOverlaps
#' @importFrom S4Vectors queryHits
#' @importFrom Matrix colSums
#' @importFrom SeuratObject GetAssayData
#'
#' @export
#' @concept utilities
#' @return Returns a numeric vector
#' @examples
#' \donttest{
#' CountsInRegion(
#'   object = atac_small,
#'   assay = 'bins',
#'   regions = blacklist_hg19
#' )
#' }
CountsInRegion <- function(
  object,
  assay,
  regions,
  ...
) {
  if (!is(object = object[[assay]], class2 = "ChromatinAssay")) {
    stop("Must supply a ChromatinAssay")
  }
  obj.granges <- GetAssayData(object = object, assay = assay, slot = "ranges")
  overlaps <- findOverlaps(query = obj.granges, subject = regions, ...)
  hit.regions <- queryHits(x = overlaps)
  data.matrix <- GetAssayData(
    object = object, assay = assay, slot = "counts"
  )[hit.regions, , drop = FALSE]
  return(colSums(data.matrix))
}

#' Fraction of counts in a genomic region
#'
#' Find the fraction of counts per cell that overlap a given set of genomic
#' ranges
#'
#' @param object A Seurat object
#' @param assay Name of assay to use
#' @param regions A GRanges object containing a set of genomic regions
#' @param ... Additional arguments passed to \code{\link{CountsInRegion}}
#' @importFrom Matrix colSums
#' @importFrom SeuratObject GetAssayData DefaultAssay
#'
#' @export
#' @concept utilities
#' @return Returns a numeric vector
#' @examples
#' \dontrun{
#' FractionCountsInRegion(
#'   object = atac_small,
#'   assay = 'bins',
#'   regions = blacklist_hg19
#' )
#' }
FractionCountsInRegion <- function(
  object,
  regions,
  assay = NULL,
  ...
) {
  assay <- SetIfNull(x = assay, y = DefaultAssay(object = object))
  reads.in.region <- CountsInRegion(
    object = object,
    regions = regions,
    assay = assay,
    ...
  )
  total.reads <- colSums(x = GetAssayData(
    object = object, assay = assay, slot = "counts"
  ))
  return(reads.in.region / total.reads)
}

#' Intersect genomic coordinates with matrix rows
#'
#' Remove or retain matrix rows that intersect given genomic regions
#'
#' @param matrix A matrix with genomic regions in the rows
#' @param regions A set of genomic regions to intersect with regions in the
#' matrix. Either a vector of strings encoding the genomic coordinates, or a
#' GRanges object.
#' @param invert Discard rows intersecting the genomic regions supplied, rather
#' than retain.
#' @param sep A length-2 character vector containing the separators to be used
#' for extracting genomic coordinates from a string. The first element will be
#' used to separate the chromosome name from coordinates, and the second element
#' used to separate start and end coordinates.
#' @param verbose Display messages
#' @param ... Additional arguments passed to \code{\link[IRanges]{findOverlaps}}
#'
#' @importFrom IRanges findOverlaps
#' @importFrom S4Vectors queryHits
#'
#' @export
#' @concept utilities
#' @return Returns a sparse matrix
#' @examples
#' counts <- matrix(data = rep(0, 12), ncol = 2)
#' rownames(counts) <- c("chr1-565107-565550","chr1-569174-569639",
#' "chr1-713460-714823","chr1-752422-753038",
#' "chr1-762106-763359","chr1-779589-780271")
#' IntersectMatrix(matrix = counts, regions = blacklist_hg19)
IntersectMatrix <- function(
  matrix,
  regions,
  invert = FALSE,
  sep = c("-", "-"),
  verbose = TRUE,
  ...
) {
  if (is(object = regions, class2 = "character")) {
    regions <- StringToGRanges(regions = regions, sep = sep)
  }
  rowranges <- StringToGRanges(regions = rownames(x = matrix), sep = sep)
  if (verbose) {
    message("Intersecting genomic regions")
  }
  region.overlaps <- findOverlaps(query = rowranges, subject = regions, ...)
  keep.rows <- queryHits(x = region.overlaps)
  if (invert) {
    all.rows <- seq_len(length.out = nrow(matrix))
    keep.rows <- setdiff(x = all.rows, y = keep.rows)
  }
  if (verbose) {
    message("Subsetting matrix")
  }
  matrix <- matrix[keep.rows, ]
  return(matrix)
}

#' Get gene coordinates
#'
#' Extract the coordinates of the longest transcript for a gene stored in the
#' annotations within an object.
#'
#' @param object A Seurat object
#' @param gene Name of a gene to extract
#' @param assay Name of assay to use
#'
#' @importFrom SeuratObject DefaultAssay
#' @importFrom GenomicRanges GRanges
#' @importFrom IRanges IRanges start end
#' @importFrom GenomeInfoDb seqnames
#' @export
#' @concept utilities
#' @examples
#' LookupGeneCoords(atac_small, gene = "MIR1302-10")
LookupGeneCoords <- function(object, gene, assay = NULL) {
  assay <- SetIfNull(x = assay, y = DefaultAssay(object = object))
  if (!inherits(x = object[[assay]], what = "ChromatinAssay")) {
    stop("The requested assay is not a ChromatinAssay")
  }
  annotations <- Annotation(object = object[[assay]])
  isgene <- annotations$gene_name == gene
  isgene <- !is.na(x = isgene) & isgene
  annot.sub <- annotations[isgene]
  if (length(x = annot.sub) == 0) {
    return(NULL)
  } else {
    gr <- GRanges(seqnames = as.character(x = seqnames(x = annot.sub))[[1]],
                  ranges = IRanges(start = min(start(x = annot.sub)),
                                   end = max(end(x = annot.sub))))
    return(gr)
  }
}

#' Match DNA sequence characteristics
#'
#' Return a vector if genomic regions that match the distribution of a set of
#' query regions for any given set of characteristics, specified in the input
#' \code{meta.feature} dataframe.
#'
#' For each requested feature to match, a density
#' distribution is estimated using the \code{\link[stats]{density}} function,
#' and a set of weights for each feature in the dataset estimated based on the
#' density distribution. If multiple features are to be matched (for example,
#' GC content and overall accessibility), a joint density distribution is then
#' computed by multiplying the individual feature weights. A set of features
#' with characteristics matching the query regions is then selected using the
#' \code{\link[base]{sample}} function, with the probability of randomly
#' selecting each feature equal to the joint density distribution weight.
#'
#' @param meta.feature A dataframe containing DNA sequence information for
#' features to choose from
#' @param query.feature A dataframe containing DNA sequence information for
#' features to match.
#' @param n Number of regions to select, with characteristics matching the query
#' @param features.match Which features of the query to match when selecting a
#' set of regions. A vector of column names present in the feature metadata can
#' be supplied to match multiple characteristics at once. Default is GC content.
#' @param verbose Display messages
#' @param ... Arguments passed to other functions
#' @return Returns a character vector
#'
#' @importFrom stats density approx na.omit
#' @export
#' @concept utilities
#' @concept motifs
#' @examples
#' metafeatures <- SeuratObject::GetAssayData(
#'   object = atac_small[['peaks']], slot = 'meta.features'
#' )
#' query.feature <- metafeatures[1:10, ]
#' features.choose <- metafeatures[11:nrow(metafeatures), ]
#' MatchRegionStats(
#'   meta.feature = features.choose,
#'   query.feature = query.feature,
#'   features.match = "percentile",
#'   n = 10
#' )
MatchRegionStats <- function(
  meta.feature,
  query.feature,
  features.match = c("GC.percent"),
  n = 10000,
  verbose = TRUE,
  ...
) {
  if (!inherits(x = meta.feature, what = 'data.frame')) {
    stop("meta.feature should be a data.frame")
  }
  if (!inherits(x = query.feature, what = "data.frame")) {
    stop("query.feature should be a data.frame")
  }
  if (length(x = features.match) == 0) {
    stop("Must supply at least one sequence characteristic to match")
  }
  # remove features that have NA for any of the features to match
  meta.feature <- na.omit(object = meta.feature[, features.match, drop = FALSE])
  if (nrow(x = meta.feature) < n) {
    n <- nrow(x = meta.feature)
    warning("Requested more features than present in supplied data.
            Returning ", n, " features")
  }
  for (i in seq_along(along.with = features.match)) {
    featmatch <- features.match[[i]]
    if (!(featmatch %in% colnames(x = query.feature))) {
      if (featmatch == "GC.percent") {
        stop("GC.percent not present in meta.features.",
             " Run RegionStats to compute GC.percent for each feature.")
      } else {
        stop(featmatch, " not present in meta.features")
      }
    }
    if (verbose) {
      message("Matching ", featmatch, " distribution")
    }
    density.estimate <- density(
      x = query.feature[[featmatch]], kernel = "gaussian", bw = 1
    )
    weights <- approx(
      x = density.estimate$x,
      y = density.estimate$y,
      xout = meta.feature[[featmatch]],
      yright = 0.0001,
      yleft = 0.0001
    )$y
    if (i > 1) {
      feature.weights <- feature.weights * weights
    } else {
      feature.weights <- weights
    }
  }
  feature.select <- sample.int(
    n = nrow(x = meta.feature),
    size = n,
    prob = feature.weights
  )
  feature.select <- rownames(x = meta.feature)[feature.select]
  return(feature.select)
}

#' Unify genomic ranges
#'
#' Create a unified set of non-overlapping genomic ranges
#' from multiple Seurat objects containing single-cell
#' chromatin data.
#'
#' @param object.list A list of Seurat objects or ChromatinAssay objects
#' @param mode Function to use when combining genomic ranges. Can be "reduce"
#' (default) or "disjoin".
#' See \code{\link[GenomicRanges]{reduce}}
#' and \code{\link[GenomicRanges]{disjoin}}
#' for more information on these functions.
#' @importFrom GenomicRanges reduce disjoin
#' @export
#' @concept utilities
#' @concept preprocessing
#' @return Returns a GRanges object
#' @examples
#' UnifyPeaks(object.list = list(atac_small, atac_small))
UnifyPeaks <- function(object.list, mode = "reduce") {
  peak.ranges <- list()
  for (i in seq_along(along.with = object.list)) {
    peak.ranges[[i]] <- granges(x = object.list[[i]])
  }
  peak.ranges <- Reduce(f = c, x = peak.ranges)
  if (mode == "reduce") {
    return(reduce(x = peak.ranges))
  } else if (mode == "disjoin") {
    return(disjoin(x = peak.ranges))
  } else {
    stop("Unknown mode requested")
  }
}

#' Subset matrix rows and columns
#'
#' Subset the rows and columns of a matrix by removing
#' rows and columns with less than the specified number of
#' non-zero elements.
#'
#' @param mat A matrix
#' @param min.rows Minimum number of non-zero elements for
#' the row to be retained
#' @param min.cols Minimum number of non-zero elements for
#' the column to be retained
#' @param max.row.val Maximum allowed value in a row for the
#' row to be retained. If NULL, don't set any limit.
#' @param max.col.val Maximum allowed value in a column for
#' the column to be retained. If NULL, don't set any limit.
#'
#' @return Returns a matrix
#' @export
#' @concept utilities
#' @importFrom Matrix colSums rowSums
#' @examples
#' SubsetMatrix(mat = volcano)
SubsetMatrix <- function(
  mat,
  min.rows = 1,
  min.cols = 1,
  max.row.val = 10,
  max.col.val = NULL
) {
  rowcount <- rowSums(mat > 0)
  colcount <- colSums(mat > 0)
  keeprows <- rowcount > min.rows
  keepcols <- colcount > min.cols
  if (!is.null(x = max.row.val)) {
    rowmax <- apply(X = mat, MARGIN = 1, FUN = max)
    keeprows <- keeprows & (rowmax < max.row.val)
  }
  if (!is.null(x = max.col.val)) {
    colmax <- apply(X = mat, MARGIN = 2, FUN = max)
    keepcols <- keepcols & (colmax < max.col.val)
  }
  return(mat[keeprows, keepcols])
}

#### Not exported ####

#' @importFrom IRanges isDisjoint
NonOverlapping <- function(x, all.features) {
  # x is list of assays
  diff.features <- names(x = all.features[all.features < length(x = x)])
  if (length(x = diff.features) == 0) {
    return(TRUE)
  } else {
    diff.ranges <- StringToGRanges(regions = diff.features)
    return(isDisjoint(x = diff.ranges))
  }
}

#' @importFrom Matrix sparseMatrix
AddMissing <- function(x, cells, features = NULL) {
  # add columns with zeros for cells or features not in matrix
  missing.cells <- setdiff(x = cells, y = colnames(x = x))
  if (!(length(x = missing.cells) == 0)) {
    null.mat <- sparseMatrix(
      i = c(),
      j = c(),
      dims = c(nrow(x = x), length(x = missing.cells))
    )
    rownames(x = null.mat) <- rownames(x = x)
    colnames(x = null.mat) <- missing.cells
    x <- cbind(x, null.mat)
  }
  x <- x[, cells, drop = FALSE]
  if (!is.null(x = features)) {
    missing.features <- setdiff(x = features, y = rownames(x = x))
    if (!(length(x = missing.features) == 0)) {
      null.mat <- sparseMatrix(
        i = c(),
        j = c(),
        dims = c(length(x = missing.features), ncol(x = x))
      )
      rownames(x = null.mat) <- missing.features
      colnames(x = null.mat) <- colnames(x = x)
      x <- rbind(x, null.mat)
    }
    x <- x[features, , drop = FALSE]
  }
  return(x)
}

#' @importFrom SeuratObject DefaultAssay GetAssayData
#' @importFrom Matrix Diagonal tcrossprod rowSums
AverageCountMatrix <- function(
  object,
  assay = NULL,
  group.by = NULL,
  idents = NULL
) {
  assay = SetIfNull(x = assay, y = DefaultAssay(object = object))
  countmatrix <- GetAssayData(object = object[[assay]], slot = "counts")
  ident.matrix <- BinaryIdentMatrix(
    object = object,
    group.by = group.by,
    idents = idents
  )
  collapsed.counts <- tcrossprod(x = countmatrix, y = ident.matrix)
  avg.counts <- tcrossprod(
    x = collapsed.counts,
    y = Diagonal(x = 1 / rowSums(x = ident.matrix))
  )
  return(as.matrix(x = avg.counts))
}

# Create binary cell x class matrix of group membership
#' @importFrom Matrix sparseMatrix
BinaryIdentMatrix <- function(object, group.by = NULL, idents = NULL) {
  group.idents <- GetGroups(object = object, group.by = group.by, idents = idents)
  cell.idx <- seq_along(along.with = names(x = group.idents))
  unique.groups <- as.character(x = unique(x = group.idents))
  ident.idx <- seq_along(along.with = unique.groups)
  names(x = ident.idx) <- unique.groups
  ident.matrix <- sparseMatrix(
    i = ident.idx[as.character(x = group.idents)],
    j = cell.idx,
    x = 1
  )
  colnames(x = ident.matrix) <- names(x = group.idents)
  rownames(x = ident.matrix) <- unique.groups
  ident.matrix <- as(object = ident.matrix, Class = "CsparseMatrix")
  return(ident.matrix)
}

# Calculate nCount and nFeature
#
# From Seurat
#
# @param object An Assay object
#
# @return A named list with nCount and nFeature
#
#' @importFrom Matrix colSums
#
CalcN <- function(object) {
  if (IsMatrixEmpty(x = GetAssayData(object = object, slot = "counts"))) {
    return(NULL)
  }
  return(list(
    nCount = colSums(x = object, slot = "counts"),
    nFeature = colSums(x = GetAssayData(object = object, slot = "counts") > 0)
  ))
}

globalVariables(
  names = c("start", "end", "seqnames", "strand", "gene_biotype"),
  package = "Signac"
)
#' @importFrom GenomicRanges makeGRangesFromDataFrame
#' @import data.table
CollapseToLongestTranscript <- function(ranges) {
  range.df <- as.data.table(x = ranges)
  range.df$strand <- as.character(x = range.df$strand)
  range.df$strand <- ifelse(
    test = range.df$strand == "*",
    yes = "+",
    no = range.df$strand
  )
  collapsed <- range.df[
    , .(unique(seqnames),
        min(start),
        max(end),
        strand[[1]],
        gene_biotype[[1]],
        gene_name[[1]]),
    "gene_id"
  ]
  colnames(x = collapsed) <- c(
    "gene_id", "seqnames", "start", "end", "strand", "gene_biotype", "gene_name"
  )
  collapsed$gene_name <- make.unique(names = collapsed$gene_name)
  gene.ranges <- makeGRangesFromDataFrame(
    df = collapsed,
    keep.extra.columns = TRUE
  )
  return(gene.ranges)
}

# Chunk GRanges
#
# Split a genomic ranges object into evenly sized chunks
# @param granges A GRanges object
# @param nchunk Number of chunks to split into
#
# @return Returns a list of GRanges objects
# @examples
# ChunkGRanges(blacklist_hg19, n = 10)
ChunkGRanges <- function(granges, nchunk) {
  if (length(x = granges) < nchunk) {
    nchunk <- length(x = granges)
  }
  chunksize <- as.integer(x = (length(granges) / nchunk))
  range.list <- sapply(X = seq_len(length.out = nchunk), FUN = function(x) {
    chunkupper <- (x * chunksize)
    if (x == 1) {
      chunklower <- 1
    } else {
      chunklower <- ((x - 1) * chunksize) + 1
    }
    if (x == nchunk) {
      chunkupper <- length(x = granges)
    }
    return(granges[chunklower:chunkupper])
  })
  return(range.list)
}

# Extract cell
#
# Extract cell barcode from list of tab delimited character
# vectors (output of \code{\link{scanTabix}})
#
# @param x List of character vectors
# @return Returns a string
#' @importFrom stringi stri_split_fixed
ExtractCell <- function(x) {
  if (length(x = x) == 0) {
    return(NULL)
  } else {
    x <- stri_split_fixed(str = x, pattern = "\t")
    n <- length(x = x)
    x <- unlist(x = x)
    return(unlist(x = x)[5 * (1:n) - 1])
  }
}

# Run groupCommand for the first n lines, convert the cell barcodes in the file
# to the cell names that appear in the fragment object, and subset the output to
# cells present in the fragment object
#
# Every cell in the fragment file will be present in the output dataframe. If
# the cell information is not set, every cell barcode that appears in the first
# n lines will be present.
#
# @param fragments A Fragment object
# @param n Number of lines to read from the beginning of the fragment file
# @param verbose Display messages
#
# @return Returns a data.frame
ExtractFragments <- function(fragments, n = NULL, verbose = TRUE) {
  fpath <- GetFragmentData(object = fragments, slot = "path")
  if (isRemote(x = fpath)) {
    stop("Remote fragment files not supported")
  }
  fpath <- normalizePath(path = fpath, mustWork = TRUE)
  cells <- GetFragmentData(object = fragments, slot = "cells")
  if (!is.null(x = cells)) {
    cells.use <- as.character(x = cells)
  } else {
    cells.use <- NULL
  }
  verbose <- as.logical(x = verbose)
  n <- SetIfNull(x = n, y = 0)
  n <- as.numeric(x = n)
  n <- round(x = n, digits = 0)
  counts <- groupCommand(
    fragments = fpath,
    some_whitelist_cells = cells.use,
    max_lines = n,
    verbose = verbose
  )
  # convert cell names
  if (!is.null(x = cells)) {
    # every cell will be present in the output, even if 0 counts
    converter <- names(x = cells)
    names(x = converter) <- cells
    counts$CB <- converter[counts$CB]
  }
  return(counts)
}

# convert region argument to genomic coordinates
# region can be a string, name of a gene, or GRanges object
FindRegion <- function(
  object,
  region,
  sep = c("-", "-"),
  assay = NULL,
  extend.upstream = 0,
  extend.downstream = 0
) {
  if (!is(object = region, class2 = "GRanges")) {
    # first try to convert to coordinates, if not lookup gene
    region <- tryCatch(
      expr = suppressWarnings(
        expr = StringToGRanges(regions = region, sep = sep)
      ),
      error = function(x) {
        region <- LookupGeneCoords(
          object = object,
          assay = assay,
          gene = region
        )
        return(region)
      }
    )
    if (is.null(x = region)) {
      stop("Gene not found")
    }
  }
  region <- suppressWarnings(expr = Extend(
    x = region,
    upstream = extend.upstream,
    downstream = extend.downstream
  )
  )
  return(region)
}

# GetReadsInRegion
#
# Extract reads for each cell within a given genomic region or set of regions
#
# @param cellmap A mapping of cell names in the fragment file to cell names in
# the Seurat object. Should be a named vector where each element is a cell name
# that appears in the fragment file and the name of each element is the
# name of the cell in the Seurat object.
# @param region A genomic region, specified as a string in the format
# 'chr:start-end'. Can be a vector of regions.
# @param tabix.file A TabixFile object.
# @param cells Cells to include. Default is all cells present in the object.
# @param verbose Display messages
# @param ... Additional arguments passed to \code{\link{StringToGRanges}}
#
#' @importFrom Rsamtools TabixFile scanTabix
#' @importFrom SeuratObject Idents
#' @importFrom fastmatch fmatch
#
# @return Returns a data frame
GetReadsInRegion <- function(
  cellmap,
  region,
  tabix.file,
  cells = NULL,
  verbose = TRUE,
  ...
) {
  file.to.object <- names(x = cellmap)
  names(x = file.to.object) <- cellmap

  if (verbose) {
    message("Extracting reads in requested region")
  }
  if (!is(object = region, class2 = "GRanges")) {
    region <- StringToGRanges(regions = region, ...)
  }
  # remove regions that aren't in the fragment file
  common.seqlevels <- intersect(
    x = seqlevels(x = region),
    y = seqnamesTabix(file = tabix.file)
  )
  if (length(common.seqlevels) != 0) {
    region <- keepSeqlevels(
      x = region,
      value = common.seqlevels,
      pruning.mode = "coarse"
    )
    reads <- scanTabix(file = tabix.file, param = region)
    reads <- TabixOutputToDataFrame(reads = reads)
    reads <- reads[
      fmatch(x = reads$cell, table = cellmap, nomatch = 0L) > 0,
    ]
    # convert cell names to match names in object
    reads$cell <- file.to.object[reads$cell]
    if (!is.null(x = cells)) {
      reads <- reads[reads$cell %in% cells, ]
    }
    if (nrow(reads) == 0) {
      reads$ident <- integer()
      reads$length <- numeric()
      return(reads)
    }
    reads$length <- reads$end - reads$start
  } else {
    reads <- data.frame(
      "chr" = character(),
      "start" = numeric(),
      "end" = numeric(),
      "cell" = character(),
      "count" = numeric(),
      "ident" = integer(),
      "length" = numeric()
    )
  }
  return(reads)
}

# Get vector of cell names and associated identity
# @param object A Seurat object
# @param group.by Identity class to group cells by
# @param idents which identities to include
# @return Returns a named vector
#' @importFrom SeuratObject Idents
GetGroups <- function(
  object,
  group.by,
  idents
) {
  if (is.null(x = group.by)) {
    obj.groups <- Idents(object = object)
  } else {
    obj.md <- object[[group.by]]
    obj.groups <- obj.md[, 1]
    names(obj.groups) <- rownames(x = obj.md)
  }
  if (!is.null(idents)) {
    obj.groups <- obj.groups[obj.groups %in% idents]
  }
  return(obj.groups)
}

# Check if path is remote
# @param x path/s to check
isRemote <- function(x) {
  return(grepl(pattern = "^http|^ftp", x = x))
}

# row merge list of matrices
# @param mat.list list of sparse matrices
# @param new.rownames rownames to assign merged matrix
#' @importFrom SeuratObject RowMergeSparseMatrices
MergeMatrixParts <- function(mat.list, new.rownames) {
  # RowMergeSparseMatrices only exported in Seurat release Dec-2019 (3.1.2)
  merged.all <- mat.list[[1]]
  for (i in 2:length(x = mat.list)) {
    merged.all <- RowMergeSparseMatrices(
      mat1 = merged.all,
      mat2 = mat.list[[i]]
    )
  }
  # reorder rows to match genomic ranges
  merged.all <- merged.all[new.rownames, ]
  return(merged.all)
}

# Run GetReadsInRegion for a list of Fragment objects
# concatenate the output dataframes and return
# @param object A Seurat or ChromatinAssay object
# @param region Genomic region to extract fragments for
# @param fragment.list A list of Fragment objects. If NULL, pull them from the
# object
# @param assay Name of assay to use if supplying a Seurat object
#' @importFrom SeuratObject DefaultAssay
#' @importFrom Rsamtools TabixFile
#' @importFrom GenomeInfoDb keepSeqlevels
MultiGetReadsInRegion <- function(
  object,
  region,
  fragment.list = NULL,
  assay = NULL,
  ...
) {
  if (inherits(x = object, what = "Seurat")) {
    # pull the assay
    assay <- SetIfNull(x = assay, y = DefaultAssay(object = object))
    object <- object[[assay]]
  }
  fragment.list <- SetIfNull(
    x = fragment.list,
    y = Fragments(object = object)
  )
  if (length(x = fragment.list) == 0) {
    # no fragments set
    stop("No fragment files found")
  }
  res <- data.frame()
  for (i in seq_along(along.with = fragment.list)) {
    tbx.path <- GetFragmentData(object = fragment.list[[i]], slot = "path")
    cellmap <- GetFragmentData(object = fragment.list[[i]], slot = "cells")
    tabix.file <- TabixFile(
      file = tbx.path,
      index = GetIndexFile(fragment = tbx.path, verbose = FALSE)
    )
    open(con = tabix.file)
    reads <- GetReadsInRegion(
      cellmap = cellmap,
      region = region,
      tabix.file = tabix.file,
      ...
    )
    res <- rbind(res, reads)
    close(con = tabix.file)
  }
  return(res)
}

# Generate matrix of integration sites
#
# Generates a cell-by-position matrix of Tn5 integration sites
# centered on a given region (usually a DNA sequence motif). This
# matrix can be used for downstream footprinting analysis.
#
# @param cellmap A mapping of cell names in the fragment file to cell names in
# the Seurat object. Should be a named vector where each element is a cell name
# that appears in the fragment file and the name of each element is the
# name of the cell in the Seurat object.
# @param region A set of GRanges containing the regions of interest
# @param cells Which cells to include in the matrix. If NULL, use all cells in
# the cellmap
# @param tabix.file A \code{\link[Rsamtools]{TabixFile}} object.
# @param verbose Display messages
#' @importFrom Matrix sparseMatrix
#' @importFrom Rsamtools TabixFile
#' @importMethodsFrom GenomicRanges width start end
# @return Returns a sparse matrix
SingleFileCutMatrix <- function(
  cellmap,
  region,
  cells = NULL,
  tabix.file,
  verbose = TRUE
) {
  # if multiple regions supplied, must be the same width
  cells <- SetIfNull(x = cells, y = names(x = cellmap))
  if (length(x = region) == 0) {
    return(NULL)
  }
  fragments <- GetReadsInRegion(
    region = region,
    cellmap = cellmap,
    cells = cells,
    tabix.file = tabix.file,
    verbose = verbose
  )
  start.lookup <- start(x = region)
  names(start.lookup) <- seq_along(region)
  # if there are no reads in the region
  # create an empty matrix of the correct dimension
  if (nrow(x = fragments) == 0) {
    cut.matrix <- sparseMatrix(
      i = NULL,
      j = NULL,
      dims = c(length(x = cells), width(x = region)[[1]])
    )
  } else {
    fragstarts <- start.lookup[fragments$ident] + 1
    cut.df <- data.frame(
      position = c(fragments$start, fragments$end) - fragstarts,
      cell = c(fragments$cell, fragments$cell),
      stringsAsFactors = FALSE
    )
    cut.df <- cut.df[
      (cut.df$position > 0) & (cut.df$position <= width(x = region)[[1]]),
      ]
    cell.vector <- seq_along(along.with = cells)
    names(x = cell.vector) <- cells
    cell.matrix.info <- cell.vector[cut.df$cell]
    cut.matrix <- sparseMatrix(
      i = cell.matrix.info,
      j = cut.df$position,
      x = 1,
      dims = c(length(x = cells), width(x = region)[[1]])
    )
  }
  rownames(x = cut.matrix) <- cells
  colnames(x = cut.matrix) <- seq_len(width(x = region)[[1]])
  return(cut.matrix)
}

# Generate matrix of integration sites
#
# Generates a cell-by-position matrix of Tn5 integration sites.
#
# @param object A Seurat object
# @param region A GRanges object containing the region of interest
# @param assay A name of assay to use. Must be a \code{\link{ChromatinAssay}}
# containing a list of \code{\link{Fragment}} objects.
# @param cells Which cells to include in the matrix. If NULL (default), use all
# cells in the object
# @param group.by Name of grouping variable to use
# @param verbose Display messages
# @return Returns a sparse matrix
#' @importFrom SeuratObject DefaultAssay
#' @importFrom Rsamtools TabixFile seqnamesTabix
#' @importFrom GenomeInfoDb keepSeqlevels
CutMatrix <- function(
  object,
  region,
  group.by = NULL,
  assay = NULL,
  cells = NULL,
  verbose = TRUE
) {
  # run SingleFileCutMatrix for each fragment file and combine results
  assay <- SetIfNull(x = assay, y = DefaultAssay(object = object))
  cells <- SetIfNull(x = cells, y = colnames(x = object))
  fragments <- Fragments(object = object[[assay]])
  if (length(x = fragments) == 0) {
    stop("No fragment information found for requested assay")
  }
  res <- list()
  for (i in seq_along(along.with = fragments)) {
    fragment.path <- GetFragmentData(object = fragments[[i]], slot = "path")
    cellmap <- GetFragmentData(object = fragments[[i]], slot = "cells")
    tabix.file <- TabixFile(
      file = fragment.path,
      index = GetIndexFile(fragment = fragment.path, verbose = FALSE)
    )
    open(con = tabix.file)
    # remove regions that aren't in the fragment file
    seqnames.in.both <- intersect(
      x = seqnames(x = region),
      y = seqnamesTabix(file = tabix.file)
    )
    region <- keepSeqlevels(
      x = region,
      value = seqnames.in.both,
      pruning.mode = "coarse"
    )
    if (length(x = region) != 0) {
      cm <- SingleFileCutMatrix(
        region = region,
        cellmap = cellmap,
        tabix.file = tabix.file,
        cells = cells,
        verbose = FALSE
      )
      res[[i]] <- cm
    }
    close(con = tabix.file)
  }
  res <- Reduce(f = `+`, x = res)
  return(res)
}

# Generate cut matrix for many regions
#
# Run CutMatrix on multiple regions and add them together.
# Assumes regions are pre-aligned.
#
# @param object A Seurat object
# @param regions A set of GRanges
# @param group.by Name of grouping variable to use
# @param fragments A list of Fragment objects
# @param assay Name of the assay to use
# @param cells Vector of cells to include
# @param verbose Display messages
#' @importFrom Rsamtools TabixFile seqnamesTabix
#' @importFrom SeuratObject DefaultAssay
#' @importFrom GenomeInfoDb keepSeqlevels
MultiRegionCutMatrix <- function(
  object,
  regions,
  group.by = NULL,
  fragments = NULL,
  assay = NULL,
  cells = NULL,
  verbose = FALSE
) {
  if (inherits(x = object, what = "Seurat")) {
    assay <- SetIfNull(x = assay, y = DefaultAssay(object = object))
    object <- object[[assay]]
  }
  fragments <- SetIfNull(x = fragments, y = Fragments(object = object))
  res <- list()
  if (length(x = fragments) == 0) {
    stop("No fragment files present in assay")
  }
  for (i in seq_along(along.with = fragments)) {
    frag.path <- GetFragmentData(object = fragments[[i]], slot = "path")
    cellmap <- GetFragmentData(object = fragments[[i]], slot = "cells")
    if (is.null(x = cellmap)) {
      cellmap <- colnames(x = object)
      names(x = cellmap) <- cellmap
    }
    tabix.file <- TabixFile(
      file = frag.path,
      index = GetIndexFile(fragment = frag.path, verbose = FALSE)
    )
    open(con = tabix.file)
    # remove regions that aren't in the fragment file
    common.seqlevels <- intersect(
      x = seqlevels(x = regions),
      y = seqnamesTabix(file = tabix.file)
    )
    regions <- keepSeqlevels(
      x = regions,
      value = common.seqlevels,
      pruning.mode = "coarse"
    )
    cm <- SingleFileCutMatrix(
      cellmap = cellmap,
      tabix.file = tabix.file,
      region = regions,
      verbose = verbose
    )
    close(con = tabix.file)
    res[[i]] <- cm
  }
  # each matrix contains data for different cells at same positions
  # bind all matrices together
  res <- do.call(what = rbind, args = res)
  return(res)
}

# Create cut site pileup matrix
#
# For a set of aligned genomic ranges, find the total number of
# integration sites per cell per base.
#
# @param object A Seurat object
# @param regions A GRanges object
# @param assay Name of the assay to use
# @param cells Which cells to include. If NULL, use all cells
# @param verbose Display messages
#' @importMethodsFrom GenomicRanges strand
CreateRegionPileupMatrix <- function(
  object,
  regions,
  assay = NULL,
  cells = NULL,
  verbose = TRUE
) {
  if (length(x = regions) == 0) {
    stop("No regions supplied")
  }
  # split into strands
  on_plus <- strand(x = regions) == "+" | strand(x = regions) == "*"
  plus.strand <- regions[on_plus, ]
  minus.strand <- regions[!on_plus, ]

  # get cut matrices for each strand
  if (verbose) {
    message("Finding + strand cut sites")
  }
  cut.matrix.plus <- MultiRegionCutMatrix(
    regions = plus.strand,
    object = object,
    assay = assay,
    cells = cells,
    verbose = FALSE
  )
  if (verbose) {
    message("Finding - strand cut sites")
  }
  cut.matrix.minus <- MultiRegionCutMatrix(
    regions = minus.strand,
    object = object,
    assay = assay,
    cells = cells,
    verbose = FALSE
  )

  # reverse minus strand and add together
  if (is.null(x = cut.matrix.plus)) {
    full.matrix <- cut.matrix.minus[, rev(x = colnames(x = cut.matrix.minus))]
  } else if (is.null(x = cut.matrix.minus)) {
    full.matrix <- cut.matrix.plus
  } else {
    full.matrix <- cut.matrix.plus + cut.matrix.minus[, rev(
      x = colnames(x = cut.matrix.minus)
    )]
  }
  # rename so 0 is center
  region.width <- width(x = regions)[[1]]
  midpoint <- round(x = (region.width / 2))
  colnames(full.matrix) <- seq_len(length.out = region.width) - midpoint
  return(full.matrix)
}

# Apply function to integration sites per base per group
#
# Perform colSums on a cut matrix with cells in the rows
# and position in the columns, for each group of cells
# separately.
#
# @param mat A cut matrix. See \code{\link{CutMatrix}}
# @param groups A vector of group identities, with the name
# of each element in the vector set to the cell name.
# @param fun Function to apply to each group of cells.
# For example, colSums or colMeans.
# @param group.scale.factors Scaling factor for each group. Should
# be computed using the number of cells in the group and the average number of
# counts in the group.
# @param normalize Perform sequencing depth and cell count normalization
# @param scale.factor Scaling factor to use. If NULL (default), will use the
# median normalization factor for all the groups.
ApplyMatrixByGroup <- function(
  mat,
  groups,
  fun,
  normalize = TRUE,
  group.scale.factors = NULL,
  scale.factor = NULL
) {
  if (normalize) {
    if (is.null(x = group.scale.factors) | is.null(x = scale.factor)) {
      stop("If normalizing counts, supply group scale factors")
    }
  }
  all.groups <- as.character(x = unique(x = groups))
  if (any(is.na(x = groups))) {
    all.groups <- c(all.groups, NA)
  }
  ngroup <- length(x = all.groups)
  npos <- ncol(x = mat)

  group <- unlist(
    x = lapply(X = all.groups, FUN = function(x) rep(x, npos))
  )
  position <- rep(x = as.numeric(x = colnames(x = mat)), ngroup)
  count <- vector(mode = "numeric", length = npos * ngroup)

  for (i in seq_along(along.with = all.groups)) {
    grp <- all.groups[[i]]
    if (is.na(x = grp)) {
      pos.cells <- names(x = groups)[is.na(x = groups)]
    } else {
      pos.cells <- names(x = groups)[groups == all.groups[[i]]]
    }
    if (length(x = pos.cells) > 1) {
      totals <- fun(x = mat[pos.cells, ])
    } else {
      totals <- mat[pos.cells, ]
    }
    count[((i - 1) * npos + 1):((i * npos))] <- totals
  }

  # construct dataframe
  coverages <- data.frame(
    "group" = group, "position" = position, "count" = count,
    stringsAsFactors = FALSE
  )

  if (normalize) {
    scale.factor <- SetIfNull(
      x = scale.factor, y = median(x = group.scale.factors)
    )
    coverages$norm.value <- coverages$count /
      group.scale.factors[coverages$group] * scale.factor
  } else {
    coverages$norm.value <- coverages$count
  }
  return(coverages)
}

# TabixOutputToDataFrame
#
# Create a single dataframe from list of character vectors
#
# @param reads List of character vectors (the output of \code{\link{scanTabix}})
# @param record.ident Add a column recording which region the reads overlapped
# with
#' @importFrom stringi stri_split_fixed
#' @importFrom S4Vectors elementNROWS
TabixOutputToDataFrame <- function(reads, record.ident = TRUE) {
  if (record.ident) {
    nrep <- elementNROWS(x = reads)
  }
  original_names = names(reads)
  reads <- unlist(x = reads, use.names = FALSE)
  if (length(x = reads) == 0 | is.null(x = original_names)) {
    df <- data.frame(
      "chr" = "",
      "start" = "",
      "end" = "",
      "cell" = "",
      "count" = ""
    )
    df <- df[-1, ]
    return(df)
  }
  reads <- stri_split_fixed(str = reads, pattern = "\t")
  n <- length(x = reads[[1]])
  unlisted <- unlist(x = reads)
  e1 <- unlisted[n * (seq_along(along.with = reads)) - (n - 1)]
  e2 <- as.numeric(x = unlisted[n * (seq_along(along.with = reads)) - (n - 2)])
  e3 <- as.numeric(x = unlisted[n * (seq_along(along.with = reads)) - (n - 3)])
  e4 <- unlisted[n * (seq_along(along.with = reads)) - (n - 4)]
  e5 <- as.numeric(x = unlisted[n * (seq_along(along.with = reads)) - (n - 5)])
  df <- data.frame(
    "chr" = e1,
    "start" = e2,
    "end" = e3,
    "cell" = e4,
    "count" = e5,
    stringsAsFactors = FALSE,
    check.rows = FALSE,
    check.names = FALSE
  )
  if (record.ident) {
    df$ident <- rep(x = seq_along(along.with = nrep), nrep)
  }
  return(df)
}

# Extract delimiter information from a string.
#
# From Seurat
#
# Parses a string (usually a cell name) and extracts fields based on a delimiter
#
# @param string String to parse.
# @param field Integer(s) indicating which field(s) to extract. Can be a vector
# multiple numbers.
# @param delim Delimiter to use, set to underscore by default.
#
# @return A new string, that parses out the requested fields, and (if multiple),
# rejoins them with the same delimiter
#' @importFrom stringi stri_split_fixed
#
ExtractField <- function(string, field = 1, delim = "_") {
  fields <- as.numeric(
    x = unlist(x = stri_split_fixed(
      str = as.character(x = field), pattern = ",")
    )
  )
  if (length(x = fields) == 1) {
    return(stri_split_fixed(str = string, pattern = delim)[[1]][field])
  }
  return(paste(
    stri_split_fixed(str = string, pattern = delim)[[1]][fields],
    collapse = delim))
}

# Set a default value if an object is null
#
# @param x An object to set if it's null
# @param y The value to provide if x is null
# @return Returns y if x is null, otherwise returns x.
SetIfNull <- function(x, y) {
  if (is.null(x = x)) {
    return(y)
  } else {
    return(x)
  }
}

# Check if a matrix is empty
#
# From Seurat
#
# Takes a matrix and asks if it's empty (either 0x0 or 1x1 with a value of NA)
#
# @param x A matrix
#
# @return Whether or not \code{x} is empty
#
IsMatrixEmpty <- function(x) {
  matrix.dims <- dim(x = x)
  matrix.na <- all(matrix.dims == 1) && all(is.na(x = x))
  return(all(matrix.dims == 0) || matrix.na)
}

# Find matrix indices corresponding to overlapping genomic ranges
# @param assay.list A list of ChromatinAssay objects
# @param all.ranges Combined genomic ranges for all objects. This should be the
# set of ranges that \code{reduce} was run on to get \code{reduced.ranges}
# @param reduced.ranges A set of reduced genomic ranges containing the rev.map
# information
GetRowsToMerge <- function(assay.list, all.ranges, reduced.ranges) {
  revmap <- as.vector(x = reduced.ranges$revmap)

  # get indices of ranges that changed
  revmap.lengths <- sapply(X = revmap, FUN = length)
  changed.ranges <- which(x = revmap.lengths > 1)
  grange.string <- GRangesToString(grange = reduced.ranges[changed.ranges])

  # preallocate
  offsets <- list()
  results <- list()
  matrix.indices <- vector(
    mode = "numeric",
    length = length(x = changed.ranges) * 2
  )
  granges <- vector(
    mode = "character",
    length = length(x = changed.ranges) * 2
  )
  for (i in seq_along(along.with = assay.list)) {
    indices <- which(x = all.ranges$dataset == i)
    offsets[[i]] <- min(indices) - 1
    offsets[[i]][[2]] <- max(indices) + 1
    results[['matrix']][[i]] <- matrix.indices
    results[['grange']][[i]] <- granges
  }

  # find sets of ranges for each dataset
  counter <- vector(mode = "numeric", length = length(x = assay.list))
  for (x in seq_along(along.with = changed.ranges)) {
    idx <- changed.ranges[[x]]
    all.assay <- revmap[[idx]]
    for (i in seq_along(along.with = assay.list)) {
      this.assay <- all.assay[
        (all.assay > offsets[[i]][1]) & (all.assay < offsets[[i]][2])
      ]
      mat.idx <- this.assay - offsets[[i]][1]
      mat.idx <- mat.idx[mat.idx < offsets[[i]][2] & mat.idx > 0]
      for (y in seq_along(along.with = mat.idx)) {
        counter[i] <- counter[i] + 1
        results[['matrix']][[i]][[counter[i]]] <- mat.idx[[y]]
        results[['grange']][[i]][[counter[i]]] <- grange.string[[x]]
      }
    }
  }
  # remove trailing extra values in each vector
  for (i in seq_along(along.with = assay.list)) {
    results$matrix[[i]] <- results$matrix[[i]][1:counter[i]]
    results$grange[[i]] <- results$grange[[i]][1:counter[i]]
  }
  return(results)
}

# Merge rows of count matrices with overlapping genomic ranges
# @param mergeinfo The output of GetRowsToMerge: a list of matrix indices
#  and matrix rownames to be merged for each assay
# @param assay.list List of assays
# @param verbose Display messages
#' @importFrom utils txtProgressBar setTxtProgressBar
#' @importFrom Matrix rowSums
#' @importMethodsFrom Matrix t
MergeOverlappingRows <- function(
  mergeinfo,
  assay.list,
  slot = "counts",
  verbose = TRUE
) {
  merge.counts <- list()
  for (i in seq_along(along.with = assay.list)) {
    # get count matrix
    counts <- GetAssayData(object = assay.list[[i]], slot = slot)

    if (nrow(x = counts) == 0) {
      # no counts, only data
      # skip row merge and return empty counts matrices
      merge.counts <- lapply(
        X = seq_along(along.with = assay.list),
        FUN = matrix,
        nrow = 0,
        ncol = 0
      )
      return(merge.counts)
    }

    # transpose for faster access since matrix is column major
    counts <- t(x = counts)

    # get rows to merge
    mrows <- mergeinfo$matrix[[i]]
    new.rownames <- mergeinfo$grange[[i]]
    nrep <- rle(x = new.rownames)

    # allocate
    todelete <- c()
    newmat <- vector(
      mode = "list",
      length = length(new.rownames)
    )
    newmat.names <- vector(
      mode = "character",
      length = length(x = new.rownames)
    )
    x <- 1  # row index for matrix
    y <- 1  # counter for list index
    if (verbose & length(x = nrep$lengths) > 1) {
      pb <- txtProgressBar(
        min = 1,
        max = length(x = nrep$lengths),
        style = 3,
        file = stderr()
      )
    }
    to.rename.idx <- vector(
      mode = "numeric", length = length(x = nrep$lengths)
    )
    to.rename.names <- vector(
      mode = "character", length = length(x = nrep$lengths)
    )
    idx.counter <- 0
    for (j in seq_along(along.with = nrep$lengths)) {
      rowrun <- nrep$lengths[[j]]
      new.feature.name <- nrep$values[[j]]
      index.range <- x:(x + rowrun - 1)
      matrix.index <- mrows[index.range]
      if (rowrun < 2) {
        idx.counter <- idx.counter + 1
        # no merge needed, just rename row in-place
        # store row indices and names to do the change in one step at the end
        to.rename.idx[idx.counter] <- matrix.index
        to.rename.names[idx.counter] <- new.feature.name
      } else {
        # merge multiple rows and add to list
        newmat[[y]] <- rowSums(x = counts[, matrix.index])
        # mark merged row for deletion
        todelete <- c(todelete, matrix.index)
        # add row names
        newmat.names[y] <- new.feature.name
        y <- y + 1
      }
      if (verbose & length(x = nrep$lengths) > 1) {
        setTxtProgressBar(pb = pb, value = j)
      }
      x <- x + rowrun
    }
    # remove extra elements in vectors
    to.rename.idx <- to.rename.idx[1:idx.counter]
    to.rename.names <- to.rename.names[1:idx.counter]
    newmat <- newmat[1:(y - 1)]
    newmat.names <- newmat.names[1:(y - 1)]

    # transpose back
    counts <- t(x = counts)

    # rename matrix rows that weren't merged
    rownames(counts)[to.rename.idx] <- to.rename.names

    if (y == 1) {
      # no rows were merged, can return counts
      merge.counts[[i]] <- counts
    } else if (y == 2) {
      # only one element
      tomerge <- matrix(data = newmat[[1]], nrow = 1)
      colnames(x = tomerge) <- names(x = newmat[[1]])
      rownames(x = tomerge) <- newmat.names
      tomerge <- tomerge[, colnames(x = counts)]
      counts <- rbind(counts, tomerge)
      merge.counts[[i]] <- counts
    } else {
      # construct sparse matrix
      if (verbose) {
        message("\nBinding matrix rows")
      }
      merged.mat <- Reduce(f = rbind, x = newmat)
      rownames(merged.mat) <- newmat.names
      merged.mat <- as(object = merged.mat, Class = "CsparseMatrix")

      # remove rows from count matrix that were merged
      mat.rows <- seq_len(length.out = nrow(x = counts))
      tokeep <- setdiff(mat.rows, todelete)
      counts <- counts[tokeep, ]

      # add new merged rows to counts
      counts <- rbind(counts, merged.mat)
      merge.counts[[i]] <- counts
    }
  }
  return(merge.counts)
}

#' @importFrom Matrix sparseMatrix
#' @importFrom S4Vectors elementNROWS
PartialMatrix <- function(tabix, regions, sep = c("-", "-"), cells = NULL) {
  # construct sparse matrix for one set of regions
  # names of the cells vector can be ignored here, conversion is handled in
  # the parent functions
  open(con = tabix)
  cells.in.regions <- GetCellsInRegion(
    tabix = tabix,
    region = regions,
    cells = cells
  )
  close(con = tabix)
  gc(verbose = FALSE)
  nrep <- elementNROWS(x = cells.in.regions)
  if (all(nrep == 0) & !is.null(x = cells)) {
    # no fragments
    # zero for all requested cells
    featmat <- sparseMatrix(
      dims = c(length(x = regions), length(x = cells)),
      i = NULL,
      j = NULL
    )
    rownames(x = featmat) <- GRangesToString(grange = regions)
    colnames(x = featmat) <- cells
    featmat <- as(object = featmat, Class = "CsparseMatrix")
    return(featmat)
  } else if (all(nrep == 0)) {
    # no fragments, no cells requested
    # create empty matrix
    featmat <- sparseMatrix(
      dims = c(length(x = regions), 0),
      i = NULL,
      j = NULL
    )
    rownames(x = featmat) <- GRangesToString(grange = regions)
    featmat <- as(object = featmat, Class = "CsparseMatrix")
    return(featmat)
  } else {
    # fragments detected
    if (is.null(x = cells)) {
      all.cells <- unique(x = unlist(x = cells.in.regions))
      cell.lookup <- seq_along(along.with = all.cells)
      names(x = cell.lookup) <- all.cells
    } else {
      cell.lookup <- seq_along(along.with = cells)
      names(cell.lookup) <- cells
    }
    # convert cell name to integer
    cells.in.regions <- unlist(x = cells.in.regions)
    cells.in.regions <- unname(obj = cell.lookup[cells.in.regions])
    all.features <- GRangesToString(grange = regions, sep = sep)
    feature.vec <- rep(x = seq_along(along.with = all.features), nrep)
    featmat <- sparseMatrix(
      i = feature.vec,
      j = cells.in.regions,
      x = rep(x = 1, length(x = cells.in.regions))
    )
    featmat <- as(Class = "CsparseMatrix", object = featmat)
    rownames(x = featmat) <- all.features[1:max(feature.vec)]
    colnames(x = featmat) <- names(x = cell.lookup)[1:max(cells.in.regions)]
    # add zero columns for missing cells
    if (!is.null(x = cells)) {
      featmat <- AddMissing(x = featmat, cells = cells, features = NULL)
    }
    # add zero rows for missing features
    missing.features <- all.features[!(all.features %in% rownames(x = featmat))]
    if (length(x = missing.features) > 0) {
      null.mat <- sparseMatrix(
        i = c(),
        j = c(),
        dims = c(length(x = missing.features), ncol(x = featmat))
      )
      rownames(x = null.mat) <- missing.features
      null.mat <- as(object = null.mat, Class = "CsparseMatrix")
      featmat <- rbind(featmat, null.mat)
    }
    return(featmat)
  }
}

# Convert PFMMatrix to
# @param x A PFMatrix
PFMatrixToList <- function(x) {
  if (!requireNamespace("TFBSTools", quietly = TRUE)) {
    stop("Please install TFBSTools.
         https://www.bioconductor.org/packages/TFBSTools/")
  }
  position.matrix <- TFBSTools::Matrix(x = x)
  name.use <- TFBSTools::name(x = x)
  return(list("matrix" = position.matrix, "name" = name.use))
}

#' @importFrom Matrix rowMeans rowSums
SparseRowVar <- function(x) {
  return(rowSums(x = (x - rowMeans(x = x)) ^ 2) / (dim(x = x)[2] - 1))
}

#' @importMethodsFrom Matrix t
SparseColVar <- function(x) {
  return(SparseRowVar(x = t(x = x)))
}

# Replace non-zero entries in a sparse entries with non-zero ranks
#
# This method creates a rank matrix for a sparse matrix X using the following approach:
# 1. Use non-zero entries in a column to calculate the ranks
# 2. Add (z-1)/2 to the ranks (only non-zero entries are changed). z is the number of zeros
# in the column
# Since all the entries are shifted by the same constant (the zeros
# are already shifted), the covariance matrix of this shifted matrix is
# the same as the rank matrix of the entire matrix (where the zeros would
# all also have a rank = (z+1)/2) where z is the number of zeros
#
# This rank matrix can then be used to calculate pearson correlation
SparsifiedRanks <- function(X){
  if (!inherits(x = X, what = "CsparseMatrix")) {
    X <- as(object = X, Class = "CsparseMatrix")
  }
  non_zeros_per_col <- diff(x = X@p)
  n_zeros_per_col <- nrow(x = X) - non_zeros_per_col
  offsets <- (n_zeros_per_col - 1) / 2
  x <- X@x
  ## split entries to columns
  col_lst <- split(
    x = x,
    f = rep.int(x = 1:ncol(x = X), times = non_zeros_per_col)
  )
  ## calculate sparsified ranks and do shifting
  sparsified_ranks <- unlist(
    x = lapply(
      X = seq_along(col_lst),
      FUN = function(i) rank(x = col_lst[[i]]) + offsets[i]
      )
    )
  ## Create template rank matrix
  X.ranks <- X
  X.ranks@x <- sparsified_ranks
  return(X.ranks)
}

SparseSpearmanCor <- function(X, Y = NULL, cov = FALSE) {
  # Get sparsified ranks
  rankX <- SparsifiedRanks(X = X)
  if (is.null(Y)){
    # Calculate pearson correlation on rank matrices
    return (corSparse(X = rankX, cov = cov))
    }
  rankY <- SparsifiedRanks(X = Y)
  return(corSparse(X = rankX, Y = rankY, cov = cov))
}