ArchR: Analyzing single-cell regulatory chromatin in R.

Documented in addMotifAnnotations addPeakAnnotations getMatches getPeakAnnotation getPositions

##########################################################################################
# Annotation Methods
##########################################################################################

#' Get peakAnnotation from an ArchRProject
#' 
#' This function gets a peakAnnotation from a given ArchRProject.
#' 
#' @param ArchRProj An `ArchRProject` object.
#' @param name The name of the `peakAnnotation` object (i.e. Motifs) to retrieve from the designated `ArchRProject`.
#' 
#' @examples
#'
#' # Get Test ArchR Project
#' proj <- getTestProject()
#'
#' # Get Peak Annotations
#' peakAnno <- getPeakAnnotation(proj)
#'
#' @export
getPeakAnnotation <- function(ArchRProj = NULL, name = NULL){
  .validInput(input = ArchRProj, name = "ArchRProj", valid = c("ArchRProj"))
  .validInput(input = name, name = "name", valid = c("character", "null"))
  if(is.null(name)){
    name <- 1
  }else{
    if(name %ni% names(ArchRProj@peakAnnotation)){
      stop("Name is not in peakAnnotation!")
    }
  }
  ArchRProj@peakAnnotation[[name]]
}

#' Get peak annotation positions from an ArchRProject
#' 
#' This function gets the peak annotation positions (i.e. Motifs) from a given ArchRProject.
#' 
#' @param ArchRProj An `ArchRProject` object.
#' @param name The name of the `peakAnnotation` object (i.e. Motifs) to retrieve from the designated `ArchRProject`.
#' @param annoName The name of a specific annotation to subset within the `peakAnnotation`.
#' 
#' @examples
#'
#' # Get Test ArchR Project
#' proj <- getTestProject()
#'
#' # Get Annotation Positions
#' positions <- getPositions(proj)
#'
#' @export
getPositions <- function(ArchRProj = NULL, name = NULL, annoName = NULL){
  .validInput(input = ArchRProj, name = "ArchRProj", valid = c("ArchRProj"))
  .validInput(input = name, name = "name", valid = c("character", "null"))
  .validInput(input = annoName, name = "annoName", valid = c("character", "null"))
  if(is.null(name)){
    name <- 1
  }else{
    if(name %ni% names(ArchRProj@peakAnnotation)){
      stop("Name is not in peakAnnotation!")
    }
  }
  anno <- ArchRProj@peakAnnotation[[name]]
  idx <- grep("positions", names(anno), ignore.case=TRUE)
  if(length(idx)==0){
    stop("peakAnnotation does not contain positions!")
  }
  positions <- readRDS(anno[[idx]])
  if(!is.null(annoName)){
    idx <- grep(annoName, names(positions), ignore.case=TRUE)
    if(length(idx)==0){
      stop("Positons do not contain annoName!")
    }
    positions <- positions[idx]
  }
  positions
}

#' Get peak annotation matches from an ArchRProject
#' 
#' This function gets peak annotation matches from a given ArchRProject. The peaks in the returned object are in the
#' same order as the peaks returned by `getPeakSet()`.
#' 
#' @param ArchRProj An `ArchRProject` object.
#' @param name The name of the `peakAnnotation` object (i.e. Motifs) to retrieve from the designated `ArchRProject`.
#' @param annoName The name of a specific annotation to subset within the `peakAnnotation`.
#' 
#' @examples
#'
#' # Get Test ArchR Project
#' proj <- getTestProject()
#'
#' # Get Annotation Matches
#' matches <- getMatches(proj)
#'
#' @export
getMatches <- function(ArchRProj = NULL, name = NULL, annoName = NULL){
  .validInput(input = ArchRProj, name = "ArchRProj", valid = c("ArchRProj"))
  .validInput(input = name, name = "name", valid = c("character", "null"))
  .validInput(input = annoName, name = "annoName", valid = c("character", "null"))
  if(is.null(name)){
    name <- 1
  }else{
    if(name %ni% names(ArchRProj@peakAnnotation)){
      stop("Name is not in peakAnnotation!")
    }
  }
  anno <- ArchRProj@peakAnnotation[[name]]
  idx <- grep("matches", names(anno), ignore.case=TRUE)
  if(length(idx)==0){
    stop("peakAnnotation does not contain positions!")
  }
  matches <- readRDS(anno[[idx]])
  if(!is.null(annoName)){
    idx <- grep(annoName, colnames(matches), ignore.case=TRUE)
    if(length(idx)==0){
      stop("Matches do not contain annoName!")
    }
    matches <- matches[, idx, drop=FALSE]
  }
  #Check
  rr1 <- paste0(rowRanges(matches))
  rr2 <- paste0(getPeakSet(ArchRProj))
  if(!all(rr1 %in% rr2)){
    stop("Not all matches in PeakSet")
  }
  rownames(matches) <- rr1
  matches <- matches[rr2,]
  rownames(matches) <- NULL
  matches

}

#' Add peak annotations to an ArchRProject
#' 
#' This function adds information about which peaks contain input regions to a given ArchRProject. For each peak, a
#' binary value is stored indicating whether each region is observed within the peak region.
#' 
#' @param ArchRProj An `ArchRProject` object.
#' @param regions A named `list` of `GRanges` that are to be overlapped with the `peakSet` in the `ArchRProject`.
#' @param name The name of `peakAnnotation` object to be stored as in `ArchRProject`.
#' @param force A boolean value indicating whether to force the `peakAnnotation` object indicated by `name` to be overwritten
#' if it already exists in the given `ArchRProject`.
#' @param logFile The path to a file to be used for logging ArchR output.
#' 
#' @examples
#'
#' # Get Test ArchR Project
#' proj <- getTestProject()
#'
#' # Get Motif Positions Can Be Any Interval GRanges List
#' positions <- getPositions(proj)
#'
#' # Add Peak Annotations
#' proj <- addPeakAnnotations(proj, regions = positions)
#'
#' @export
addPeakAnnotations <- function(
  ArchRProj = NULL,
  regions = NULL,
  name = "Region",
  force = FALSE,
  logFile = createLogFile("addPeakAnnotations")
  ){

  .validInput(input = ArchRProj, name = "ArchRProj", valid = c("ArchRProj"))
  .validInput(input = regions, name = "regions", valid = c("grangeslist", "character"))
  .validInput(input = name, name = "name", valid = c("character"))
  .validInput(input = force, name = "force", valid = c("boolean"))
  .validInput(input = logFile, name = "logFile", valid = c("character"))

  tstart <- Sys.time()
  .startLogging(logFile = logFile)
  .logThis(mget(names(formals()),sys.frame(sys.nframe())), "addPeakAnnotations Input-Parameters", logFile = logFile)

  if(name %in% names(ArchRProj@peakAnnotation)){
    if(force){
      .logMessage("peakAnnotation name already exists! Overriding.", verbose = TRUE, logFile = logFile)
    }else{
      .logStop("peakAnnotation name already exists! set force = TRUE to override!", logFile = logFile)
    }
  }

  if(inherits(regions, "GRanges")){

    regionPositions <- GRangesList(region = regions)

  }else{

    if(is.null(names(regions))){
      names(regions) <- paste0("Region_", seq_along(regions))
    }

    if(any(duplicated(names(regions)))){
      stop("Found duplicated region names! Please make unique!")
    }

    regionPositions <- lapply(seq_along(regions), function(x){
      
      .logThis(regions[[x]], paste0("regions[[x]]-", x), logFile = logFile)

      if(inherits(regions[[x]], "GRanges")){

          gr <- .validGRanges(regions[[x]])

      }else if(is.character(regions[[x]])){

        gr <- tryCatch({
          makeGRangesFromDataFrame(
            df = data.frame(data.table::fread(regions[[x]])), 
            keep.extra.columns = FALSE,
            seqnames.field = "V1",
            start.field = "V2",
            end.field = "V3"
          )
        }, error = function(y){

          .logMessage(paste0("Could not successfully get region : ", regions[[x]]), verbose = TRUE, logFile = logFile)

          if(!file.exists(regions[[x]])){
            .logStop(paste0("If region provided is a path it does not exist!"), logFile = logFile)
          }
          
          .logStop("Could not create GRanges from region", logFile = logFile)

        })

      }else{
        
        .logStop("Unrecognized input in regions please input GRanges, GRangesList, or Paths to bed files!", logFile = logFile)
      
      }

      gr

    }) %>% GRangesList

    names(regionPositions) <- names(regions)

  }

  #############################################################
  # Peak Overlap Matrix
  #############################################################
  peakSet <- getPeakSet(ArchRProj)
  if(is.null(peakSet)){
    .logStop("peakSet is NULL. You need a peakset to run addMotifAnnotations! See addReproduciblePeakSet!", logFile = logFile)
  }
  allPositions <- unlist(regionPositions, use.names=TRUE)

  .logDiffTime("Creating Peak Overlap Matrix", t1 = tstart, verbose = TRUE, logFile = logFile)

  overlapRegions <- findOverlaps(peakSet, allPositions, ignore.strand=TRUE)
  if(length(overlapRegions) == 0){
    stop("No Overlaps Found between regions and peak Matrix!")
  }
  .logThis(overlapRegions, "overlapRegions", logFile = logFile)

  regionMat <- Matrix::sparseMatrix(
    i = queryHits(overlapRegions),
    j = match(names(allPositions),names(regionPositions))[subjectHits(overlapRegions)],
    x = rep(TRUE, length(overlapRegions)),
    dims = c(length(peakSet), length(regionPositions))
  )
  colnames(regionMat) <- names(regionPositions)
  .logThis(regionMat, "regionMat", logFile = logFile)

  regionMat <- SummarizedExperiment::SummarizedExperiment(assays=SimpleList(matches = regionMat), rowRanges = peakSet)
  .logThis(regionMat, "regionSE", logFile = logFile)

  #############################################################
  # Filter Regions With No Matches
  #############################################################

  #Number of Overlaps
  nO <- Matrix::colSums(assay(regionMat))
  rF <- names(which(nO == 0))

  if(all(nO == 0)){
    stop("No Overlaps Found! Please check your peakSet and genome!")
  }

  if(length(rF) > 0){
    .logDiffTime(paste0("Filtering Region Annotations with 0 overlaps :\n\n ", paste(rF, collapse=", "), "\n\n"), t1 = tstart, verbose = TRUE, logFile = logFile)
    #Filter
    regionPositions <- regionPositions[!(names(regionPositions) %in% rF)]
    regionMat <- regionMat[,names(regionPositions),drop=FALSE]
  }else{
    .logDiffTime(paste0("All Regions Overlap at least 1 peak!"), t1 = tstart, verbose = TRUE, logFile = logFile)
  }  

  #############################################################
  # Summarize and Save
  #############################################################

  dir.create(file.path(getOutputDirectory(ArchRProj), "Annotations"), showWarnings=FALSE)
  savePositions <- file.path(getOutputDirectory(ArchRProj), "Annotations", paste0(name,"-Positions-In-Peaks.rds"))
  saveMatches <- file.path(getOutputDirectory(ArchRProj), "Annotations", paste0(name,"-Matches-In-Peaks.rds"))

  out <- SimpleList(
      regionMatches = regionMat,
      regionPositions = regionPositions,
      date = Sys.Date()
    )

  ArchRProj@peakAnnotation[[name]]$Name <- name
  ArchRProj@peakAnnotation[[name]]$Positions <- savePositions
  ArchRProj@peakAnnotation[[name]]$Matches <- saveMatches

  .safeSaveRDS(out, file.path(getOutputDirectory(ArchRProj),  "Annotations", paste0(name,"-In-Peaks-Summary.rds")), compress = FALSE)
  .safeSaveRDS(out$regionPositions, savePositions, compress = FALSE)
  .safeSaveRDS(out$regionMatches, saveMatches, compress = FALSE)

  return(ArchRProj)

}

#' Add motif annotations to an ArchRProject
#' 
#' This function adds information about which peaks contain motifs to a given ArchRProject. For each peak, a binary value
#' is stored indicating whether each motif is observed within the peak region.
#' 
#' @param ArchRProj An `ArchRProject` object.
#' @param motifSet The name of a curated motif set to be used for annotation. Options include: (i) "JASPAR2016", "JASPAR2018", "JASPAR2020"
#' which gives the 2016, 2018, or 2020 version of JASPAR motifs, (ii) one of "cisbp", "encode", or "homer" which gives the
#' corresponding motif sets from the `chromVAR` package, or (iii) "vierstra" which gives the clustered archetype motifs
#' created by Jeff Vierstra (https://github.com/jvierstra/motif-clustering). 
#' @param annoName The name of the `peakAnnotation` object to be stored in the provided `ArchRProject`
#' @param species The latin name of the species relevant to the supplied `ArchRProject`. This is used for identifying which motif to be
#' used from CisBP/JASPAR. For JASPAR, `species` is passed to `TFBS::getMatrixSet` and some species names are not recognized. In these cases
#' it is possible to use the NCBI taxonomy ID. By default, this function will attempt to guess the species based on the value from `getGenome()`.
#' @param collection If one of the JASPAR motif sets is used via `motifSet`, this parameter allows you to indicate the JASPAR
#' collection to be used. See `getMatrixSet()` from `TFBSTools` for all options to supply for collection. If `motifSet` is
#' "vierstra", then this must either be "archetype" (for the v2.1 clustered models) or "individual" (for the original v1 individual motif models). 
#' NOTE: vierstra archetype motifs are currently in beta and have not been finalized by Jeff Vierstra.
#' @param motifPWMs A custom set of motif PWMs as a PWMatrixList to be used instead of `motifSet` for adding motif annotations.
#' If `motifPWMs` is used, `motifSet` will be ignored.
#' @param cutOff The p-value cutoff to be used for motif search. The p-value is determined vs a background set of sequences
#' (see `MOODS` for more details on this determination).
#' @param width The width in basepairs to consider for motif matches. See the `motimatchr` package for more information.
#' @param version An integer specifying version 1 or version 2 of chromVARmotifs see github for more info GreenleafLab/chromVARmotifs.
#' @param force A boolean value indicating whether to force the `peakAnnotation` object indicated by `annoName` to be overwritten if
#' it already exists in the given `ArchRProject`.
#' @param logFile The path to a file to be used for logging ArchR output.
#' @param ... Additional parameters to be passed to `TFBSTools::getMatrixSet` for getting a JASPAR PWM object.
#' 
#' @examples
#'
#' # Get Test ArchR Project
#' proj <- getTestProject()
#'
#' # Add Motif Annotations
#' proj <- addMotifAnnotations(proj, motifSet = "cisbptest", annoName = "test")
#'
#' @export
addMotifAnnotations <- function(
  ArchRProj = NULL,
  motifSet = "cisbp",
  annoName = "Motif",
  species = NULL,
  collection = "CORE",
  motifPWMs = NULL,
  cutOff = 5e-05, 
  width = 7,
  version = 2,
  force = FALSE,
  logFile = createLogFile("addMotifAnnotations"),
  ...
  ){

  .validInput(input = ArchRProj, name = "ArchRProj", valid = c("ArchRProj"))
  .validInput(input = motifSet, name = "motifSet", valid = c("character", "null"))
  .validInput(input = annoName, name = "annoName", valid = c("character"))
  .validInput(input = species, name = "species", valid = c("character", "null"))
  .validInput(input = collection, name = "collection", valid = c("character", "null"))
  .validInput(input = cutOff, name = "cutOff", valid = c("numeric"))
  .validInput(input = width, name = "width", valid = c("integer"))
  .validInput(input = force, name = "force", valid = c("boolean"))
  .validInput(input = logFile, name = "logFile", valid = c("character"))

  if(!is.null(motifPWMs)){
    if(!is(motifPWMs, "PWMatrixList")){
      stop("User Supplied motifPWMS must be a PWMatrixList!")
    }
    motifSet <- "Custom"
  }

  if(is.null(motifSet)){
    stop("Must provide motifSet or motifPWMs!")
  }

  .requirePackage("motifmatchr", installInfo='BiocManager::install("motifmatchr")')

  tstart <- Sys.time()
  .startLogging(logFile = logFile)
  .logThis(mget(names(formals()),sys.frame(sys.nframe())), "addMotifAnnotations Input-Parameters", logFile = logFile)

  if(annoName %in% names(ArchRProj@peakAnnotation)){
    if(force){
      message("peakAnnotation name already exists! Overriding.")
    }else{
      stop("peakAnnotation name already exists! set force = TRUE to override!")
    }
  }

  if(grepl("JASPAR|CISBP", motifSet, ignore.case = TRUE) & is.null(species)){
    if(grepl("hg19",getGenomeAnnotation(ArchRProj)$genome, ignore.case = TRUE)){
      species <- "Homo sapiens"
    }
    if(grepl("hg38",getGenomeAnnotation(ArchRProj)$genome, ignore.case = TRUE)){
      species <- "Homo sapiens"
    }
    if(grepl("mm9",getGenomeAnnotation(ArchRProj)$genome, ignore.case = TRUE)){
      species <- "Mus musculus"
    }
    if(grepl("mm10",getGenomeAnnotation(ArchRProj)$genome, ignore.case = TRUE)){
      species <- "Mus musculus"
    }
  }

  #############################################################
  # Get PWM List adapted from chromVAR!
  #############################################################

  .logDiffTime(paste0("Getting Motif Set, Species : ", species), t1 = tstart, verbose = TRUE, logFile = logFile)

  if(tolower(motifSet)=="jaspar2020"){
    
    .requirePackage("JASPAR2020",installInfo='BiocManager::install("JASPAR2020")')
    args <- list(species = species, collection = collection, ...)
    motifs <- TFBSTools::getMatrixSet(JASPAR2020::JASPAR2020, args)
    obj <- .summarizeJASPARMotifs(motifs)
    motifs <- obj$motifs
    motifSummary <- obj$motifSummary

  }else if(tolower(motifSet)=="jaspar2018"){

    .requirePackage("JASPAR2018",installInfo='BiocManager::install("JASPAR2018")')
    args <- list(species = species, collection = collection, ...)
    motifs <- TFBSTools::getMatrixSet(JASPAR2018::JASPAR2018, args)
    obj <- .summarizeJASPARMotifs(motifs)
    motifs <- obj$motifs
    motifSummary <- obj$motifSummary

  }else if(tolower(motifSet)=="jaspar2016"){

    .requirePackage("JASPAR2016",installInfo='BiocManager::install("JASPAR2018")')
    args <- list(species = species, collection = collection, ...)
    motifs <- TFBSTools::getMatrixSet(JASPAR2016::JASPAR2016, args)
    obj <- .summarizeJASPARMotifs(motifs)
    motifs <- obj$motifs
    motifSummary <- obj$motifSummary

  }else if(tolower(motifSet) %in% c("cisbp", "cisbptest")){

    .requirePackage("chromVARmotifs",installInfo='devtools::install_github("GreenleafLab/chromVARmotifs")')

    if(tolower(motifSet) == "cisbptest"){
        
        message("Using version 2 motifs!")
        data("human_pwms_v2")
        motifs <- human_pwms_v2
        subset <- grep("PAX5|CEBPA|CEBPB|IRF4|ETS1|EOMES", names(motifs), value=TRUE)
        motifs <- motifs[subset]
        obj <- .summarizeChromVARMotifs(motifs)
        motifs <- obj$motifs
        motifSummary <- obj$motifSummary

    }else{

      if(tolower(species) == "mus musculus"){
        if(version == 1){
          message("Using version 1 motifs!")
          data("mouse_pwms_v1")
          motifs <- mouse_pwms_v1        
        }else if(version == 2){
          message("Using version 2 motifs!")
          data("mouse_pwms_v2")
          motifs <- mouse_pwms_v2
        }else{
          stop("Only versions 1 and 2 exist!")
        }
        obj <- .summarizeChromVARMotifs(motifs)
        motifs <- obj$motifs
        motifSummary <- obj$motifSummary
      }else if(tolower(species) == "homo sapiens"){
        if(version == 1){
          message("Using version 1 motifs!")
          data("human_pwms_v1")
          motifs <- human_pwms_v1        
        }else if(version == 2){
          message("Using version 2 motifs!")
          data("human_pwms_v2")
          motifs <- human_pwms_v2
        }else{
          stop("Only versions 1 and 2 exist!")
        }
        obj <- .summarizeChromVARMotifs(motifs)
        motifs <- obj$motifs
        motifSummary <- obj$motifSummary
      }else{
        stop(paste0("Species ",species," not available in CisBP! Currently, only homo sapiens and mus musculus are supported. Please use a different motifSet."))
      }

    }

  }else if(tolower(motifSet)=="encode"){

    .requirePackage("chromVARmotifs",installInfo='devtools::install_github("GreenleafLab/chromVARmotifs")')
    data("encode_pwms")
    motifs <- encode_pwms
    obj <- .summarizeChromVARMotifs(motifs)
    motifs <- obj$motifs
    motifSummary <- obj$motifSummary

  }else if(tolower(motifSet)=="homer"){

    .requirePackage("chromVARmotifs",installInfo='devtools::install_github("GreenleafLab/chromVARmotifs")')
    data("homer_pwms")
    motifs <- homer_pwms
    obj <- .summarizeChromVARMotifs(motifs)
    motifs <- obj$motifs
    motifSummary <- obj$motifSummary

  }else if(tolower(motifSet)=="vierstra"){
    if(tolower(collection)=="individual"){
      url = "https://jeffgranja.s3.amazonaws.com/ArchR/Annotations/Vierstra_Individual_Motifs.rds"
      message("Using Vierstra v1.0 motifs. See https://www.vierstra.org/resources/motif_clustering for more details.")
    } else if(tolower(collection == "archetype")){
      url = "https://jeffgranja.s3.amazonaws.com/ArchR/Annotations/Vierstra_Archetype_Motifs_v2.1.rds"
      message("Using Vierstra v2.1beta motifs. See https://resources.altius.org/~jvierstra/projects/motif-clustering-v2.1beta/ for more details.")
    } else {
      stop(paste0("Error! collection ", collection, " not recognized for motifSet ",motifSet,
        ". Accepted values are 'individual' and 'archetype'"))
    }

    annoPath <- file.path(find.package("ArchR", NULL, quiet = TRUE), "data", "Annotations")
    dir.create(annoPath, showWarnings = FALSE)

    #Download
    if(!file.exists(file.path(annoPath, basename(url)))){
      message("Motif file ", basename(url)," does not exist! Downloading..")
      download.file(
        url = url, 
        destfile = file.path(annoPath, basename(url)),
        quiet = FALSE
      )
    }
    motifFile <- file.path(annoPath, basename(url))

    motifs <- readRDS(motifFile)
    obj <- NULL
    motifSummary <- NULL

  }else if(tolower(motifSet)=="custom"){

    obj <- NULL
    motifs <- motifPWMs
    motifSummary <- NULL

  }else{

    stop("Error MotifSet Not Recognized!")

  }

  .logThis(motifs, "motifs", logFile = logFile)
  .logThis(obj, "obj", logFile = logFile)
  .logThis(motifSummary, "motifSummary", logFile = logFile)

  #############################################################
  # Get BSgenome Information!
  #############################################################
  genome <- ArchRProj@genomeAnnotation$genome
  BSgenome <- tryCatch({
    eval(parse(text = paste0(genome)))
  }, error = function(e){
    eval(parse(text = paste0(genome,"::",genome)))
  })
  BSgenome <- validBSgenome(BSgenome)

  #############################################################
  # Calculate Motif Positions
  #############################################################
  .logDiffTime("Finding Motif Positions with motifmatchr!", t1 = tstart, verbose = TRUE, logFile = logFile)
  peakSet <- ArchRProj@peakSet
  if(is.null(peakSet)){
    .logStop("peakSet is NULL. You need a peakset to run addMotifAnnotations! See addReproduciblePeakSet!", logFile = logFile)
  }
  motifPositions <- motifmatchr::matchMotifs(
      pwms = motifs,
      subject = peakSet,
      genome = BSgenome, 
      out = "positions", 
      p.cutoff = cutOff, 
      w = width
    )

  #############################################################
  # Filter Motifs With No Matches
  #############################################################

  #Number of Overlaps
  nO <- lapply(motifPositions, length) %>% unlist
  mF <- names(which(nO == 0))

  if(all(nO == 0)){
    stop("No Overlaps Found! Please check your peakSet and genome!")
  }

  if(length(mF) > 0){
    .logDiffTime(paste0("Filtering Motif Annotations with 0 overlaps :\n\n ", paste(mF, collapse=", "), "\n\n"), t1 = tstart, verbose = TRUE, logFile = logFile)
    #Filter
    motifPositions <- motifPositions[nO > 0]
    motifSummary <- motifSummary[names(motifPositions),,drop=FALSE]
    motifs <- motifs[names(motifPositions)]
  }else{
    .logDiffTime(paste0("All Motifs Overlap at least 1 peak!"), t1 = tstart, verbose = TRUE, logFile = logFile)
  }  

  #############################################################
  # Motif Overlap Matrix
  #############################################################
  .logDiffTime("Creating Motif Overlap Matrix", t1 = tstart, verbose = TRUE, logFile = logFile)
  allPositions <- unlist(motifPositions)
  overlapMotifs <- findOverlaps(peakSet, allPositions, ignore.strand=TRUE)
  motifMat <- Matrix::sparseMatrix(
    i = queryHits(overlapMotifs),
    j = match(names(allPositions),names(motifPositions))[subjectHits(overlapMotifs)],
    x = rep(TRUE, length(overlapMotifs)),
    dims = c(length(peakSet), length(motifPositions))
  )
  colnames(motifMat) <- names(motifPositions)
  motifMat <- SummarizedExperiment::SummarizedExperiment(assays=SimpleList(matches = motifMat), rowRanges = peakSet)
  .logDiffTime("Finished Getting Motif Info!", t1 = tstart, verbose = TRUE, logFile = logFile)

  out <- SimpleList(
      motifSummary = motifSummary,
      motifMatches = motifMat,
      motifPositions = motifPositions,
      motifList = motifs,
      date = Sys.Date()
    )

  dir.create(file.path(getOutputDirectory(ArchRProj), "Annotations"), showWarnings=FALSE)
  savePositions <- file.path(getOutputDirectory(ArchRProj), "Annotations", paste0(annoName,"-Positions-In-Peaks.rds"))
  saveMatches <- file.path(getOutputDirectory(ArchRProj), "Annotations", paste0(annoName,"-Matches-In-Peaks.rds"))

  ArchRProj@peakAnnotation[[annoName]]$Name <- annoName
  ArchRProj@peakAnnotation[[annoName]]$motifs <- motifs
  ArchRProj@peakAnnotation[[annoName]]$motifSummary <- motifSummary
  ArchRProj@peakAnnotation[[annoName]]$Positions <- savePositions
  ArchRProj@peakAnnotation[[annoName]]$Matches <- saveMatches

  .safeSaveRDS(out, file.path(getOutputDirectory(ArchRProj),  "Annotations", paste0(annoName,"-In-Peaks-Summary.rds")), compress = FALSE)
  .safeSaveRDS(out$motifPositions, savePositions, compress = FALSE)
  .safeSaveRDS(out$motifMatches, saveMatches, compress = FALSE)

  .endLogging(logFile = logFile)

  return(ArchRProj)

}

.summarizeJASPARMotifs <- function(motifs = NULL){

  motifNames <- lapply(seq_along(motifs), function(x){
    namex <- make.names(motifs[[x]]@name)
    if(substr(namex,nchar(namex),nchar(namex))=="."){
      namex <- substr(namex,1,nchar(namex)-1)
    }
    namex <- paste0(namex, "_", x)
    namex
  }) %>% unlist(.)

  motifDF <- lapply(seq_along(motifs), function(x){
    data.frame(
      row.names = motifNames[x],
      name = motifs[[x]]@name[[1]],
      ID = motifs[[x]]@ID,
      strand = motifs[[x]]@strand,
      symbol = ifelse(!is.null(motifs[[x]]@tags$symbol[1]), motifs[[x]]@tags$symbol[1], NA) ,
      family = ifelse(!is.null(motifs[[x]]@tags$family[1]), motifs[[x]]@tags$family[1], NA),
      alias = ifelse(!is.null(motifs[[x]]@tags$alias[1]), motifs[[x]]@tags$alias[1], NA),
      stringsAsFactors = FALSE
    )
  }) %>% Reduce("rbind", .) %>% DataFrame
  
  names(motifs) <- motifNames

  out <- list(motifs = motifs, motifSummary = motifDF)

  return(out)
  
}

.summarizeChromVARMotifs <- function(motifs = NULL){

  motifNames <- lapply(seq_along(motifs), function(x){
    namex <- make.names(motifs[[x]]@name)
    if(grepl("LINE", namex)){
      splitNamex <- stringr::str_split(motifs[[x]]@ID, pattern="\\_", simplify = TRUE)
      namex <- splitNamex[1, grep("LINE",splitNamex[1,]) + 1]
    }
    if(substr(namex,nchar(namex),nchar(namex))=="."){
      namex <- substr(namex,1,nchar(namex)-1)
    }
    namex <- paste0(namex, "_", x)
    namex
  }) %>% unlist(.)

  motifNames2 <- lapply(seq_along(motifs), function(x){
    namex <- make.names(motifs[[x]]@name)
    if(grepl("LINE", namex)){
      splitNamex <- stringr::str_split(motifs[[x]]@ID, pattern="\\_", simplify = TRUE)
      namex <- splitNamex[1, grep("LINE",splitNamex[1,]) + 1]
    }
    if(substr(namex,nchar(namex),nchar(namex))=="."){
      namex <- substr(namex,1,nchar(namex)-1)
    }
    namex
  }) %>% unlist(.)

  motifDF <- lapply(seq_along(motifs), function(x){
    df <- data.frame(
      row.names = motifNames[x],
      name = motifNames2[[x]],
      ID = motifs[[x]]@ID,
      strand = motifs[[x]]@strand,
      stringsAsFactors = FALSE
    )
  }) %>% Reduce("rbind", .) %>% DataFrame

  names(motifs) <- motifNames

  out <- list(motifs = motifs, motifSummary = motifDF)

  return(out)

}

#' Add ArchR annotations to an ArchRProject
#' 
#' This function adds information about which peaks in the ArchR database contain input regions to a given ArchRProject.
#' For each peak, a binary value is stored indicating whether each region is observed within the peak region.
#' 
#' @param ArchRProj An `ArchRProject` object.
#' @param db A string indicating the database or a path to a database to use for peak annotation. Options include ArchR,
#' LOLA, and a valid path to a file of class `ArchRAnno`.
#' @param collection A string indicating which collection within the database to collect for annotation. 
#' For ArchR, options are "ATAC", "EncodeTFBS", "CistromeTFBS", or "Codex".
#' For LOLA, options include "EncodeTFBS" "CistromeTFBS", "CistromeEpigenome", "Codex", or "SheffieldDnase".
#' If supplying a custom `ArchRAnno` file please select a valid collection from within that database.
#' @param name The name of the `peakAnnotation` object to be stored in the `ArchRProject`.
#' @param force A boolean value indicating whether to force the `peakAnnotation` object indicated by `name` to be
#' overwritten if it already exists in the given `ArchRProject`.
#' @param logFile The path to a file to be used for logging ArchR output.
#' 
#' @examples
#'
#' # Get Test ArchR Project
#' proj <- getTestProject()
#'
#' # Add Motif Annotations
#' proj <- addArchRAnnotations(proj, name = "test")
#'
#' @export
addArchRAnnotations <- function(
  ArchRProj = NULL,
  db = "ArchR",
  collection = "EncodeTFBS",
  name = collection,
  force = FALSE,
  logFile = createLogFile("addArchRAnnotations")
  ){

  .validInput(input = ArchRProj, name = "ArchRProj", valid = c("ArchRProj"))
  .validInput(input = db, name = "db", valid = c("character"))
  .validInput(input = collection, name = "collection", valid = c("character"))
  .validInput(input = name, name = "name", valid = c("character"))
  .validInput(input = force, name = "force", valid = c("boolean"))
  .validInput(input = logFile, name = "logFile", valid = c("character"))

  tstart <- Sys.time()
  .startLogging(logFile = logFile)
  .logThis(mget(names(formals()),sys.frame(sys.nframe())), "addArchRAnnotations Input-Parameters", logFile = logFile)

  if(name %in% names(ArchRProj@peakAnnotation)){
    if(force){
      message("peakAnnotation name already exists! Overriding.")
    }else{
      stop("peakAnnotation name already exists! set force = TRUE to override!")
    }
  }

  genome <- tolower(tryCatch({
    validBSgenome(getGenome(ArchRProj))$provider_version
  }, error = function(e){
    strsplit(validBSgenome(getGenome(ArchRProj))@pkgname,"\\.")[[1]][4]
  }))

  annoPath <- file.path(find.package("ArchR", NULL, quiet = TRUE), "data", "Annotations")
  dir.create(annoPath, showWarnings = FALSE)
  
  if(tolower(db) == "lola"){
  
    if(genome == "hg19"){
      url <- "https://jeffgranja.s3.amazonaws.com/ArchR/Annotations/LOLA-Hg19-v1.Anno"
    }else if(genome == "hg38"){
      url <- "https://jeffgranja.s3.amazonaws.com/ArchR/Annotations/LOLA-Hg38-v1.Anno"
    }else if(genome == "mm9"){
      url <- "https://jeffgranja.s3.amazonaws.com/ArchR/Annotations/LOLA-Mm9-v1.Anno"
    }else if(genome == "mm10"){
      url <- "https://jeffgranja.s3.amazonaws.com/ArchR/Annotations/LOLA-Mm10-v1.Anno"
    }
    
    #Download
    if(!file.exists(file.path(annoPath, basename(url)))){
      message("Annotation ", basename(url)," does not exist! Downloading..")
      download.file(
        url = url, 
        destfile = file.path(annoPath, basename(url)),
        quiet = FALSE
      )
    }
    AnnoFile <- file.path(annoPath, basename(url))

  }else if(tolower(db) == "archr"){

    if(genome == "hg19"){
      url <- "https://jeffgranja.s3.amazonaws.com/ArchR/Annotations/ArchR-Hg19-v1.Anno"
    }else if(genome == "hg38"){
      url <- "https://jeffgranja.s3.amazonaws.com/ArchR/Annotations/ArchR-Hg38-v1.Anno"
    }else if(genome == "mm9"){
      stop("ArchR mm9 annotations not yet supported! Try LOLA for now!")
      url <- "https://jeffgranja.s3.amazonaws.com/ArchR/Annotations/ArchR-Mm9-v1.Anno"
    }else if(genome == "mm10"){
      stop("ArchR mm10 annotations not yet supported! Try LOLA for now!")
      url <- "https://jeffgranja.s3.amazonaws.com/ArchR/Annotations/ArchR-Mm10-v1.Anno"
    }

    #Download
    if(!file.exists(file.path(annoPath, basename(url)))){
      oldTimeout <- getOption('timeout')
      options(timeout=10000)
      message("Annotation ", basename(url)," does not exist! Downloading..")
      download.file(
        url = url, 
        destfile = file.path(annoPath, basename(url)),
        quiet = FALSE
      )
      options(timeout=oldTimeout)
    }
    AnnoFile <- file.path(annoPath, basename(url))

  }else if(tolower(db) %ni% c("archr", "lola")){

    if(!file.exists(db)){
      stop("Database path does not exist! Please supply valid database for Annotations!")
    }
    AnnoFile <- db

  }else{

    stop("Please supply valid database for Annotations!")

  }

  #Check AnnoFile is ArchRAnno
  if (h5read(AnnoFile, "Class") != "ArchRAnno") {
    stop("Not Valid ArchRAnno!")
  }

  #Check if Collection is Valid
  h5d <- h5ls(AnnoFile)
  collections <- h5d[h5d$group == "/" & h5d$otype == "H5I_GROUP",]$name
  if(any(tolower(collections) == tolower(collection))){
    collection <- collections[tolower(collections) %in% tolower(collection)]
  }else{
    stop("Not Valid Collection in ArchRAnno Database!")
  }

  #############################################################
  # Peak Overlap Matrix
  #############################################################
  peakSet <- getPeakSet(ArchRProj)
  if(is.null(peakSet)){
    .logStop("peakSet is NULL. You need a peakset to run addMotifAnnotations! See addReproduciblePeakSet!", logFile = logFile)
  }
  chr <- paste0(unique(seqnames(peakSet)))

  .logMessage("Annotating Chromosomes", verbose = TRUE, logFile = logFile)
  regionMat <- lapply(seq_along(chr), function(i){
    .logMessage(paste0("\tAnnotating Chr: ", chr[i]), verbose = TRUE, logFile = logFile)
    regions <- .getRegionsFromAnno(AnnoFile = AnnoFile, Group = collection, chr = chr[i])
    o <- DataFrame(findOverlaps(query = peakSet, subject = regions, ignore.strand = TRUE))
    o$ID <- as.integer(mcols(regions)[o$subjectHits, "ID"])
    o$subjectHits <- NULL
    o
  }) %>% Reduce("rbind", .)
  .logThis(regionMat, "regionMat-Overlaps", logFile=logFile)

  .logThis(range(regionMat[, 1]), "regionMat-Overlaps-1", logFile=logFile)
  .logThis(range(regionMat[, 2]), "regionMat-Overlaps-2", logFile=logFile)
  .logThis(peakSet, "regionMat-Overlaps-1-Peaks", logFile=logFile)
  .logThis(unique(regionMat$ID), "regionMat-Overlaps-2-IDs", logFile=logFile)
  regionMat <- Matrix::sparseMatrix(
    i = regionMat[, 1], 
    j = regionMat[, 2], 
    x = rep(TRUE, nrow(regionMat)), 
    dims = c(length(peakSet), max(regionMat$ID))
  )
  .logThis(regionMat, "regionMat", logFile=logFile)

  #Get Region Metadata
  regionMetadata <- DataFrame(h5read(AnnoFile, paste0(collection,"/Info")))
  rownames(regionMetadata) <- regionMetadata$name
  colnames(regionMat) <- rownames(regionMetadata)
  regionMetadata$name <- NULL

  #Matches Summarized Experiment
  regionMat <- SummarizedExperiment::SummarizedExperiment(
    assays=SimpleList(matches = regionMat), 
    rowRanges = peakSet, 
    colData = regionMetadata
  )
  .logThis(regionMat, "regionSE", logFile=logFile)

  #############################################################
  # Filter Regions With No Matches
  #############################################################

  #Number of Overlaps
  nO <- Matrix::colSums(assay(regionMat))
  rF <- names(which(nO == 0))

  if(all(nO == 0)){
    stop("No Overlaps Found! Please check your peakSet and genome!")
  }

  if(length(rF) > 0){
    .logDiffTime(paste0("Filtering Region Annotations with 0 overlaps :\n\n ", paste(rF, collapse=", "), "\n\n"), t1 = tstart, verbose = TRUE, logFile = logFile)
    #Filter
    regionMat <- regionMat[,nO > 0,drop=FALSE]
  }else{
    .logDiffTime(paste0("All Regions Overlap at least 1 peak!"), t1 = tstart, verbose = TRUE, logFile = logFile)
  }  

  #############################################################
  # Save
  #############################################################

  dir.create(file.path(getOutputDirectory(ArchRProj), "Annotations"), showWarnings=FALSE)
  saveMatches <- file.path(getOutputDirectory(ArchRProj), "Annotations", paste0(name,"-Matches-In-Peaks.rds"))

  out <- SimpleList(
      regionMatches = regionMat,
      regionPositions = "None",
      date = Sys.Date()
    )

  ArchRProj@peakAnnotation[[name]]$Name <- name
  ArchRProj@peakAnnotation[[name]]$Positions <- "None"
  ArchRProj@peakAnnotation[[name]]$Matches <- saveMatches

  .safeSaveRDS(out, file.path(getOutputDirectory(ArchRProj),  "Annotations", paste0(name,"-In-Peaks-Summary.rds")), compress = FALSE)
  .safeSaveRDS(out$regionMatches, saveMatches, compress = FALSE)

  .endLogging(logFile = logFile)

  return(ArchRProj)

}

.getRegionsFromAnno <- function(
  AnnoFile = NULL, 
  Group = NULL,
  chr = NULL, 
  out = "GRanges", 
  method = "fast"
  ){

  if(is.null(chr)){
    stop("Need to provide chromosome to read!")
  }

  o <- h5closeAll()
  if (h5read(AnnoFile, "Class") != "ArchRAnno") {
  stop("Not Valid ArchRAnno!")
  }
  
  if(chr %ni% .availableSeqnames(AnnoFile, Group)){
    stop(paste("Error! Chromosome ",chr," not in AnnoFile!"))
  }

  o <- h5closeAll()
  nRegions <- sum(.h5read(AnnoFile, paste0(Group,"/",chr,"/IDLengths"), method = method))

  if(nRegions==0){
    if(tolower(out)=="granges"){
      output <- GRanges(seqnames = chr, IRanges(start = 1, end = 1), ID = "tmp")
      output <- output[-1,]
    }else{
      output <- IRanges(start = 1, end = 1)
      mcols(output)$ID <- c("tmp")
      output <- output[-1,]
    }
    return(output)
  }


  if(tolower(method) == "fast"){
    
    output <- .h5read(AnnoFile, paste0(Group,"/",chr,"/Ranges"), method = method) %>% 
      {IRanges(start = .[,1], width = .[,2])}
    mcols(output)$ID <- Rle(
      values =  .h5read(AnnoFile, paste0(Group,"/",chr,"/IDValues"), method = method), 
      lengths = .h5read(AnnoFile, paste0(Group,"/",chr,"/IDLengths"), method = method)
    )

  }else{
    o <- h5closeAll()
  idRle <- Rle(h5read(AnnoFile, paste0(Group,"/",chr,"/IDValues")), h5read(AnnoFile, paste0(Group,"/",chr,"/IDLengths")))
  idx <- seq_along(idRle)
  if(length(idx) > 0){
    output <- h5read(AnnoFile, paste0(Group,"/",chr,"/Ranges"), index = list(idx, 1:2)) %>% 
      {IRanges(start = .[,1], width = .[,2])}
    mcols(output)$ID <- idRle[idx]
  }else{
    output <- IRanges(start = 1, end = 1)
    mcols(output)$ID <- c("tmp")
    output <- output[-1,]
  }

  }
  
  o <- h5closeAll()

  if(tolower(out)=="granges"){
    if(length(output) > 0){
      output <- GRanges(seqnames = chr, ranges(output), ID = mcols(output)$ID)    
    }else{
      output <- IRanges(start = 1, end = 1)
      mcols(output)$ID <- c("tmp")
      output <- GRanges(seqnames = chr, ranges(output), ID = mcols(output)$ID)
      output <- output[-1,]
    }
  }

  return(output)

}

#' Hypergeometric Enrichment in input peak ranges.
#' 
#' This function will perform hypergeometric enrichment of a given peak matches object and ranges.
#' 
#' @param ranges  A `GenomicRanges` object of peaks/regions to overlap with peaks.
#' @param matches A custom `peakAnnotation` matches object used as input for the hypergeometric test. See
#' `motifmatchr::matchmotifs()` for additional information.
#' @param bgdPeaks A `SummarizedExperiment` of background peaks from `getBgdPeaks` can be NULL for using all peaks.
#' 
#' @examples
#' #Project
#' proj <- getTestProject()
#' 
#' #Get Peaks
#' peaks <- getPeakSet(proj)
#' 
#' #Custom C1 Mono
#' peaks1 <- peaks[names(peaks)=="C1"]
#' 
#' #All Peaks
#' customEnrichment(
#'   ranges = peaks1, 
#'   matches = getMatches(proj)
#' )
#' #         feature CompareFrequency nCompare CompareProportion BackgroundFrequency
#' # CEBPB_1 CEBPB_1               70      635        0.11023622                 122
#' # CEBPA_2 CEBPA_2               81      635        0.12755906                 156
#' # IRF4_4   IRF4_4              103      635        0.16220472                 276
#' # EOMES_6 EOMES_6               36      635        0.05669291                 120
#' # ETS1_3   ETS1_3               38      635        0.05984252                 149
#' # PAX5_5   PAX5_5               23      635        0.03622047                 124
#' #         nBackground BackgroundProporition Enrichment mlog10p mlog10Padj
#' # CEBPB_1        2142            0.05695612  1.9354589 10.3279   9.373657
#' # CEBPA_2        2142            0.07282913  1.7514839  8.9394   7.985157
#' # IRF4_4         2142            0.12885154  1.2588497  2.6938   1.739557
#' # EOMES_6        2142            0.05602241  1.0119685  0.3001   0.000000
#' # ETS1_3         2142            0.06956116  0.8602864  0.0487   0.000000
#' # PAX5_5         2142            0.05788982  0.6256795  0.0006   0.000000
#' 
#' #Background Peaks
#' customEnrichment(
#'  ranges = peaks1, 
#'   matches = getMatches(proj), 
#'  bgdPeaks = getBgdPeaks(proj, force=TRUE)
#' )
#' #         feature CompareFrequency nCompare CompareProportion BackgroundFrequency
#' # CEBPB_1 CEBPB_1               70      635        0.11023622                2459
#' # CEBPA_2 CEBPA_2               81      635        0.12755906                3154
#' # IRF4_4   IRF4_4              103      635        0.16220472                4836
#' # ETS1_3   ETS1_3               38      635        0.05984252                1781
#' # EOMES_6 EOMES_6               36      635        0.05669291                1975
#' # PAX5_5   PAX5_5               23      635        0.03622047                1495
#' #         nBackground BackgroundProporition Enrichment mlog10p mlog10Padj
#' # CEBPB_1       32385            0.07593021  1.4518097  2.9544   2.000157
#' # CEBPA_2       32385            0.09739077  1.3097654  2.1341   1.179857
#' # IRF4_4        32385            0.14932839  1.0862283  0.7142   0.000000
#' # ETS1_3        32385            0.05499460  1.0881527  0.4975   0.000000
#' # EOMES_6       32385            0.06098502  0.9296203  0.1551   0.000000
#' # PAX5_5        32385            0.04616335  0.7846154  0.0422   0.000000
#' #
#' @export
customEnrichment <- function(
  ranges = NULL,
  matches = NULL,
  bgdPeaks = NULL
  ){

  if(is.null(matches)){
    stop("Please supply matches! Try `matches` = getMatches(ArchRProj)!")
  }

  .validInput(input = ranges, name = "ranges", valid = c("granges"))
  .validInput(input = matches, name = "matches", valid = c("SummarizedExperiment"))
  .validInput(input = bgdPeaks, name = "bgdPeaks", valid = c("SummarizedExperiment", "null"))

  if(!is.null(bgdPeaks)){
    
    rownames(matches) <- paste0(rowRanges(matches))
    rownames(bgdPeaks) <- paste0(rowRanges(bgdPeaks))
    
    bgdPeaks <- bgdPeaks[rownames(matches), ]
    idx <- unique(queryHits(findOverlaps(matches, ranges, ignore.strand=TRUE)))
    
    .computeEnrichment(matches, idx, c(idx, as.vector(assay(bgdPeaks)[idx,])))
  
  }else{
   
    idx <- unique(queryHits(findOverlaps(matches, ranges, ignore.strand=TRUE)))
    
    .computeEnrichment(matches, idx, seq_len(nrow(matches)))
  
  }

}

#' Peak Annotation Hypergeometric Enrichment in Marker Peaks.
#' 
#' This function will perform hypergeometric enrichment of a given peak annotation within the defined marker peaks.
#' 
#' @param seMarker  A `SummarizedExperiment` object returned by `markerFeatures()`.
#' @param ArchRProj An `ArchRProject` object.
#' @param peakAnnotation A `peakAnnotation` object in the provided `ArchRProject` to be used for hypergeometric testing.
#' @param matches A custom `peakAnnotation` matches object used as input for the hypergeometric test. See
#' `motifmatchr::matchmotifs()` for additional information.
#' @param cutOff A valid-syntax logical statement that defines which marker features from `seMarker` to use.
#' `cutoff` can contain any of the `assayNames` from `seMarker`.
#' @param background A string that indicates whether to use a background set of matched peaks to compare against ("bgdPeaks") or all peaks ("all").
#' @param logFile The path to a file to be used for logging ArchR output.
#' 
#' @examples
#'
#' # Get Test ArchR Project
#' proj <- getTestProject()
#'
#' # Get Markers
#' seMarker <- getMarkerFeatures(
#'   ArchRProj = proj, 
#'   useMatrix = "PeakMatrix", 
#'   testMethod = "binomial", 
#'   binarize = TRUE
#' )
#'
#' # Get Peak Annotation Enrichment
#' annoEnrich <- peakAnnoEnrichment(
#'   seMarker = seMarker, 
#'   ArchRProj = proj,
#'   cutOff = "FDR <= 0.1 & Log2FC >= 0"
#' )
#'
#' @export
peakAnnoEnrichment <- function(
  seMarker = NULL,
  ArchRProj = NULL,
  peakAnnotation = NULL,
  matches = NULL,
  cutOff = "FDR <= 0.1 & Log2FC >= 0.5",
  background = "all",
  logFile = createLogFile("peakAnnoEnrichment")
  ){

  .validInput(input = seMarker, name = "seMarker", valid = c("SummarizedExperiment"))
  .validInput(input = ArchRProj, name = "ArchRProj", valid = c("ArchRProj"))
  .validInput(input = peakAnnotation, name = "peakAnnotation", valid = c("character", "null"))
  .validInput(input = matches, name = "matches", valid = c("SummarizedExperiment", "null"))
  .validInput(input = cutOff, name = "cutOff", valid = c("character"))
  .validInput(input = background, name = "background", valid = c("character"))
  .validInput(input = logFile, name = "logFile", valid = c("character"))

  tstart <- Sys.time()
  .startLogging(logFile = logFile)
  .logThis(mget(names(formals()),sys.frame(sys.nframe())), "peakAnnoEnrichment Input-Parameters", logFile = logFile)

  if(S4Vectors::metadata(seMarker)$Params$useMatrix != "PeakMatrix"){
    stop("Only markers identified from PeakMatrix can be used!")
  }

  if(is.null(matches)){
    matches <- getMatches(ArchRProj, peakAnnotation)
  }
  
  r1 <- SummarizedExperiment::rowRanges(matches)
  pr1 <- paste(seqnames(r1),start(r1),end(r1),sep="_")
  mcols(r1) <- NULL

  r2 <- getPeakSet(ArchRProj)
  pr2 <- paste(seqnames(r2),start(r2),end(r2),sep="_")
  mcols(r2) <- NULL

  r3 <- GRanges(rowData(seMarker)$seqnames, IRanges(rowData(seMarker)$start, rowData(seMarker)$end))
  pr3 <- paste(seqnames(r3),start(r3),end(r3),sep="_")
  mcols(r3) <- NULL

  .logThis(r1, "Peaks-Matches", logFile = logFile)
  .logThis(r2, "Peaks-ArchRProj", logFile = logFile)
  .logThis(r3, "Peaks-SeMarker", logFile = logFile)

  .logThis(pr1, "Peaks-Pasted-Matches", logFile = logFile)
  .logThis(pr2, "Peaks-Pasted-ArchRProj", logFile = logFile)
  .logThis(pr3, "Peaks-Pasted-SeMarker", logFile = logFile)

  if(length(which(pr1 %ni% pr2)) != 0){
    stop("Peaks from matches do not match peakSet in ArchRProj!")
  }

  if(length(which(pr2 %ni% pr3)) != 0){
    stop("Peaks from seMarker do not match peakSet in ArchRProj!")
  }

  rownames(matches) <- pr1
  matches <- matches[pr3, ]

  #Evaluate AssayNames
  assayNames <- names(SummarizedExperiment::assays(seMarker))
  for(an in assayNames){
    eval(parse(text=paste0(an, " <- ", "SummarizedExperiment::assays(seMarker)[['", an, "']]")))
  }
  passMat <- eval(parse(text=cutOff))
  passMat[is.na(passMat)] <- FALSE
  for(an in assayNames){
    eval(parse(text=paste0("rm(",an,")")))
  }

  if(tolower(background) %in% c("backgroundpeaks", "bgdpeaks", "background", "bgd")){
    method <- "bgd"
    bgdPeaks <- SummarizedExperiment::assay(getBgdPeaks(ArchRProj))
  }else{
    method <- "all"
  }

  enrichList <- lapply(seq_len(ncol(seMarker)), function(x){
    .logDiffTime(sprintf("Computing Enrichments %s of %s", x, ncol(seMarker)), t1 = tstart, verbose = TRUE, logFile = logFile)
    idx <- which(passMat[, x])
    if(method == "bgd"){
      .computeEnrichment(matches, idx, c(idx, as.vector(bgdPeaks[idx,])))
    }else{
      .computeEnrichment(matches, idx, seq_len(nrow(matches)))
    }
  }) %>% SimpleList
  names(enrichList) <- colnames(seMarker)

  assays <- lapply(seq_len(ncol(enrichList[[1]])), function(x){
    d <- lapply(seq_along(enrichList), function(y){
      enrichList[[y]][colnames(matches),x,drop=FALSE]
    }) %>% Reduce("cbind",.)
    colnames(d) <- names(enrichList)
    d
  }) %>% SimpleList
  names(assays) <- colnames(enrichList[[1]])
  assays <- rev(assays)
  out <- SummarizedExperiment::SummarizedExperiment(assays=assays)

  .endLogging(logFile = logFile)

  out

}

.computeEnrichment <- function(matches = NULL, compare = NULL, background = NULL){

  matches <- .getAssay(matches,  grep("matches", names(assays(matches)), value = TRUE, ignore.case = TRUE))
  
  #Compute Totals
  matchCompare <- matches[compare, ,drop=FALSE]
  matchBackground <- matches[background, ,drop=FALSE]
  matchCompareTotal <- Matrix::colSums(matchCompare)
  matchBackgroundTotal <- Matrix::colSums(matchBackground)

  #Create Summary DF
  pOut <- data.frame(
    feature = colnames(matches),
    CompareFrequency = matchCompareTotal,
    nCompare = nrow(matchCompare),
    CompareProportion = matchCompareTotal/nrow(matchCompare),
    BackgroundFrequency = matchBackgroundTotal,
    nBackground = nrow(matchBackground),
    BackgroundProporition = matchBackgroundTotal/nrow(matchBackground)
  )
  
  #Enrichment
  pOut$Enrichment <- pOut$CompareProportion / pOut$BackgroundProporition
  
  #Get P-Values with Hyper Geometric Test
  pOut$mlog10p <- lapply(seq_len(nrow(pOut)), function(x){
    p <- -phyper(pOut$CompareFrequency[x] - 1, # Number of Successes the -1 is due to cdf integration
     pOut$BackgroundFrequency[x], # Number of all successes in background
     pOut$nBackground[x] - pOut$BackgroundFrequency[x], # Number of non successes in background
     pOut$nCompare[x], # Number that were drawn
     lower.tail = FALSE, log.p = TRUE)# P[X > x] Returns LN must convert to log10
    return(p/log(10))
  }) %>% unlist %>% round(4)

  #Minus Log10 Padj
  pOut$mlog10Padj <- pmax(pOut$mlog10p - log10(ncol(pOut)), 0)
  pOut <- pOut[order(pOut$mlog10p, decreasing = TRUE), , drop = FALSE]

  pOut

}


#' @export
enrichHeatmap <- function(...){
    .Deprecated("plotEnrichHeatmap")
    plotEnrichHeatmap(...)
}

#' Plot a Heatmap of Peak Annotation Hypergeometric Enrichment in Marker Peaks.
#' 
#' This function will plot a heatmap of hypergeometric enrichment of a given peakAnnotation within the defined marker peaks.
#' 
#' @param seEnrich A `SummarizedExperiment` object containing peak enrichment information returned by `peakAnnoEnrichment()`.
#' @param pal A custom continuous palette (see `paletteContinuous()`) used to override the default continuous palette for the heatmap.
#' @param n The number of top enriched peakAnnotations per column from the `seMarker` to display in the heatmap. This number can
#' be lowered to improve visibility of the heatmap.
#' @param cutOff A numeric cutOff that indicates the minimum P-adj enrichment to be included in the heatmap.
#' @param pMax A numeric representing the maximum P-adj for plotting in the heatmap.
#' @param clusterCols A boolean indicating whether or not to cluster columns in the heatmap.
#' @param binaryClusterRows A boolean indicating whether or not to cluster rows using binary classification in the heatmap.
#' @param labelRows A boolean indicating whether or not to label all rows in the heatmap.
#' @param rastr A boolean value that indicates whether the plot should be rasterized using `ComplexHeatmap`. This does not rasterize
#' lines and labels, just the internal portions of the plot.
#' @param transpose A boolean determining whether to transpose the heatmap in the plot.
#' @param returnMatrix A boolean determining whether to return the matrix corresponding to the heatmap rather than generate a plot.
#' @param logFile The path to a file to be used for logging ArchR output.
#' 
#' @examples
#'
#' # Get Test ArchR Project
#' proj <- getTestProject()
#'
#' # Get Markers
#' seMarker <- getMarkerFeatures(
#'   ArchRProj = proj, 
#'   useMatrix = "PeakMatrix", 
#'   testMethod = "binomial", 
#'   binarize = TRUE
#' )
#'
#' # Get Peak Annotation Enrichment
#' annoEnrich <- peakAnnoEnrichment(
#'   seMarker = seMarker, 
#'   ArchRProj = proj,
#'   cutOff = "FDR <= 0.1 & Log2FC >= 0"
#' )
#' 
#' # Multiply by 50 since this is a super small test sample
#' assay(annoEnrich) <- assay(annoEnrich) * 50
#' 
#' #Plot
#' p <- plotEnrichHeatmap(annoEnrich)
#' 
#' #PDF
#' plotPDF(p, name = "PeakAnnoEnrich", ArchRProj = proj)
#' 
#' @export
plotEnrichHeatmap <- function(
  seEnrich = NULL,
  pal = paletteContinuous(set = "comet", n = 100),
  n = 10,
  cutOff = 20,
  pMax = Inf,
  clusterCols = TRUE,
  binaryClusterRows = TRUE,
  labelRows = TRUE,
  rastr = TRUE,
  transpose = FALSE,
  returnMatrix = FALSE,
  logFile = createLogFile("plotEnrichHeatmap")
  ){

  .validInput(input = seEnrich, name = "seEnrich", valid = c("SummarizedExperiment"))
  .validInput(input = pal, name = "pal", valid = c("character"))
  .validInput(input = n, name = "n", valid = c("integer"))
  .validInput(input = cutOff, name = "cutOff", valid = c("numeric"))
  .validInput(input = pMax, name = "pMax", valid = c("numeric"))
  .validInput(input = clusterCols, name = "clusterCols", valid = c("boolean"))
  .validInput(input = binaryClusterRows, name = "binaryClusterRows", valid = c("boolean"))
  .validInput(input = labelRows, name = "labelRows", valid = c("boolean"))
  .validInput(input = rastr, name = "rastr", valid = c("boolean"))
  .validInput(input = transpose, name = "transpose", valid = c("boolean"))
  .validInput(input = returnMatrix, name = "returnMatrix", valid = c("boolean"))
  .validInput(input = logFile, name = "logFile", valid = c("character"))

  tstart <- Sys.time()
  .startLogging(logFile = logFile)
  .logThis(mget(names(formals()),sys.frame(sys.nframe())), "plotEnrichHeatmap Input-Parameters", logFile = logFile)

  mat <- assays(seEnrich)[["mlog10Padj"]]
  .logThis(mat, "mat-mlog10Padj", logFile = logFile)

  keep <- lapply(seq_len(ncol(mat)), function(x){
    idx <- head(order(mat[, x], decreasing = TRUE), n)
    rownames(mat)[idx[which(mat[idx,x] > cutOff)]]
  }) %>% unlist %>% unique
  mat <- mat[keep, ,drop = FALSE]
  .logThis(mat, "mat-mlog10Padj-Filter", logFile = logFile)

  if(nrow(mat)==0){
    stop("No enrichments found for your cutoff!")
  }

  passMat <- lapply(seq_len(nrow(mat)), function(x){
    (mat[x, ] >= 0.9*max(mat[x, ])) * 1
  }) %>% Reduce("rbind", .) %>% data.frame
  colnames(passMat) <- colnames(mat)
  .logThis(passMat, "passMat", logFile = logFile)

  mat[mat > pMax] <- pMax

  if(nrow(mat)==0){
    stop("No enrichments found for your cutoff!")
  }

  mat <- .rowScale(as.matrix(mat), min = 0)
  if(pMax != 100){
      rownames(mat[[1]]) <- paste0(rownames(mat[[1]]), " (",round(mat$max),")")
      rownames(passMat) <- rownames(mat[[1]])
  }
  .logThis(mat, "mat-mlog10Padj-RowScale", logFile = logFile)

  mat2 <- mat[[1]] * 100

  if(nrow(mat2) > 1 & ncol(mat2) > 1){
    if(binaryClusterRows){
      #cn <- order(colMeans(mat2), decreasing=TRUE)
      bS <- .binarySort(mat2, lmat = passMat[rownames(mat2), colnames(mat2)], clusterCols = clusterCols)
      mat2 <- bS[[1]][,colnames(mat2)]
      clusterRows <- FALSE
      clusterCols <- bS[[2]]
    }else{
      clusterRows <- TRUE
    }
  }else{
    clusterCols <- NULL
    clusterRows <- FALSE
  }

  if(nrow(mat2) > 100){
    borderColor <- FALSE
  }else{
    borderColor <- TRUE
  }
  .logThis(mat2, "mat-mlog10Padj-RowScale-Plot", logFile = logFile)

  if(transpose){

    if(!is.null(clusterCols)){
      mat2 <- t(mat2[,clusterCols$order,drop=FALSE])
    }else{
      mat2 <- t(mat2)
    }

    if(returnMatrix){
      return(mat2)
    }

    ht <- tryCatch({

      .ArchRHeatmap(
        mat = as.matrix(mat2),
        scale = FALSE,
        limits = c(0, max(mat2)),
        color = pal, 
        clusterCols = FALSE, 
        clusterRows = FALSE,
        labelRows = TRUE,
        useRaster = rastr,
        fontSizeCols = 6,
        borderColor = borderColor,
        customColLabel = seq_len(ncol(mat2)),
        showRowDendrogram = FALSE,
        draw = FALSE,
        name = "Norm. Enrichment -log10(P-adj) [0-Max]"
      )

    }, error = function(e){

      errorList = list(
        mat = as.matrix(mat2),
        scale = FALSE,
        limits = c(0, max(mat2)),
        color = pal, 
        clusterCols = FALSE, 
        clusterRows = FALSE,
        labelRows = TRUE,
        useRaster = rastr,
        fontSizeCols = 6,
        borderColor = borderColor,
        customColLabel = seq_len(ncol(mat2)),
        showRowDendrogram = FALSE,
        draw = FALSE,
        name = "Norm. Enrichment -log10(P-adj) [0-Max]"
      )

      .logError(e, fn = ".ArchRHeatmap", info = "", errorList = errorList, logFile = logFile)

    })

  }else{

    if(returnMatrix){
      return(mat2)
    }

    ht <- tryCatch({

      .ArchRHeatmap(
        mat = as.matrix(mat2),
        scale = FALSE,
        limits = c(0, max(mat2)),
        color = pal, 
        clusterCols = clusterCols, 
        clusterRows = clusterRows,
        useRaster = rastr,
        borderColor = borderColor,
        fontSizeRows = 6,
        customRowLabel = seq_len(nrow(mat2)),
        labelCols = TRUE,
        showColDendrogram = TRUE,
        draw = FALSE,
        name = "Norm. Enrichment -log10(P-adj) [0-Max]"
      )

    }, error = function(e){

      errorList = list(
        mat = as.matrix(mat2),
        scale = FALSE,
        limits = c(0, max(mat2)),
        color = pal, 
        clusterCols = clusterCols, 
        clusterRows = clusterRows,
        useRaster = rastr,
        borderColor = borderColor,
        fontSizeRows = 6,
        customRowLabel = seq_len(nrow(mat2)),
        labelCols = TRUE,
        showColDendrogram = TRUE,
        draw = FALSE,
        name = "Norm. Enrichment -log10(P-adj) [0-Max]"
      )

      .logError(e, fn = ".ArchRHeatmap", info = "", errorList = errorList, logFile = logFile)

    })

  }

  return(ht)

}
GreenleafLab/ArchR documentation built on Feb. 21, 2025, 3:05 p.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
GreenleafLab/ArchR
Analyzing single-cell regulatory chromatin in R.

R/AnnotationPeaks.R
In GreenleafLab/ArchR: Analyzing single-cell regulatory chromatin in R.

Defines functions .summarizeJASPARMotifs addMotifAnnotations addPeakAnnotations getMatches getPositions getPeakAnnotation

Documented in addMotifAnnotations addPeakAnnotations getMatches getPeakAnnotation getPositions

R Package Documentation

Browse R Packages

We want your feedback!

GreenleafLab/ArchR Analyzing single-cell regulatory chromatin in R.

R/AnnotationPeaks.R In GreenleafLab/ArchR: Analyzing single-cell regulatory chromatin in R.

Defines functions .summarizeJASPARMotifs addMotifAnnotations addPeakAnnotations getMatches getPositions getPeakAnnotation

Documented in addMotifAnnotations addPeakAnnotations getMatches getPeakAnnotation getPositions

R Package Documentation

Browse R Packages

We want your feedback!

GreenleafLab/ArchR
Analyzing single-cell regulatory chromatin in R.

R/AnnotationPeaks.R
In GreenleafLab/ArchR: Analyzing single-cell regulatory chromatin in R.