R/helpers.R

Defines functions GetTranscripts RequantifyPeaksLarge RequantifyPeaks PeakJaccard OverlapTotal OverlapQC PercOverlap OverlapDistPlot WelcomePlot ToggleDemos OversizedLegend Oxford Online NNTransform SaveAzimuthReference LoadBridgeReference LoadReference LoadH5ADobs LoadH5AD LoadFileInput HoverBoxStyle FileType Connect ConvertEnsembleToSymbol ConvertGeneNames

Documented in ConvertGeneNames GetTranscripts LoadBridgeReference LoadFileInput LoadH5AD LoadH5ADobs LoadReference NNTransform SaveAzimuthReference

#' Converts gene names of query to match type/species of reference names (human
#' or mouse).
#'
#' @param object Object to convert, must contain only RNA counts matrix
#' @param reference.names Gene names of reference
#' @param homolog.table Location of file (or URL) containing table with
#' human/mouse homologies
#' @return query object with converted feature names, likely subsetted
#'
#' @importFrom stringr str_match
#' @importFrom shiny isRunning
#' @useDynLib Azimuth
#' @export
#'
ConvertGeneNames <- function(object, reference.names, homolog.table) {
  uri <- httr::build_url(url = httr::parse_url(url = homolog.table))
  if (grepl(pattern = '^://', x = uri)) {
    if (!file.exists(homolog.table)) {
      stop("Homolog file doesn't exist at path provided: ", homolog.table)
    }
  } else {
    if (!Online(url = homolog.table)) {
      stop("Cannot find the homolog table at the URL given: ", homolog.table)
    }
    homolog.table <- url(description = homolog.table)
    on.exit(expr = {
      close(con = homolog.table)
    })
  }
  linked <- readRDS(file = homolog.table)
  query.names <- rownames(x = object)
  # remove version numbers
  ensembl.id <- '(?:ENSG|ENSMUS)'
  capture.nonversion <- paste0('(', ensembl.id, '.*)\\.(?:.*)')
  pattern <- paste0('(?:', capture.nonversion,')|(.*)')
  query.names <- Filter(
    f = function(x) !is.na(x = x),
    x = as.vector(x = t(x = str_match(string = query.names, pattern = pattern)[, 2:3]))
  )
  # determine idtype and species of query
  # 5000 because sometimes ensembl IDs are mixed with regular gene names
  query.names.sub = sample(
    x = query.names,
    size = min(length(x = query.names), 5000)
  )
  idtype <- names(x = which.max(x = apply(
    X = linked,
    MARGIN = 2,
    FUN = function(col) {
      length(x = intersect(x = col, y = query.names.sub))
    }
  )))
  species <- ifelse(test = length(x = grep(pattern = '\\.mouse', x = idtype)) > 0, 'mouse', 'human')
  idtype <- gsub(pattern = '\\.mouse|\\.human', replacement = '', x = idtype)
  message('detected inputs from ', toupper(x = species), ' with id type ', idtype)
  totalid <- paste0(idtype, '.', species)

  # determine idtype and species of ref
  reference.names.sub <- sample(x = reference.names, size = min(length(x = reference.names), 5000))
  idtype.ref <- names(x = which.max(x = apply(
    X = linked,
    MARGIN = 2,
    FUN = function(col) {
      length(x = intersect(x = col, y = reference.names))
    }
  )))
  species.ref <- ifelse(test = length(x = grep(pattern = '\\.mouse', x = idtype.ref)) > 0, 'mouse', 'human')
  idtype.ref <- gsub(pattern = '\\.mouse|\\.human', replacement = '', x = idtype.ref)
  message('reference rownames detected ', toupper(x = species.ref),' with id type ', idtype.ref)
  totalid.ref <- paste0(idtype.ref, '.', species.ref)

  if (totalid == totalid.ref) {
    return(object)
  } else {
    display.names <- setNames(
      c('ENSEMBL gene', 'ENSEMBL transcript', 'gene name', 'transcript name'),
      nm = c('Gene.stable.ID', 'Transcript.stable.ID','Gene.name','Transcript.name')
    )
    if (isRunning()) {
      showNotification(
        paste0(
          "Converted ",species," ",display.names[idtype]," IDs to ",
          species.ref," ",display.names[idtype.ref]," IDs"
        ),
        duration = 3,
        type = 'warning',
        closeButton = TRUE,
        id = 'no-progress-notification'
      )
    }
    # set up table indexed by query ids (totalid)
    linked.unique <- linked[!duplicated(x = linked[[totalid]]), ]
    new.indices <- which(query.names %in% linked.unique[[totalid]])
    message("Found ", length(x = new.indices), " out of ",
            length(x = query.names), " total inputs in conversion table")
    query.names <- query.names[new.indices]
    rownames(x = linked.unique) <- linked.unique[[totalid]]
    linked.unique <- linked.unique[query.names, ]
    # get converted totalid.ref
    new.names <- linked.unique[, totalid.ref]
    # remove duplicates
    notdup <- !duplicated(x = new.names)
    new.indices <- new.indices[notdup]
    new.names <- new.names[notdup]
    # subset/rename object accordingly
    counts <- GetAssayData(object = object[["RNA"]], slot = "counts")[rownames(x = object)[new.indices], ]
    rownames(x = counts) <- new.names
    reductions <- slot(object = object, name = "reductions")
    object <- CreateSeuratObject(
      counts = counts,
      meta.data = object[[]]
    )
    slot(object = object, name = "reductions") <- reductions
    return(object)
  }
}

ConvertEnsembleToSymbol <- function(
    mat,
    species = c('human', 'mouse')
) {
  species <- match.arg(arg = species)
  if (species == 'human') {
    database <- 'hsapiens_gene_ensembl'
    symbol <- 'hgnc_symbol'

  } else if (species == 'mouse') {
    database <- 'mmusculus_gene_ensembl'
    symbol <- 'mgi_symbol'

  } else {
    stop('species name not found')
  }

  library("biomaRt")
  library("dplyr")

  name_df <- data.frame(gene_id = c(rownames(mat)))
  name_df$orig.id <- name_df$gene_id
  #make this a character, otherwise it will throw errors with left_join
  name_df$gene_id <- as.character(name_df$gene_id)
  # in case it's gencode, this mostly works
  #if ensembl, will leave it alone
  name_df$gene_id <- sub("[.][0-9]*","",name_df$gene_id)
  mart <- useDataset(dataset = database, useMart("ensembl"))
  genes <-  name_df$gene_id
  gene_IDs <- getBM(filters= "ensembl_gene_id",
                    attributes= c("ensembl_gene_id", symbol),
                    values = genes,
                    mart= mart)
  gene.df <- left_join(name_df, gene_IDs, by = c("gene_id"="ensembl_gene_id"))
  rownames(gene.df) <- make.unique(gene.df$orig.id)
  gene.df <- gene.df[rownames(mat),]
  gene.df <-gene.df[gene.df[,symbol] != '',]
  gene.df <- gene.df[ !is.na(gene.df$orig.id),]
  mat.filter <- mat[gene.df$orig.id,]
  rownames(mat.filter) <- make.unique(gene.df[,symbol])
  return(mat.filter)
}

#' Connect to a single-cell HDF5 dataset
#'
#' @param filename Name of on-disk file
#' @param type Type of single-cell dataset to connect as; choose from:
#' \itemize{
#'  \item h5seurat
#' }
#' Leave as \code{NULL} to guess type from file extension
#' @param mode Mode to connect to data as; choose from:
#' \describe{
#'  \item{r}{Open existing dataset in read-only mode}
#'  \item{r+}{Open existing dataset in read/write mode}
#' }
#' @param force Force a connection if validation steps fail; returns a
#' \code{\link[hdf5r]{H5File}} object
#'
#' @return An object of class \code{type}, opened in mode \code{mode}
#'
#' @importFrom hdf5r H5File
#'
#' @export
#'
Connect <- function(
  filename,
  type = NULL,
  mode = c('r', 'r+'),
  force = FALSE
) {
  type <- type %||% FileType(file = filename)
  mode <- match.arg(arg = mode)
  if (!file.exists(filename)) {
    stop("Cannot find ", type, " file ", filename, call. = FALSE)
  }
  return(tryCatch(
    expr = {
      cls <- GetSCDisk(r6class = type)
      cls$new(filename = filename, mode = mode)
    },
    error = function(err) {
      if (!isTRUE(x = force)) {
        stop(err$message, call. = FALSE)
      }
      warning(err$message, call. = FALSE, immediate. = TRUE)
      return(H5File$new(filename = filename, mode = mode))
    }
  ))
}

#' Determine a filetype based on its extension
#'
#' @param file Name of file
#'
#' @return The extension, all lowercase
#'
#' @importFrom tools file_ext
#'
FileType <- function(file) {
  ext <- file_ext(x = file)
  ext <- ifelse(test = nchar(x = ext), yes = ext, no = basename(path = file))
  return(tolower(x = ext))
}

# Return CSS styling for hover box on interactive plots
#
# @param x X hover position (hover$coords_css$x)
# @param y Y hover position (hover$coords_css$y)
#
# @return Returns a string of CSS to pass to style param
#
HoverBoxStyle <- function(x, y) {
  xpad <- 20 # important to avoid collisions between cursor and hover panel
  ypad <- 20
  paste0(
    "position:absolute; background-color:rgba(245, 245, 245, 0.85); ",
    "left:", (x + xpad), "px; top:",
    (y - ypad), "px;",
    "padding: 5px; margin-bottom: 0px;"
  )
}

#' Load file input into a \code{Seurat} object
#'
#' Take a file and load it into a \code{\link[Seurat]{Seurat}} object. Supports
#' a variety of file types and always returns a \code{Seurat} object
#'
#' @details \code{LoadFileInput} supports several file types to be read in as
#' \code{Seurat} objects. File type is determined by extension, matched in a
#' case-insensitive manner See sections below for details about supported
#' filtypes, required extension, and specifics for how data is loaded
#'
#' @param path Path to input data
#'
#' @return A \code{\link[Seurat]{Seurat}} object
#'
#' @importFrom tools file_ext
#' @importFrom SeuratDisk Connect
#' @importFrom SeuratObject CreateSeuratObject Assays GetAssayData
#' DefaultAssay<-
#' @importFrom Seurat Read10X_h5 as.sparse Assays DietSeurat as.Seurat
#'
#' @section 10X H5 File (extension \code{h5}):
#' 10X HDF5 files are supported for all versions of Cell Ranger; data is read
#' in using \code{\link[Seurat]{Read10X_h5}}. \strong{Note}: for multi-modal
#' 10X HDF5 files, only the \emph{first} matrix is read in
#'
#' @section Rds File (extension \code{rds}):
#' Rds files are supported as long as they contain one of the following data
#' types:
#' \itemize{
#'  \item A \code{\link[Seurat]{Seurat}} V3 object
#'  \item An S4 \code{\link[Matrix]{Matrix}} object
#'  \item An S3 \code{\link[base]{matrix}} object
#'  \item A \code{\link[base]{data.frame}} object
#' }
#' For S4 \code{Matrix}, S3 \code{matrix}, and \code{data.frame} objects, a
#' \code{Seurat} object will be made with
#' \code{\link[Seurat]{CreateSeuratObject}} using the default arguments
#'
#' @section h5Seurat File (extension \code{h5seurat}):
#' h5Seurat files and all of their features are fully supported. They are read
#' in via \code{\link[SeuratDisk]{LoadH5Seurat}}. \strong{Note}: only the
#' \dQuote{counts} matrices are read in and only the default assay is kept
#'
#' @inheritSection LoadH5AD AnnData H5AD File (extension \code{h5ad})
#' @export
#'
LoadFileInput <- function(path, bridge = FALSE) {
  # TODO: add support for loom files
  on.exit(expr = gc(verbose = FALSE))
  type <- tolower(x = tools::file_ext(x = path))
  return(switch(
    EXPR = type,
    'h5' = {
      mat <- Read10X_h5(filename = path)
      if (is.list(x = mat)) {
        mat <- mat[[1]]
      }
      object <- CreateSeuratObject(counts = mat, min.cells = 1, min.features = 1)
      if (inherits(x = object[["RNA"]], what = "Assay5")) {
        object[["RNA"]]$data <- object[["RNA"]]$counts
      }
      object
    },
    'rds' = {
      object <- readRDS(file = path)
      if (inherits(x = object, what = c('Matrix', 'matrix', 'data.frame'))) {
        object <- CreateSeuratObject(counts = as.sparse(x = object), min.cells = 1, min.features = 1)
      } else if (inherits(x = object, what = 'Seurat')) {
        if (isTRUE(x = bridge)){
          if (!'ATAC' %in% Assays(object = object)) {
            stop("No ATAC assay provided", call. = FALSE)
          } else if (Seurat:::IsMatrixEmpty(x = GetAssayData(object = object, slot = 'counts', assay = 'ATAC'))) {
            stop("No ATAC counts matrix present", call. = FALSE)
          }
          assay <- "ATAC"
        } else{
          if (!'RNA' %in% Assays(object = object)) {
            stop("No RNA assay provided", call. = FALSE)
          } else if (Seurat:::IsMatrixEmpty(x = GetAssayData(object = object, slot = 'counts', assay = 'RNA'))) {
            stop("No RNA counts matrix present", call. = FALSE)
          }
          assay <- "RNA"
        }
        object <- tryCatch({
          CreateSeuratObject(
            counts = GetAssayData(object = object[[assay]], slot = "counts"),
            min.cells = 1,
            min.features = 1,
            meta.data = object[[]]
          )
        }, error = function(e){
          object <- UpdateSeuratObject(object)
          CreateSeuratObject(
            counts = GetAssayData(object = object[[assay]], slot = "counts"),
            min.cells = 1,
            min.features = 1,
            meta.data = object[[]]
          )
        })
        if (inherits(x = object[["RNA"]], what = "Assay5")) {
          object[["RNA"]]$data <- object[["RNA"]]$counts
        }
      } else {
        stop("The RDS file must be a Seurat object", call. = FALSE)
      }
      object
    },
    'h5ad' = LoadH5AD(path = path),
    'h5seurat' = {
      if (!requireNamespace("SeuratDisk", quietly = TRUE)) {
        stop("Loading h5Seurat files requires SeuratDisk", call. = FALSE)
      }
      hfile <- suppressWarnings(expr = SeuratDisk::Connect(filename = path))
      on.exit(expr = hfile$close_all())
      if (isTRUE(x = bridge)){
        if (!'ATAC' %in% names(x = hfile[['assays']])) {
          stop("Cannot find the ATAC assay in this h5Seurat file", call. = FALSE)
        } else if (!'counts' %in% names(x = hfile[['assays/ATAC']])) {
          stop("No ATAC counts matrix provided", call. = FALSE)
        }
        object <- as.Seurat(
          x = hfile,
          assays = list(ATAC = 'counts'),
          reductions = FALSE,
          graphs = FALSE,
          images = FALSE
        )
        assay <- 'ATAC'
      } else {
        if (!'RNA' %in% names(x = hfile[['assays']])) {
          stop("Cannot find the RNA assay in this h5Seurat file", call. = FALSE)
        } else if (!'counts' %in% names(x = hfile[['assays/RNA']])) {
          stop("No RNA counts matrix provided", call. = FALSE)
        }
        object <- as.Seurat(
          x = hfile,
          assays = list(RNA = 'counts'),
          reductions = FALSE,
          graphs = FALSE,
          images = FALSE
        )
        assay <- 'RNA'
      }
      if (inherits(x = object[[assay]], what = "Assay5")) {
        if (length(Layers(object, search = "counts")) > 1) {
          object[[assay]] <- JoinLayers(object[[assay]], 
                                        layers = "counts", new = "counts")
        }
      }
      object <- CreateSeuratObject(
        counts = GetAssayData(object = object[[assay]], slot = "counts"),
        min.cells = 1,
        min.features = 1,
        meta.data = object[[]]
      )
      if (inherits(x = object[[assay]], what = "Assay5")) {
        object[[assay]]$data <- object[[assay]]$counts
      }
    },
    stop("Unknown file type: ", type, call. = FALSE)
  ))
}

#' Load a diet H5AD file
#'
#' Read in only the counts matrix and (if present) metadata of an H5AD file and
#' return a \code{Seurat} object
#'
#' @inheritParams LoadFileInput
#'
#' @return A \code{Seurat} object
#'
#' @importFrom hdf5r H5File h5attr
#' @importFrom SeuratObject AddMetaData CreateSeuratObject
#'
#' @keywords internal
#'
#' @section AnnData H5AD File (extension \code{h5ad}):
#' Only H5AD files from AnnData v0.7 or higher are supported. Data is read from
#' the H5AD file in the following manner
#' \itemize{
#'  \item The counts matrix is read from \dQuote{/raw/X}; if \dQuote{/raw/X} is
#'  not present, the matrix is read from \dQuote{/X}
#'  \item Feature names are read from feature-level metadata. Feature level
#'  metadata must be an HDF5 group, HDF5 compound datasets are \strong{not}
#'  supported. If counts are read from \code{/raw/X}, features names are looked
#'  for in \dQuote{/raw/var}; if counts are read from \dQuote{/X}, features
#'  names are looked for in \dQuote{/var}. In both cases, feature names are read
#'  from the dataset specified by the \dQuote{_index} attribute, \dQuote{_index}
#'  dataset, or \dQuote{index} dataset, in that order
#'  \item Cell names are read from cell-level metadata. Cell-level metadata must
#'  be an HDF5 group, HDF5 compound datasets are \strong{not} supported.
#'  Cell-level metadata is read from \dQuote{/obs}. Cell names are read from the
#'  dataset specified by the \dQuote{_index} attribute, \dQuote{_index} dataset,
#'  or \dQuote{index} dataset, in that order
#'  \item Cell-level metadata is read from the \dQuote{/obs} dataset. Columns
#'  will be returned in the same order as in the \dQuote{column-order}, if
#'  present, or in alphabetical order. If a dataset named \dQuote{__categories}
#'  is present, then all datasets in \dQuote{__categories} will serve as factor
#'  levels for datasets present in \dQuote{/obs} with the same name (eg. a
#'  dataset named \dQuote{/obs/__categories/leiden} will serve as the levels for
#'  \dQuote{/obs/leiden}). Row names will be set as cell names as described
#'  above. All datasets in \dQuote{/obs} will be loaded except for
#'  \dQuote{__categories} and the cell names dataset
#' }
#'
LoadH5AD <- function(path) {
  if (!requireNamespace("hdf5r", quietly = TRUE)) {
    stop("Loading H5AD files requires hdf5r", call. = FALSE)
  }
  adata <- hdf5r::H5File$new(filename = path, mode = 'r')
  on.exit(expr = adata$close_all())
  Exists <- function(name) {
    name <- unlist(x = strsplit(x = name[1], split = '/', fixed = TRUE))
    hpath <- character(length = 1L)
    exists <- TRUE
    for (i in seq_along(along.with = name)) {
      hpath <- paste(hpath, name[i], sep = '/')
      exists <- adata$exists(name = hpath)
      if (isFALSE(x = exists)) {
        break
      }
    }
    return(exists)
  }
  GetIndex <- function(md) {
    return(
      if (adata[[md]]$attr_exists(attr_name = '_index')) {
        h5attr(x = adata[[md]], which = '_index')
      } else if (adata[[md]]$exists(name = '_index')) {
        '_index'
      } else if (adata[[md]]$exists(name = 'index')) {
        'index'
      } else {
        stop("Cannot find the rownames for '", md, "'", call. = FALSE)
      }
    )
  }
  GetRowNames <- function(md) {
    return(adata[[md]][[GetIndex(md = md)]][])
  }
  LoadMetadata <- function(md) {
    factor.cols <- if (adata[[md]]$exists(name = '__categories')) {
      names(x = adata[[md]][['__categories']])
    } else {
      NULL
    }
    index <- GetIndex(md = md)
    col.names <- names(x = adata[[md]])
    if (adata[[md]]$attr_exists(attr_name = 'column-order')) {
      tryCatch(
        expr = {
          col.order <- hdf5r::h5attr(x = adata[[md]], which = 'column-order')
          col.names <- c(
            intersect(x = col.order, y = col.names),
            setdiff(x = col.names, y = col.order)
          )
        },
        error = function(...) {
          return(invisible(x = NULL))
        }
      )
    }
    col.names <- col.names[!col.names %in% c('__categories', index)]
    df <- sapply(
      X = col.names,
      FUN = function(i) {
        x <- adata[[md]][[i]][]
        if (i %in% factor.cols) {
          x <- factor(x = x, levels = adata[[md]][['__categories']][[i]][])
        }
        return(x)
      },
      simplify = FALSE,
      USE.NAMES = TRUE
    )
    return(as.data.frame(x = df, row.names = GetRowNames(md = md)))
  }
  if (Exists(name = 'raw/X')) {
    md <- 'raw/var'
    x <- adata[['raw/X']]
  } else if (Exists(name = 'X')) {
    md <- 'var'
    x <- adata[['X']]
  } else {
    stop("Cannot find counts matrix", call. = FALSE)
  }
  # check different possible attributes to try and get matrix shape
  if (isTRUE(x$attr_exists(attr_name = 'h5sparse_shape'))) {
    mtx.shape <- h5attr(x, 'h5sparse_shape')
  } else if (isTRUE(x$attr_exists(attr_name = 'shape'))) {
    mtx.shape <- h5attr(x, 'shape')
  } else {
    warning("Could not determine matrix shape")
  }
  # check different attributes to try and deduce matrix type
  if (isTRUE(x$attr_exists(attr_name = 'h5sparse_format'))) {
    mtx.type <- h5attr(x, 'h5sparse_format')
  } else if (isTRUE(x$attr_exists(attr_name = 'encoding-type'))) {
    mtx.type <- substr(h5attr(x, 'encoding-type'), 0, 3)
  } else {
    mtx.type <- 'csr' # assume matrix is csr
    warning("Could not determine matrix format")
  }
  if (mtx.type != 'csr') {
    p <- as.integer(x[['indptr']][])
    i <- as.integer(x[['indices']][])
    data <- as.double(x[['data']][])
    # csc -> csr
    converted.mtx <- csc_tocsr(
      n_row = as.integer(mtx.shape[1]),
      n_col = as.integer(mtx.shape[2]),
      Ap = p,
      Ai = i,
      Ax = data
    )
    # csr -> dgC
    counts <- new(
      Class = 'dgCMatrix',
      p = converted.mtx$p,
      i = converted.mtx$i,
      x = converted.mtx$x,
      Dim = c(mtx.shape[2], mtx.shape[1])
    )
  } else {
    # x must be a CSR matrix
    counts <- as.matrix(x = x)
  }
  metadata <- LoadMetadata(md = 'obs') # gather additional cell-level metadata
  rownames <- GetRowNames(md = md)
  colnames <- rownames(metadata)
  rownames(x = counts) <- rownames
  colnames(x = counts) <- colnames
  options(Seurat.object.assay.calcn = TRUE)
  object <- CreateSeuratObject(counts = counts)
  if (ncol(x = metadata)) {
    object <- AddMetaData(object = object, metadata = metadata)
  }
  object <- subset(object, subset = nCount_RNA > 0)
  return(object)
}


#' Load obs from a H5AD file
#'
#' Read in only the metadata of an H5AD file and
#' return a data.frame object
#' @section AnnData H5AD File (extension \code{h5ad}):
#' @importFrom rhdf5 h5read
#' @export
#'
LoadH5ADobs <- function(path, cell.groups = NULL) {
  cell.var <- rhdf5::h5readAttributes(file = path, name = 'obs')$`_index`
  cells.index <- h5read(file = path, name = 'obs')[[cell.var]]
  suppressWarnings(expr = hfile <- Connect(filename = path, force = TRUE))
  hfile_obs <- hfile[['obs']]
  #cell.groups <- cell.groups %||% intersect(names(hfile_obs), c('_index', 'cell', 'cell_id'))
  obs_groups <- setdiff(names(hfile_obs), c('__categories',  c('_index', 'cell', 'cell_id')))
  matrix <- as.data.frame(
    x = matrix(data = NA,
               nrow = length(cells.index),
               ncol = length(obs_groups))
  )
  colnames(matrix) <- obs_groups
  rownames(matrix) <- cells.index
  if ('__categories' %in% names(x = hfile_obs)) {
    hfile_cate <- hfile_obs[['__categories']]
    for (i in seq_along(obs_groups)) {
      obs.i <- obs_groups[i]
      obs_value_i <- hfile_obs[[obs.i]][]
      if (obs.i %in% names(x = hfile_cate)){
        obs_value_i <- factor(x = obs_value_i, labels =  hfile_cate[[obs.i]][])
      }
      matrix[,i] <- obs_value_i
    }
  } else {
    for (i in seq_along(obs_groups)) {
      obs.i <- obs_groups[i]
      if (all(names(hfile_obs[[obs.i]]) == c("categories", "codes"))) {
        if (
          length(unique(hfile_obs[[obs.i]][['codes']][])) == length(hfile_obs[[obs.i]][['categories']][])
        ) {
          obs_value_i <- factor(
            x = hfile_obs[[obs.i]][['codes']][],
            labels =  hfile_obs[[obs.i]][['categories']][]
          )
        } else {
          obs_value_i <- hfile_obs[[obs.i]][['codes']][]
        }
      } else {
        # list of list, skip for now
        obs_value_i <- tryCatch(expr = hfile_obs[[obs.i]][],
                                error = function(e) {
          return('unknown')
        })
      }
      matrix[,i] <- obs_value_i
    }
  }
  hfile$close_all()
  return(matrix)
}


#' Load the reference RDS files
#'
#' Read in a reference \code{\link[Seurat]{Seurat}} object and annoy index. This
#' function can read either from URLs or a file path. In order to read properly,
#' there must be the following files:
#' \itemize{
#'  \item \dQuote{ref.Rds} for the downsampled reference \code{Seurat}
#'  object (for mapping)
#'  \item \dQuote{idx.annoy} for the nearest-neighbor index object
#' }
#'
#' @param path Path or URL to the two RDS files
#' @param seconds Timeout to check for URLs in seconds
#'
#' @return A list with two entries:
#' \describe{
#'  \item{\code{map}}{
#'   The downsampled reference \code{\link[Seurat]{Seurat}}
#'   object (for mapping)
#'  }
#'  \item{\code{plot}}{The reference \code{Seurat} object (for plotting)}
#' }
#'
#' @importFrom SeuratObject Idents<- Loadings<- 
#' @importFrom Seurat LoadAnnoyIndex Loadings
#' @importFrom httr build_url parse_url status_code GET timeout
#' @importFrom utils download.file
#' @importFrom Matrix sparseMatrix
#'
#' @export
#' @examples
#' \dontrun{
#' # Load from a URL
#' ref <- LoadReference("https://seurat.nygenome.org/references/pbmc")
#' # Load from a directory
#' ref2 <- LoadReference("/var/www/html")
#' }
#'
LoadReference <- function(path, seconds = 10L) {
  op <- options(Seurat.object.assay.calcn = FALSE)
  on.exit(expr = options(op), add = TRUE)
  ref.names <- list(
    map = 'ref.Rds',
    ann = 'idx.annoy'
  )
  if (substr(x = path, start = nchar(x = path), stop = nchar(x = path)) == '/') {
    path <- substr(x = path, start = 1, stop = nchar(x = path) - 1)
  }
  uri <- httr::build_url(url = httr::parse_url(url = path))
  if (grepl(pattern = '^://', x = uri) | grepl(pattern = '^[a-zA-Z]{1}://', x = uri)) {
    if (!dir.exists(paths = path)) {
      stop("Cannot find directory ", path, call. = FALSE)
    }
    mapref <- file.path(path, ref.names$map)
    annref <- file.path(path, ref.names$ann)
    exists <- file.exists(c(mapref, annref))
    if (!all(exists)) {
      stop(
        "Missing the following files from the directory provided: ",
        Oxford(unlist(x = ref.names)[!exists], join = 'and')
      )
    }
  } else {
    ref.uris <- paste(uri, ref.names, sep = '/')
    names(x = ref.uris) <- names(x = ref.names)
    online <- vapply(
      X = ref.uris,
      FUN = Online,
      FUN.VALUE = logical(length = 1L),
      USE.NAMES = FALSE
    )
    if (!all(online)) {
      stop(
        "Cannot find the following files at the site given: ",
        Oxford(unlist(x = ref.names)[!online], join = 'and')
      )
    }
    mapref <- url(description = ref.uris[['map']])
    annref <- tempfile()
    download.file(url = ref.uris[['ann']], destfile = annref, quiet = TRUE)
    on.exit(expr = {
      close(con = mapref)
      unlink(x = annref)
    })
  }
  # Load the map reference
  map <- readRDS(file = mapref)

  # handle new parameters in uwot models beginning in v0.1.13
  if (!"num_precomputed_nns" %in% names(Misc(map[["refUMAP"]])$model)) {
    Misc(map[["refUMAP"]], slot="model")$num_precomputed_nns <- 1
  }

  # Load the annoy index into the Neighbor object in the neighbors slot
  map[["refdr.annoy.neighbors"]] <- LoadAnnoyIndex(
    object = map[["refdr.annoy.neighbors"]],
    file = annref
  )
  # Validate that reference contains required dims
  if (ncol(x = map[["refDR"]]) < getOption(x = "Azimuth.map.ndims")) {
    stop("Provided reference doesn't contain at least ",
         getOption(x = "Azimuth.map.ndims"), " dimensions. Please either
         regenerate reference with requested dimensionality or adjust ",
         "the Azimuth.map.ndims option.")
  }
  # Fix colnames of 'feature.loadings' in reference 
  key.pattern = "^[^_]*_"
  new.colnames <- gsub(pattern = key.pattern, 
                       replacement = Key(map[["refDR"]]), 
                       x = colnames(Loadings(
                         object = map[["refDR"]],
                         projected = FALSE)))
  colnames(Loadings(object = map[["refDR"]], 
                    projected = FALSE)) <- new.colnames
  # Create plotref
  ad <- Tool(object = map, slot = "AzimuthReference")
  plotref.dr <- GetPlotRef(object = ad)
  cm <- sparseMatrix(
    i = 1, j = 1, x = 0, dims = c(1, nrow(x = plotref.dr)),
    dimnames = list("placeholder", Cells(x = plotref.dr))
  )
  op <- options(Seurat.object.assay.version = "v3")
  on.exit(expr = options(op), add = TRUE)
  plot <- CreateSeuratObject(counts = cm)
  plot[["refUMAP"]] <- plotref.dr
  DefaultAssay(plot[["refUMAP"]]) <- DefaultAssay(plot)
  plot <- AddMetaData(object = plot, metadata = Misc(object = plotref.dr, slot = "plot.metadata"))
  gc(verbose = FALSE)
  return(list(
    map = map,
    plot = plot
  ))
}

#' Load the extended reference RDS file for bridge integration
#'
#' Read in a precomputed extended reference. This function can
#' read either from URLs or a file path. The function looks for a file 
#' called ext.Rds for the extended reference \code{Seurat} object 
#'
#' @param path Path or URL to the RDS file
#' @param seconds Timeout to check for URLs in seconds
#'
#' @return A list with two entries:
#' \describe{
#'  \item{\code{map}}{
#'   The extended reference \code{\link[Seurat]{Seurat}}
#'   object 
#'  }
#'  \item{\code{plot}}{The reference \code{Seurat} object (for plotting)}
#' }
#'
#' @importFrom SeuratObject Idents<- RenameAssays Loadings<- 
#' @importFrom Seurat LoadAnnoyIndex Loadings
#' @importFrom httr build_url parse_url status_code GET timeout
#' @importFrom utils download.file
#' @importFrom Matrix sparseMatrix
#'
#' @export
#' @examples
#' \dontrun{
#' # Load from a URL
#' ref <- LoadBridgeReference("https://seurat.nygenome.org/references/pbmc")
#' # Load a file from the path to a directory 
#' ref2 <- LoadBridgeReference("path/")
#' # Load a file directly
#' ref3 <- LoadBridgeReference("ext.Rds")
#' }

LoadBridgeReference<- function(path, seconds = 10L) {
  op <- options(Seurat.object.assay.calcn = FALSE)
  on.exit(expr = options(op), add = TRUE)
  ref.names <- list(
    map = 'ext.Rds'
  )
  if (substr(x = path, start = nchar(x = path), stop = nchar(x = path)) == '/') {
    path <- substr(x = path, start = 1, stop = nchar(x = path) - 1)
  }
  uri <- httr::build_url(url = httr::parse_url(url = path))
  if (grepl(pattern = '^://', x = uri) | grepl(pattern = '^[a-zA-Z]{1}://', x = uri)) {
    if (file.exists(path) && !dir.exists(path)){
      extref <- path
    } else if (!dir.exists(paths = path)) {
        stop("Cannot find directory ", path, call. = FALSE)
    } else {
      extref <- file.path(path, ref.names$map)
    }
    exists <- file.exists(c(extref))
    if (!all(exists)) {
      stop(
        "Missing the following files from the directory provided: ",
        Oxford(unlist(x = ref.names)[!exists], join = 'and')
      )
    }
  } else {
    ref.uris <- paste(uri, ref.names, sep = '/')
    names(x = ref.uris) <- names(x = ref.names)
    online <- vapply(
      X = ref.uris,
      FUN = Online,
      FUN.VALUE = logical(length = 1L),
      USE.NAMES = FALSE
    )
    if (!all(online)) {
      stop(
        "Cannot find the following files at the site given: ",
        Oxford(unlist(x = ref.names)[!online], join = 'and')
      )
    }
    #mapref <- url(description = ref.uris[['map']])
    on.exit(expr = {
      close(con = extref)
      unlink(x = annref)
    })
  }
  # Load the map reference
  map <- readRDS(file = extref)
  # handle new parameters in uwot models beginning in v0.1.13
  if (!"num_precomputed_nns" %in% names(Misc(map[["refUMAP"]])$model)) {
    Misc(map[["refUMAP"]], slot="model")$num_precomputed_nns <- 1
  }
  
  # Validate that reference contains required dims
  if (ncol(x = map[["refDR"]]) < getOption(x = "Azimuth.map.ndims")) {
    stop("Provided reference doesn't contain at least ",
         getOption(x = "Azimuth.map.ndims"), " dimensions. Please either
         regenerate reference with requested dimensionality or adjust ",
         "the Azimuth.map.ndims option.")
  }
  key.pattern = "^[^_]*_"
  new.colnames <- gsub(pattern = key.pattern, 
                       replacement = Key(map[["refDR"]]), 
                       x = colnames(Loadings(
                         object = map[["refDR"]],
                         projected = FALSE)))
  colnames(Loadings(object = map[["refDR"]], 
                    projected = FALSE)) <- new.colnames
  # Create plotref
  ad <- Tool(object = map, slot = "AzimuthReference")
  plotref.dr <- GetPlotRef(object = ad)
  cm <- sparseMatrix(
    i = 1, j = 1, x = 0, dims = c(1, nrow(x = plotref.dr)),
    dimnames = list("placeholder", Cells(x = plotref.dr))
  )
  op <- options(Seurat.object.assay.version = "v3")
  on.exit(expr = options(op), add = TRUE)
  plot <- CreateSeuratObject(counts = cm)
  plot[["refUMAP"]] <- plotref.dr
  plot <- AddMetaData(object = plot, metadata = Misc(object = plotref.dr, slot = "plot.metadata"))
  gc(verbose = FALSE)
  return(list(
    map = map,
    plot = plot
  ))
}



#' Save \code{Azimuth} references and neighbors index to same folder
#'
#' @param object An \code{\link{Azimuth}} reference
#' @param file Path to save \code{Azimuth} reference to; defaults to
#' \code{file.path(getwd(), "azimuth_reference/"))}
#' @inheritDotParams base::saveRDS
#'
#' @return Invisibly returns \code{file}
#'
#' @export
#' @seealso \code{\link{saveRDS}()} \code{\link{readRDS}()}
#'
#'
#' @examples
#' # Make Azimuth Reference object
#' obj.azimuth <- AzimuthReference(object)
#' 
#' # Save 
#' SaveAzimuthReference(object = obj.azimuth, folder = "azimuth_reference")
#' 
#' # Run Azimuth
#' 
#' query <- RunAzimuth(query = query, 
#'                     reference = "azimuth_reference", 
#'                     ...)
#' 
#'
SaveAzimuthReference <- function(
    object = NULL,
    folder = NULL
) {
  if (is.null(Tool(object, "AzimuthReference"))){
    stop("The object is not an AzimuthReference object.", 
         "Please run AzimuthReference() and try again.")
  }
  folder <- folder %||% file.path(getwd(), "azimuth_reference/")
  base::saveRDS(object = object, file = paste0(folder, "ref.Rds"), compress=F)
  SaveAnnoyIndex(object = object[["refdr.annoy.neighbors"]], file = paste0(folder, "idx.annoy"))
  message("Saved 'ref.Rds' and 'idx.annoy' in ", folder, "folder")
  return(invisible(x = folder))
}



#' Transform an NN index
#'
#' @param object Seurat object
#' @param meta.data Metadata
#' @param neighbor.slot Name of Neighbor slot
#' @param key Column of metadata to use
#'
#' @return \code{object} with transfomed neighbor.slot
#'
#' @importFrom SeuratObject Indices
#'
#' @keywords internal
#'
NNTransform <- function(
  object,
  meta.data,
  neighbor.slot = "query_ref.nn",
  key = 'ori.index'
) {
  on.exit(expr = gc(verbose = FALSE))
  ind <- Indices(object[[neighbor.slot]])
  ori.index <- t(x = sapply(
    X = 1:nrow(x = ind),
    FUN = function(i) {
      return(meta.data[ind[i, ], key])
    }
  ))
  rownames(x = ori.index) <- rownames(x = ind)
  slot(object = object[[neighbor.slot]], name = "nn.idx") <- ori.index
  return(object)
}

# Check if file is available at given URL
#
# @param url URL to file
# @param strict Ensure http code is 200
#
# @return Boolean indicating file presence
#
Online <- function(url, strict = FALSE) {
  if (isTRUE(x = strict)) {
    code <- 200L
    comp <- identical
  } else {
    code <- 404L
    comp <- Negate(f = identical)
  }
  return(comp(
    x = httr::status_code(x = httr::GET(
      url = url,
      httr::timeout(seconds = getOption('timeout')
      ))),
    y = code
  ))
}

# Make An English List
#
# Joins items together to make an English list; uses the Oxford comma for the
# last item in the list.
#
#' @inheritParams base::paste
# @param join either \dQuote{and} or \dQuote{or}
#
# @return A character vector of the values, joined together with commas and
# \code{join}
#
#' @keywords internal
#
# @examples
# \donttest{
# Oxford("red")
# Oxford("red", "blue")
# Oxford("red", "green", "blue")
# }
#
Oxford <- function(..., join = c('and', 'or')) {
  join <- match.arg(arg = join)
  args <- as.character(x = c(...))
  args <- Filter(f = nchar, x = args)
  if (length(x = args) == 1) {
    return(args)
  } else if (length(x = args) == 2) {
    return(paste(args, collapse = paste0(' ', join, ' ')))
  }
  return(paste0(
    paste(args[1:(length(x = args) - 1)], collapse = ', '),
    paste0(', ', join, ' '),
    args[length(x = args)]
  ))
}

# Determine if there are a prohibitive # of annotations for legend
OversizedLegend <- function(annotation.list) {
  return(length(x = unique(x = as.vector(x = annotation.list))) > 50)
}

# Toggle demo button enable/disable
#
# @param action Whether to enable or disable the buttons
# @param demos data.frame containing demo dataset name and file paht
#
# @return No return value
#
ToggleDemos <- function(action = c("enable", "disable"), demos = NULL) {
  if (!is.null(x = demos)) {
    if (action == "enable") {
      for (i in 1:nrow(x = demos)) {
        enable(id = paste0("triggerdemo", i))
      }
    }
   if (action == "disable") {
     for (i in 1:nrow(x = demos)) {
       disable(id = paste0("triggerdemo", i))
     }
   }
  }
}

# Theme for the plot on welcome page
#
WelcomePlot <- function(...) {
  welcomeplot.theme <- theme(
    axis.line = element_blank(), axis.ticks = element_blank(),
    axis.text.x = element_blank(), axis.text.y = element_blank(),
    axis.title.x = element_blank(), axis.title.y = element_blank(),
    legend.position = "none", plot.title = element_blank(),
    # if we want backgroundless... (also replace 'box' with 'fluidRow' in UI)
    panel.background = element_rect(color = '#ecf0f5', fill = '#ecf0f5'),
    plot.background = element_rect(color = '#ecf0f5', fill = '#ecf0f5')
  )
  return(welcomeplot.theme)
}



############### Overlap Functionality ###############
# QC For Overlap between multiomic bridge and atac query 
#
# @param query_assay 
# @param multiome_atac 
#
# @return No return value
#

OverlapDistPlot <- function(query_assay, multiome){
  atac_peaks <- OverlapQC(query = query_assay, subject = multiome)
  d <- density(atac_peaks$perc_overlap)
  plot(d, xlab='Percentage of Overlap', main = 'Distribution of Overlap Percentages')
}

# Calculate Overlap percentage for dataframe of overlap info per peak 
#
# @param atac_peaks dataframe with coordinates for each peak and overlap 
#
# @return Percentage of Overlap 
#
PercOverlap <- function(overlap_df){
  # if no overlap  
  len_overlap <- overlap_df$o_end - overlap_df$o_start
  len_q <- overlap_df$end - overlap_df$start
  perc <- (len_overlap/len_q)
  return(perc)
}

# Calculate Create dataframe with info for each peaks's overlap in multiome 
#
# @param o_hits Iranges object of overlapping hits 
# @param query 
# @param subject multi[["ATAC"]] for example
#
# @return Percentage of Overlap 
#
OverlapQC <- function(query, subject) {
  o_hits <- findOverlaps(query, subject)
  query_inds <- queryHits(o_hits)
  subject_inds <- subjectHits(o_hits)
  overlap_df <- as.data.table(GetAssayData(query, slot = "ranges")[query_inds,])
  subject_peaks <- as.data.table(GetAssayData(subject, assay = "query", slot = "ranges")[subject_inds,])
  overlap_df$o_start <- mapply(max, overlap_df$start, subject_peaks$start)
  overlap_df$o_end <- mapply(min, overlap_df$end, subject_peaks$end)
  overlap_df$perc_overlap <- PercOverlap(overlap_df)
  overlap_df
}

OverlapTotal <- function(query, subject){ 
  overlap_df <- OverlapQC(query, subject)
  q_width <- sum(overlap_df$width) # but there will be repeats in this 
  o_width <- sum(overlap_df$o_end - overlap_df$o_start)
  amount_covered <- (o_width/q_width) * 100
  return(amount_covered)
}

PeakJaccard <- function(query, subject) {
  overlap_df <- OverlapQC(query, subject)
  intersection = sum(overlap_df$o_end - overlap_df$o_start) # this is the total overlap 
  total_query_width = sum(width(query@ranges))
  total_subject_width = sum(width(subject@ranges))
  union = total_query_width + total_subject_width - intersection
  return ((intersection/union) * 100)
}

# Requantify atac peaks to either multiomic peaks or to genes 
#
# @param o_hits Iranges object of overlapping hits (Should use same assay as assay for requantification)
# @param ATAC chromatin assay or Seurat Object
# @param subject ATAC assay from Bridge or Transcripts dataframe 
# @param assay assay to use in requantifying peaks to genes (original peaks "peak.orig" or requantified peaks "ATAC")
# @param verbose
#
# @return Percentage of Overlap 
#
RequantifyPeaks <- function(
    #o_hits, 
    atac, 
    subject,
    assay = "peak.orig",
    verbose = TRUE){
  # Query peaks that have overlap w/ multiome peaks
  if (inherits(x = atac, what = "ChromatinAssay")){
    o_hits <- findOverlaps(atac, subject[["ATAC"]])
    atac <- GetAssayData(atac, assay = "ATAC", slot = "counts")
    atac_inds <- queryHits(o_hits)
    atac_subset <- atac[atac_inds, ]
    new_names <- rownames(subject[["ATAC"]])[subjectHits(o_hits)]
    if (verbose){
      message("Requantifying query peaks to match multiome")
    }
    # Reassign query row names
    row.names(atac_subset) <- new_names
    # Merge duplicates
    row.names <- row.names(atac_subset)
    model.matrix <- sparse.model.matrix(
      object = ~ 0 + row.names
    )  
    colnames(x = model.matrix) <- sapply(
      X = colnames(x = model.matrix),
      FUN = function(name) {
        name <- gsub(pattern = "row.names", replacement = "", x = name)
        return(paste0(rev(x = unlist(x = strsplit(x = name, split = ":"))),
                      collapse = "__"
        ))
      }
    )
    # Multiply matrices to combine counts
    atac_final <- as((Matrix::t(model.matrix) %*% atac_subset), "dgCMatrix")
  } else if (inherits(x = atac, what = "Seurat")){ 
    o_hits <- suppressWarnings(findOverlaps(atac[[assay]], subject))
    atac_inds <- queryHits(o_hits)
    DefaultAssay(atac) <- assay
    print(atac)
    atac_data <- GetAssayData(atac, assay = assay, slot = "counts")
    atac_final <- atac_data[atac_inds, ]
    new_names <- GRangesToString(subject[subjectHits(o_hits)])
    if (verbose){
      message("Requantifying query peaks to genes")
    }
    # Reassign query row names
    rownames(atac_final) <- new_names
    # Merge duplicates
    atac_final <- rowsum(atac_final, row.names(atac_final), reorder=FALSE)  
    atac_final <- Matrix::Matrix(atac_final, sparse = TRUE) 
  } else{
    stop("Incorrect object type ")
  }
  ##### code from signac 
  if (inherits(x = subject, what = "GRanges")){
    gene.key <- subject$gene_name
    names(x = gene.key) <- GRangesToString(grange = subject)
    rownames(x = atac_final) <- as.vector(x = gene.key[rownames(x = atac_final)])
    atac_final <- atac_final[rownames(x = atac_final) != "", ]
  }
  return(atac_final)
}

# Requantify atac peaks to either multiomic peaks or to genes (optimized for a large set of features)
#
# @param o_hits Iranges object of overlapping hits (Should use same assay as assay for requantification)
# @param ATAC chromatin assay or Seurat Object
# @param subject ATAC assay from Bridge or Transcripts dataframe 
# @param assay assay to use in requantifying peaks to genes (original peaks "peak.orig" or requantified peaks "ATAC")
# @param verbose
#
# @return Percentage of Overlap 
#
RequantifyPeaksLarge <- function(
    atac, 
    subject,
    assay = "peak.orig",
    verbose = TRUE){
  # Query peaks that have overlap w/ multiome peaks
  if (inherits(x = atac, what = "ChromatinAssay")){
    o_hits <- findOverlaps(atac, subject[["ATAC"]])
    atac <- GetAssayData(atac, assay = "ATAC", slot = "counts")
    atac_inds <- queryHits(o_hits)
    atac_subset <- atac[atac_inds, ]
    new_names <- rownames(subject[["ATAC"]][subjectHits(o_hits)]) 
    if (verbose){
      message("Requantifying query peaks to match multiome")
    }
  } else if (inherits(x = atac, what = "Seurat")){ 
    o_hits <- suppressWarnings(findOverlaps(atac[[assay]], subject))
    atac_inds <- queryHits(o_hits)
    DefaultAssay(atac) <- assay
    atac_data <- GetAssayData(atac, assay = assay, slot = "counts")
    atac_subset <- atac_data[atac_inds, ]
    new_names <- GRangesToString(subject[subjectHits(o_hits)])
    if (verbose){
      message("Requantifying query peaks to genes")
    }
  } else{
    stop("Incorrect object type ")
  }
  # Reassign query row names
  row.names(atac_subset) <- new_names
  # Merge duplicates
  row.names <- row.names(atac_subset)
  model.matrix <- sparse.model.matrix(
    object = ~ 0 + row.names
  )  
  colnames(x = model.matrix) <- sapply(
    X = colnames(x = model.matrix),
    FUN = function(name) {
      name <- gsub(pattern = "row.names", replacement = "", x = name)
      return(paste0(rev(x = unlist(x = strsplit(x = name, split = ":"))),
                    collapse = "__"
      ))
    }
  )
  # Multiply matrices to combine counts
  atac_final <- as((Matrix::t(model.matrix) %*% atac_subset), "dgCMatrix")
  ##### code from signac 
  if (inherits(x = subject, what = "GRanges")){
    gene.key <- subject$gene_name
    names(x = gene.key) <- GRangesToString(grange = subject)
    rownames(x = atac_final) <- as.vector(x = gene.key[rownames(x = atac_final)])
    atac_final <- atac_final[rownames(x = atac_final) != "", ]
  }
  return(atac_final)
}


#' Get transcripts modified from Signac::GeneActivity
#'
#' @param object A Seurat object
#' @param assay Name of assay to use. If NULL, use the default assay
#' @param features Genes to include. If NULL, use all protein-coding genes in
#' the annotations stored in the object
#' @param extend.upstream Number of bases to extend upstream of the TSS
#' @param extend.downstream Number of bases to extend downstream of the TTS
#' @param biotypes Gene biotypes to include. If NULL, use all biotypes in the
#' gene annotation.
#' @param max.width Maximum allowed gene width for a gene to be quantified.
#' Setting this parameter can avoid quantifying extremely long transcripts that
#' can add a relatively long amount of time. If NULL, do not filter genes based
#' on width.
#' @param process_n Number of regions to load into memory at a time, per thread.
#' Processing more regions at once can be faster but uses more memory.
#' @param gene.id Record gene IDs in output matrix rather than gene name.
#' @param verbose
#'
#' @importFrom SeuratObject DefaultAssay
#' 
#' @return Transcripts 
#'
GetTranscripts <- function( 
  object,
  assay = NULL,
  features = NULL,
  extend.upstream = 2000,
  extend.downstream = 0,
  biotypes = "protein_coding",
  max.width = 500000,
  process_n = 2000,
  gene.id = FALSE,
  verbose = TRUE
) {
  if (!is.null(x = features)) {
    if (length(x = features) == 0) {
      stop("Empty list of features provided")
    }
  }
  # collapse to longest protein coding transcript
  assay <- Signac:::SetIfNull(x = assay, y = DefaultAssay(object = object))
  if (!inherits(x = object[[assay]], what = "ChromatinAssay")) {
    stop("The requested assay is not a ChromatinAssay.")
  }
  annotation <- Annotation(object = object[[assay]])
  # replace NA names with gene IDD
  annotation$gene_name <- ifelse(
    test = is.na(x = annotation$gene_name) | (annotation$gene_name == ""),
    yes = annotation$gene_id,
    no = annotation$gene_name
  )
  if (length(x = annotation) == 0) {
    stop("No gene annotations present in object")
  }
  if (verbose) {
    message("Extracting gene coordinates")
  }
  transcripts <- Signac:::CollapseToLongestTranscript(ranges = annotation)
  if (gene.id) {
    transcripts$gene_name <- transcripts$gene_id
  }
  if (!is.null(x = biotypes)) {
    transcripts <- transcripts[transcripts$gene_biotype %in% biotypes]
    if (length(x = transcripts) == 0) {
      stop("No genes remaining after filtering for requested biotypes")
    }
  }
  
  # filter genes if provided
  if (!is.null(x = features)) {
    transcripts <- transcripts[transcripts$gene_name %in% features]
    if (length(x = transcripts) == 0) {
      stop("None of the requested genes were found in the gene annotation")
    }
  }
  if (!is.null(x = max.width)) {
    transcript.keep <- which(x = width(x = transcripts) < max.width)
    transcripts <- transcripts[transcript.keep]
    if (length(x = transcripts) == 0) {
      stop("No genes remaining after filtering for max.width")
    }
  }
  
  # extend to include promoters
  transcripts <- Extend(
    x = transcripts,
    upstream = extend.upstream,
    downstream = extend.downstream
  )
  
}
satijalab/azimuth documentation built on Nov. 19, 2023, 8:34 a.m.