R/peak_to_gene.R

Defines functions PeakToGene

Documented in PeakToGene

#' Get peak to gene links
#'
#' This function will link peak to genes based on correlation analysis.
#' We here modified the code from the ArchR package of the
#' \code{\link{addPeak2GeneLinks}} function to make it compatible with matrix input.
#' For more details, please
#' refer to \url{https://www.archrproject.com/reference/addPeak2GeneLinks.html}. \cr
#' Author: Jeffrey Granja
#'
#' @param peak.mat A matrix containing chromatin accessibility data
#' @param gene.mat A matrix containing gene expression data
#' @param genome Which genome to use. Currently available are: hg19, hg38, mm9,
#' and mm10
#' @param method Which method to use to measure the association between peak and gene.
#' Available are: "correlation", "glm"
#' @param max.dist The maximum distance between a peak and a gene. Default: 250000 bp
#'
#' @useDynLib scMEGA, .registration = TRUE
#' @importFrom Rcpp sourceCpp
#' @importFrom GenomicRanges GRanges
#' @importFrom GenomicRanges findOverlaps
#' @importFrom GenomicRanges resize
#' @importFrom IRanges IRanges
#' @importFrom S4Vectors SimpleList
#' @importFrom stats p.adjust
#' @return A data frame containing all peak-to-gene links
#' @export
#'
PeakToGene <- function(peak.mat,
                       gene.mat,
                       genome = "hg19",
                       max.dist = 250000,
                       method = "corrleation") {
  if (!(genome %in% c("hg19", "hg38", "mm9", "mm10"))) {
    stop("Available genome are: hg19, hg38, mm9, and mm10!")
  }

  if (genome == "hg19") {
    gene_anno <- geneAnnoHg19
  } else if (genome == "hg38") {
    gene_anno <- geneAnnoHg38
  } else if (genome == "mm9") {
    gene_anno <- geneAnnoMm9
  } else if (genome == "mm10") {
    gene_anno <- geneAnnoMm10
  }

  ## create object for RNA data
  genes <- gene_anno$genes
  gene.use <-
    intersect(elementMetadata(genes)[, "symbol"], rownames(gene.mat))

  genes <- genes[elementMetadata(genes)[, "symbol"] %in% gene.use]
  gene.mat <- gene.mat[gene.use, ]

  gene_start <- ifelse(genes@strand == "+",
                       genes@ranges@start,
                       genes@ranges@start + genes@ranges@width - 1)

  genes <- GRanges(
    genes@seqnames,
    ranges = IRanges(gene_start,
                     width = 1),
    name = genes$symbol,
    gene_id = genes$gene_id,
    strand = genes@strand
  )

  seRNA <-
    SummarizedExperiment(assays = SimpleList(RNA = gene.mat),
                         rowRanges = genes)

  ## create object for ATAC data
  df_peak <-
    stringr::str_split_fixed(rownames(peak.mat), "-", 3)

  peakSet <- GRanges(df_peak[, 1],
                     IRanges(start = as.numeric(df_peak[, 2]),
                             end = as.numeric(df_peak[, 3])))

  seATAC <-
    SummarizedExperiment(assays = SimpleList(ATAC = peak.mat),
                         rowRanges = peakSet)

  ## find putative peak-to-gene
  o <-
    data.frame(findOverlaps(
      resize(seRNA, 2 * max.dist + 1, "center"),
      resize(rowRanges(seATAC), 1, "center"),
      ignore.strand = TRUE
    ))
  o$distance <- IRanges::distance(rowRanges(seRNA)[o[, 1]],
                                  rowRanges(seATAC)[o[, 2]])
  colnames(o) <- c("gene_idx", "peak_idx", "distance")

  df <- rowRanges(seATAC)[o$peak_idx, ]

  o$gene <- rowData(seRNA)[o$gene_idx, ]$name
  o$peak <- paste0(
    df@seqnames,
    "-",
    as.data.frame(df@ranges)$start,
    "-",
    as.data.frame(df@ranges)$end
  )


  ## compute correlation
  o$Correlation <- rowCorCpp(as.integer(o$peak_idx),
                             as.integer(o$gene_idx),
                             assay(seATAC),
                             assay(seRNA))

  ## compute p-value
  o$TStat <-
    (o$Correlation / sqrt((
      pmax(1 - o$Correlation ^ 2, 0.00000000000000001, na.rm = TRUE)
    ) / (ncol(seATAC) - 2))) #T-statistic P-value

  o$Pval <- 2 * pt(-abs(o$TStat), ncol(seATAC) - 2)
  o$FDR <- p.adjust(o$Pval, method = "fdr")
  o <- o[!is.na(o$FDR), ]

  return(o)

}
CostaLab/scMEGA documentation built on Sept. 25, 2024, 6:11 a.m.