R/link_genes.R

Defines functions FindCausalProb FindLinkedGenes FindCausalPeaks ImportCausalSNPs

Documented in FindCausalPeaks FindCausalProb FindLinkedGenes ImportCausalSNPs

## Functions for linking causal SNPs to genes after gChromVAR analysis

#' Import causal snp bed files
#'
#' @param files Paths of bed files to import
#' @param colidx Index of the posterior probability column
#'
#' @return Causal SNPs as GRanges objects
#'
#' @import GenomicRanges
#' @export
#'
ImportCausalSNPs <- function(files, colidx = 5) {
  lapply(files, function(file) {

    # Import Bed
    hitdf <- read.table(file)
    stopifnot(dim(hitdf)[2] >= colidx)
    stopifnot(3 < colidx)
    gdf <- hitdf[,c(1,2,3,colidx)]

    #Make GRanges
    names(gdf) <- c("chr", "start", "end", "PP")
    makeGRangesFromDataFrame(gdf, keep.extra.columns = T)
  })
}



#' Identify most impactful SNPs for a given trait/cell type combo
#'
#' @param SE Accessibility counts as a Summarized Experiment object
#' @param traits Posterior probability score of each peak for every trait as a Summarized Experiment object
#' @param bg.peaks Matrix of background peaks. Will be computed with getBackgroundPeaks if not specified
#' @param min.z Z-score cutoff for peaks
#' @param min.PP Posterior probability score for peaks
#' @param return.mat Return all z-scores
#'
#' @return List of lists of causal peaks for each trait and cell type. Each trait/cell type pair has a dataframe with the z-score, accessibility deviation, and posterior probability for each causal peak
#'
#' @import GenomicRanges
#' @import Matrix
#' @export
#'
FindCausalPeaks <- function(SE, traits, bg.peaks = NULL, min.z = 1, min.PP = 0.025,
                            return.mat = F) {
  require(chromVAR)
  require(gchromVAR)

  if(is.null(bg.peaks)) {
    bg.peaks <- getBackgroundPeaks(SE)
    rownames(bg.peaks) <- rownames(SE)
  }

  stopifnot(all(rownames(bg.peaks) == rownames(SE)))
  stopifnot(all(rownames(traits) == rownames(SE)))

  ## Get overlap of each peak with SNPs from each trait
  trait.scores <- assay(traits, "weights")
  rownames(trait.scores) <- rownames(traits)

  ## Find peaks overlapping with causal SNPs for each trait
  causal.peaks <- lapply(colnames(trait.scores), function(trait) {
    w <- trait.scores[,trait]
    w[w > 0]
  })
  names(causal.peaks) <- colnames(trait.scores)
  causal.peaks <- causal.peaks[sapply(causal.peaks, length) > 0]

  ## Compute expected counts
  counts <- assay(SE, "counts")
  cluster.sums <- colSums(counts)
  peak.frac <- computeExpectations(SE)
  expected.counts <- sapply(cluster.sums, function(x) x*peak.frac)

  ## Compute deviation from expected counts
  dev <- (counts - expected.counts)/expected.counts

  ## Compute individual peak-cluster z-scores for each trait using background peaks
  causal.z <- lapply(names(causal.peaks), function(tr) {
    w <- causal.peaks[[tr]]
    peaks.use <- names(w)

    w.exp <- w*as.matrix(expected.counts[peaks.use,])
    w.dev <- (w*as.matrix(counts[peaks.use,]) - w.exp)/w.exp

    bg.dev <- lapply(1:ncol(bg.peaks), function(i) {
      ctrl.peaks <- rownames(counts)[bg.peaks[peaks.use,i]]
      bg.exp <- w*as.matrix(expected.counts[ctrl.peaks,])
      (w*as.matrix(counts[ctrl.peaks,]) - bg.exp)/bg.exp
    })
    bg.dev <- abind::abind(bg.dev, along = 3)

    bg.dev.mean <- apply(bg.dev, c(1,2), mean)
    bg.dev.sd <- apply(bg.dev, c(1,2), sd)

    z <- (w.dev - bg.dev.mean)/bg.dev.sd
    z.list <- lapply(colnames(z), function(i) {
      df <- data.frame(peak = rownames(z), z = z[,i],
                       PP = trait.scores[rownames(z),tr],
                       dev = dev[rownames(z),i])
      subset(df[order(df$z, decreasing = T),], z > min.z & PP > min.PP)
    })
    names(z.list) <- colnames(z)

    if (return.mat) {
      return(z)
    } else {
      return(z.list)
    }
  })
  names(causal.z) <- names(causal.peaks)

  return(causal.z)
}



#' Find genes coaccessible with causal peaks for each cluster/trait pair
#'
#' @param pheno.cl.use Trait/cluster pairs to analyze, as a character vector of "trait: cluster" elements
#' @param causal.z List of lists of causal peaks for each trait and cell type generated by FindCausalPeaks
#' @param peak.gene.weights Matrix of peak - gene coaccessibilities
#' @param min.weight Minimum coaccessibility to consider
#'
#' @return Matrix describing the coaccessibility of specified trait/cluster pairs with genes
#'
#' @export
#'
FindLinkedGenes <- function(pheno.cl.use, causal.z, peak.gene.weights, min.weight = 0.1) {
  ## Find peaks associated with each trait/cluster pair
  pheno.cluster.peaks <- lapply(pheno.cl.use, function(pheno.cl) {
    p <- strsplit(pheno.cl, split = ": ")[[1]][[1]]
    cl.use <- strsplit(pheno.cl, split = ": ")[[1]][[2]]
    peaks <- unique(as.character(causal.z[[p]][[cl.use]]$peak))
    peaks[peaks %in% rownames(peak.gene.weights)]
  })
  names(pheno.cluster.peaks) <- pheno.cl.use

  ## Get coaccessible genes as list
  pheno.cluster.genes <- lapply(pheno.cluster.peaks, function(peaks) {
    if (length(peaks) > 1) {
      w <- colSums(as.matrix(peak.gene.weights[peaks,]))
    } else if (length(peaks) == 1) {
      w <- peak.gene.weights[peaks,]
    } else {
      w <- c()
    }
    w[w >= min.weight]
  })
  all.pheno.genes <- unique(unlist(lapply(pheno.cluster.genes, names), F, F))

  ## Bind coaccessible gene weights into matrix
  pheno.genes.mat <- do.call(cbind, lapply(names(pheno.cluster.genes), function(x) {
    w <- pheno.cluster.genes[[x]]
    w <- w[names(w) %in% all.pheno.genes]
    v <- rep(0, length(all.pheno.genes))
    names(v) <- all.pheno.genes
    v[names(w)] <- w
    v
  }))
  colnames(pheno.genes.mat) <- pheno.cl.use

  return(pheno.genes.mat)
}


#' Find the posterior probability score of causal peaks coaccessible with genes for specified trait/cluster pairs
#'
#' @param pheno.genes.mat Matrix describing the coaccessibility of specified trait/cluster pairs with genes. Generated by FindLinkedGenes.
#' @param causal.z List of lists of causal peaks for each trait and cell type generated by FindCausalPeaks
#' @param peak.gene.weights Matrix of peak - gene coaccessibilities
#'
#' @return Matrix of the sum of posterior probabilities of causal snps driving the coaccessibility of specified trait/cluster pairs with genes
#'
#' @export
#'
FindCausalProb <- function(pheno.genes.mat, causal.z, peak.gene.weights) {
  pheno.pp.mat <- sapply(colnames(pheno.genes.mat), function(pheno.cl) {
    p <- strsplit(pheno.cl, split = ": ")[[1]][[1]]
    cl.use <- strsplit(pheno.cl, split = ": ")[[1]][[2]]
    df <- causal.z[[p]][[cl.use]]

    peaks <- unique(as.character(causal.z[[p]][[cl.use]]$peak))
    peaks[peaks %in% rownames(peak.gene.weights)]
    peaks <- peaks[peaks %in% rownames(peak.gene.weights)]
    if (length(peaks) > 0) {
      w <- sapply(peaks, function(pk) {
        x <- peak.gene.weights[pk, rownames(pheno.genes.mat)]
        x[x > 0] <- df[pk, "PP"]
        x
      })
      if (!is.vector(w)) w <- rowSums(w)
    }
    else {
      w <- rep(0, nrow(pheno.genes.mat))
      names(w) <- rownames(pheno.genes.mat)
    }

    return(w)
  })
  pheno.pp.mat <- pheno.pp.mat[rownames(pheno.genes.mat), colnames(pheno.genes.mat)]
  pheno.pp.mat[pheno.genes.mat == 0] <- 0

  return(pheno.pp.mat)
}
yanwu2014/chromfunks documentation built on Aug. 20, 2023, 9:17 a.m.