B compartment inference from ATAC-seq and methylation array data

Documented in getArrayABsignal

#' @title Estimate A/B compartments from methylation array data
#'
#' @description 
#' \code{getArrayABsignal} returns estimated A/B compartments from methylation array data.
#'
#' @details This function is modified from the minfi::compartments to infer A/B compartments from array data
#' 
#'
#' @param obj Input GenomicRatioSet object 
#' @param res Compartment resolution (in bp)
#' @param parallel Should the inference be done in parallel?
#' @param allchrs Whether all autosomes should be used for A/B inference
#' @param chr Specific chromosomes to analyze
#' @param targets Specify samples to use as shrinkage targets
#' @param ... Additional arguments
#'
#' @return A p x n matrix (samples as columns and compartments as rows) of compartments
#' @import minfi
#' @import Homo.sapiens
#' @export

getArrayABsignal <- function(obj, res=1e6, parallel=FALSE, allchrs=FALSE, chr = NULL, targets = NULL, ...) {
  globalMeanSet <- .getMeanGrSet(obj, targets)
  columns <- colnames(obj)
  names(columns) <- columns
  getComp <- .getPairedArray
  if (allchrs == TRUE) getComp <- .getPairedAllChrsArray
  
  if (parallel) {
    options(mc.cores=detectCores()/2) # RAM blows up otherwise 
    do.call(cbind, 
            mclapply(columns,getComp,grSet=obj,globalMeanSet=globalMeanSet,chr=chr,targets=targets,res=res))
  } else { 
    do.call(cbind, 
            lapply(columns,getComp,grSet=obj,globalMeanSet=globalMeanSet,chr=chr,targets=targets,res=res))
  } 
}

#Helper function to get compartments as in minfi::compartments
.arraycompartments <- function(obj, resolution = 1e6, what="OpenSea",
                         chr = "chr22", method = c("pearson", "spearman"),
                         keep = FALSE) {
  if (length(chr) > 1) stop("Expected a single chromosome internally and got more than one...")
  if (!(chr %in% seqlevels(obj))) stop("The supplied chromosome is not found in the seqlevels of the object...")
  method <- match.arg(method)
  gr <- .createCorMatrix(
    obj = obj,
    resolution = resolution,
    what = what,
    chr = chr,
    method = method)
  gr <- .extractAB(gr, keep = keep)
  gr$compartment <- .extractOpenClosedArray(gr)
  gr
}

#Helper function to perform mean imputation of missing values
.imputeMatrix <- function(matrix) {
    matrix[is.infinite(matrix) & matrix > 0] <- max(matrix[is.finite(matrix)])
    matrix[is.infinite(matrix) & matrix < 0] <- min(matrix[is.finite(matrix)])

    # Imputation of the missing values:
    # Probably should be using knn instead of mean imputation
    # Will leave for now but should be updated later
    missing <- which(is.na(matrix), arr.ind = TRUE)
    if (length(missing) != 0) {
        for (j in seq_len(nrow(missing))) {
            mean <- mean(
                x = matrix[missing[j, 1L], ][
                    is.finite(matrix[missing[j, 1L], ])],
                na.rm = TRUE)
            matrix[missing[j, 1L], missing[j, 2L]] <- mean
        }
    }
    matrix
}

#Helper function to return binned M-values
#this cannot possibly be optimal (two nested loops?!)
.returnBinnedMatrix <- function(gr.unbinnedCor, resolution){

    bin2D <- function(matrix, ids, n) {
        unique.ids <- sort(unique(ids))
        bin.matrix <- matrix(0, n, n)
        m <- length(unique.ids)
        for (i in seq_len(n)) {
            for (j in seq_len(n)) {
                # TODO: We should be able to speed this one up a lot
                indices1 <- which(ids == unique.ids[i])
                indices2 <- which(ids == unique.ids[j])
                bin.matrix[unique.ids[i],unique.ids[j]] <- median(
                    x = matrix[indices1, indices2],
                    na.rm = TRUE)
            }
        }
        # TODO: This line should not be necessary
        bin.matrix[is.na(bin.matrix)] <- 1
        bin.matrix
    }

    # TODO: Use Bioconductor infrastructure to get chromosome lengths and
    #       don't directly call structure() to create object
    # chr.lengths <- structure(
    #     c(249250621L, 243199373L, 198022430L, 191154276L, 180915260L,
    #       171115067L, 159138663L, 146364022L, 141213431L, 135534747L,
    #       135006516L, 133851895L, 115169878L, 107349540L, 102531392L,
    #       90354753L, 81195210L, 78077248L, 59128983L, 63025520L, 48129895L,
    #       51304566L, 155270560L, 59373566L),
    #     .Names = c("chr1", "chr2", "chr3", "chr4", "chr5", "chr6", "chr7",
    #                "chr8", "chr9", "chr10", "chr11", "chr12", "chr13", "chr14",
    #                "chr15", "chr16", "chr17", "chr18", "chr19", "chr20",
    #                "chr21", "chr22", "chrX", "chrY"))
    chr.lengths <- seqlengths(Homo.sapiens)[standardChromosomes(Homo.sapiens)]
    seqlengths(gr.unbinnedCor) <- chr.lengths[seqlevels(gr.unbinnedCor)]
    
    #Split this out and return a more informative error...
    if (length(seqlevels(gr.unbinnedCor)) > 1) stop("The number of seqlevels returned greater than one...")
    if (is.na(seqlengths(gr.unbinnedCor))) stop("The seqlengths returned are NA... is this human data?")
    if (!("cor.matrix" %in% names(mcols(gr.unbinnedCor)))) stop("The correlation matrix was not found in the GRanges object...")

    gr.binnedCor <- tileGenome(
        seqlengths = seqlengths(gr.unbinnedCor),
        tilewidth = resolution,
        cut.last.tile.in.chrom = TRUE)
    ids <- subjectHits(findOverlaps(gr.unbinnedCor, gr.binnedCor))
    gr.binnedCor$cor.matrix <- bin2D(
        matrix = gr.unbinnedCor$cor.matrix,
        ids = ids,
        n = length(gr.binnedCor))
    gr.binnedCor
}

#Helper function to remove empty bins
.removeBadBins <- function(gr) {
    good.bins <- !colAlls(gr$cor.matrix, value = 0)
    if (length(good.bins) < nrow(gr$cor.matrix)) {
        gr <- gr[good.bins]
        gr$cor.matrix <- gr$cor.matrix[, good.bins]
    }
    gr
}

#Helper function to extract whether the compartment is open or closed
.extractOpenClosedArray <- function(gr, cutoff = 0){
    pc <- gr$pc
    ifelse(pc < cutoff, "open", "closed")
}

#Helper function to get the first principal component
.getFirstPCarray <- function(matrix, method){
    # Centre the matrix
    center <- rowMeans2(matrix, na.rm = TRUE)
    matrix <- sweep(matrix, 1L, center, check.margin = FALSE)
    .fsvd(matrix, k = 1, method = method)$u
}

#Helper function to use a moving average smooth
.meanSmootherArray <- function(x, k = 1L, iter = 2L, na.rm = TRUE) {
  meanSmoother.internal <- function(x, k=1, na.rm=TRUE){
    if (k < 1) stop("k needs to be greater than or equal to 1...")
    if (length(x) < k) stop("Cannot smooth. Too few bins...")
    n <- length(x)
    y <- rep(NA,n)
    
    
    window.mean <- function(x, j, k, na.rm=na.rm){
      if (k>=1){
        return(mean(x[(j-(k+1)):(j+k)], na.rm=na.rm))
      } else {
        return(x[j])
      }    
    }
    
    for (i in (k+1):(n-k)){
      y[i] <- window.mean(x,i,k, na.rm)
    }
    for (i in 1:k){
      y[i] <- window.mean(x,i,i-1, na.rm)
    }
    for (i in (n-k+1):n){
      y[i] <- window.mean(x,i,n-i,na.rm)
    }
    y
  }
  
  for (i in 1:iter){
    x <- meanSmoother.internal(x, k=k, na.rm=na.rm)
  }
  x
}

#Helper function to unitarize the A/B estimates
.unitarize <- function(x, medianCenter = TRUE) {
    if (medianCenter) x <- x - median(x, na.rm = TRUE)
    bad <- is.na(x)
    x[!bad] <- x[!bad] / sqrt(sum(x[!bad]^2))
    n.bad <- sum(bad)
    if (n.bad > 0) {
        message(
            sprintf("[.unitarize] %i missing values were ignored.\n", n.bad))
    }
    x
}

#Helper function to perform fsvd instead of the nipals method from mixOmics package
#This needs to be resolved since nipals is required for ATAC but not here
#We are already supporting nipals so why do we need fsvd too - or the inverse?
.fsvd <- function(A, k, i = 1, p = 2, method = c("qr", "svd", "exact")) {
    method <- match.arg(method)
    l <- k + p
    n <- ncol(A)
    m <- nrow(A)
    tall <- TRUE
    # NOTE: In the case a WIDE matrix is provided:
    if (m < n) {
        A <- t(A) # This is potentially expensive
        n <- ncol(A)
        m <- nrow(A)
        tall <- FALSE
    }
    if (l > ncol(A) && method != "exact") {
        stop("(k+p) is greater than the number of columns. Please decrease ",
             "the value of k.")
    }

    # Construct G to be n x l and Gaussian
    G <- matrix(rnorm(n * l, 0, 1), nrow = n, ncol = l)

    if (method == "svd") {
        # Power method:
        H <- A %*% G # m x l matrix
        for (j in seq_len(i)) {
            H <- A %*% (crossprod(A, H))
        }
        # NOTE: We use a SVD to find an othogonal basis Q:
        # TODO: Remove this commented line if not needed
        # H = FF %*% Omega %*% t(S)
        svd <- svd(crossprod(H))
        FF <- svd$u # l x l
        omega <- diag(1/sqrt(svd$d)) # l x l
        S <- H %*% FF %*% omega # m x l
        # Define the orthogonal basis:
        Q <- S[, seq_len(k), drop = FALSE] # m x k
        # TODO: Remove these commented lines if not needed
        # TT <- t(A) %*% Q # n x k
        # TT <- t(TT)
        TT <- crossprod(Q, A)
    } else if (method == "qr") {
        # NOTE: Need to create a list of H matrices
        h.list <- vector("list", i + 1L)
        h.list[[1]] <- A %*% G
        for (j in seq(2, i + 1L)) {
            h.list[[j]] <- A %*% (crossprod(A, h.list[[j - 1L]]))
        }
        H <- do.call("cbind", h.list) # n x [(1+1)l] matrix
        # QR algorithm
        Q <- qr.Q(qr(H, 0))
        # TODO: Remove these commented lines if not needed
        # TT <- t(A)%*%Q # n x [(i+1)l]
        # TT <- t(TT)
        TT <- crossprod(Q, A)
    }
    if (method == "svd" || method == "qr") {
        svd <- svd(TT)
        u <- Q %*% svd$u[,seq_len(k), drop = FALSE]
        v <- svd$v[, seq_len(k), drop = FALSE]
        d <- svd$d[seq_len(k)]
    } else {
        # Exact SVD
        svd <- svd(A)
        u <- svd$u[, seq_len(k), drop = FALSE]
        v <- svd$v[, seq_len(k), drop = FALSE]
        d <- svd$d[seq_len(k)]
    }
    if (!tall) {
        uu <- v
        v <- u
        u <- uu
    }
    list(u = u, v = v, d = d)
}

#Helper function to create a correlation matrix
.createCorMatrix <- function(obj, resolution = 100 * 1000, what = "OpenSea",
                            chr = "chr22", method = c("pearson", "spearman")) {
    if (length(chr) > 1) stop("Expected a single chromosome internally and got more than one...")
    if (!(chr %in% seqlevels(obj))) stop("The supplied chromosome is not found in the seqlevels of the object...")
    method <- match.arg(method)

    if (is(obj, "GenomicMethylSet")) {
        obj <- ratioConvert(obj, what = "M", keepCN = FALSE)
    }
    assay(obj, "M") <- .imputeMatrix(getM(obj))

    # Next we subset to a chromosome, keep OpenSea probes and remove SNPs
    seqlevels(obj, pruning.mode = "coarse") <- chr
    obj <- obj[getIslandStatus(obj) %in% what,]
    obj <- dropLociWithSnps(obj, snps = c("CpG", "SBE"), maf = 0.01)

    gr.unbinnedCor <- granges(obj)
    gr.unbinnedCor$cor.matrix <- suppressWarnings(cor(t(getM(obj)), method = method))
    gr.cor <- .returnBinnedMatrix(gr.unbinnedCor, resolution = resolution)
    gr.cor <- .removeBadBins(gr.cor)
    #Fisher's Z to ensure MVN
    gr.cor$cor.matrix <- fisherZ(gr.cor$cor.matrix)
    gr.cor
}

#Helper function to extract A/B compartments
.extractAB <- function(gr, keep = TRUE, svdMethod = "qr"){
    if (!(is(gr, "GRanges") && "cor.matrix" %in% names(mcols(gr)))) {
        stop("'gr' must be an object created by createCorMatrix")
    }
    pc <- .getFirstPCarray(gr$cor.matrix, method = svdMethod)
    pc <- .meanSmootherArray(pc)
    pc <- .unitarize(pc)
    # Fix sign of eigenvector
    # Also check for NA correlation between colsums and the PC
    # Not sure if we should just dump all of these to NAs if the correlation fails
    # Explicitly implies that many of the eigenvalues are [close to] zero
    # This check ensures positive values are associated with the closed compartment
    # Currently we leave it as is...
    if (!is.na(cor(colSums2(gr$cor.matrix), pc)) & cor(colSums2(gr$cor.matrix), pc) < 0 ) {
        pc <- -pc
    }
    pc <- pc * sqrt(length(pc))
    gr$pc <- pc

    if (!keep) gr$cor.matrix <- NULL
    gr
}

.getMeanGrSet <- function(grSet, targets = NULL) { 
  if (!is.null(targets)) {
    stargets <- .getShrinkageTargets(grSet, targets)
    message("Using ", paste(shQuote(targets), collapse = ", "), " as shrinkage targets...")
    meanBeta <- matrix(rowMeans(getBeta(stargets), na.rm=TRUE), ncol=1)
  }
  else meanBeta <- matrix(rowMeans(getBeta(grSet), na.rm=TRUE), ncol=1) 
  subGrSet <- grSet[,1]
  assays(subGrSet)$Beta <- meanBeta
  sampleNames(subGrSet) <- "globalMean"
  return(subGrSet) 
}

.getPairedArray <- function(column, grSet, globalMeanSet=NULL, res=1e6, targets = NULL, ...) {
  message("Computing shrunken compartment eigenscores for ", column, "...") 
  if(is.null(globalMeanSet)) globalMeanSet <- .getMeanGrSet(grSet, targets)
  res <- .arraycompartments(cbind(grSet[,column], globalMeanSet), keep=FALSE, resolution=res, ...)
  pc <- res$pc
  names(pc) <- as.character(res)
  return(pc)
}

.getPairedAllChrsArray <- function(column, grSet, globalMeanSet=NULL, res=1e6, targets = NULL, ...) {
  if (is.null(globalMeanSet)) globalMeanSet <- .getMeanGrSet(grSet, targets)
  chrs <- intersect(paste0("chr", seq_along(1:22)), seqlevels(grSet))
  names(chrs) <- chrs
  getPairedChr <- function(chr) { 
    message("Computing shrunken eigenscores for ", column, " on ", chr, "...") 
    res <- .arraycompartments(cbind(grSet[,column],globalMeanSet),keep=FALSE,resolution=res,chr=chr)
    pc <- res$pc
    names(pc) <- as.character(res)
    return(pc)
  }
  unlist(lapply(chrs, getPairedChr))
}

Any scripts or data that you put into this service are public.

compartmap documentation built on Nov. 8, 2020, 5:34 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

compartmap
A/B compartment inference from ATAC-seq and methylation array data

R/getArrayABsignal.R
In compartmap: A/B compartment inference from ATAC-seq and methylation array data

Defines functions .getPairedAllChrsArray .getPairedArray .getMeanGrSet .extractAB .createCorMatrix .fsvd .unitarize .meanSmootherArray .getFirstPCarray .extractOpenClosedArray .removeBadBins .returnBinnedMatrix .imputeMatrix .arraycompartments getArrayABsignal

Documented in getArrayABsignal

Try the compartmap package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

compartmap A/B compartment inference from ATAC-seq and methylation array data

R/getArrayABsignal.R In compartmap: A/B compartment inference from ATAC-seq and methylation array data

Defines functions .getPairedAllChrsArray .getPairedArray .getMeanGrSet .extractAB .createCorMatrix .fsvd .unitarize .meanSmootherArray .getFirstPCarray .extractOpenClosedArray .removeBadBins .returnBinnedMatrix .imputeMatrix .arraycompartments getArrayABsignal

Documented in getArrayABsignal

Try the compartmap package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

compartmap
A/B compartment inference from ATAC-seq and methylation array data

R/getArrayABsignal.R
In compartmap: A/B compartment inference from ATAC-seq and methylation array data