R/MethylSeekRGR-utilities.R

# Copyright (C) 2015 Peter Hickey
#
# This file is part of methsim.
#
# methsim is free software: you can redistribute it and/or modify it
# under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 2 of the License, or
# (at your option) any later version.
#
# methsim is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with methsim  If not, see <http://www.gnu.org/licenses/>.

### =========================================================================
### MethylSeekRGR utility functions. These functions are not exported.
### -------------------------------------------------------------------------
###

# The functions in this file are modified versions of functions available in
# the MethylSeekR Bioconductor package version 1.6.0
# (http://www.bioconductor.org/packages/3.0/bioc/html/MethylSeekR.html).
# MethylSeekR is released under a GPL (>= 2) license.
#
# TODO (long term):I'm going to need a way to keep these functions up to date
# with what is available in MethylSeekR. It may be easier/better to try to get
# these changes folder back into MethylSeekR.
#
# TODO (long term): Ensure that all GRanges-like objects returned by these
# functions preserve the seqinfo of the input.

# A copy of MethylSeekR::plotAlphaDistributionOneChr that can be safely run in
# parallel because it does not create the plot but returns it as an object.
# TODO: Document and robust-ify (long-term)
plotAlphaDistributionOneChr <- function(m, chr.sel, nCGbin = 101,
                                        plot = FALSE) {
  message("determining alpha distribution for chromosome: ",
          chr.sel)
  indx <- as.character(seqnames(m)) == chr.sel
  if (sum(indx) < nCGbin)
    stop(sprintf("Error: less than %d covered CpGs on chromosome %s",
                 nCGbin, chr.sel))
  T <- as.numeric(values(m[indx])[, 1])
  M <- as.numeric(values(m[indx])[, 2])
  score <- MethylSeekR:::calculateAlphaDistr(M, T, nCGbin, num.cores = 1L)
  hist(score, probability = TRUE, breaks = 30,
       xlab = sprintf("posterior mean of alpha (%s)", chr.sel), plot)
}

# A copy of MethylSeekR::segmentPMDs that can be safely run in
# parallel because it does not create the plot but returns it as an object.
# TODO: Document and robust-ify (long-term)
segmentPMDs <- function(m, chr.sel, seqLengths, nCGbin = 101, plot = FALSE) {
  hmm.model <- trainPMDHMM(m, chr.sel, nCGbin, plot = FALSE)
  y.list <- MethylSeekR:::PMDviterbiSegmentation(m, hmm.model$startval,
                                                 nCGbin, num.cores = 1L)
  segments <- MethylSeekR:::createGRangesObjectPMDSegmentation(m, y.list,
                                                               num.cores = 1L,
                                                               seqLengths)
  list(segments = segments, hist = hmm.model$hist, x = hmm.model$x,
       lines1 = hmm.model$lines1, lines2 = hmm.model$lines2)
}

# A copy of MethylSeekR::trainPMDHMM that can be safely run in
# parallel because it does not create the plot but returns it as an object.
# TODO: Document and robust-ify (long-term)
trainPMDHMM <- function(m, chr.sel, nCGbin, plot = FALSE) {
  message("training PMD-HMM on chromosome ", chr.sel)
  indx <- as.character(seqnames(m)) == chr.sel
  if (sum(indx) < nCGbin) {
    stop(sprintf("Error: less than %d covered CpGs on chromosome %s",
                 nCGbin, chr.sel))
  }
  T <- as.numeric(values(m[indx])[, 1])
  M <- as.numeric(values(m[indx])[, 2])
  score <- MethylSeekR:::calculateAlphaDistr(M, T, nCGbin, num.cores = 1L)
  J = 2
  init0 <- c(0, 1)
  P0 <- t(matrix(c(0.998297563, 0.001702437, 0.002393931, 0.997606069),
                 nrow = J, ncol = J))
  b0 <- list(mu = c(0.3867895, 1.1690474), sigma = c(0.01649962,
                                                     0.1437864))
  startval <- mhsmm::hmmspec(init = init0, trans = P0, parms.emission = b0,
                             dens.emission = dnorm.hsmm)
  train <- list(x = score, N = length(score))
  startval <- mhsmm::hmmfit(train, startval, mstep = mstep.norm)$model
  x <- seq(0, 3, by = 0.01)
  hist <- hist(score, probability = TRUE, breaks = 30,
               xlab = sprintf("posterior mean of alpha (%s)", chr.sel),
               plot = plot)
  lines1 <- dnorm(x, mean = startval$parms.emission$mu[1],
                  sd = sqrt(startval$parms.emission$sigma[1]))
  lines2 <- dnorm(x, mean = startval$parms.emission$mu[2],
                  sd = sqrt(startval$parms.emission$sigma[2]))
  if (plot) {
    plot(hist, freq = FALSE)
    lines(x, lines1, type = "l", col = "red")
    lines(x, lines2, type = "l", col = "green")
    return(list(startval = startval, hist = hist, x = x, lines1 = lines1,
                lines2 = lines2))
  } else {
    list(startval = startval, hist = hist, x = x, lines1 = lines1,
         lines2 = lines2)
  }
}

# A copy of MethylSeekR::calculateFDRs that adds the sample name to the title
# of each plot.
# TODO: Document and robust-ify (long-term)
calculateFDRs <- function(m, sn, CGIs, PMDs = NA, pdfFilename = NULL, num.cores = 1,
                          nCpG.smoothing = 3,
                          meth.cutoffs = seq(0.3, 0.7, by = 0.1),
                          nCpG.cutoffs = seq(1, 6, by = 1), minCover = 5) {
  m = m[values(m)[, 1] >= minCover]
  message("calculating false discovery rate")
  n <- length(meth.cutoffs) * length(nCpG.cutoffs)
  parameters <- vector("list", n)
  count = 1
  for (meth.cutoff in meth.cutoffs) {
    for (k in nCpG.cutoffs) {
      parameters[[count]] <- c(meth.cutoff, k)
      count <- count + 1
    }
  }
  meth.notrand <- m
  if (class(PMDs) == "GRanges") {
    message("removing PMDs for randomization")
    meth.notrand <- subsetByOverlaps(m, PMDs[values(PMDs)$type ==
                                               "notPMD"])
  }
  meth.rand <- meth.notrand
  ov <- findOverlaps(meth.rand, CGIs)
  meth.rand <- meth.rand[-unique(queryHits(ov))]
  values(meth.rand) <- values(meth.rand)[sample(length(meth.rand)),
                                         ]
  res <- mclapply(parameters, function(params) {
    meth.cutoff <- params[1]
    k <- params[2]
    mean.meth <- runmean(Rle(values(meth.notrand)[, 2]/values(meth.notrand)[,
                                                                            1]), k = nCpG.smoothing, endrule = "constant")
    indx <- mean.meth < meth.cutoff
    nSeg <- sum(runValue(indx) == TRUE & runLength(indx) >=
                  k)
    mean.meth = runmean(Rle(values(meth.rand)[, 2]/values(meth.rand)[,
                                                                     1]), k = nCpG.smoothing, endrule = "constant")
    indx <- mean.meth < meth.cutoff
    nSeg.rand <- sum(runValue(indx) == TRUE & runLength(indx) >=
                       k)
    c(nSeg, nSeg.rand)
  }, mc.cores = num.cores)
  FDRs = 100 * sapply(res, function(x) {
    x[2]/x[1]
  })
  tmp = matrix(NA, nrow = length(meth.cutoffs), ncol = length(nCpG.cutoffs))
  rownames(tmp) = meth.cutoffs
  colnames(tmp) = nCpG.cutoffs
  count = 1
  for (meth.cutoff in meth.cutoffs) {
    for (k in nCpG.cutoffs) {
      tmp[as.character(meth.cutoff), as.character(k)] = FDRs[count]
      count = count + 1
    }
  }
  FDRs = tmp
  numSegments = sapply(res, function(x) {
    x[1]
  })
  tmp = matrix(NA, nrow = length(meth.cutoffs), ncol = length(nCpG.cutoffs))
  rownames(tmp) = meth.cutoffs
  colnames(tmp) = nCpG.cutoffs
  count = 1
  for (meth.cutoff in meth.cutoffs) {
    for (k in nCpG.cutoffs) {
      tmp[as.character(meth.cutoff), as.character(k)] = numSegments[count]
      count = count + 1
    }
  }
  numSegments = tmp
  rownames(FDRs) = as.character(100 * as.numeric(rownames(FDRs)))
  rownames(numSegments) = as.character(100 * as.numeric(rownames(numSegments)))
  if (!is.null(pdfFilename)) {
    pdf(pdfFilename, width = 9, height = 4.5)
  }
  par(mfrow = c(1, 2))
  barplot(pmin((t(FDRs)), 20), beside = TRUE, ylab = "FDR (%)",
          ylim = c(0, 20), xlab = "methylation cut-off (%)", main = sn)
  barplot(t(numSegments), beside = TRUE, ylab = "number of segments",
          xlab = "methylation cut-off (%)", main = sn)
  legend("topleft",
         legend = paste(colnames(FDRs),
                        c("CpG", rep("CpGs", ncol(FDRs) - 1)), sep = " "),
         fill = grey.colors(ncol(FDRs)), bty = "n")
  if (!is.null(pdfFilename))
    dev.off()
  rownames(FDRs) = as.character(as.numeric(rownames(FDRs))/100)
  rownames(numSegments) = as.character(as.numeric(rownames(numSegments))/100)
  list(FDRs = FDRs, numSegments = numSegments)
}

# A copy of MethylSeekR::segmentUMRsLMRs that adds the sample name to the title
# of each plot. Actually, it doesn't create the plot but simply returns the
# data necessary for creating it.
# TODO: Document and robust-ify (long-term)
segmentUMRsLMRs <- function(m, meth.cutoff = 0.5, nCpG.cutoff = 3, PMDs = NA,
                            myGenomeSeq, seqLengths, nCpG.smoothing = 3,
                            minCover = 5) {
  nCG.classification <- 30
  message("identifying UMRs and LMRs")
  m = m[values(m)[, 1] >= minCover]
  # TODO: Not sure why I have to explicitly coerce to a character vector but
  # the Rle,table-method isn't working as expected.
  nCGsPerChr = table(as.character(seqnames(m)))
  chrs = names(nCGsPerChr)[nCGsPerChr >= nCpG.smoothing]
  res <- mclapply(chrs, function(chr) {
    sel <- which(as.character(seqnames(m)) == chr)
    mean.meth <- runmean(Rle(values(m)[sel, 2] /
                               values(m)[sel, 1]), k = nCpG.smoothing,
                         endrule = "constant")
    indx <- mean.meth < meth.cutoff
    runValue(indx)[runLength(indx) < nCpG.cutoff & runValue(indx) ==
                     TRUE] = FALSE
    runValue(indx)[runLength(indx) < nCpG.cutoff & runValue(indx) ==
                     FALSE] = TRUE
    tmp.ids <- rep(1:length(runLength(indx)), runLength(indx))
    tmp <- split(1:length(sel), tmp.ids)
    tmp <- tmp[runValue(indx) == TRUE]
    if (length(tmp) > 0) {
      coords <- cbind(sapply(tmp, min), sapply(tmp, max))
      starts <- round((start(m)[sel[pmax(1, coords[, 1] - 1)]] +
                         start(m)[sel[coords[, 1]]]) / 2)
      ends <- round((start(m)[sel[coords[, 2]]] +
                       start(m)[sel[pmin(length(sel), coords[, 2] + 1)]]) / 2)
      hmr.gr = GRanges(seqnames = unique(seqnames(m[sel])),
                       strand = "*", ranges = IRanges(starts, ends),
                       seqlengths = seqLengths)
    }
    else {
      hmr.gr = GRanges(, seqlengths = seqLengths)
    }
    hmr.gr
  }, mc.cores = 1L)
  segments.gr = do.call(c, unname(res))
  if (class(PMDs) == "GRanges") {
    segments.gr = subsetByOverlaps(segments.gr,
                                   PMDs[values(PMDs)$type == "notPMD"])
  }
  nCG = vcountPattern("CG", getSeq(myGenomeSeq,
                                   resize(segments.gr, width(segments.gr),
                                          fix = "start"), as.character = FALSE))
  ov <- findOverlaps(m, segments.gr)
  T = tapply(values(m[queryHits(ov)])[, 1], subjectHits(ov), sum)
  M = tapply(values(m[queryHits(ov)])[, 2], subjectHits(ov), sum)
  nCG.segmentation = tapply(values(m[queryHits(ov)])[, 1], subjectHits(ov),
                            length)
  median.meth = tapply(as.vector(runmean(Rle(values(m[queryHits(ov)])[, 2] /
                                               values(m[queryHits(ov)])[, 1]),
                                         nCpG.smoothing, endrule = "constant")),
                       subjectHits(ov), median)
  median.meth = pmax(0, median.meth)
  if (!all.equal(as.numeric(names(T)), 1:length(segments.gr))) {
    message("error in calculating methylation levels for PMDs")
  }
  type = c("UMR", "LMR")[as.integer(nCG < nCG.classification) + 1]
  values(segments.gr) = DataFrame(nCG.segmentation, nCG, T, M, pmeth = M / T,
                                  median.meth = median.meth, type)
  jet.colors <- colorRampPalette(c("#00007F", "blue", "#007FFF", "cyan",
                                   "#7FFF7F", "yellow", "#FF7F00", "red",
                                   "#7F0000"))
  list(x = log2(values(segments.gr)$nCG),
       y = 100 * values(segments.gr)$median.meth,
       colramp = jet.colors,
       xlab = "log2 number of CpGs in segment",
       ylab = "median methylation (%)",
       v = log2(nCG.classification),
       segments.gr = segments.gr)
}

# A copy of MethylSeekR::plotFinalSegmentation that can be safely run in
# parallel because it does not create the plot but returns it as an object.
# TODO: Document and robust-ify (long-term)
plotFinalSegmentation <- function(m, sn, segs, PMDs = NA, meth.cutoff,
                                  numRegions = 1, minCover = 5,
                                  nCpG.smoothing = 3, plot = FALSE) {
  m = m[values(m)[, 1] >= minCover]
  cols.seg = c("#377EB8", "#E41A1C")
  names(cols.seg) = c("UMR", "LMR")
  col.PMD = "#4DAF4A"
  pch.seg <- c(LMR = 24, UMR = 22)
  chrs = unique(as.character(seqnames(m)))
  height = 1
  len = 5000
  nCGperChr = table(as.character(seqnames(m)))
  indx = which(nCGperChr[chrs] < 3 * len)
  if (length(indx) > 0) {
    chrs = chrs[-indx]
  }
  exist.PMDs <- FALSE
  if (class(PMDs) == "GRanges") {
    exist.PMDs <- TRUE
  }
  for (ii in 1:numRegions) {
    chr = sample(chrs, 1)
    i = seqnames(m) == chr
    s = sample(1:(length(m[i]) - 3 * len), 1)
    par(mfrow = c(6, 1))
    for (gg in 1:3) {
      indx = (s + (gg - 1) * len):(s + (gg) * len)
      seg.sel = subsetByOverlaps(segs, m[i][indx])
      if (length(seg.sel) > 0) {
        mysegcol <- cols.seg[as.character(values(seg.sel)$type)]
        mysegpch <- pch.seg[as.character(values(seg.sel)$type)]
      }
      PMD.sel = NA
      if (exist.PMDs) {
        PMD.sel = subsetByOverlaps(PMDs[values(PMDs)$type == "PMD"], m[i][indx])
      }
      add.mar <- 2
      par(mar = c(3 - add.mar, 4, 1 + add.mar, 2))
      plot(start(m[i])[indx], 100 * values(m[i])[indx, 2] /
             values(m[i])[indx, 1], pch = "*", ylim = c(-18, 100), cex = 0.9,
           ylab = "methylation", axes = FALSE, xlab = "", main = sn)
      axis(2, at = c(0, 50, 100))
      box()
      text(par("usr")[1] + par("cxy")[1] * 0.2, par("usr")[3] +
             par("cxy")[2] * 0.3, adj = c(0, 0), label = "raw")
      if (exist.PMDs & length(PMD.sel) > 0) {
        rect(start(PMD.sel), -18 - height / 4, end(PMD.sel),
             -18 + height / 4, lwd = 2, col = col.PMD, border = col.PMD)
      }
      if (length(seg.sel) > 0) {
        points(x = mid(ranges(seg.sel)),
               y = rep(-18, length(seg.sel)), pch = mysegpch, cex = 1.3,
               col = "black", lwd = 0.5, bg = mysegcol)
      }
      par(mar = c(2 + add.mar, 4, 2 - add.mar, 2))
      plot(start(m[i])[indx], 100 *
             as.vector(runmean(Rle(values(m[i])[indx, 2] /
                                     values(m[i])[indx, 1]),
                               nCpG.smoothing, endrule = "constant")),
           pch = "*", ylim = c(-18, 100), cex = 0.9, ylab = "methylation",
           xlab = sprintf("position on %s", chr), axes = FALSE)
      axis(1)
      axis(2, at = c(0, 50, 100))
      box()
      text(par("usr")[1] + par("cxy")[1] * 0.2, par("usr")[3] +
             par("cxy")[2] * 0.3, adj = c(0, 0), label = "smoothed")
      if (exist.PMDs & length(PMD.sel) > 0) {
        rect(start(PMD.sel), -18 - height / 4, end(PMD.sel),
             -18 + height / 4, lwd = 2, col = col.PMD, border = col.PMD)
      }
      if (length(seg.sel) > 0) {
        points(x = mid(ranges(seg.sel)),
               y = rep(-18, length(seg.sel)), pch = mysegpch, cex = 1.3,
               col = "black", lwd = 0.5, bg = mysegcol)
      }
      abline(h = 100 * meth.cutoff, lty = 5, col = "darkgrey")
    }
  }
}
PeteHaitch/methsim documentation built on May 8, 2019, 1:32 a.m.