R/segmentation.R
In HMMcopy: Copy number prediction with correction for GC and mappability bias for HTS data

Documented in HMMsegment normalize plotParam plotSegments stateCols

stateCols <- function() {
  return(c("#74C476", "#238B45", "#00008B", "#A50F15", "#DE2D26", "#FB6A4A"))
}

plotSegments <- function(correctOutput, segmentOutput,
    chr = correctOutput$chr[1], ...){
  if (is.null(segmentOutput$segs)) {
    warning("Processed segments now found, automatically processing")
    segmentOutput$segs <- processSegments(segments$segs,
      correctOutput$chr, correctOutput$start, correctOutput$end,
      correctOutput$copy)
  }

  segs <- segmentOutput$segs
  correctOutput$state <- segmentOutput$state
  cols <- stateCols()
  range <- quantile(correctOutput$copy, na.rm = TRUE, prob = c(0.01, 0.99))

  a <- subset(correctOutput, chr == chr)
  b <- subset(segs, chr == chr)
  plot(a$start, a$copy,
    col = cols[as.numeric(as.character(a$state))], ylim = range, ...)
  for (k in 1:nrow(b)){
    lines(c(b$start[k], b$end[k]), rep(b$median[k], 2), lwd = 3,
    col = "green")
  }
}

plotParam <- function(segmentOutput, param, ...) {
  cols <- stateCols()

  domain <- max(param$mu) - min(param$mu) + 1
  left <- min(param$mu) - domain
  right <- max(param$mu) + domain
  x = seq(left * 10, right * 10, by = 0.01);
  height = 0

  last <- ncol(segmentOutput$mus)
  lastmu <- segmentOutput$mus[, last]
  lastlambda <- segmentOutput$lambdas[, last]

  for(state in 1:nrow(param)) {
    y = tdistPDF(x, param$mu[state], param$lambda[state], param$nu[state]);
    z = tdistPDF(x, lastmu[state], lastlambda[state], param$nu[state]);
    height <- max(height, y, z)
  }

  plot(c(left, right), c(0, height), type = "n", xlab = "Log Ratio Intensities",
  ylab = "Student-t Density", ...);

  for(state in 1:nrow(param)) {
    y = tdistPDF(x, param$mu[state], param$lambda[state], param$nu[state]);
    z = tdistPDF(x, lastmu[state], lastlambda[state], param$nu[state]);
    lines(x, y, col = cols[state], lwd = 2, lty = 3);
    lines(x, z, col = cols[state], lwd = 3);
  }
}

HMMsegment <- function(correctOut, param = NULL, autosomes = NULL, maxiter = 50,
    getparam = FALSE, verbose = TRUE) {
  chr <- correctOut$chr

  if (!is.factor(chr)) {
    warning("chr is not a factor, converting to factor")
    chr <- as.factor(chr)
  }
  
  if (is.null(autosomes)) {
    autosomes <- (chr != "X" & chr != "Y" & chr != "23" & chr != "24" &
      chr != "chrX" & chr != "chrY" & chr != "M" & chr != "MT" & chr != "chrM")
  }

  if (is.null(param)) {
    param <- data.frame(
      strength = 10000000,
      e = 0.9999999,
      mu = quantile(correctOut$copy, na.rm = TRUE,
        prob = c(0.1, 0.25, 0.5, 0.75, 0.9, 0.99)),
      lambda = 20,
      nu = 2.1,
      kappa = c(0.05, 0.05, 0.7, 0.1, 0.05, 0.05) * 1000,
      m = 0,
      eta = c(5, 5, 50, 5, 5, 5) * 10000,
      gamma = 3,
      S = 0
    )
    param$m <- param$mu
    param$S <-
      ((sd(2 ^ correctOut$copy[autosomes], na.rm = TRUE)/sqrt(nrow(param))) ^ 2)
    rownames(param) <- seq(1, 6)
  }

  if (getparam) {
    return(param)
  }

  output <- manualSegment(correctOut$copy, chr, autosomes, param, maxiter,
    verbose)
  output$segs <- processSegments(output$segs, chr, correctOut$start,
    correctOut$end, correctOut$copy)
  return(output)
}

processSegments <- function(seg, chr, start, end, copy) {
  segment <- data.frame()

  if (!is.factor(chr)) {
    warning("chr is not a factor, converting to factor")
    chr <- as.factor(chr)
  }
  
  chromosomes <- levels(chr)
  for (i in 1:length(chromosomes)) {
    seg_length = dim(seg[[i]])[1]
    chr_name <- rep(chromosomes[i], seg_length)
    chr_index <- which(chr == chromosomes[i])
    chr_start <- start[chr_index][seg[[i]][, 1]]
    chr_stop <- end[chr_index][seg[[i]][, 2]]
    chr_state <- seg[[i]][, 3]
    chr_median <- rep(0, seg_length)
    for(j in 1:seg_length) {
      chr_median[j] <-
        median(na.rm = TRUE, copy[chr_index][seg[[i]][j, 1]:seg[[i]][j, 2]])
    }
    segment <- rbind(segment, cbind(chr = chr_name,
      start = as.numeric(chr_start), end = chr_stop, state = chr_state,
      median = chr_median))
  }
  segment <- transform(segment, start = as.numeric(as.character(start)),
    end = as.numeric(as.character(end)),
    median = as.numeric(as.character(median)))
  return(segment)
}

manualSegment <- function(copy, chr, autosomes, param, maxiter,
    verbose = TRUE) {

  if (!is.factor(chr)) {
    warning("chr column is not a factor, converting to factor")
    chr <- as.factor(chr)
  }
  
  if (length(copy) != length(chr) || length (copy) != length(autosomes)) {
    stop("Length of inputs do not match for one of: copy, chr, autosomes")
  }

  if (is.null(param$mu) || is.null(param$lambda) || is.null(param$nu) ||
    is.null(param$kappa) || is.null(param$eta) || is.null(param$gamma)) {
    stop(paste("Parameter missing, ensure all parameters exist as columns in",
      "data frame: mu, lambda, nu, kappa, eta, gamma"))
  }

  K <- dim(param)[1]
  N <- length(copy)
  rho <- matrix(0, K, N)
  py <- matrix(0, K, N)            # Local evidence
  mus <- matrix(0, K, maxiter)     # State means
  lambdas <- matrix(0, K, maxiter) # State Variances
  kappa <- param$kappa             # Initial state distribution
  converged <- FALSE               # Flag for convergence
  Z <- rep(0, N)
  Zcounts <- matrix(0, K, K)
  loglik <- rep(0, maxiter)

  # SET UP
  # Set up the chromosome indices and make cell array of chromosome indicies
  chrs <- levels(chr)              # WARNING, gets resorted here...
  chrsI <- vector('list', length(chrs))

  # initialise the chromosome index and the init state distributions
  for(i in 1:length(chrs)) {
    chrsI[i] <- list(which(chr == chrs[i]))
  }

  # INITIALIZATION
  if (verbose) { message("Initialization") }
  i <- 1
  mus[, 1] <- param$mu # First column
  lambdas[, 1] <- param$lambda
  # Recalculate the likelihood
  for (k in 1:K) {
    py[k, ] <- tdistPDF(copy, mus[k, i],lambdas[k, i], param$nu[k]);
  }
  # Initialize transition matrix to the prior
  A <- matrix(0, K, K)
  for (j in 1:K) {
    A[j, ] <- (1 - param$e[1]) / (K - 1)
    A[j, j] <- param$e[1]
  }
  A_prior <- A
  dirPrior <- A * param$strength[1]
  loglik[i] <- -Inf

  while(!converged && (i < maxiter)) {
    if (verbose) { message(paste("EM iteration:", i,
      "Log likelihood:", loglik[i])) }
    i <- i + 1

    # E-step
    ############################################################################
    if (verbose) { message("Expectation") }
    Zcounts <- matrix(0, K, K)
    for (j in 1:length(chrsI)) {
      I <- chrsI[[j]]
      output <- .Call("forward_backward", kappa, A, py[, I],PACKAGE = "HMMcopy")
      rho[, I] <- output$rho
      loglik[i] <- loglik[i] + output$loglik
      Zcounts <- Zcounts + t(colSums(aperm(output$xi, c(3, 2, 1))))
    }

    # M-step
    ############################################################################
    # Update the noise hyperparams
    if (verbose) { message("Maximization") }
    mu_i <- mus[, i - 1]
    lambda_i <- lambdas[, i - 1]
    output <- estimateTNoiseParamsMap(copy[autosomes], mu_i, lambda_i, param$nu,
      rho[, autosomes], param$eta, param$m, param$gamma, param$S, param$kappa)
    mus[, i] <- output$mu_N
    lambdas[, i] <- output$lambda_N
    kappa <- output$pi

    # Recalculate the likelihood
    for (k in 1:K) {
      py[k, ] <- tdistPDF(copy, mus[k, i], lambdas[k, i], param$nu[k])
    }

    # Update transition matrix A
    priorA <- 0
    for (k in 1:K) {
      A[k, ] <- Zcounts[k, ] + dirPrior[k, ]
      A[k, ] <- normalize(A[k, ])
      priorA <- priorA + log(dirichletpdf(A_prior[k, ], A[k, ]))
    }

    # Compute log likelihood and check convergence
    priorMu <- c()
    for(k in 1:K) {
      priorMu[k] <- log(dnorm(mus[k, i], param$mu[k], 1))
    }
    loglik[i] <- loglik[i] + priorA + sum(priorMu)
    if (abs(loglik[i] - loglik[i - 1]) < 1e-1 || loglik[i] < loglik[i - 1]) {
      converged = 1
    }
  }

  if (converged) {
    i = i - 1
	 # Perform one last round of E-step to get latest responsibilities
     #E-step
    #%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
	 if (verbose) { message("Re-calculating latest responsibilties for output") }   
    for (j in 1:length(chrsI)) {
      I <- chrsI[[j]]
      output <- .Call("forward_backward", kappa, A, py[, I],
        PACKAGE = "HMMcopy")
      rho[, I] <- output$rho
    }

  }

  if (verbose) {
    message("Optimal parameters found, segmenting and classifying")
  }

  segs <- vector('list', length(chrs))
  for(c in 1:length(chrsI)) {
    I <- chrsI[[c]]
    output <- .Call("viterbi", log(kappa), log(A), log(py[, I]),PACKAGE = "HMMcopy")
    Z[I] <- output$path
    segs[[c]] <- output$seg
  }

  mus = mus[, 1:i]
  lambdas = lambdas[, 1:i]
  loglik = loglik[1:i]

  output <- vector('list', 0);
  output$state <- Z
  output$segs <- segs
  output$mus <- mus
  output$lambdas <- lambdas
  output$pi <- kappa
  output$loglik <- loglik
  output$rho <- rho

  return(output)
}

# Normalize a given array to sum to 1
normalize <- function(A) {
  M <- A / (sum(A) + (sum(A) == 0))
  return(M);
}

# Dirichlet probability density function, returns the probability of vector
# x under the Dirichlet distribution with parameter vector alpha
# Author: David Ross http://www.cs.toronto.edu/~dross/code/dirichletpdf.m
dirichletpdf <- function(x, alpha) {
  if (any(x < 0)) {
    return(0);
  }
  if (abs(sum(x) - 1) > 1e-3) {
    stop("Dirichlet PDF: probabilities do not sum to 1")
    return(0);
  }
  p <- exp(lgamma(sum(alpha)) - sum(lgamma(alpha))) * prod(x ^ (alpha - 1))
  return(p)
}

tdistPDF <- function(x, mu, lambda, nu) {
  p <- (gamma(nu / 2 + 0.5)/gamma(nu / 2)) * ((lambda / (pi * nu)) ^ (0.5)) *
    (1 + (lambda * (x - mu) ^ 2) / nu) ^ (-0.5 * nu - 0.5)
  p[is.na(p)] <- 1
  return(p)
}

estimateTNoiseParamsMap <- function(
    y, mu, lambda, nu, rho, eta, m, gamma, S, kappa) {

  yr <- t(matrix(y, length(y), length(mu)))        # Vectorize parameter
  u <- (1 + nu) / (((yr - mu) ^ 2) * lambda + nu); # scale parameter

  # Calculate the mean
  mu_N = (rowSums(rho * u * yr, na.rm = TRUE) + (eta * m)) /
    (rowSums(rho * u, na.rm = TRUE) + eta)

  # Calculate the precision
  lambda_N = (rowSums(rho, na.rm = TRUE) + gamma + 1) /
    (rowSums(rho * u * ((yr - mu_N) ^ 2), na.rm = TRUE) +
    (eta * (mu_N - m) ^ 2 + S))

  # Calculate the stationary distribution
  pi <- (rowSums(rho, na.rm = TRUE) + kappa - 1) /
    (length(y) + sum(kappa, na.rm = TRUE) + length(mu))

  out <- vector('list', 0)
  out$mu_N <- mu_N
  out$lambda_N <- lambda_N
  out$pi <- pi
  return(out)
}

Any scripts or data that you put into this service are public.

HMMcopy documentation built on Nov. 8, 2020, 8:09 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

HMMcopy
Copy number prediction with correction for GC and mappability bias for HTS data

R/segmentation.R
In HMMcopy: Copy number prediction with correction for GC and mappability bias for HTS data

Defines functions estimateTNoiseParamsMap tdistPDF dirichletpdf normalize manualSegment processSegments HMMsegment plotParam plotSegments stateCols

Documented in HMMsegment normalize plotParam plotSegments stateCols

Try the HMMcopy package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

HMMcopy Copy number prediction with correction for GC and mappability bias for HTS data

R/segmentation.R In HMMcopy: Copy number prediction with correction for GC and mappability bias for HTS data

Defines functions estimateTNoiseParamsMap tdistPDF dirichletpdf normalize manualSegment processSegments HMMsegment plotParam plotSegments stateCols

Documented in HMMsegment normalize plotParam plotSegments stateCols

Try the HMMcopy package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

HMMcopy
Copy number prediction with correction for GC and mappability bias for HTS data

R/segmentation.R
In HMMcopy: Copy number prediction with correction for GC and mappability bias for HTS data