marimba2: Gibbs sampler for genotyping CNPs

### y is an nx3 matrix, where n is the number of trios
### 1st column = M, 2nd column = F, 3rd column = Offspring
### S is the number of gibbs sampler scans
### mu is the mean of the normal prior put on each class mean
### tau is the standard deviation (not variance) of the same distribution
### Put a Gamma(nu/2, nu/2 * lambda^2) on the class precisions 
### Assume prior of dirichlet(a,b,c) for the joint prior on the 
###   probabilities of class membership
### This model assumes no independence of offspring copy number state
###   to parent copy number state
#' A Gibbs Sampler for genotyping copy number in trios
#' 
#' This function allows you to obtain a posterior probability distribution for copy number genotyping on Illumina arrays for an fixed genetic model where tau1=1, tau2=0.5 and tau3=0
#' @param y Data$response where data is as generated by the cn.sim.data function.
#' @param S The number of iterations for the Gibbs Sampler (suggest default 10000).
#' @param mu Defaults to -.02.
#' @param xi Defaults to sqrt(200).
#' @param nu Defaults to 3.
#' @param lambda Defaults to 0.05.
#' @param alpha Alpha1 and alpha2 and psi refer to HWE parameters as specified in Cardin et al (alpha=w, psi=lambda).
#' @keywords marimba
#'

initialize_chains <- function(gmodel, params) {
  N <- params$N
  K <- length(gmodel$theta)
  S <- params$iter

  C <- list()
  
  for (k in 1:K){
    cn.k <- attributes(table(gmodel$cn))$dimnames[[1]][k]
    label <- as.vector(paste0("C",cn.k))
    c <- matrix(0, N, 3)
    colnames(c) <-  c("m", "f", "o")
    C[[length(C)+1]] <- c
    names(C)[k] <- label
    }
  
  theta <- matrix(0, S, K)
  sigma <- matrix(0, S, K)
  tau <- matrix(0, S, 1)
  logll <- rep(0, S)
  ##tau <- rep(0, S)
  ## each row of p corresponds to scan, each column is posterior probability of 
  ##   copy number 0, 1, and 2 respectively
  pi <- matrix(0, nrow = S, ncol = K)
  pi.child <- matrix(0, nrow = S, ncol = K)
  colnames(pi) <- names(C)
  colnames(pi.child) <- names(C)
  list(C=C,
       theta=theta,
       sigma=sigma,
       logll=logll, 
       tau=tau,
       pi=pi,
       pi.child=pi.child)
}

gMendelian <- function(tau.one, tau.two, tau.three, err=0){
  tau1 <- tau.one-err/2
  tau2 <- tau.two-err/2
  tau3 <- tau.three
  mendelian.probs <- array(dim=c(3, 3, 3))
  genotypes <- c("BB", "AB", "AA")
  dimnames(mendelian.probs) <- list(paste0("O_", genotypes),
                                    paste0("F_", genotypes),
                                    paste0("M_", genotypes))
  mendelian.probs[, 1, 1] <- c((1 - tau3)^2, 2 * tau3 * (1 - tau3), tau3^2)
  mendelian.probs[, 2, 1] <- c((1 - tau2) * (1 - tau3), tau2 * (1 - tau3) + (1 - tau2) * tau3, tau2 * tau3)
  mendelian.probs[, 3, 1] <- c((1 - tau1) * (1 - tau3), tau1 * (1 - tau3) + tau3 * (1 - tau1), tau1 * tau3)
  mendelian.probs[, 1, 2] <- c((1 - tau2) * (1 - tau3), tau2 * (1 - tau3) + (1 - tau2) * tau3, tau2 * tau3)
  mendelian.probs[, 2, 2] <- c((1 - tau2)^2, 2 * tau2 * (1 - tau2), tau2^2)
  mendelian.probs[, 3, 2] <- c((1 - tau1) * (1 - tau2), tau1 * (1 - tau2) + tau2 * (1 - tau1), tau1 * tau2)
  mendelian.probs[, 1, 3] <- c((1 - tau1) * (1 - tau3), tau1 * (1 - tau3) + tau3 * (1 - tau1), tau1 * tau3)
  mendelian.probs[, 2, 3] <- c((1 - tau1) * (1 - tau2), tau1 * (1 - tau2) + tau2 * (1 - tau1), tau1 * tau2)
  mendelian.probs[, 3, 3] <- c((1 - tau1)^2, 2 * tau1 * (1 - tau1), tau1^2)
  mendelian.probs
}

kmean_clusters <- function(dat, params){
  K <- params$K
  y.mf <- dat[, 1:2]
  y.mfdf <- melt(y.mf)
  y.mfresponse <- y.mfdf$value
  kmeans.result <- kmeans(y.mfresponse, K)
  kmeans.center <- kmeans.result$centers
  kmeans.order <- order(kmeans.center)
  kmeans.clust <- kmeans.result$cluster
  kmeans.clust <- sapply(kmeans.clust, function(x) which(kmeans.order==x))
  y.mfdf$cn <- kmeans.clust - 1
  y.mfdf
}

kmean_clusters.add <- function(kmean.df){
  # use y.mfdf as input from kmean_clusters
  kmean.clust<-kmean.df$cn
  kmean.df$cn<-kmean.clust+1
  kmean.df
}

p_offspring <- function(cn, mendelian.probs, theta){
  t(apply(cn, 1, lookup_prob, mendelian.probs=mendelian.probs, theta=theta))
}

lookup_prob <- function(cn, mendelian.probs, theta) {
  att <- names(theta)[1]
  
  if((as.numeric(att))==0) {
    mendelian.probs[, cn[[2]] + 1, cn[[1]] + 1]
  } else if((as.numeric(att))==1) {
    mendelian.probs[, cn[[2]], cn[[1]]]
  } else {
    mendelian.probs[, cn[[2]] - 1, cn[[1]] - 1]
  }
}

cnProb <- function(p, y, theta, sigma){
  K <- length(theta)
  N <- length(y)
  ##
  ## Repeat each y K times so that each y is evaluated for the K different values of theta and sigma
  ##
  y.tmp <- rep(y, each=K)
  d.y <- matrix(dnorm(y.tmp, mean=theta, sd=sigma), N, K, byrow=TRUE)
  if(nrow(p) != nrow(d.y)) stop()
  cn.prob <- p * d.y
  cn.denom <- rowSums(cn.prob, na.rm=TRUE)
  if(any(cn.denom==0)){
    ##
    ## we simulated parental states that are incompatible with mendelian transmission
    ##
    any_incomp <- TRUE
    i <- which(cn.denom==0)
    cn.denom[i] <- 1
  } else any_incomp <- FALSE
  cn.prob <- cn.prob/cn.denom
  if(any_incomp){
    ## let data drive (?)
    cn.prob[i, ] <- d.y[i, ]/(rowSums(d.y[i, , drop=FALSE]))
  }
  colnames(cn.prob) <- NULL
  cn.prob <- as.matrix(cn.prob)
  cn.prob
}

cnProb2 <- function(p, y, theta, sigma){
  N <- length(y)
  K <- length(theta)
  p <- matrix(p, N, K, byrow=TRUE)
  cnProb(p, y, theta, sigma)
}

multi_K <- function(dat, params){
  K <- 3
  params$K <- K
  y.mf.k3 <- initialize_gmodel_setup(dat, params)
  
  # k2a has cn components 0,1
  # k2b has cn components 1,2
  K <- 2
  params$K <- K
  y.mf.k2a <- initialize_gmodel_setup(dat, params)
  y.mf.k2b <- initialize_gmodel_setup.alt(dat, params)
  
  #k1a has cn component 1
  #k1b has cn component 2
  K <- 1
  params$K <- K
  y.mf.k1a <- initialize_gmodel_setup.alt(dat, params)
  y.mf.k1b <- initialize_gmodel_setup.alt2(dat, params)
  return(list(y.mf.k3=y.mf.k3,
        y.mf.k2a=y.mf.k2a,
        y.mf.k2b=y.mf.k2b,
        y.mf.k1a=y.mf.k1a,
        y.mf.k1b=y.mf.k1b))
}

initialize_gmodel_setup <- function(dat, params){
  K <- params$K
  N <- params$N
  y.mf <- kmean_clusters(dat[, c("m", "f")], params)
}

initialize_gmodel_setup.alt <- function(dat, params){
  K <- params$K
  N <- params$N
  y.mf <- kmean_clusters(dat[, c("m", "f")], params)
  y.mf <- kmean_clusters.add(y.mf)
}

initialize_gmodel_setup.alt2 <- function(dat, params){
  K <- params$K
  N <- params$N
  y.mf <- kmean_clusters(dat[, c("m", "f")], params)
  y.mf <- kmean_clusters.add(y.mf)
  y.mf <- kmean_clusters.add(y.mf)
}

dim.reduct <- function(offspring.probs, comp){
  if (comp==2){
    offspring.probs <- offspring.probs[,1:2]
  } else if (comp==3){
    offspring.probs <- offspring.probs[,2:3]
  } else if (comp==4) {
    offspring.probs <- as.matrix(offspring.probs[,2])
  } else if (comp==5) {
    offspring.probs <- as.matrix(offspring.probs[,3])
  } else offspring.probs <- offspring.probs
  return (offspring.probs)
}

init.offspring.cn <- function(cn.prob, cn.mf, comp, y.odf){
  if (comp < 3){
    cn.o <- rMultinom(cn.prob, 1)[, 1]
    y.odf$cn <- cn.o - 1
  } else {
    if (comp==3){
      cn.o <- rMultinom(cn.prob, 1)[, 1]
      y.odf$cn <- cn.o
    } else {
      cn.o <- cn.mf$m
      y.odf$cn <- cn.o
    }
  }
  return (y.odf)
}

compute_loglik <- function(gmodel){
  y <- as.numeric(gmodel$y)
  att <- attributes(gmodel$theta)
  if(as.numeric(att$names[1])==0) {
    z <- gmodel$cn + 1
  } else if (as.numeric(att$names[1])==1) {
    z <- gmodel$cn
  } else {
    z <- gmodel$cn - 1
  }
  theta <- gmodel$theta[z]
  sigma <- gmodel$sigma[z]
  sum(dnorm(y, theta, sigma, log=TRUE))
}

componentStats <- function(y, cn){
  yy <- as.numeric(y)
  cn2 <- as.numeric(cn)
  ylist <- split(yy, cn2)
  means <- sapply(ylist, mean, na.rm=TRUE)
  sds <- sapply(ylist, sd, na.rm=TRUE)

  ## record just the parents for the mix probs
  y.p <- as.numeric(y[, 1:2])
  cn.p <- as.numeric(cn[, 1:2])
  y.p.list <- split(y.p, cn.p)
  ns <- sapply(y.p.list, length)
  list(means=means, Ns=ns, sds=sds)
}

update_cn <- function(gmodel, params){
  N <- params$N
  pi <- gmodel$pi
  gmodel <- update_mendel(gmodel)
  mendel.prob <- gmodel$m.probs
  y <- gmodel$y
  sigma <- gmodel$sigma
  theta <- gmodel$theta
  K <- length(theta)
  ## parents
  cn.prob.m <- cnProb2(pi, y[, "m"], theta, sigma)
  cn.prob.f <- cnProb2(pi, y[, "f"], theta, sigma)
  cn.mf <- update_cn.parents(cn.prob.m, cn.prob.f, gmodel, params)
  # cn.mf <- replaceIfMissing2(cn.mf)
  
  ##
  ## offspring
  ##
  cn.o <- update_cn.child(gmodel, params, cn.mf, mendel.prob)
  cn <- cbind(cn.mf, cn.o)
  colnames(cn) <- c("m", "f", "o")
  cn
}

update_cn.parents <- function(probs.m, probs.f, gmodel, params){
  N <- params$N
  att <- attributes(gmodel$theta)
  cn.prob.m <- probs.m
  cn.prob.f <- probs.f
  cn.mf <- cbind(rMultinom(cn.prob.m, 1)[, 1],
                 rMultinom(cn.prob.f, 1)[, 1])
  cn.mf <- if((att$names[1])==1) cn.mf else cn.mf-1
  cn.mf <- if((att$names[1])==2) cn.mf+1 else cn.mf
  cn.mf
}

update_cn.child <- function (gmodel, params, cn.parents, probs) {
  N <- params$N
  y <- gmodel$y
  theta <- gmodel$theta
  sigma <- gmodel$sigma
  # note for future, if mendel.prob only one prob - nulll dimension, may throw error here.
    p <- p_offspring(cn.parents, probs, theta)
    p.star <- cnProb(p, y[, "o"], theta, sigma)
    if(FALSE){
      delta <- abs(p - p.star)
      index <- which(rowSums(delta > 0.5) > 0)
      p[index, ]
      p.star[index, ]
    }
    cn.o <- update_cn.child.cno(gmodel, p.star)
    cn.o
}

update_cn.child.cno <- function (gmodel, probs) {
  p.star <- probs
  cn.o <- tryCatch(rMultinom(p.star, 1)[, 1] , error=function(e) NULL)
  if(is.null(cn.o)) browser()
  att<-attributes(gmodel$theta)
  cn.o <- if((att$names[1])==1) cn.o else cn.o-1
  cn.o <- if((att$names[1])==2) cn.o+1 else cn.o
  cn.o
}

missingCnState <- function(cn.states, expected=as.character(c(0, 1, 2))){
  observed.cnstates <- names(table(cn.states))
  not.observed <- expected[ !expected %in% observed.cnstates ]
  not.two <- names(table(cn.states)[table(cn.states)==1])
  not.observed <- c(not.observed, not.two)
  not.observed <- unique(not.observed)
}

replaceIfMissing <- function(cn.states, expected=as.character(c(0, 1, 2))){
  missing.states <- missingCnState(cn.states)
  if(length(missing.states) > 0){
    cn.states[sample(seq_along(cn.states), 4)] <- as.numeric(missing.states)
  }
  cn.states
}

replaceIfMissing2 <- function(cn.states, expected=as.character(c(0, 1, 2))){
  cn.states <- as.matrix(cn.states)
  missing.states <- missingCnState(cn.states)
  if(length(missing.states) > 0){
    cn.states[sample(seq_along(cn.states), 2)] <- as.numeric(missing.states)
  }
  cn.states
}

balance.cn.all <- function(gmodel, params){
  params <- params
  N <- params$N
  cn.states <- gmodel$cn

  result <- replaceIfMissing(cn.states)
  result
}

update_theta <- function(ymns, sigma, ns, params){
  mu.0 <- params$mu
  xi <- params$xi ## prior variance
  ## let phi denote inverse variance
  data.precision <- ns * 1/sigma^2
  prior.precision <- 1/xi
  post.prec <- prior.precision + data.precision
  mu.n <- (mu.0 * prior.precision + ymns * data.precision) / post.prec
  K <- length(ymns)
  rnorm(K, mu.n, sqrt(1/post.prec))
}

ybar <- function(gmodel){
  yy <- as.numeric(gmodel$y)
  cn <- factor(as.numeric(gmodel$cn))
  mns <- tapply(yy, cn, mean, na.rm=TRUE)
  if(any(is.na(mns))) stop("NAs in variances")
  mns
}

yvar <- function(gmodel){
  yy <- as.numeric(gmodel$y)
  cn <- factor(as.numeric(gmodel$cn))
  vars <- tapply(yy, cn, var, na.rm=TRUE)
  if(any(is.na(vars))) stop("NAs in variances")
  vars
}

n <- function(gmodel){
  cn <- as.numeric(table(as.numeric(gmodel$cn)))
  cn
}

n_parents <- function(gmodel){
  yy <- as.numeric(gmodel$y[, c("m", "f")])
  cn <- as.numeric(gmodel$cn[, 1:2])
  tab <- tapply(yy, cn, length)
  freq <- setNames(rep(0L, length(gmodel$theta)),
                   attributes(tab)$dimnames[[1]])
  freq[names(tab)] <- tab
  freq
}

n_child <- function(gmodel){
  yy <- as.numeric(gmodel$y[, c("o")])
  cn <- as.numeric(gmodel$cn[, 3])
  tab <- tapply(yy, cn, length)
  freq <- setNames(rep(0L, length(gmodel$theta)),
                   attributes(tab)$dimnames[[1]])
  freq[names(tab)] <- tab
  freq
}

nu <- function(params) params$nu

## mean: shape/rate
## var:  shape/rate^2
update_sigma <- function(params, ns, theta, ymns, yvars){
  nu.0 <- params$nu
  s2.0 <- params$sigma2.0
  nu.n <- nu.0 + ns
  s2.n <- 1/nu.n * (nu.0*s2.0 + (ns-1)*yvars + ns*(ymns-theta)^2)
  K <- length(ns)
  prec <- rgamma(K, nu.n/2, nu.n*s2.n/2)
  sqrt(1/prec)
}

# key module for different models

gmodel_oneiter <- function(gmodel, params){
  N <- params$N
  y <- gmodel$y
  pi <- gmodel$pi
  pi.child <- gmodel$pi.child
  theta <- gmodel$theta
  sigma <- gmodel$sigma
  K <- length(theta)
  ##
  ## update component labels
  ##
  if(TRUE){
    cn <- update_cn(gmodel, params)
  } else cn <- gmodel$cn
  gmodel$cn <- cn
  if(TRUE){
    gmodel$cn <- balance.cn.all(gmodel, params)
  }
  ##
  ## update theta
  ##
  ymeans <- ybar(gmodel)
  yvars <- yvar(gmodel)
  ns <- n(gmodel)
  if(TRUE){
    theta <- update_theta(params=params,
                          ymns=ymeans,
                          sigma=sigma,
                          ns=ns)
  } else theta <- gmodel$theta
  gmodel$theta[] <- theta
  ##
  ## update sigma
  ##
  if(TRUE){
    sigma <- update_sigma(params=params,
                          ns=ns,
                          theta=theta,
                          ymns=ymeans,
                          yvars=yvars)
  } else sigma <- gmodel$sigma
  gmodel$sigma[] <- sigma

  ## update mixture probabilities
  ## -- only for parents
  if(TRUE){
    pi <- update_pi(gmodel, params)
  } else pi <- gmodel$pi
  gmodel$pi[] <- pi
  
  if(TRUE){
    pi.child <- update_pi.child(gmodel, params)
  } else pi.child <- gmodel$pi.child
  gmodel$pi.child[] <- pi.child
  
  # update transmission matrix
  if(TRUE){
    gmodel <- updateTransmissionProb(gmodel, params)
  }
  # update log likelihood
  gmodel$logll <- compute_loglik(gmodel)
  gmodel
}

update_pi <- function(gmodel, params){
  a <- params$a
  ns <- n_parents(gmodel)
  d <- length(ns)
  a <- a[1:d]
  alpha.n <- ns + a
  p <- tryCatch(rdirichlet(1, alpha.n), error=function(e) NULL)
  if(is.null(p)) browser()
  p
}

update_pi.child <- function(gmodel, params){
  a <- params$a
  ns <- n_child(gmodel)
  d <- length(ns)
  a <- a[1:d]
  alpha.n <- ns + a
  p <- tryCatch(rdirichlet(1, alpha.n), error=function(e) NULL)
  if(is.null(p)) browser()
  p
}

update_tau <- function(gmodel, params){
  a <- params$a
  N <- params$N
  ns <- n_parents(gmodel)
  d <- length(ns) - 1
  bvec0 <- a[1:d]
  nvec <- as.numeric(n_child(gmodel))
  allele.freq <- DirichSampHWE(nvec,bvec0, 1)
  pvec <- allele.freq$pvec[2]
}

update_tau.env <- function(gmodel, params){
  a <- params$a
  N <- params$N
  ns <- n_parents(gmodel)
  d <- length(ns) - 1
  bvec0 <- a[1:d]
  nvec <- as.numeric(n_child(gmodel))
  # the line below will throw an error if balance.cn.all is not working properly to balance 3 components
  # bvec0 should always be c(1,1) and nvec a length 3 vector for 3 component model
  allele.freq <- DirichSampHWE(nvec,bvec0, 1)
  # to generalise to multi-components, the line below will need to be revised.
  pvec <- allele.freq$pvec[2]
  denom <- 2 * ns[[3]] + ns[[2]]
  numer <- pvec * 2* N
  tau.env <- numer / denom
  gmodel$tau[] <- tau.env
  tau <- gmodel$tau
  tau
}

update_tau.intermed <- function(gmodel, params){
  # see notes in update_tau.env
  a <- params$a
  N <- params$N
  ns <- n_parents(gmodel)
  d <- length(ns) - 1
  bvec0 <- a[1:d]
  nvec <- as.numeric(n_child(gmodel))
  allele.freq <- DirichSampHWE(nvec,bvec0, 1)
  pvec <- allele.freq$pvec[2]
  denom <- ns[[2]]
  # numerator here is just for heterozygotes in kids so calculated accordingly
  numer <- pvec * 2 * N * (nvec[2] / (nvec[2] + 2 * nvec[3]))
  tau.intermed <- numer / denom
  gmodel$tau[] <- tau.intermed
  tau <- gmodel$tau
  tau
}

update_mendel <- function(gmodel){
  m.probs <- gmodel$m.probs
  att <- attributes(gmodel$theta)
  dim <- as.numeric(att$names) + 1
  dim <- sort(dim)
  #dim.l <- length(dim)
  dim.max <- max(dim)
  dim.min <- min(dim)
  subset <- c(dim.min:dim.max)
  m.probs <- m.probs[subset, subset, subset]
  gmodel$m.probs <- m.probs
  gmodel
 }

# this module updates both the Mendelian transmission matrix as well as the taus where relecant
updateTransmissionProb <- function(gmodel, params){

  #ncp <- params$ncp

if(params$model == "Genetic") {
  m.probs <- gMendelian(tau.one<-params$tau.one, tau.two<-params$tau.two, tau.three<-params$tau.three, err<-params$error)
  gmodel$m.probs <- m.probs
  gmodel
} else if(params$model == "Environmental") {
tau <- update_tau.env(gmodel, params)
gmodel$tau <- tau
m.probs <- gMendelian(tau.one<-gmodel$tau, tau.two<-gmodel$tau, tau.three<-gmodel$tau, err<-params$error)
gmodel$m.probs <- m.probs
gmodel
} else {
  tau <- update_tau.intermed(gmodel, params)
  gmodel$tau <- tau
  m.probs <- gMendelian(tau.one<-params$tau.one, tau.two<-gmodel$tau, tau.three<-params$tau.three, err<-params$error)
  gmodel$m.probs <- m.probs
  gmodel
}
  return (gmodel)
  }

# move_chains <- function(chains, gmodel, i){
  
#generalised chain movement
 #K <- length(gmodel$theta)
  
  #chains$C$C0 <- chains$C$C0 + 1L * (gmodel$cn == 0)
  #chains$C$C1 <- chains$C$C1 + 1L * (gmodel$cn == 1)
  #chains$C$C2 <- chains$C$C2 + 1L * (gmodel$cn == 2)
  ##
  
  ##
  #chains$theta[i, ] <- gmodel$theta
  #chains$sigma[i, ] <- gmodel$sigma
  #chains$logll[i] <- compute_loglik(gmodel)
  #chains$pi[i, ] <- gmodel$pi
  #chains$tau[i] <- gmodel$tau
  #chains
#}

move_chains <- function(chains, gmodel, i){
  ##generalised chain movement
  K <- length(gmodel$theta)
  for (k in 1:K){
    cn.k <- attributes(table(gmodel$cn))$dimnames[[1]][k]
    cn.k <- as.numeric(cn.k)
    chains$C[[k]] <- chains$C[[k]] + 1L * (gmodel$cn == cn.k)
  }
  ##
  chains$theta[i, ] <- gmodel$theta
  chains$sigma[i, ] <- gmodel$sigma
  chains$logll[i] <- compute_loglik(gmodel)
  chains$pi[i, ] <- gmodel$pi
  chains$pi.child[i,] <- gmodel$pi.child
  chains$tau[i] <- gmodel$tau
  chains
}

.gibbs_burnin <- function(S, gmod, params){
  for(i in seq_len(S)){
    gmod <- gmodel_oneiter(gmod, params)
  }
  gmod
}

##############################
###the customised functions
#############################

###########################
###original working well###
###########################
# wrapper <- _genetic <- mcmc <- oneiter (make changes to oneiter)

gmodel_mcmc <- function(gmod, params, chains){
  S <- params$iter
  Th <- params$thin
  for(i in 2:S){
    for(j in seq_len(Th)){
      gmod <- gmodel_oneiter(gmod, params)
    }
    chains <- move_chains(chains, gmod, i)
  }
  list(gmodel=gmod, chains=chains)
}


# model comparison metrics
# calculation of BIC and DIC

model.comparison <- function (fit.model) {
  BIC <- calc.BIC(fit.model)
  DIC <- calc.DIC(fit.model)
  list = c(model.BIC = BIC,
           model.DIC = DIC)
}

calc.BIC <- function (fit.model) {
  n.sample <- length(fit.model$gmodel$y)
  npar <- length(fit.model$chains)
  logll <- fit.model$chains$logll
  bic.calc <- log(n.sample)*npar - 2*logll
  bic.calc <- mean(bic.calc)
  bic.calc
}

calc.DIC <- function (fit.model) {
  S <- nrow(fit.model$chains$theta)
  logll <- fit.model$chains$logll
  logll.sum <- sum(fit.model$chains$logll)
  npar <- 2*(logll - (1/S * logll.sum))
  dic.calc <- -2*(logll-npar)
  dic.calc <- mean(dic.calc)
  dic.calc
}

# output results summary
gibbs.results <- function(sim.truth, gibbs.cnv){
  N <- nrow(gibbs.cnv$trio.cn)
  
  # compare the truth set with call set
  truth <- sim.truth["cn"]
  truth <- truth$cn
  map.cn <- gibbs.cnv$trio.cn
  pr.cn <- gibbs.cnv$trio.cn.probs
  contig <- table(map.cn, truth)
  index <- which(rowSums(map.cn != truth) > 0)
  mistake.true.cn.trio <- truth[index, ]
  mistake.cn.called.trio <- map.cn[index, ]
  mistake.cn.probs.trio <- pr.cn[index, ]
  index.2 <- map.cn != truth
  mistake.type <- index.2 * 1
  mistake.type <- data.frame(mistake.type)
  mistake.type$num <- mistake.type$m * 1 + mistake.type$f * 10 + mistake.type$o * 100
  mistake.typed <- table(mistake.type$num)
  mistake.cn.true <- truth[index.2]
  mistake.y.true <- sim.truth$y[index.2]
  mistake.cn.called <- map.cn[index.2]
  mistake.spec <- cbind(mistake.cn.true, mistake.cn.called, mistake.y.true)
  colnames(mistake.spec) <- c("CN.true", "CN.estimated", "LRR.true")
  
  # calculate $ misclassification splot into parents and children
  map.cn.parents <- map.cn[,1:2]
  map.cn.offspring <- map.cn[,3]
  truth.parents <- truth[,1:2]
  truth.offspring <- truth[,3]
  contig.par <- table(map.cn.parents, truth.parents)
  contig.off <- table(map.cn.offspring, truth.offspring)
  parent.misclass.perc <- 1 - ((sum(map.cn.parents==truth.parents))/1000)
  parent.misclass.perc.cn0 <- (contig.par[2]+contig.par[3])/sum(contig.par[,1])
  parent.misclass.perc.cn1 <- (contig.par[4]+contig.par[6])/sum(contig.par[,2])
  parent.misclass.perc.cn2 <- (contig.par[7]+contig.par[8])/sum(contig.par[,3])
  parent.misclass.perc.cn <- c(parent.misclass.perc.cn0, parent.misclass.perc.cn1, parent.misclass.perc.cn2)
  offspring.misclass.perc <- 1 - ((sum(map.cn.offspring==truth.offspring))/500)
  child.misclass.perc.cn0 <- (contig.off[2]+contig.off[3])/sum(contig.off[,1])
  child.misclass.perc.cn1 <- (contig.off[4]+contig.off[6])/sum(contig.off[,2])
  child.misclass.perc.cn2 <- (contig.off[7]+contig.off[8])/sum(contig.off[,3])
  child.misclass.perc.cn <- c(child.misclass.perc.cn0, child.misclass.perc.cn1, child.misclass.perc.cn2)
  
  # effective size
  p <- gibbs.cnv$post.pi
  eff.size.pi <- effectiveSize(p)
  p.mns.parents <- colMeans(p)
  p <- gibbs.cnv$post.pi.child
  eff.size.pi.child <- effectiveSize(p)
  p.mns.child <- colMeans(p)
  p <- gibbs.cnv$post.thetas
  eff.size.thetas <- effectiveSize(p)
  p <- gibbs.cnv$post.sigmas
  eff.size.sigmas <- effectiveSize(p)
  
  # compare posterior probabilities for different components
  truth.p <- sim.truth$p
  empirical.p.overall <- table(truth)/(N*3)
  empirical.p.parents <- table(truth[,1:2]) / (N*2)
  empirical.p.child <- table(truth[,3]) / N
  attributes(p.mns.parents)$names <- c("0","1","2")
  attributes(p.mns.child)$names <- c("0","1","2")
  
  # calculate empirical and estimated thetas and sigmas
  empirical.thetas <- ybar(sim.truth)
  empirical.sigmas <- yvar(sim.truth)
  estimated.thetas <- apply(gibbs.cnv$post.thetas, 2, mean)
  estimated.sigmas <- apply(gibbs.cnv$post.sigmas, 2, mean)
  
  # taus
  posterior.tau <- median(gibbs.cnv$taus)
  
  # taus from truth set
  parents.count <- n_parents(sim.truth)
  child.count <- n_child(sim.truth)
  
  numer.env <- child.count[[2]] + 2 * child.count[[3]]
  denom.env <- parents.count[[2]] + 2 * parents.count[[3]]
  tau.env <- numer.env / denom.env
  
  numer.intermed <- child.count[[2]]
  denom.intermed <- parents.count[[2]]
  tau.intermed <- numer.intermed / denom.intermed
  
  # truth table
  truth.tb <- smartbind(truth.p, empirical.p.parents, empirical.p.child, p.mns.parents, p.mns.child)
  truth.tb[is.na(truth.tb)] <- 0
  row.names(truth.tb) <- c("truth", "empirical.parents", "empirical.offspring", "estimated.parents", "estimated.offspring")
  
  truth.tb.2 <- data.frame(cn=c("C0", "C1", "C2"),
                        truth=as.numeric(truth.p),
                        empirical=as.numeric(empirical.p.overall),
                        empirical.parents=as.numeric(empirical.p.parents),
                        empirical.offspring=as.numeric(empirical.p.child),
                        empirical.thetas=as.numeric(empirical.thetas),
                        empirical.sigmas=as.numeric((empirical.sigmas)^0.5),
                        estimated.parents = as.numeric(p.mns.parents),
                        estimated.offspring = as.numeric(p.mns.child),
                        estimated.thetas = as.numeric(estimated.thetas),
                        estimated.sigmas = as.numeric(estimated.sigmas))
  
  return(list(truth.table = contig,
              parent.misclassification.perc = parent.misclass.perc,
              child.misclassification.perc = offspring.misclass.perc,
              parent.misclassification.by.cn = parent.misclass.perc.cn,
              child.misclassification.by.cn = child.misclass.perc.cn,
              missed.true.cn.trio = mistake.true.cn.trio,
              missed.called.cn.trio = mistake.cn.called.trio,
              missed.called.cn.probs.trio = mistake.cn.probs.trio,
              mistake.typed = mistake.typed,
              missed.call.values = mistake.spec,
              taus = posterior.tau,
              true.tau.env = tau.env,
              true.tau.intermed = tau.intermed,
              effective.size.pi = eff.size.pi,
              effective.size.pi.child = eff.size.pi.child,
              effective.size.thetas = eff.size.thetas,
              effective.size.sigmas = eff.size.sigmas,
              mixture.prob.tb1 = truth.tb,
              mixture.prob.tb2 = truth.tb.2))
}

## are the mixing probs are in agreement with the empirical values estimated from
## the simulated data?
gg_prob.comparison <- function (gibbs.cnv, gibbs.results) {
  p.df <- melt(gibbs.cnv$post.pi)
  colnames(p.df)[2] <- "cn"
  ggplot(p.df, aes(Var1, value)) +
    geom_line(color="gray") +
    geom_hline(data=gibbs.results$mixture.prob.tb2,
               mapping=aes(yintercept=empirical.parents)) +
    facet_wrap(~cn)
}

gg_prob.child.comparison <- function (gibbs.cnv, gibbs.results) {
  p.df <- melt(gibbs.cnv$post.pi.child)
  colnames(p.df)[2] <- "cn"
  ggplot(p.df, aes(Var1, value)) +
    geom_line(color="gray") +
    geom_hline(data=gibbs.results$mixture.prob.tb2,
               mapping=aes(yintercept=empirical.offspring)) +
    facet_wrap(~cn)
}

gg_theta.comparison <- function (gibbs.cnv) {
  p.df <- melt(gibbs.cnv$post.thetas)
  colnames(p.df)[2] <- "cn"
  ggplot(p.df, aes(Var1, value)) +
    geom_line(color="gray") +
    facet_wrap(~cn)
}

gg_sigma.comparison <- function (gibbs.cnv) {
  p.df <- melt(gibbs.cnv$post.sigmas)
  colnames(p.df)[2] <- "cn"
  ggplot(p.df, aes(Var1, value)) +
    geom_line(color="gray") +
    facet_wrap(~cn)
}

gg_inten.comparison1 <- function (sim.truth, gibbs.cnv) {
  
  p <- cbind(as.numeric(gibbs.cnv$post.cn0), as.numeric(gibbs.cnv$post.cn1), as.numeric(gibbs.cnv$post.cn2))
  cn.model <- apply(p, 1, which.max) - 1
  cn.trio <- matrix(cn.model, nrow(gibbs.cnv$trio.cn), ncol(gibbs.cnv$trio.cn))
  colnames(cn.trio) <- c("m", "f", "o")
  df <- data.frame(y=as.numeric(sim.truth$y),
                   cn=factor(cn.model),
                   truth=factor(as.numeric(sim.truth$cn)),
                   concordant=cn.model==as.numeric(sim.truth$cn))
  
  ggplot(df, aes(cn, y)) +
    geom_jitter(width=0.1,
                aes(color=concordant)) +
    geom_boxplot(alpha=0.2, outlier.fill="transparent",
                 outlier.color="transparent")
}

gg_inten.comparison2 <- function (sim.truth, gibbs.cnv) {
  
  p <- cbind(as.numeric(gibbs.cnv$post.cn0), as.numeric(gibbs.cnv$post.cn1), as.numeric(gibbs.cnv$post.cn2))
  cn.model <- apply(p, 1, which.max) - 1
  cn.trio <- matrix(cn.model, nrow(gibbs.cnv$trio.cn), ncol(gibbs.cnv$trio.cn))
  colnames(cn.trio) <- c("m", "f", "o")
  df <- data.frame(y=as.numeric(sim.truth$y),
                   cn=factor(cn.model),
                   truth=factor(as.numeric(sim.truth$cn)),
                   concordant=cn.model==as.numeric(sim.truth$cn))
  
  ggplot(df, aes(truth, y)) + 
                        geom_jitter(width=0.1,
                                    aes(color=concordant)) +
    geom_boxplot(alpha=0.2, outlier.fill="transparent",
                 outlier.color="transparent")
}

## We will use this function to simulate data
## We will simulate using assumed distributions, which is definitely not the 
##   actual case
## Output: a list of two  n x 3 matrices, where each row is a trio, 
##   1st column = M, 2nd column = F, 3rd column = O
##   1st list is of fluorescence values (what we would observe)
##   2nd list is of true CN states (for evaluation)
## Inputs:
##   n -- the number of trios
##   p -- a vector of length three giving the probabilities of having
##        CN 0, 1, 2 respectively (for mother and father)
##   theta -- a vector of length three giving the distribution means for
##        CN 0, 1, 2 respectively 
##   sigma -- a vector of length three giving the distribution SDs for
##        CN 0, 1, 2 respectively 

simulate_data <- function(params, n, mendelian.probs){
  p <- params[, "p"]
  theta <- params[, "theta"]
  sigma <- params[, "sigma"]
  c_m <- sample(1:3, size = n, replace = TRUE, prob = p)
  c_f <- sample(1:3, size = n, replace = TRUE, prob = p)
  c_o <- rep(NA, length = n)
  for(i in 1:n){
    cn_m <- c_m[i]
    cn_f <- c_f[i]
    p.offspring <- mendelian.probs[, cn_m, cn_f]
    c_o[i] <- sample(1:3, size = 1, prob = p.offspring)
  }
  y_m <- rnorm(n, mean = theta[c_m], sd = sigma[c_m])
  y_f <- rnorm(n, mean = theta[c_f], sd = sigma[c_f])
  y_o <- rnorm(n, mean = theta[c_o], sd = sigma[c_o])
  y.mat <- cbind(y_m, y_f, y_o)
  cn.mat <- cbind(c_m, c_f, c_o)
  return(list(response = y.mat, cn = cn.mat))
}

statistics_1000g <- function(region){
  path <- system.file("extdata", package="marimba")
  vcf.del <- read.table(file.path(path, "parameter.p.1000gp.v2.txt"),
                        header=TRUE, sep="\t")
  median.sd <- read.table(file.path(path, "medians.sd.txt"),
                          header=TRUE, sep="\t")
  p <- as.numeric(vcf.del[region, c(9:11)])
  theta <- as.numeric(median.sd[region, c(1:3)])
  sigma <- as.numeric(median.sd[region, c(4:6)])
  params <- cbind(p, theta, sigma)
  colnames(params) <- c("p", "theta", "sigma")
  params
}

#' Simulate data from simple deletion model - Genetic model
#'
#' @param region integer indexing the 1000G data
#' @param N number of trios to simulate
#' @param error probability of non-Mendelian event in offspring
#' @export
simulate_data2 <- function(params, N, error=0){
  ##mendelian.probs <- mendelianProb(epsilon=error)
  mendelian.probs <- gMendelian(tau.one=1, tau.two=0.5, tau.three=0, err=0)
  ##stats <- statistics_1000g(region)
  dat <- simulate_data(params, N, mendelian.probs)
  dat$cn <- dat$cn - 1
  colnames(dat$cn) <- c("m", "f", "o")
  colnames(dat$response) <- gsub("y_", "", colnames(dat$response))
  names(dat) <- c("y", "cn")
  dat$theta <- setNames(params[, "theta"], 0:2)
  dat$sigma <- setNames(params[, "sigma"], 0:2)
  dat$p <- setNames(params[, "p"], 0:2)
  dat$logll <- compute_loglik(dat)
  dat
}

# simulate from environmental model
# set taus to arbitary but the same value
simulate_data.env1 <- function(params, N, error=0){
  ##mendelian.probs <- mendelianProb(epsilon=error)
  mendelian.probs <- gMendelian(tau.one=0.1, tau.two=0.1, tau.three=0.1, err=0)
  ##stats <- statistics_1000g(region)
  dat <- simulate_data(params, N, mendelian.probs)
  dat$cn <- dat$cn - 1
  colnames(dat$cn) <- c("m", "f", "o")
  colnames(dat$response) <- gsub("y_", "", colnames(dat$response))
  names(dat) <- c("y", "cn")
  dat$theta <- setNames(params[, "theta"], 0:2)
  dat$sigma <- setNames(params[, "sigma"], 0:2)
  dat$p <- setNames(params[, "p"], 0:2)
  dat$logll <- compute_loglik(dat)
  dat
}

simulate_data.env2 <- function(params, N, error=0){
  ##mendelian.probs <- mendelianProb(epsilon=error)
  mendelian.probs <- gMendelian(tau.one=0.8, tau.two=0.8, tau.three=0.8, err=0)
  ##stats <- statistics_1000g(region)
  dat <- simulate_data(params, N, mendelian.probs)
  dat$cn <- dat$cn - 1
  colnames(dat$cn) <- c("m", "f", "o")
  colnames(dat$response) <- gsub("y_", "", colnames(dat$response))
  names(dat) <- c("y", "cn")
  dat$theta <- setNames(params[, "theta"], 0:2)
  dat$sigma <- setNames(params[, "sigma"], 0:2)
  dat$p <- setNames(params[, "p"], 0:2)
  dat$logll <- compute_loglik(dat)
  dat
}

simulate_data.int <- function(params, N, error=0){
  ##mendelian.probs <- mendelianProb(epsilon=error)
  mendelian.probs <- gMendelian(tau.one=1, tau.two=0.3, tau.three=0, err=0)
  ##stats <- statistics_1000g(region)
  dat <- simulate_data(params, N, mendelian.probs)
  dat$cn <- dat$cn - 1
  colnames(dat$cn) <- c("m", "f", "o")
  colnames(dat$response) <- gsub("y_", "", colnames(dat$response))
  names(dat) <- c("y", "cn")
  dat$theta <- setNames(params[, "theta"], 0:2)
  dat$sigma <- setNames(params[, "sigma"], 0:2)
  dat$p <- setNames(params[, "p"], 0:2)
  dat$logll <- compute_loglik(dat)
  dat
}

mendelianProb <- function(epsilon=0){
  ## Mendelian probability array for child
  ## for ease of referencing of row/ column names, let (0,0) = BB, (0,1)= AB and (1,1) = AA
  ## here BB, AB, AA represent from mother dim for array creation
  ## the probability array is labelled with .c, .f and .m representing child, father, mother respectively
  ##  BB <- matrix(c(1, 0, 0,
  ##                 0.5, 0.5, 0,
  ##                 0, 1, 0) ,nrow=3, ncol=3,
  ##               dimnames=list(c("BB","AB","AA"),c("BB","AB","AA")))
  ##  AB <- matrix(c(0.5, 0.5, 0,
  ##                 0.25, 0.5, 0.25,
  ##                 0, 0.5, 0.5) ,nrow=3, ncol=3,
  ##               dimnames=list(c("BB","AB","AA"),c("BB","AB","AA")))
  ##  AA <- matrix(c(0, 1, 0,
  ##                 0, 0.5, 0.5,
  ##                 0, 0, 1) ,nrow=3, ncol=3,
  ##               dimnames=list(c("BB","AB","AA"),c("BB","AB","AA")))
  ##  mendelian.probs <- abind(BB, AB, AA, along=3,
  ##                           new.names=list(c("BB.c","AB.c","AA.c"),
  ##                                          c("BB.f","AB.f","AA.f"),
  ##                                          c("BB.m","AB.m","AA.m")))
  .Deprecated("use gMendelian")
  mendelian.probs <- array(dim=c(3, 3, 3))
  mendelian.probs[, 1, 1] <- c(1 - epsilon, epsilon/2, epsilon/2)
  mendelian.probs[, 2, 1] <- c(.5 - epsilon/2, .5 - epsilon/2, epsilon)
  mendelian.probs[, 3, 1] <- c(epsilon/2, 1 - epsilon, epsilon/2)
  mendelian.probs[, 1, 2] <- c(.5 - epsilon/2 , .5 - epsilon/2, epsilon)
  mendelian.probs[, 2, 2] <- c(.25, .5, .25)
  mendelian.probs[, 3, 2] <- c(epsilon, .5 - epsilon/2, .5 - epsilon/2)
  mendelian.probs[, 1, 3] <- c(epsilon/2, 1 - epsilon, epsilon/2)
  mendelian.probs[, 2, 3] <- c(epsilon, .5 - epsilon/2, .5 - epsilon/2)
  mendelian.probs[, 3, 3] <- c(epsilon/2, epsilon/2, 1 - epsilon)
  mendelian.probs
}

gg_cnp <- function(dat){
  response.df <- melt(dat$y)
  cn.df <- melt(dat$cn)
  df <- data.frame(logr=response.df$value,
                   cn=cn.df$value)
  df$cn <- as.factor(df$cn)
  p <- ggplot(df, aes(logr, ..count.., fill = cn)) + 
    geom_density(alpha = .5) + xlab("LRR")
  p
}


gg_chains <- function(gibbs.cnv){
  L <- nrow(gibbs.cnv$post.thetas)
  K <- ncol(gibbs.cnv$post.thetas)
  df <- tibble(post.thetas0=gibbs.cnv$post.thetas[, 1],
               post.thetas1=gibbs.cnv$post.thetas[, 2],
               post.thetas2=gibbs.cnv$post.thetas[, 3],
               post.sigmas0=gibbs.cnv$post.sigmas[, 1],
               post.sigmas1=gibbs.cnv$post.sigmas[, 2],
               post.sigmas2=gibbs.cnv$post.sigmas[, 3],
               post.pi0=gibbs.cnv$post.pi[, 1],
               post.pi1=gibbs.cnv$post.pi[, 2],
               post.pi2=gibbs.cnv$post.pi[, 3],
               post.pi.child0=gibbs.cnv$post.pi.child[, 1],
               post.pi.child1=gibbs.cnv$post.pi.child[, 2],
               post.pi.child2=gibbs.cnv$post.pi.child[, 3],
               # loglik=gibbs.cnv$logll,
               iter=seq_len(L))
  df2 <- gather(df, key=parameter, value=monte_carlo, -iter)
  ##  df <- data.frame(post.thetas=as.numeric(gibbs.cnv$post.thetas),
  ##                   post.sigmas=as.numeric(gibbs.cnv$post.sigmas),
  ##                   post.pi=as.numeric(gibbs.cnv$post.pi),
  ##                   ##loglik=as.numeric(),
  ##                   cn=rep(c("0", "1", "2"), each=L),
  ##                   log_lik=as.numeric(gibbs.cnv$logll),
  ##                   iter=rep(seq_len(L), K))
  ##  df2 <- melt(df, id.vars=c("iter", "cn"))
  ggplot(df2, aes(iter, monte_carlo)) +
    geom_line() +
    facet_wrap(~parameter, scales="free_y")
}

# deprecate
#gg_chains <- function(chains){
 # L <- nrow(chains$theta)
  #K <- ncol(chains$theta)
  #df <- tibble(theta0=chains$theta[, 1],
   #            theta1=chains$theta[, 2],
    #           theta2=chains$theta[, 3],
     #          sigma0=chains$sigma[, 1],
      #         sigma1=chains$sigma[, 2],
       #        sigma2=chains$sigma[, 3],
        #       pi0=chains$pi[, 1],
         #      pi1=chains$pi[, 2],
          #     pi2=chains$pi[, 3],
           #    pi.child0=chains$pi.child[, 1],
            #   pi.child1=chains$pi.child[, 2],
             #  pi.child2=chains$pi.child[, 3],
              # loglik=chains$logll,
               # iter=seq_len(L))
  # df2 <- gather(df, key=parameter, value=monte_carlo, -iter)
##  df <- data.frame(theta=as.numeric(chains$theta),
##                   sigma=as.numeric(chains$sigma),
##                   pi=as.numeric(chains$pi),
##                   ##loglik=as.numeric(),
##                   cn=rep(c("0", "1", "2"), each=L),
##                   log_lik=as.numeric(chains$logll),
##                   iter=rep(seq_len(L), K))
##  df2 <- melt(df, id.vars=c("iter", "cn"))
  #ggplot(df2, aes(iter, monte_carlo)) +
   # geom_line() +
    # facet_wrap(~parameter, scales="free_y")
# }

map_cn <- function(gmodel, ch, params){
  S<-params$iter
  K <- length(gmodel$theta)
  
  pr.list<-list()
  
  for (k in 1:K){
    c <- ch[[k]]/S
    pr.list[[length(pr.list)+1]]<-c
    names(pr.list)[k]<-attributes(gmodel$pi)$names[k]
  }
  
  m <- do.call(cbind, lapply(pr.list, "[", , "m"))
  f <- do.call(cbind, lapply(pr.list, "[", , "f"))
  o <- do.call(cbind, lapply(pr.list, "[", , "o"))
  cn.f <- apply(f, 1, which.max) - 1
  cn.m <- apply(m, 1, which.max) - 1
  cn.o <- apply(o, 1, which.max) - 1
  
  probs <- cbind(rowMaxs(m),
                 rowMaxs(f),
                 rowMaxs(o))
  
  cn <- cbind(cn.m, cn.f, cn.o)
  colnames(probs) <- colnames(cn) <- c("m", "f", "o")
  list(prob=probs,
       cn=cn)
}

#' Return maximum a posterior copy number from a model
#'
#' @param model a fitted model
#' @return a vector of the maximum a posterior copy number
#' @examples
#'   params <- statistics_1000g(region=192)
#'   truth <- simulate_data2(params, N=81, error=0)
#'   gparams <- geneticParams(N=81, K=3,
#'                            iter=50,
#'                            thin=2,
#'                            burnin=0)
#'   set.seed(204)
#'   comp <- 1
#'   gmod <- initialize_gmodel(truth$y, gparams, comp)
#'   fit <- gibbs_genetic(gmod, gparams)
#'   cn <- map_cn2(fit)
map_cn2 <- function(model){
  cn.list <- model$chains$C
  iter <- nrow(model$chains$theta)
  prob0 <- (cn.list$C0/iter) %>% as.tibble %>%
    mutate(trio=paste0("trio", 1:nrow(.))) %>%
    gather(key=family_membership, value=prob0, -trio)
  prob1 <- (cn.list$C1/iter) %>% as.tibble %>%
    mutate(trio=paste0("trio", 1:nrow(.))) %>%
    gather(key=family_membership, value=prob1, -trio)
  prob2 <- (cn.list$C2/iter) %>% as.tibble %>%
    mutate(trio=paste0("trio", 1:nrow(.))) %>%
    gather(key=family_membership, value=prob2, -trio)
  left_join(prob0, prob1) %>%
    left_join(prob2) %>%
    select(c(prob0, prob1, prob2)) %>%
    max.col %>%
    (0:2)[.]
}


dnorm_quantiles <- function(y, mean, sd){
  x <- list()
  for(i in seq_along(mean)){
    x[[i]] <-   qnorm(c(0.001, 0.999), mean=mean[i], sd=sd[i])
  }
  xx <- unlist(x)
  limits <- c(min(y, min(xx)),
              max(y, max(xx)))
  seq(limits[1], limits[2], length.out=500)
}


.dnorm_poly <- function(x, p, mean, sd){
  y <- p*dnorm(x, mean=mean, sd=sd)
  yy <- c(y, rep(0, length(y)))
  xx <- c(x, rev(x))
  tmp <- data.frame(y=yy, x=xx)
}

dnorm_model <- function(qtiles, p, mean, sd){
  df.list <- list()
  for(i in seq_along(mean)){
    dat <- .dnorm_poly(qtiles, p[i], mean[i], sd[i])
    if(i == 1){
      overall <- dat$y
    } else{
      overall <- overall + dat$y
    }
    df.list[[i]] <- dat
  }
  df <- do.call(rbind, df.list)
  L <- sapply(df.list, nrow)
  df$component <- factor(rep(seq_along(mean), L))
  df.overall <- data.frame(y=overall, x=dat$x)
  df.overall$component <- "marginal"
  df <- rbind(df, df.overall)
  df$component <- factor(df$component, levels=c("marginal", seq_along(mean)))
  if(length(mean) == 1){
    df <- df[df$component != "overall", ]
    df$component <- factor(df$component)
  }
  df
}

dnorm_poly <- function(model){
  mixprob <- model$pi
  means <- model$theta
  sds <- model$sigma
  qtiles <- dnorm_quantiles(model$y, means, sds)
  df <- dnorm_model(qtiles, mixprob, means, sds)
  df
}

gg_model <- function(model, bins){
  colors <- c("#999999", "#56B4E9", "#E69F00", "#0072B2",
              "#D55E00", "#CC79A7",  "#009E73")
  df.observed <- melt(gmodel$y)
  colnames(df.observed)[3] <- "y"
  if(missing(bins))
    bins <- nrow(df.observed)/2
  dat <- dnorm_poly(model)
  component <- x <- y <- ..density.. <- NULL
  ggplot(dat, aes(x, y, group=component)) +
    geom_histogram(data=df.observed, aes(y, ..density..),
                   bins=bins,
                   inherit.aes=FALSE) +
    geom_polygon(aes(fill=component, color=component), alpha=0.4) +
    xlab("quantiles") + ylab("density") +
    scale_color_manual(values=colors) +
    scale_y_sqrt() +
    scale_fill_manual(values=colors) +
    guides(fill=guide_legend(""), color=guide_legend(""))
}

startAtTrueValues <- function(truth, gmodel){
  gmodel$theta <- truth$theta
  gmodel$sigma <- truth$sigma
  gmodel$cn <- truth$cn
  gmodel$pi <- truth$p
  gmodel
}

# function to check if the real parameter is captured in the credible interval of the posterior chains
credible.interval <- function (gibbs.model, gibbs.result, real.params) {
  
  ci.mat <- matrix(0, ncol=nrow(real.params)*(ncol(real.params)+1), nrow=1)
  mixture.ci.parents <- data.frame(HPDinterval(mcmc(gibbs.model$post.pi), 0.95))
  mixture.ci.offspring <- data.frame(HPDinterval(mcmc(gibbs.model$post.pi.child), 0.95))
  theta.ci <- data.frame(HPDinterval(mcmc(gibbs.model$post.thetas), 0.95))
  sigma.ci <- data.frame(HPDinterval(mcmc(gibbs.model$post.sigmas), 0.95))
  
    #w <- real.params[,1]
    #x <- real.params[,1]
    #y <- real.params[,2]
    #z <- real.params[,3]
    
    w <- gibbs.result$mixture.prob.tb2$empirical.parents
    x <- gibbs.result$mixture.prob.tb2$empirical.offspring
    y <- gibbs.result$mixture.prob.tb2$empirical.thetas
    z <- (gibbs.result$mixture.prob.tb2$empirical.sigmas)#^0.5

    mix.parents.hit <- mixture.ci.parents %>%
      filter(lower<=w, w<=upper)
    mix.par <- mixture.ci.parents[,1] %in% mix.parents.hit[,1]
    
    mix.offspring.hit <- mixture.ci.offspring %>%
      filter(lower<=x, x<=upper)
    mix.off <- mixture.ci.offspring[,1] %in% mix.offspring.hit[,1]
    
    theta.hit <- theta.ci %>%
      filter(lower<=y, y<=upper)
    theta <- theta.ci[,1] %in% theta.hit[,1]
    
    sigma.hit <- sigma.ci %>%
      filter(lower<=z, z<=upper)
    sigma <- sigma.ci[,1] %in% sigma.hit[,1]

    col <- nrow(real.params)  
    ci.mat[1,1:col] <- mix.par
    ci.mat[1,(col+1) : (2*col)] <- mix.off
    ci.mat[1,(2*col+1) : (3*col)] <- theta
    ci.mat[1,(3*col+1) : (4*col)] <- sigma

  return (ci.mat)
}

credible.interval.quantile <- function (gibbs.model, gibbs.result, real.params) {
  
  ci.mat <- matrix(0, ncol=nrow(real.params)*(ncol(real.params)+1), nrow=1)
  mixture.ci.parents <- apply(gibbs.model$post.pi, 2, quantile, probs = c(0.025, 0.975))
  mixture.ci.offspring <- apply(gibbs.model$post.pi.child, 2, quantile, probs = c(0.025, 0.975))
  theta.ci <- apply(gibbs.model$post.thetas, 2, quantile, probs = c(0.025, 0.975))
  sigma.ci <- apply(gibbs.model$post.sigmas, 2, quantile, probs = c(0.025, 0.975))
  
  # reshape
  mixture.ci.parents <- data.frame(t(mixture.ci.parents))
  colnames(mixture.ci.parents) <- c("lower", "upper")
  mixture.ci.offspring <- data.frame(t(mixture.ci.offspring))
  colnames(mixture.ci.offspring) <- c("lower", "upper")
  theta.ci <- data.frame(t(theta.ci))
  colnames(theta.ci) <- c("lower", "upper")
  sigma.ci <- data.frame(t(sigma.ci))
  colnames(sigma.ci) <- c("lower", "upper")
  
  w <- gibbs.result$mixture.prob.tb2$empirical.parents
  x <- gibbs.result$mixture.prob.tb2$empirical.offspring
  y <- gibbs.result$mixture.prob.tb2$empirical.thetas
  z <- (gibbs.result$mixture.prob.tb2$empirical.sigmas)#^0.5
  
  mix.parents.hit <- mixture.ci.parents %>%
    filter(lower<=w, w<=upper)
  mix.par <- mixture.ci.parents[,1] %in% mix.parents.hit[,1]
  
  mix.offspring.hit <- mixture.ci.offspring %>%
    filter(lower<=x, x<=upper)
  mix.off <- mixture.ci.offspring[,1] %in% mix.offspring.hit[,1]
  
  theta.hit <- theta.ci %>%
    filter(lower<=y, y<=upper)
  theta <- theta.ci[,1] %in% theta.hit[,1]
  
  sigma.hit <- sigma.ci %>%
    filter(lower<=z, z<=upper)
  sigma <- sigma.ci[,1] %in% sigma.hit[,1]
  
  col <- nrow(real.params)  
  ci.mat[1,1:col] <- mix.par
  ci.mat[1,(col+1) : (2*col)] <- mix.off
  ci.mat[1,(2*col+1) : (3*col)] <- theta
  ci.mat[1,(3*col+1) : (4*col)] <- sigma
  
  return (ci.mat)
}

init.pi.child <- function(dat.y, cn.mf, params, theta){
  N.half <- params$N / 2
  yy <- as.numeric(dat.y[, c("o")])
  cn.mf.mat <- as.matrix(cn.mf)
  cn.mf.mat <- cn.mf.mat[1:N.half,]
  cn <- as.numeric(cn.mf.mat[, 1:2])
  tab <- tapply(yy, cn, length)
  freq <- setNames(rep(0L, length(theta)),
                   attributes(tab)$dimnames[[1]])
  freq[names(tab)] <- tab
  freq <- freq / params$N
}

ess.compile <- function(gibbs.result, real.params){
  ess.mat <- matrix(0, ncol=nrow(real.params)*(ncol(real.params)+1), nrow=1)
  
  col <- nrow(real.params)  
  ess.mat[1,1:col] <- gibbs.result$effective.size.pi
  ess.mat[1,(col+1) : (2*col)] <- gibbs.result$effective.size.pi.child
  ess.mat[1,(2*col+1) : (3*col)] <- gibbs.result$effective.size.thetas
  ess.mat[1,(3*col+1) : (4*col)] <- gibbs.result$effective.size.sigmas
  
  return(ess.mat)
}
githubmpc/marimba2 documentation built on May 17, 2019, 9:11 a.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
githubmpc/marimba2
Gibbs sampler for genotyping CNPs

scripts/functions.cross.R
In githubmpc/marimba2: Gibbs sampler for genotyping CNPs

R Package Documentation

Browse R Packages

We want your feedback!

githubmpc/marimba2 Gibbs sampler for genotyping CNPs

scripts/functions.cross.R In githubmpc/marimba2: Gibbs sampler for genotyping CNPs

R Package Documentation

Browse R Packages

We want your feedback!

githubmpc/marimba2
Gibbs sampler for genotyping CNPs

scripts/functions.cross.R
In githubmpc/marimba2: Gibbs sampler for genotyping CNPs