R/mixed_MCMC.R

#' MCMC function for determing contamination proportions and mixing proportions of a mixture sample
#' 
#' This function runs an MCMC tailored for a situation in which one is sampling from a mixture of individuals
#' from different populaitons.  The MCMC function uses the SNP genotype data and baseline data to estimate the 
#' contamination proportion and the mixing proportions as well as identify contaminated individuals and 
#' preform population assignment.
#' @param data A L x N matrix containing the genotype of data of individuals in the form of 0s,1s, and 2s.
#' N is the number of individuals, and L is the number of loci.
#' @param contam_data A P*P x N matrix containing the likelihood that each individual orginates from each
#' combination of the 2 populations from P, assuming the individual is contaminated.  P is the number of
#' populations and is the number of individuals.
#' @param clean_data A P x N matrix containing the likelihood that each individual originates from each
#' of P populations, assuming the individual is not contaminated.
#' @param alpha The alpha parameter for the prior distribution of rho, the probability
#' of contamination.  The prior of rho is assumed to be a beta distribution.
#' @param beta The beta parameter for the prior distribution of rho, which is assumed to be
#' a beta distribution.
#' @param inters Total number of sweeps used in the MCMC model.
#' @param contamination Logical variable.  If contamination = FALSE, then the MCMC model will only update 
#' the mixing proportions and the population identity, assuming no contamination is present.
#' 
#' @return Returns a list of four named components:
#' \describe{
#'  \item{prob_contam}{A vector containing the rho value, which is the proportion of 
#'  contaminated samples for each interation. The vector is a length of 1 plus the total number of iterations.}
#'  \item{pops}{A matrix the u, which denote the population or populations of origin, for each sweep.
#'  If contamintion=TRUE, u is a 2*inters x N matrix, and each pair of rows represents the population identification for each sweep.
#'  If an individual has u values of (any P, 0), than it is a non contaminated individual and the first value is
#'  its population of origin.  If an individual has two u values, it is contaminated and the two values 
#'  represent the source of contamination.  If contamination=FALSE, u is a intersxN matrix, and each row represents the popultion
#'  identification for each sweep.}
#'  \item{z}{A matrix containing the z value, which denotes contaminated status, for each individual
#'  and every iteration.  The matrix has columns equal to the number of individuals and rows equal to
#'  the total number of iterations.}
#'  \item{mixing}{A matrix containing the mixing proportions of each population for each sweep.  The matrix
#'  has number of rows equal to the total interations plus one and columns equal to the number of populations.}
#' }
#' @export
#' @examples
#' 
#' # Creates data for the mixed_MCMC function
#' zeros <- t(matrix(c(40,160,50,80,100,20,30,15,140),nrow=3))
#' ones <- 200 - zeros
#' genos <- t(matrix(c(1,2,0,2,1,0,1,0,0,1,2,2,1,1,1),nrow=5))
#' data <- matrix(c(1,2,1,0,2,0,2,0,0,0,1,2,0,0,1),nrow=3)
#' clean_data <- P_likelihood(zeros,ones,genos,.5)
#' contam_data <- Pcontam(zeros,ones,genos,.5) 
#' # Runs the MCMC that checks for contamination on the data for 5 interations
#' mixed_MCMC(data, contam_data, clean_data, contamination = TRUE, inters = 5)
#' 
#' # Runs the MCMC on same data, but without evaluating contamination
#' mixed_MCMC(data, contam_data, clean_data, contamination=FALSE, inters = 5)
mixed_MCMC <- function(data, contam_data, clean_data, alpha=.5, beta=.5, contamination = TRUE, inters){
  
  cbs <- nrow(contam_data) # Number of combinations of populations
  P <- nrow(clean_data) # Number of populations
  N <- ncol(data) # Number of individuals
  L <- nrow(data) # Number of loci
  
  # Sets up rho matrix and gets first rho value
  if (contamination){
  rho <- rep(0,inters+1)
  rho[1] <- rbeta(1,alpha,beta)
  }
  
  # Sets up u matrix
  if (contamination){
  u <- matrix(0,N,nrow=inters*2)}else {u <- matrix(0,N,nrow=inters)}
  
  # Sets up pi matrix and gets first pi values
  pii <- matrix(0,inters+1,P)
  gm0 <- rgamma(P,shape=rep(1,P))
  pii[1,] <- gm0/sum(gm0)
  
  # Sets up z matrix
  z <- matrix(0,inters,N)
  
  # Creates list containing all combinations of populations
  combos <- matrix(0,2,P*P)
  pops <- 1:P
  k <- 0
  for (p1 in pops) for (p2 in pops){
    k <- k + 1
    combos[ ,k] <- c(p1,p2)
  }
  
  for(i in 1:inters){
    # propabilities of an individual coming from combinations of populations given the mixture proportions
    if (contamination){
    c_pi <- rep(pii[i,], each=P)*rep(pii[i,], times=P)
  # update z
    probs_c <- contam_data*c_pi # prob of originating from combinations given genotypes and mixture proportions
    probs0 <- clean_data*pii[i,] # prob of originating from each singular population given genotypes and mixture proportions
    pcontam <- colSums(probs_c*rho[i]) #prob of individuals being contaminated
    pclean <- colSums(probs0*(1-rho[i])) # prob of individuals being noncontamined
    prob <- pcontam/(pclean + pcontam) # normalize probability so it adds to one
    z[i,] <- c(runif(N) <= prob)*1 # chooses new z based on prob of being contaminated
    
  # update u
    z_c <- which(z[i,] %in% 1) # indices of contaminated individuals
    z0 <- which(z[i,] %in% 0) # indices of clean individuals
  
    #contaminated
    if (length(z_c) != 0){
      if (length(z_c) == 1){
        uprobs_c <- probs_c[,z_c] # prob of originating from each combination
        pops_c <- sample((1:cbs), size=1, prob = uprobs_c) # samples from combinations based on uprobs_c
        }
else {
        uprobs_c <- apply(probs_c[,z_c],2,list) # prob of orginating from each combo for each individual in z_c
        pops_c <- sapply(uprobs_c, function(x) {sample((1:cbs),size=1, prob = unlist(x))}) # samples for each individual in z_c
        }
      u[(2*i-1):(2*i), z_c] <- combos[pops_c] # updates u for contaminated individuals
    }
    #clean
    if (length(z0) != 0){
      if (length(z0) == 1){
        uprobs0 = probs0[,z0] # prob of orginating from each population
        pops0 <- sample((1:P),size=1, prob = uprobs0) # samples from all populations given uprobs0
      }
      else {
        uprobs0 <- apply(probs0[,z0],2,list) # prob of orginating from each population for each individual in z0
        pops0 <- sapply(uprobs0, function(x) {sample((1:P), size=1, prob = unlist(x))}) # samples population for each individual in z0
      }
      u[(2*i-1),z0] <- pops0 # updates u for clean individuals
    }  
    }  
    if (contamination == FALSE){
      probs0 <- clean_data*pii[i,]
      uprobs0 <- apply(probs0,2,list)
      u[i,] <- sapply(uprobs0, function(x) {sample((1:P), size=1, prob = unlist(x))}) 
    }
      
  # update pi
    if (contamination) {clean_us <- u[i,][u[2,]==0]} else{clean_us <- u[i,]} # the population identification for only clean individuals
    utmp <- factor(clean_us, level=(1:P)) # step before using table so table will include values that have frequency of 0
    freqs <- as.numeric(table(utmp)) # gets frequencies of each population 
    xi <- (freqs + 1) # parameters for the dirichlet distribution for pi
    # samples from dirichlet distribution using gamma distribution and updats pii
    gm <- rgamma(P,shape=(xi)) 
    pii[i+1,] <- gm/sum(gm)
  
  # update rho
  if (contamination){
  sum_z <- sum(z[i,]) # total number of contaminated samples
  # updates rho with derived beta distribution
  p_alpha <- sum_z + alpha # alpha parameter
  p_beta <- N - sum_z + beta # beta parameter
  rho[i+1] <- rbeta(1,p_alpha,p_beta)
  }
  }
  if (contamination){
  list(prob_contam = rho, pops = u, z = z, mixing = pii)
  }else { list(pops = u, mixing = pii)}
}
eriqande/SNPcontam documentation built on May 16, 2019, 8:44 a.m.