# R/contam_MCMC.R In eriqande/SNPcontam: Detecting contaminated samples from SNP genotypes

#### Defines functions contam_MCMC

#' MCMC function for determing contamination porpotion and allele frequencies
#'
#' This function runs an MCMC algorithm on the SNP genotype data of a group of samples to estimate the contamation
#' proportion and the allele frequencies and identify the contaminated samples.
#' @param data A L x N matrix containing the genotype of data of individuals in the form of 0s,1s, and 2s.
#' N is the number of individuals, and L is the number of loci.
#' @param inters  A number representing the total of number of interations for the MCMC model.
#' @param alpha The alpha parameter for the prior distribution of rho, the probability
#' of contamination.  The prior of rho is assumed to be a beta distribution.
#' @param beta The beta parameter for the prior distribution of rho, which is assumed to be
#' a beta distribution.
#' @param lambda The alpha and beta parameter for the prior distribution of the allele
#' frequencies at each locus, which is assumed to be a beta distribution.
#'
#' @return Returns a list of three named components:
#' \describe{
#'  \item{prob_contam}{A vector containing the rho value, which is the proportion of
#'  contaminated samples for each interation. The vector is a length 1 plus the total number of iterations.}
#'  \item{allele_freq}{A matrix containing the allele frequencies at each locus for each
#'  iteration. The matrix has columns equal to the number loci and rows equal to the 1 plus
#'  total number of iterations.}
#'  \item{z}{A matrix containing the z value, which denotes contaminated status, for each individual
#'  and every iteration.  The matrix has columns equal to the number of individuals and rows equal to
#'  the total number of iterations.}
#' }
#' @export
contam_MCMC<-function(data,inters = 1000,alpha=0.5,beta=0.5,lambda=0.5){
full_z <- function(genos,theta,rho){
like <- likelihood(theta,genos) # use likelihood function to get probability of genotype given contamination and clean
#consider taking log and then colSums so loop is avoided
ln_clean <- log(like$clean) # take natural log of data ln_contam <- log(like$contam) # same for contaminated data
clean <- (exp(colSums(ln_clean,na.rm=TRUE)))*(1 -rho) # probability of non contaminated sample
contam <- (exp(colSums(ln_contam,na.rm=TRUE)))*rho # probability of contaminated samples
# full conditional probability distribution of z indicator value of 1 (indicating contamination)
# normalized by (clean + contam) so that total probability is 1
p <- contam/(clean + contam)
return(p) # can just return value without list
}
N <- ncol(data) # Number of individuals
L <- nrow(data) # Number of loci
rho <- rep(0,inters+1) # Creates array for rho values
rho[1] <- rbeta(1,alpha,beta)

allele_f <- matrix(0,inters+1,L) # Matrix for allele frequencies
allele_f[1,] <- rbeta(L,lambda,lambda)
z <- matrix(0,inters,N) # Matrix for z's

for(k in 1:inters){
# update z
prob <- full_z(data,allele_f[k,],rho[k]) # full_z gives the prob of contamination for each individual
z[k,] <- c(runif(N) <= prob)*1 # sets zi's to be 1 or 0 dependent on prob of contamination

# update allele frequency
# only use non-contaminated samples to calculate allele frequency
# gene_x has 1s at indices where z is 0 and the genetype is x
gene_0 <- (1 - matrix(rep(z[k,],L),nrow=L,byrow=TRUE))*(data==0)
gene_1 <- (1 - matrix(rep(z[k,],L),nrow=L,byrow=TRUE))*(data==1)
gene_2 <- (1 - matrix(rep(z[k,],L),nrow=L,byrow=TRUE))*(data==2)
# sum of 1s of rows of gene_x gives total number of total number of genotype x at each locus
x0 <- rowSums(gene_0,na.rm=TRUE) # total 0 genotype
x1 <- rowSums(gene_1,na.rm=TRUE) # total 1 genotype
x2 <- rowSums(gene_2,na.rm=TRUE) # total 2 genotype
# updates allele frequency using derived beta distribution
t_alpha <- 2*x2 + x1 + lambda # alpha parameter
t_beta <- 2*x0 + x1 + lambda # beta paramenter
allele_f[k+1,] <- rbeta(L,t_alpha,t_beta) # allele frequency for 1 allele

# update rho
sum_z <- sum(z[k,]) # total number of contaminated samples
# updates rho with derived beta distribution
p_alpha <- sum_z + alpha # alpha parameter
p_beta <- N - sum_z + beta # beta parameter
rho[k+1] <- rbeta(1,p_alpha,p_beta) # new rho value

}
list(prob_contam = rho, allele_freq = allele_f, z=z)
}

eriqande/SNPcontam documentation built on May 13, 2017, 8:20 p.m.