#' Is this a whole number
#'
#' Function that tets whether input is a whole number and
#' returns T/F. This is used creating the pmf of discrete
#' distributions. This is taken from the example in as.integer
#' documentation included in base.
#'
#'
#' @param x A number
#' @param tol the tolerance used to compare x and round(x)
#' @return Logical is the number an integer
#'
#'
is_wholenumber <-function(x, tol = .Machine$double.eps^0.5){
abs(x - round(x)) < tol
}
#' PMF of a zero truncated Poisson distribution
#'
#' PMF of zero truncated Poisson distribution at a point x.
#'
#'
#' @param x A number or vector of numbers
#' @param lambda The mean of the Poisson distribution that is truncated
#' not the mean of the truncated distribution.
#' @return Logical is the number an integer
#'
#' @examples
#' dzpois(1,3)
#'
#' # If Nb and l are vectors the output is a matrix of the form
#' # |---------Nb_j---------|
#' # | |
#' # | |
#' # l_i l_i
#' # | |
#' # | |
#' # | |
#' dzpois(1:2,1:3)
#' @export
dzpois<-function(x,lambda){ # the pmf of the zero truncated poisson distribution
if(x<=0 | is_wholenumber(x)==F){
return(0)
}else{
(exp(-1*lambda)*lambda^x)/(factorial(x)*(1-exp(-1*lambda)))
}
}
dzpois<-Vectorize(dzpois,vectorize.args = c("x"))
#' Probability of drawing an allele n times in n draws
#'
#' Probability of drawing an allele n times in n draws. This is a helper
#' function used in transmission modeling.
#'
#' @param p The probability (frequency of allele)
#' @param n The number of draws. or a vector of draw values
#' @return The probability of only drawing an allele at the given frequency.
#' Or a vector of such probabilities
p_all<-function(p,n){ # probability all success - only finding the variant at frequency p in n draws
p^n
}
p_all<-Vectorize(p_all,vectorize.args="n")
#' The probability of lambda for a loci
#'
#' The probability of observing the data given lambda and the presence absence
#' model.In this fit we take the minority frequency to be correct
# and set the major frequency to 1-minority. This accounts for
# the fact that frequencies are related by the expression :
# minor allele + major allele + errror =1.
#Here we make the major allele frequency = major allele + error.
#The error is always small. if it exceeds 1% then we through an error here.
#'
#' @param data a data frame of one donor polymorphic loci. It must have
#' columns chr,pos,freq1, and found
#' @param Nb_max The maximum bottleneck size to test
#' @param model The model we are using must be either "PA" or "BetaBin"
#' @param threshold limit of variant calling detection
#' @param acc a data frame with accuracy metrics
#'
#' @return a tibble with columns for each lambda and the probability of
#' observing the data for that site.
#'
#' @examples
#' get_freqs(c("HS1595","HS1563"),small_isnv)->small_dups
#' polish_freq(small_dups,freq1,0.02)->x
#' x$found=c(TRUE,TRUE)
#' math_fit(x,100,"PA")
#' x$found=c(FALSE,TRUE)
#' math_fit(x,100,"PA")
#'
#' @export
#'
math_fit=function(data,Nb_max,model,threshold,acc){
# this calculation is for each position in the genome.
stopifnot(length(unique(data$chr))==1, length(unique(data$pos))==1)
# and each model.
if(model=="PA"){#| model=="PA-straight"){
# In this fit we take the minority frequency to be correct
# and set the major frequency to 1-minority. This accounts for
# the fact that frequencies are related by the expression :
# minor allele + major allele + errror =1.
#Here we make the major allele frequency = major allele + error.
#The error is always small. if it exceeds 1% then we through an error here.
if(1-sum(data$freq1)>0.01){
warning("The sum of the frequencies is less than 99% make sure the assumption major freq = 1-minor freq is valid")
}
data$freq1[data$freq1==max(data$freq1)]<-1-min(data$freq1)
found<-data[data$found==T,] # only alleles that were transmitted
Nb<-1:Nb_max # Here are the bottlenecks
if(nrow(found)==0 | nrow(found)>2){
stop(paste0("No variant transmitted for this site or",
"there are more than 2 variants here"))
}else if(nrow(found)==1){ # one variant found here. All successes
prob<-p_all(p=found$freq1,n=Nb)
# this is a vector of probabilities for each
# n prob[i]= the probability of only getting that
# variant in Nb[i] (i.e. draws)
}else if(nrow(found)==2){
# if at least on of each allele was transmitted
first_var<-p_all(p=found$freq1[1],n=Nb) # all this one
second_var<-p_all(p=found$freq1[2],n=Nb) # all the other one
one_each<-1-(first_var+second_var)
# at least one of each -
# This is a vector as above since R adds and subtracts the
# elements of the vectors as expected
prob<-one_each
}
}else if(model=="BetaBin"){# | model =="BetaBin-straight"){
if(nrow(data[data$freq1>0.5,])>0){
data<-data[df$freq1<0.5,]
warning("The beta binomials model only uses minor alleles. Subsetting the data now.")
}
# 2 is the recipient
v_r = data$freq2
v_d = data$freq1
gc_ul = data$gc_ul2
threshold = 0.02
prob = L.Nb.beta(v_r,v_d,Nb,gc_ul,threshold,acc)
}
return(tibble(Nb=Nb,prob=prob))
}
#' Fit the transmission model
#'
#' This is wrapper around math_fit that fits the presence absence model to
#' each pair present in the data set.
#' @param data a data frame or tibble it will be split by chr pos and pair_id
#' @param Nb_max The maximum bottleneck size to test
#' @param model PA or BetaBin will fit lambda of a zero truncated Poisson. PA-straight, BetaBin-straight will fit 1 Nb.
#' @param threshold limit of variant calling detection
#' @param acc a data frame with accuracy metrics
#' @param ... other columns to group by in final output.
#' @return a tibble with columns pair_id,lambda,LL (log likelihood), and pair_id if desired.
#' @export
trans_fit<-function(data,Nb_max,model,threshold,acc,...){
group <- rlang::quos(...,Nb)
original_group <- rlang::quos(...)
probs<-data %>% dplyr::group_by(chr,pos,pair_id) %>%
dplyr::do(math_fit(.data,Nb_max,model,threshold,acc))
# For each genomic position in question
#control for number of the mutations in each donor.
counts <- data %>% dplyr::group_by(!!!original_group) %>%
dplyr::summarise(donor_mutants = length(which(freq1>0 & freq1<0.5)))
max_mut<-max(counts$donor_mutants)
LL.df<-probs %>% dplyr::left_join(counts,by = "pair_id") %>%
dplyr::group_by(!!!group) %>%
dplyr::summarize(LL=sum(log(prob)),
weighted_LL = (max_mut/unique(donor_mutants)) * LL)
# Get the log likelihood of the each lambda for this pair - we can sum accros pairs later.
return(LL.df)
}
#' Fit a distribution to pair_wise bottlenecks
#'
#' This funciton returns the negative log likelihood for
#' a distribution of bottlenecks given a data frame of
#' fits on a per-pair basis. This can be used with mle from
#' the bbmle package.
#'
#' @param data A tibble or data frame as outputed from trans_fit
#' @param weight A tibble or data frame providing the weighting factors for each
#' pair. It should have columns pair Id and weight_factor
#' @param ddist The pdf or pmf function for the distribution to test.
#' @param ... The parameters to pass to the ddist function
#' @return the negative Log likelihood weighted by the weighted data frame
#' @examples
#' require(magrittr)
#' require(dplyr)
#' fit <-trans_fit(small_trans,100,"PA",threshold = NULL,acc=NULL,pair_id)
#' weights<-small_trans %>% group_by(pair_id) %>%
#' summarize(donor_mutants = length(which(freq1>0 & freq1<0.5))) %>%
#' mutate(weight_factor = max(donor_mutants)/donor_mutants)
#' dist_prob(fit,weights,dzpois,lambda=1)
#' @export
dist_prob <- function(data,weight,ddist,...){
params <- rlang::enquos(...)
ddist <-rlang::enexpr(ddist)
Nb = unique(data$Nb) # probability of data
data<- dplyr::mutate(data,prob_D = exp(LL))
# I don't know the details of quoting ect. But this works.
# prob of Nb
prob_Nb<-rlang::eval_tidy(quo(Nb %>% purrr::map_dbl(.f = !!ddist,!!!params)))
data<-data %>%
dplyr::left_join(dplyr::tibble(Nb=Nb,prob_Nb = prob_Nb),by="Nb") %>%
dplyr::mutate(prob_D_and_Nb = prob_D*prob_Nb)
l_by_pair<- data %>% dplyr::group_by(pair_id) %>%
dplyr::summarize(prob_D_given_dist = sum(prob_D_and_Nb),
LL_D_given_dist = log(prob_D_given_dist)) %>%
dplyr::rowwise()%>%
dplyr::mutate(weighted_total_LL = weight$weight_factor[weight$pair_id==pair_id] * LL_D_given_dist)
return(-sum(l_by_pair$weighted_total_LL))
}
#' Make distribution specific functions to fit distribution
#'
#' This is a wrapper that function that uses a provided distribution
#' to fit the negative log likelihood for
#' a distribution of bottlenecks given a data frame of
#' fits on a per-pair basis. This can be used with mle from
#' the bbmle package.
#'
#' @param ddist The pdf or pmf function for the distribution to test as a string.
#' @param params The parameters to pass to the ddist function as a string with commas
#' @return the negative Log likelihood weighted by the weighted data frame
#' @examples
#' require(magrittr)
#' require(dplyr)
#' fit <-trans_fit(small_trans,100,"PA",threshold = NULL,acc=NULL,pair_id)
#' weights<-small_trans %>% group_by(pair_id) %>%
#' summarize(donor_mutants = length(which(freq1>0 & freq1<0.5))) %>%
#' mutate(weight_factor = max(donor_mutants)/donor_mutants)
#' binom_fit<-dist_prob_wrapper("dbinom","size,prob")
#' binom_fit(fit,weights,100,0.1)
#' @export
dist_prob_wrapper<-function(ddist,params){
f_string<- paste0("function(data,weight,",params,"){dist_prob(data,weight,",ddist,",", params,")}")
eval(parse(text = f_string))
}
#' Summarize model likelihoods
#'
#' This function summarizes the likelihood from the model_fit functions.
#' returning the max lambda and 95% confidence interval. The confidence interval
#' assumes a smooth curve.
#'
#' @param data data frame or tibble of model fit data.
#' @return a tibble with columns
#' lambda : best fit
#' lower_95 : lower 95% bound (lambda)
#' upper_95 : upper 95% bound (lambda)
#' Nb : mean Nb of a zero truncated poisson given the lambda
#'
#' @examples
#' trans_fit(small_trans,100,"PA",threshold=NULL,acc=NULL,pair_id) ->model_fit
#' model_summary(model_fit)
#' @export
model_summary<-function(data){
Nb<-data$Nb[which(data$LL==max(data$LL))] # Get the max lambda
good_range<-subset(data,LL> (max(LL)-1.92)) # get the bottlenecks that fall in this region the 95% confidence intereval
lower<-good_range$Nb[1]
upper <- good_range$Nb[nrow(good_range)]
return(tibble(Nb=Nb,lower_95=lower,
upper_95=upper))
}
#' Simulate transmission Presence absence model
#'
#' Simulate the transmission of alleles using the
#' presence absence model. Essentially this takes
#' the frequency of the variant allele, and bottleneck
#' size and simulates the found column. It also selects a bottleneck size
#' based on lambda and a zero truncated Poisson. If simulatin the whole data
#' set each pair should be run separately as to not use the same bottleneck
#'
#' @param data a data frame with with chr,pos,freq1, and pair_id columns
#' @param lambda The lambda of the zero truncated Poisson
#'
#' @return data frame the same as data but with a simulated found column
#'
#' @examples
#' pa_sim(small_trans,1.3)
#' @export
pa_sim<-function(data,lambda){
pair<-unique(data$pair_id)
if(length(pair)>1){
warning(paste0("Running on ",length(pair)," pairs. All will have the same bottleneck."))
}
Nb<-rzpois(1,lambda)
out<-data %>% dplyr::group_by(chr,pos,pair_id) %>%
dplyr::do(pa_sim_helper(.,Nb))
return(out)
}
#' PA sim helper
#'
#' Helper function to simulate presence absence data. This takes in one loci.
#' the data frame should have 2 rows (one for each allele.)
#' Simulate the transmission of alleles using the
#' presence absence model. Essentially this takes
#' the frequency of the variant allele, and bottleneck
#' size and simulates the found column.
#'
#' @param data a data frame with with freq1 and pair_id columns and 2 rows
#' @param Nb the bottleneck size
#'
#' @return data frame the same as data but with a simulated found column
#'
pa_sim_helper<-function(data,Nb){
# data refers to the 2 mutations at this position, bottlenecks-
# a data frame with the bottle_neck,
#model - the name of the column in the bottlenecks that
# contains the bottleneck size we want to use.
if(nrow(data)!=2){
stop(print("There are not 2 mutations at this point."))
}
pair<-unique(data$pair_id)
if(length(pair)!=1){
stop(print("There should only be one pair id at this point."))
}
freq_success<-1-min(data$freq1)
# This is the major allele frequency - calculated the same as in the model
success<-rbinom(1,Nb,freq_success)
# the number of success in n trials.
# How many of the major alleles in the draw
#
data$found<-F
if(success==Nb){ # only found major variants
data$found[data$freq1==max(data$freq1)]=T
} else if(success==0){ # only the minor
data$found[data$freq1==min(data$freq1)]=T
} else if(success>0 & success<Nb){ # both were found
data$found=T
} else{
stop("Error!")
}
return(data)
}
#' Randomly sample a zero truncated Poisson
#'
#' Randomly sample a zero truncated Poisson distribution given
#' the lambda of the underlying Poisson (that is of course truncated).
#' @param n number samples
#' @param lambda The mean of the untruncated Poisson
#' @return a random sample
#' @examples
#' rzpois(10,2.3)
#' @export
rzpois<-function(n,lambda){
#from http://giocc.com/zero_truncated_poisson_sampling_algorithm.html
out = vector(mode="double",length=n)
for(i in 1:n){
k = 1
t = exp(-lambda) / (1 - exp(-lambda)) * lambda
s = t
u = runif(1)
while(s < u){
k = k+1
t = t*(lambda / k)
s = s+t
}
out[i] <- k
}
return(out)
}
#' Simulate probability of transmission based on frequency
#'
#' @param data data frame of transmission data
#' @param runs how many simulations to run
#' @param lambda What is the lamda of the zero truncated Poisson to use
#' @param FUN what function to use to simulate the found column pa_sim or betabin_sim unquoted
#' @param threshold - optional to pass to betabinomial simulator
#' @param acc optional to pass to betabinomial simulation
#' @param ... columns to group for simulations. These will get the same
#' bottleneck size within each replication. It should probably only ever be
#' pair_id.(ie each person gets the same bottleneck within a run)
#'
#' @return a data frame of simulated logit fits to frequency in donor vs.
#' probability of transmission
#' @examples
#' simulations(small_trans,10,3.2,pa_sim)
#' simulations(small_trans,10,3.2,betabin_sim,0.02,accuracy_stringent)
#'
#'
#' @export
simulations<-function(data,runs,lambda,FUN,threshold=NULL,acc=NULL,...){
# iSNV data, how many iterations,
# what is lambda value and what function is used to simulate the data.
group_by<-rlang::quos(...)
rows_needed<-nrow(data)
model.df<-tibble(freq1=rep(NA,rows_needed*runs),
trial=rep(1:runs,each=rows_needed),
prob=NA)
pairs<-length(unique(data$pair_id))
for (i in 1:runs){
if(is.null(threshold)){
trial.df<-data %>% dplyr::group_by(!!!group_by)%>%
dplyr::do(FUN(.,lambda))
}else{
trial.df<-data %>% dplyr::group_by(!!!group_by)%>%
dplyr::do(FUN(.,lambda,threshold,acc))
}
logit<-glm(formula =found~freq1,family=binomial(logit),data=trial.df) # Fit a logit model to the data
trial.df$prob<-logit$fitted.values
trial.df$trial<-i
ind<-which(model.df$trial==i)
model.df$prob[ind]<-trial.df$prob# add to the final output
model.df$freq1[ind]<-trial.df$freq1
}
return(model.df)
}
#=============================================================================
# BETA BINOMIAL FUNCTIONS
#=============================================================================
#
#' @describeIn L.Nb.beta likelihood the variant is found in recipient
like_found.beta<-function(v_r,v_d,Nb,gc_ul,threshold,acc=accuracy_stringent){
acc_gc=10^(floor(log10(gc_ul)))
if(acc_gc>1e5){
acc_gc <- 1e5
}
# If the frequency is below our threshold then the probability of being found is 0
if(v_r<threshold){
return(0)
}
# This will round the gc down to the nearest log as discussed below.
if(v_r>=0.02 & v_r<0.05){
sense= acc$sensitivity[which(acc$gc_ul==acc_gc & acc$freq==0.02)]
}else if(v_r>=0.05 & v_r<0.1){
sense= acc$sensitivity[which(acc$gc_ul==acc_gc & acc$freq==0.05)]
}
else {
sense=1
}
prob = c()
if(v_r<=(1-threshold)){
for(k in 0:Nb){
prob[k+1]=dbeta(x=v_r,shape1 = k,shape2=Nb-k)*dbinom(x=k,size=Nb,prob = v_d)*sense
}
prob=sum(prob)
}else if(v_r>(1-threshold)){
# it is fixed - whats the probability the other allele was not found
lost_allele_freq = 1-v_d
# The for loop over k and sum is in the like_lost.beta.uncert function
prob = like_lost.beta.uncert(v_r=0,v_d=lost_allele_freq,Nb,gc_ul,threshold,acc=acc)
}
return(prob)
}
#' @describeIn L.Nb.beta likelihood the variant is not found in recipient
like_lost.beta.uncert<-function(v_r,v_d,Nb,gc_ul,threshold,acc=accuracy_stringent){ # sum over k
stopifnot(v_r<threshold) # ensure not found in the recipient
acc_gc=10^(floor(log10(gc_ul)))
if(acc_gc>1e5){
acc_gc <- 1e5
}
# This will round the gc down to the nearest log as discussed below.
prob=c()
for(k in 0:Nb){
uncert_term=c()
f=c(0.02,0.05,0.10)
for(i in 1:(length(f)-1)){
uncert=1-acc$sensitivity[which(acc$gc_ul==acc_gc & acc$freq==f[i])]
# The prob the variant is missed because itis between f[i] and f[i+1] given
# the sample size
uncert_term[i]=(pbeta(q = f[i+1],shape1 = k,shape2 = Nb-k)-
pbeta(q = f[i],shape1 = k,shape2 = Nb-k))*
dbinom(x=k,size=Nb,prob = v_d)*uncert
}
#probability the variant is below the cut off or present but missed
prob[k+1]=pbeta(q = threshold,shape1 = k,shape2 = Nb-k)*dbinom(x=k,size=Nb,prob = v_d)+sum(uncert_term)
}
sum(prob)
}
#' Betabinomial Likelihood functions
#'
#' What is the probability of observing the data given a bottleneck
#' size under the beta binomial model. This is a wrapper around the two likelihood
#' functions
#' @param v_r allele frequency in the recipient
#' @param v_d allele frequency in the donor
#' @param Nb Bottleneck size (may be a vector)
#' @param gc_ul titer in recipeint sample
#' @param threshold detection level threshold
#' @param acc a data frame with accrucacy data must contain, columns
#' freq,sensitivity, and gc_ul
#'
#' @return The likelihood of observing the data given the bottleneck size.
#' @export
L.Nb.beta<-function(v_r,v_d,Nb,gc_ul,threshold,acc=accuracy_stringent){
if(v_r>=threshold){
like=like_found.beta(v_r=v_r,v_d=v_d,Nb=Nb,gc_ul=gc_ul,threshold=threshold,acc=acc)
}else if(v_r<threshold){ # Not found
like=like_lost.beta.uncert(v_r=v_r,v_d=v_d,Nb=Nb,gc_ul=gc_ul,threshold=threshold,acc=acc)
}
return(like)
}
L.Nb.beta<-Vectorize(L.Nb.beta,vectorize.args = "Nb")
#' Simulate transmission Beta binomial model
#'
#' Simulate the transmission of alleles using the
#' betabinomial model. Essentially this takes
#' the frequency of the variant allele, and bottleneck
#' size and simulates the found column. It also selects a bottleneck size
#' based on lambda and a zero truncated Poisson. If simulation the whole data
#' set each pair should be run separately as to not use the same bottleneck
#'
#' @param data a data frame with with chr,pos,freq1, and pair_id columns
#' @param lambda The lambda of the zero truncated Poisson
#' @param threshold limit of variant calling detection
#' @param acc a data frame with accuracy metrics
#'
#' @return data frame the same as data but with a simulated found column
#'
#' @examples
#' betabin_sim(small_trans,1.2,0.02,accuracy_stringent)
#' @export
betabin_sim<-function(data,lambda,threshold,acc=accuracy_stringent){
pair<-unique(data$pair_id)
if(length(pair)>1){
warning(paste0("Running on ",length(pair)," pairs. All will have the same bottleneck."))
}
Nb<-rzpois(1,lambda)
out<-data %>% dplyr::group_by(chr,pos,pair_id) %>%
dplyr::do(betabin_sim_helper(.,Nb,threshold,acc))
return(out)
}
#' beta bin sim helper
#'
#' Helper function to simulate beta binomial data. This takes in one loci.
#' the data frame should have 2 rows (one for each allele.)
#' Essentially this takes
#' the frequency of the variant allele, and bottleneck
#' size and simulates the found column.
#'
#' @param data a data frame with with freq1 and pair_id columns and 2 rows
#' @param Nb the bottleneck size
#' @param threshold limit of variant calling detection
#' @param acc a data frame with accuracy metrics
#'
#' @return data frame the same as data but with a simulated found column
#'
betabin_sim_helper<-function(data,Nb,threshold,acc){
# data refers to the 2 mutations at this position, bottlenecks-
# a data frame with the bottle_neck,
#model - the name of the column in the bottlenecks that
# contains the bottleneck size we want to use.
if(nrow(data)!=2){
stop(print("There are not 2 mutations at this point."))
}
pair<-unique(data$pair_id)
if(length(pair)!=1){
stop(print("There should only be one pair id at this point."))
}
minor<-min(data$freq1) # This is the minor allele frequency
major<-1-min(data$freq1)
# includes uncertainty -
# with v_r = 0 or 1 these are probabilities otherwise they are densities.
only_minor= L.Nb.beta(v_r=1,v_d=minor,Nb=Nb,gc_ul=unique(data$gc_ul2),threshold=threshold,acc)
# includes uncertainty - The likelihood the allele was lost
only_major = L.Nb.beta(v_r=1,v_d=major,Nb=Nb,gc_ul=unique(data$gc_ul2),threshold=threshold,acc) # includes uncertainty - The likelihood the allele was lost
both = 1-(only_minor+only_major)
# flip the coin
pick = runif(1,0,1)
data$found=F
# 0 only_minor only_major both 1
# |------------------|---------------|---------------------------------|
if(pick<=only_minor){
# onlny the minor was found
data$found[data$freq1==min(data$freq1)]=T
}else if(pick>only_minor & pick<(only_minor+only_major)){
# only the major was found
data$found[data$freq1==max(data$freq1)]=T
}else if(pick>=(only_minor+only_major)){
# both were found
data$found=T
}
return(data)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.