knitr::opts_chunk$set( dev = c('png'), fig.align = 'center', fig.height = 7, fig.width = 8.5, collapse = TRUE, comment = "#>", message = F, error = F )
TODO: Check final references to paper.
In this vignette we check illustrate the results presented in Asger [-@asgertemp2020] using the \texttt{phasty}-package.
set.seed(0) library(phasty)
On p. 55 of Durrett [-@durrett2008probability] the numerical values of the covariances between the entries of the site frequency spectrum for $n=8$ are computed and from eqn (eq:covmatdecomp) in Asger [-@asgertemp2020] we know that the off-diagonal entries of the covariance matrix of the site frequency spectrum are proportional to the covariance matrix of the underlying phase-type distribution.
n <- 8 ph_rew_obj <- block_counting_process(n) # Create the phasetype distribution with rewards associated the block counted process with n = 8 Covmat <- var(ph_rew_obj) # Compute the covariance matrix round(0.25*Covmat,4)
In Asger [-@asgertemp2020], the distribution of the site frequency spectrum is characterized as by conditioning on the underlying block counting process. This implies a method for simulating site frequecy spectra by first generating an outcome of the block counting process, and then adding mutations according to a Poisson process. Since the block counting process is a phase-type distribtibution, we can use \texttt{phasty} for the first part.
R = 1e4 ph_mv_sim_obj = sim_rew_phase_type(R,ph_rew_obj) lambda = 1 #lambda = theta/2 ph_counts = matrix(0,dim(ph_mv_sim_obj)[2],dim(ph_mv_sim_obj)[1]) for(i in 1:R) { ph_counts[i,] = rpois(n-1,lambda*ph_mv_sim_obj[,i]) }
An alternative is to simulate the site frequency spectrum using the coala package
library(coala) loc_length = 1e4; theta = lambda*2 model <- coal_model(sample_size = n, loci_number = R,loc_length) + feat_mutation(theta) + sumstat_trees() + sumstat_seg_sites() stats <- simulate(model, seed = 15) out = rep(0,R) for(i in 1:R) {out[i] = length(stats$seg_sites[[i]]$position)} N=min(max(out),n) out = matrix(0,R,N) for(i in 1:R) { curcolsums = colSums(stats$seg_sites[[i]]$snps) for(j in 1:N) { out[i,j]=sum(curcolsums==j) } } out = out[,-n] sfs = out
We can compare the two samples by converting each of the observed triples to a three-character string and compare the counts of each string.
all_obs = unique(rbind(ph_counts,sfs)+1) myletters = c(letters,LETTERS) collapse_to_string = function(lst) paste(myletters[lst],collapse="") all_obs = apply(all_obs,1,collapse_to_string) table_ph_counts = rep(0,length(all_obs)) table_sfs = rep(0,length(all_obs)) for(i in 1:R) { table_ph_counts[which(collapse_to_string(ph_counts[i,]+1)==all_obs)] = table_ph_counts[which(collapse_to_string(ph_counts[i,]+1)==all_obs)] + 1 table_sfs[which(collapse_to_string(sfs[i,]+1)==all_obs)] = table_sfs[which(collapse_to_string(sfs[i,]+1)==all_obs)] + 1 } max_count = max(c(table_ph_counts,table_sfs)) plot(table_ph_counts,table_sfs,pch=16,xlim=c(0,max_count),ylim=c(0,max_count),las=1) abline(a=0,b=1)
We can perform a naive statistical test of equality of the two underlying multinomial distribution by using an approximate likelihood ratio test, where we collapse strings with less than five observations in either sample:
small_counts = union(which(table_ph_counts <= 5),which(table_sfs <= 5)) table_sfs_collapsed=table_sfs[-small_counts] table_ph_counts_collapsed=table_ph_counts[-small_counts] table_sfs_collapsed = c(table_sfs_collapsed,sum(table_sfs[small_counts])) table_ph_counts_collapsed = c(table_ph_counts_collapsed,sum(table_ph_counts[small_counts])) table_tot = table_sfs_collapsed + table_ph_counts_collapsed val1 = -2*sum(table_sfs_collapsed*log(table_tot/(2*R))+table_ph_counts_collapsed*log(table_tot/(2*R))) val2 = -2*(sum(table_sfs_collapsed*log(table_sfs_collapsed/R),na.rm=TRUE)+sum(table_ph_counts_collapsed*log(table_ph_counts_collapsed/R),na.rm=TRUE)) 1-pchisq(val1-val2,length(table_tot))
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.