In rivasiker/phasty: General-purpose phase-type functions

knitr::opts_chunk$set(
  dev = c('png'),
  fig.align = 'center', 
  fig.height = 7, 
  fig.width = 8.5,
  collapse = TRUE,
  comment = "#>",
  message = F,
  error = F
)

1. Introduction

TODO: Check final references to paper.

In this vignette we check illustrate the results presented in Asger [-@asgertemp2020] using the \texttt{phasty}-package.

set.seed(0)
library(phasty)

On p. 55 of Durrett [-@durrett2008probability] the numerical values of the covariances between the entries of the site frequency spectrum for $n=8$ are computed and from eqn (eq:covmatdecomp) in Asger [-@asgertemp2020] we know that the off-diagonal entries of the covariance matrix of the site frequency spectrum are proportional to the covariance matrix of the underlying phase-type distribution.

n <- 8
ph_rew_obj <- block_counting_process(n) # Create the phasetype distribution with rewards associated the block counted process with n = 8
Covmat <- var(ph_rew_obj) # Compute the covariance matrix 
round(0.25*Covmat,4)

In Asger [-@asgertemp2020], the distribution of the site frequency spectrum is characterized as by conditioning on the underlying block counting process. This implies a method for simulating site frequecy spectra by first generating an outcome of the block counting process, and then adding mutations according to a Poisson process. Since the block counting process is a phase-type distribtibution, we can use \texttt{phasty} for the first part.

R = 1e4
ph_mv_sim_obj = sim_rew_phase_type(R,ph_rew_obj)
lambda = 1 #lambda = theta/2
ph_counts = matrix(0,dim(ph_mv_sim_obj)[2],dim(ph_mv_sim_obj)[1])
for(i in 1:R) {
  ph_counts[i,] = rpois(n-1,lambda*ph_mv_sim_obj[,i])
}

An alternative is to simulate the site frequency spectrum using the coala package

library(coala)
loc_length = 1e4; theta = lambda*2
model <- coal_model(sample_size = n, loci_number = R,loc_length) + feat_mutation(theta) + sumstat_trees() + sumstat_seg_sites()
stats <- simulate(model, seed = 15)

out = rep(0,R)
for(i in 1:R) {out[i] = length(stats$seg_sites[[i]]$position)}
N=min(max(out),n)

out = matrix(0,R,N)
for(i in 1:R) {
  curcolsums = colSums(stats$seg_sites[[i]]$snps)
  for(j in 1:N) {
    out[i,j]=sum(curcolsums==j)
  }
}
out = out[,-n]
sfs = out

We can compare the two samples by converting each of the observed triples to a three-character string and compare the counts of each string.

all_obs = unique(rbind(ph_counts,sfs)+1)
myletters = c(letters,LETTERS)
collapse_to_string = function(lst) paste(myletters[lst],collapse="")
all_obs = apply(all_obs,1,collapse_to_string)
table_ph_counts = rep(0,length(all_obs))
table_sfs = rep(0,length(all_obs))
for(i in 1:R) {
  table_ph_counts[which(collapse_to_string(ph_counts[i,]+1)==all_obs)] = table_ph_counts[which(collapse_to_string(ph_counts[i,]+1)==all_obs)] + 1
  table_sfs[which(collapse_to_string(sfs[i,]+1)==all_obs)] = table_sfs[which(collapse_to_string(sfs[i,]+1)==all_obs)] + 1
}
max_count = max(c(table_ph_counts,table_sfs))
plot(table_ph_counts,table_sfs,pch=16,xlim=c(0,max_count),ylim=c(0,max_count),las=1)
abline(a=0,b=1)

We can perform a naive statistical test of equality of the two underlying multinomial distribution by using an approximate likelihood ratio test, where we collapse strings with less than five observations in either sample:

small_counts = union(which(table_ph_counts <= 5),which(table_sfs <= 5))
table_sfs_collapsed=table_sfs[-small_counts]
table_ph_counts_collapsed=table_ph_counts[-small_counts]
table_sfs_collapsed = c(table_sfs_collapsed,sum(table_sfs[small_counts]))
table_ph_counts_collapsed = c(table_ph_counts_collapsed,sum(table_ph_counts[small_counts]))


table_tot = table_sfs_collapsed + table_ph_counts_collapsed
val1 = -2*sum(table_sfs_collapsed*log(table_tot/(2*R))+table_ph_counts_collapsed*log(table_tot/(2*R)))
val2 = -2*(sum(table_sfs_collapsed*log(table_sfs_collapsed/R),na.rm=TRUE)+sum(table_ph_counts_collapsed*log(table_ph_counts_collapsed/R),na.rm=TRUE))
1-pchisq(val1-val2,length(table_tot))