#~ library( BayesianTools )
#' Estimate rate that clusters increase or decrease in frequency and transform into selection coefficients for different genotypes
#'
#' @param s data frame with sequence_name(character), del_introduction(character), sample_time(numeric), genotype (wt or mutant)
#' @param mint lower sample time bound. Clusters must have sample before this time
#' @param maxt upper sample time bound. Clusters must have sample after this time
#' @param minClusterSize exclude clusters with fewer samples than this
#' @param r specifies generation time (1/r) when transforming to selection coefficients. Default 73 corresponds to five days.
#' @return list with matrix of selection coefficients for different clusters and a t.test for difference in 'mutant' selection coefficients
#' @export
clusterwise_logistic <- function(s, mint = 2020.20, maxt = 2020.35, minClusterSize = 25 , r = 73 , quasi=F)
{
# filter missing
s <- s[with( s, !is.na(sample_time) & !is.na(del_introduction) & !is.na(genotype) ) , ]
# filter by cluster size
x = table( s$del_introduction ) ; x = x[ x >= minClusterSize ]; keep<- names(x)
s <- s[ s$del_introduction %in% keep , ]
# compute spans and make sure they cover period ; exclude others
s_clusts <- split( s, s$del_introduction )
clusts <- names( s_clusts )
clustspans <- sapply( s_clusts , function(ss){
range( ss$sample_time )
})# time range of cluster
colnames( clustspans ) <- clusts
keep <- colnames( clustspans )[ clustspans[1,] <= mint & clustspans[2,] >= maxt ]
s <- s[ s$del_introduction %in% keep , ]
# remove sample times outside of mint and maxt
s <- s[ s$sample_time >= mint , ] #& s$sample_time <= maxt
if ( nrow(s) < 100 )
stop('mint and maxt too restrictive' )
s <- s[ , colnames(s)%in%c('del_introduction', 'sample_time', 'genotype','weight') ] ## remove identifiers
rownames(s) <- NULL
# recompute clust spans
s_clusts <- split( s, s$del_introduction )
clusts <- names( s_clusts )
clustspans <- sapply( s_clusts , function(ss){
range( ss$sample_time )
})# time range of cluster
colnames( clustspans ) <- clusts
# majority rule for genotype
clust_genotypes <- sapply( s_clusts, function(x) {
tt = table(x$genotype)
names( tt[ which.max(tt) ] )[1]
})
s_genotype = split( s, s$genotype )
# analysis 1, individual cluster level
ms <- list()
for(i in 1:length(clusts)){
cc <- clusts[i]
ss = s_clusts[[ cc ]]
ccg = ss$genotype[1]
notccg = setdiff( c('wt', 'mutant') , ccg )
X <- rbind( ss , s_genotype[[ notccg ]] )
X <- X[ with(X, sample_time>=clustspans[1,cc] & sample_time <=clustspans[2,cc]) ,]
X$y <- X$genotype == ccg
if(quasi==F){
m = glm( y ~ sample_time , data = X, family = binomial(link='logit' ))
}else{
m = glm( y ~ sample_time , data = X, family = quasibinomial(link='logit' ),weight=weight)
}
# store to plot
ms[[i]] <- m
}
rs_ests <- sapply(ms,function(m) c( coef(m)[2] , tryCatch({confint(m)[2,]}, error = function(e) { return(c(NA,NA))}) ))
rs_ests_wt <- rs_ests[ , clust_genotypes == 'wt' ]
rs_ests_mutant <- rs_ests[ , clust_genotypes == 'mutant' ]
#sinaplot::sinaplot( list( rs_ests_wt[1,], rs_ests_mutant[1, ] ) )
#kruskal.test( list( rs_ests_wt[1,], rs_ests_mutant[1, ] ) )
s_mutant = rs_ests_mutant/r
smut = s_mutant[1, ]
w = apply( s_mutant , 2, function(x) 1/abs( diff( tail(x,2)) ))
m = lm( smut ~ 1, weight = w );
list( s_mutant = s_mutant
, s_wt = rs_ests_wt/r
, selcoef = c( coef(m) , confint( m ) )
, t.test = t.test ( rs_ests_mutant[1, ]/r )
, data = s
, data_clusts = s_clusts
, clust_genotypes = clust_genotypes
, ms = ms
)
}
#' Hierarchical Bayesian inference of cluster-wise selection coefficients with a random effect corresponding to a dichotomous genotype
#'
#' @param s data frame that describes each sample, must have columns: del_introduction, sample_time, genotype
#' @param MINT minimum sample time for including cluster
#' @param MAXT exclude samples after this date
#' @param MU sets time scale of process and generation time. Generation time is 1/MU years
#' @param minClusterSize minimum size for cluster to be included in analysis
#' @return Bayesian sampler output produced by BayesianTools::runMCMC
#' @export
hier_bayes_exponentialGrowth_frequency <- function(s, iterations = 8000000, thin = 200 , ncpu = 4, MINT = -Inf, MAXT = 2020.25, MU = 73, minClusterSize = 10)
{
library( BayesianTools )
s <- s[ s$sample_time > MINT & s$sample_time <= MAXT , ]
s <- s[ order( s$sample_time ) , ]
x = table( s$del_introduction ) ; x = x[ x >= minClusterSize]; keep_lineages <- names(x)
s <- s [ s$del_introduction %in% keep_lineages , ]
# make d: data frame that describes each cluster, must have columns: lineage (matching del_introduction), first_sample (time of first sample in cluster), genotype ( 'wt' or 'mutant' )
Xs <- split( s, s$del_introduction )
# majority rule for genotype
clust_genotypes <- sapply( Xs, function(x) {
tt = table(x$genotype)
names( tt[ which.max(tt) ] )[1]
})
d = data.frame(
lineage = sapply( Xs, function(x) x$del_introduction[1] )
,genotype = clust_genotypes
,first_sample = sapply( Xs, function(x) min( x$sample_time) )
, stringsAsFactors=FALSE
)
# remove any clusters without samples
d <- d[ d$lineage %in% unique( s$del_introduction ) , ]
M <- nrow(d)
# define index of each lineage
d$ilineage <- 1:nrow(d)
s$ilineage <- d$ilineage[ match( s$del_introduction, d$lineage )]
# indicator for which lineages are extant at each sample
EXTANT <- matrix (0, nrow = nrow(s), ncol = nrow(d) )
for ( k in 1:nrow(d)){
EXTANT[ s$sample_time >= d$first_sample[k] , k ] <- 1
}
# t minus start of cluster
TminusTi <- matrix (-Inf, nrow = nrow(s), ncol = nrow(d) )
for ( k in 1:nrow(d)){
TminusTi[ ,k] <- s$sample_time - d$first_sample[k]
}
# prior
## initial parameters and bounds
pnames <- c( paste(sep='.', 'logn', 1:M) , paste( sep='.', 'r', 1:M), 'cv' )
theta0 <- c( rep(0, M)
, rep( 0, M )
, 1
)
theta0.ub <- c( rep(2, M)
, rep( MU, M )
, 5
)
theta0.lb <- c( rep(-2, M)
, rep(-MU, M )
, .01
)
names(theta0) = names(theta0.lb) = names(theta0.ub) = pnames
## density
dprior0 <- function(theta){
theta <- pmax( theta0.lb, pmin( theta0.ub, theta ))
logns <- theta[1:M]
rs <- theta[ (M+1):(M+M) ]
theta1 <- tail( theta, 1 )
cv <- theta1[1]
sigma <- cv * MU
drs <- rs[ d$genotype=='wt' ]
grs <- rs[ d$genotype=='mutant' ]
dmu = mean( drs )
gmu = mean( grs )
.selcoef <- ( gmu - dmu ) / MU
pr0 <- sum( dnorm( drs , dmu, sigma, log=TRUE ) ) + sum( dnorm(grs, gmu, sigma, log=TRUE) )
pr1 <- sum( dnorm( logns, 0, 1 , log=TRUE ) )
pr2 <- dnorm( .selcoef, 0, 1, log =TRUE) + dgamma(cv , shape=1, scale=1 , log=TRUE)
pr3 <- dnorm( gmu / MU, 0, 1, log=TRUE) + dnorm( dmu/MU, 0, 1, log=TRUE )
unname( pr0 + pr1 + pr2 + pr3)
}
## sampler for starting conditions
rprior0 <- function(n=1)
{
if ( n > 1){
return(
sapply( 1:n, function(k)
pmax( theta0.lb, pmin( theta0.ub,
c( rnorm(M,0,.25)
, rnorm( M, 0, MU/5)
, runif(1, .5, 2)
)
))
)
)
}
pmax( theta0.lb, pmin( theta0.ub,
c( rnorm(M,0,.25)
, rnorm( M, 0, MU/5 )
, runif(1, .5, 2)
)
))
}
# objective functions
ll0 <- function( theta )
{
theta <- pmax( theta0.lb, pmin( theta0.ub, theta ))
logns <- theta[1:M]
rs <- theta[ (M+1):(M+M) ]
denmat <- t(EXTANT) * exp( logns ) * exp(t(TminusTi) * rs )
den <- log( colSums( denmat ) )
num <- ( logns[ s$ilineage ] + TminusTi[ cbind(1:nrow(s), s$ilineage) ] * rs[s$ilineage] )
ll <- sum(num) - sum(den)
ll
}
ll0.1 <- function( theta ){
if ( is.matrix( theta )){
return( apply( theta, MAR=2, ll0 ) )
}
ll0(theta )
}
# fit the model
pr0 <- createPrior( density = dprior0, sampler = rprior0, lower = theta0.lb, upper = theta0.ub, best = theta0 )
bs0 <- createBayesianSetup(ll0.1, prior = pr0, names = pnames, parallel=ncpu)
f0 <- runMCMC(bs0, settings = list(iterations = iterations, thin = thin, startValues = bs0$prior$sampler()))
i <- Sys.getpid()
saveRDS( f0, file=paste0('a5f0_', i, '.rds' ) )
f0$clusterdata <- d
f0$data = s
f0$clust_genotypes = clust_genotypes
f0
}
#' too slow
hier_bayes_pairwiseLogistic_frequency <- function(s, d
, iterations = 8000000, thin = 200
, ncpu = 4
, r = 73 # 5 days
, minclustsize = 10
)
{
library( BayesianTools )
MINT <- -Inf
MAXT <- 2020.25
s <- s[ s$sample_time > MINT & s$sample_time <= MAXT , ]
s <- s[ order( s$sample_time ) , ]
x = table( s$del_introduction ) ; x = x[ x >= minclustsize]; keep_lineages <- names(x)
s <- s [ s$del_introduction %in% keep_lineages , ]
# remove any clusters without samples
d <- d[ d$lineage %in% unique( s$del_introduction ) , ]
M <- nrow(d)
# define index of each lineage
d$ilineage <- 1:nrow(d)
s$ilineage <- d$ilineage[ match( s$del_introduction, d$lineage )]
nc = nrow(d)
n = nrow(s)
# indicator for which lineages are extant at each sample
EXTANT <- matrix (0, nrow = nrow(s), ncol = nrow(d) )
for ( k in 1:nrow(d)){
EXTANT[ s$sample_time >= d$first_sample[k] , k ] <- 1
}
# t minus start of cluster ( lineages X cluster )
TminusTi <- matrix (-Inf, nrow = nrow(s), ncol = nrow(d) )
for ( k in 1:nrow(d)){
TminusTi[ ,k] <- s$sample_time - d$first_sample[k]
}
ti.c.c <- array( NA, c( nrow(d), nrow(d), nrow(s) ))
for ( i in 1:n )
{
x = TminusTi[i, ]
ti.c.c[,,i] <- sapply( 1:nc, function(k) sapply( 1:nc, function(l) min( x[k], x[l] ) ))
}
# compute sel coef between clusters
.smat <- function ( fs ) {
fs %*% t(1/fs) - 1
}
# prior
dprior0 <- function(theta){
theta <- pmax( theta0.lb, pmin( theta0.ub, theta ))
p0s <- theta[1:M]
fs <- theta[ (M+1):(M+M) ]
theta1 <- tail( theta, 1 )
logsd <- theta1[1]
wtfs = fs[ d$genotype == 'wt' ]
mutfs = fs [ d$genotype == 'mutant']
wtmu = mean( wtfs )
mutmu = mean( mutfs )
.selcoef <- mutmu / wtmu - 1
pr0 <- sum( dnorm( log(wtfs), log(wtmu), logsd, log=TRUE ) ) + sum( dnorm( log(mutfs), log(mutmu), logsd, log=TRUE ) )
pr1 <- sum( dbeta( p0s, 1, 4 , log=TRUE ) )
pr2 <- dnorm( .selcoef, 0, 1, log =TRUE) + dexp( logsd , rate = 1/5, log=TRUE )
unname( pr0 + pr1 + pr2 )
}
# initial parameters and bounds
pnames <- c( paste(sep='.', 'p0', 1:M) , paste( sep='.', 'f', 1:M), 'logsd' )
theta0 <- c( rep(1/M, M)
, rep(1, M )
, .5
)
theta0.lb <- c( rep(1e-6, M)
, rep(.1, M )
, .01
)
theta0.ub <- c( rep(1, M)
, rep( 10, M )
, 1
)
names(theta0) = names(theta0.lb) = names(theta0.ub) = pnames
# sampler for starting conditions
rprior0 <- function(n=1)
{
if ( n > 1){
return(
sapply( 1:n, function(k)
pmax( theta0.lb, pmin( theta0.ub,
c( rbeta(M,1,4)
, rlnorm( M, 0, .25 )
, runif(1, .25, .75)
)
))
)
)
}
pmax( theta0.lb, pmin( theta0.ub,
c( rbeta(M,1,4)
, rlnorm( M, 0, .25 )
, runif(1, .25, .75)
)
))
}
# objective functions
ll0 <- function( theta )
{
theta <- pmax( theta0.lb, pmin( theta0.ub, theta ))
p0s <- theta[1:M]
fs <- theta[ (M+1):(M+M) ]
smat <- .smat ( fs )
x0kl <- matrix( 1, nrow = nc , ncol = nc )
nextant <- 1
isextant <- rep(FALSE, nc )
ki = s$ilineage[1]
isextant[ki] <- TRUE
p <- rep(0, nc ); p[ki] <- p0s[ki]
ll = 0
for( i in 2:nrow(s)){
ki = s$ilineage[i]
ti = s$sample_time[i]
iextant <- which(isextant)
if ( nextant > 1 ){
A <- matrix( 0, ncol = nc , nrow = choose( nextant, 2 ) +1 )
A[1, iextant] <- 1
#apply( t( combn(iextant,2) ) , 1, function(kl) {
# k = kl[1]
# l = kl[2]
ai = 2
for ( .k in 1:(length( iextant )-1)) for ( .l in (.k+1):length(iextant)) {
k <- iextant[.k]
l <- iextant[.l]
Arow = rep(0, ncol (A) )
Arow[k] <- 1
Arow[l] <- -x0kl[k,l] * exp( r * smat[k,l] * ti.c.c[ k, l, i] )
A[ai,] <- Arow
ai <- ai + 1
}
b <- rep(0, nrow(A)); b[1] <- 1
p = coef( lm( b ~ A-1 ) )
p [is.na(p)] <- 0
p <- p / sum(p)
}
p0i = p[ki]
if ( ti == d$first_sample[ki] ) {
p0i <- p0s[ ki ]
p <- (1-p0i) * p / sum(p)
p[ki] <- p0i
nextant <- nextant +1
isextant [ ki] <- TRUE
for ( .l in 1:(length( iextant ))) {
l <- iextant[.l]
x0kl[ki, l] <- p0i / p[l]
x0kl[ l, ki] <- p[l] / p0i
}
#~ browser()
}
if ( nextant > 1 ){
ll <- ll + log( p0i) - log( 1-p0i)
}
}
ll
}
#~ ll0( rprior0() )
#~ stop()
ll0.1 <- function( theta ){
if ( is.matrix( theta )){
return( apply( theta, MAR=2, ll0 ) )
}
ll0(theta )
}
# fit the model
pr0 <- createPrior( density = dprior0, sampler = rprior0, lower = theta0.lb, upper = theta0.ub, best = theta0 )
bs0 <- createBayesianSetup(ll0.1, prior = pr0, names = pnames, parallel=ncpu)
f0 <- runMCMC(bs0, settings = list(iterations = iterations, thin = thin, startValues = bs0$prior$sampler()))
i <- Sys.getpid()
saveRDS( f0, file=paste0('hier_bayes_pairwiseLogistic_frequency', '_', i, '.rds' ) )
# TODO include d in output, indicator for genotype
f0
}
#' Compute cluster sizes only counting clusters with first sample occuring within given range and arrange by genotype
#'
#' @param s data frame with sequence_name(character), del_introduction(character), sample_time(numeric), genotype (wt or mutant)
#' @param mint lower sample time bound. Clusters must have sample before this time
#' @param maxt upper sample time bound. Clusters must have sample after this time
#' @return data frame with cluster sizes, origin time, and genotype
#' @export
cluster_sizes <- function(s, mint = 2020.20, maxt = 2020.35)
{
#~ mint = decimal_date( as.Date('2020-08-01'))
#~ maxt = decimal_date( as.Date('2020-10-07'))
# filter missing
s <- s[with( s, !is.na(sample_time) & !is.na(del_introduction) & !is.na(genotype) ) , ]
# compute spans and make sure they **begin** within period ; exclude others
s_clusts <- split( s, s$del_introduction )
clusts <- names( s_clusts )
clust_starts <- sapply( s_clusts , function(ss){
min( ss$sample_time )
})
names( clust_starts ) <- clusts
keep <- names( clust_starts )[ clust_starts >= mint & clust_starts <= maxt ]
s <- s[ s$del_introduction %in% keep , ]
# remove sample times outside of mint and maxt
s <- s[ s$sample_time >= mint & s$sample_time <= maxt , ]
s <- s[ , c('del_introduction', 'sample_time', 'genotype') ] ## remove identifiers
rownames(s) <- NULL
# recompute clust spans
s_clusts <- split( s, s$del_introduction )
clusts <- names( s_clusts )
clustspans <- sapply( s_clusts , function(ss){
range( ss$sample_time )
})# time range of cluster
colnames( clustspans ) <- clusts
clust_starts <- sapply( s_clusts , function(ss){
min( ss$sample_time )
})
names( clust_starts ) <- clusts
clust_genotypes <- sapply( s_clusts, function(x) {
# majority rule
tx = table( x$genotype )
names(tx)[which.max(tx)]
})
s_genotype = split( s, s$genotype )
X= data.frame( cluster = clusts
, origin = clust_starts
, size = sapply( s_clusts, nrow )
, genotype = clust_genotypes
)
X
}
#' @export
plot_cluster_sizes <- function(X, mincs = 2 ){
library( ggplot2 )
X1 <- X [ X$size > mincs , ]
p = ggplot( X1 , aes( x = origin, y = size, colour = genotype ) ) + geom_point() + scale_y_log10() + stat_smooth( method = lm )
p
}
#' Performs a simple logistic model fit and make a frequency plot
#'
#' genotype has two values: 'ancestral'(or 'wt') and mutant
#'
#' @param s1 data frame with columns genotype and sample_time
#' @param cols Colours for plotting
#' @param bw bin width for aggregating data
#' @export
plpg <- function( s1, bw = 1 , cols = c(ancestral='white', mutant='black')
, g = 365 / 6.5 # recovery rate
, RD = seq( 1.1,1.5, length = 100) #hypothetical rep numbers for ancestral
, gentime = 6.5/365
, timeScaleMethod = c( 'gentime', 'growth')
)
{
library( ggplot2 )
library( lubridate )
library( cowplot )
x = table( s1$genotype )
if ( 'wt' %in% names (x ) )
s1$genotype [ s1$genotype == 'wt'] <- 'ancestral'
m = glm( (genotype=='mutant') ~ sample_time, family = binomial(link='logit'), data = s1 )
print( summary( m ))
timeScaleMethod = timeScaleMethod[1]
rs = c( coef( m )[2] , confint( m ) [2, ] )
if (timeScaleMethod=='growth') {
rD <- RD * g - g
#~ selcoef <- rs / rD
selcoef <- (rs) %*% t( 1/rD )
print(median( selcoef ))
print(range( selcoef ))
selcoef = c( median( selcoef ), range( selcoef ) )
} else if ( timeScaleMethod=='gentime' ){
selcoef = rs * gentime
}
s1 <- s1[ !is.na( s1$sample_time ), ]
.cols = cols
.time <- seq( min( s1$sample_time )-bw/365, max( s1$sample_time )+bw/365, by = bw / 365 )
h = hist( s1$sample_time, breaks = .time , plot=FALSE)
s1dg = split( s1, s1$genotype )
hd = hist( s1dg$ancestral$sample_time, breaks = .time , plot=F)
hg = hist( s1dg$mutant$sample_time, breaks = .time , plot=F)
sweek = data.frame( time = hd$mids, nD = hd$counts, nG = hg$counts, date = date_decimal( hd$mids ) )
sweek$n = with( sweek, nD + nG )
sweek$pG <- with( sweek, nG / n )
#.time <- seq( 2020, 2020.25, length = 100 )
theta2pg <- function( theta ){
t50 = -theta[1] / theta[2]
logodds50 = theta[1]+t50 * theta[2]
#pg50 = exp( logodds50 ) / ( 1 + exp( logodds50) ) # = 50%, checking the math
pg.time <- exp( theta[2] * (sweek$time - t50 ) ) / ( 1 + exp( theta[2] * (sweek$time - t50 ) ) )
pg.time
}
sweek$fitted <- theta2pg( coef(m) )
prof = profile( m, which =1 )[[1]]
thetaub = tail( unlist( tail( prof[ prof$z < -1.96 , ], 1) ), 2 )
thetalb = tail( unlist( head( prof[ prof$z > 1.96 , ], 1) ), 2 )
sweek$fittedub = theta2pg( thetaub )
sweek$fittedlb = theta2pg( thetalb )
sweek$nglb = with( sweek , qbinom( .025, size = n, prob = fittedub) )
sweek$ngub = with( sweek, qbinom( .975, size = n, prob = fittedlb ) )
sweek$pglb <- with( sweek, nglb / n )
sweek$pgub <- with( sweek, ngub / n )
k = with( sweek , pmin(nD,nG)==0 )
sweek$pgub[ k ] <- 1
sweek$pglb[ k ] <- 0
p0 = ggplot( data = sweek, aes( x = date, y = fitted , ymax = pgub, ymin=pglb ) ) + geom_path() + geom_point( aes( x = date, y = pG, size =n ), colour = 'black' ) + geom_ribbon(alpha = .2 ) + theme_classic() + theme( legend.position = 'none' ) + xlab('') + ylab('Frequency of sampling mutant')
X = with (sweek, data.frame( date = date, n = nD , genotype='ancestral' ) )
X <- rbind( X, with (sweek, data.frame( date = date, n = nG , genotype='mutant' ) ) )
p1 = ggplot( data= X , aes( x = date, y = n, fill = genotype)) + geom_bar( position = 'stack', stat='identity', colour = 'grey' , alpha=.5) + scale_fill_manual( values = .cols) + xlab('') + ylab('Samples') + theme_classic() + theme( legend.position='top')
p = plot_grid( plotlist= list( p0 , p1 ), nrow = 2) # ggtitle( paste('sel coef', selcoef))
p$m = m
list( selcoef = selcoef , plot1=p0, plot2=p1, plot3=p, modelfit=m )
}
#' Compute cumulative proportion of samples descended from clusters originating within a specified time window
#'
#' @param s data frame with del_introduction(character), sample_time(numeric)
#' @param mint lower sample time bound. Clusters must have sample before this time
#' @param maxt upper sample time bound. Clusters must have sample after this time
#' @param res integer resolution of time axis
#' @return list with 1) result with columns: cumulative fraction of samples descended from new clusters, number descended from old clusters, number descended from new clusters 2) data with cluster subset
#' @export
cluster_origin_comparison <- function(s, mint = decimal_date( as.Date('2020-08-01')) , maxt = Inf , res = 50 )
{
if ( 'del_lineage' %in% colnames(s) & !('del_introduction' %in% colnames(s) ) )
s$del_introduction <- s$del_lineage
# filter missing
s <- s[with( s, !is.na(sample_time) & !is.na(del_introduction) ) , ]
# compute spans and make sure they cover period ; exclude others
s_clusts <- split( s, s$del_introduction )
clusts <- names( s_clusts )
clustspans <- sapply( s_clusts , function(ss){
range( ss$sample_time )
})# time range of cluster
colnames( clustspans ) <- clusts
newClusts <- clusts[ clustspans[1,] >= mint & clustspans[2, ] <= maxt ]
s$isnew <- s$del_introduction %in% newClusts
# remove sample times outside of mint and maxt
s <- s[ s$sample_time >= mint & s$sample_time <= maxt , ]
rownames(s) <- NULL
s <- s[ order(s$sample_time ) , ]
ss <- split( s, s$isnew )
taxis <- seq( mint, max( s$sample_time ), length = res )
X = t( sapply( taxis , function(tt){
c( sum( ss[[1]]$sample_time <= tt ), sum( ss[[2]]$sample_time <= tt ) )
}) )
colnames(X) <- names( ss )
pnew = X[,'TRUE'] / rowSums( X )
res = cbind( taxis, pnew, X )
list( result = res, data = s )
}
#' Compute cumulative proportion of samples descended from clusters originating within a specified time window or uk lineage
#'
#' This returns a data frame with lieange log odds broken down by epi week
#'
#' @param s data frame with del_introduction(character), sample_time(numeric), and epi_week (integer)
#' @param uk_lineage optional character vector of uk_lineage
#' @param mint lower sample time bound. Clusters must have sample before this time
#' @param maxt upper sample time bound. Clusters must have sample after this time
#' @return data frame with log odds of new lineages or uk_lineage
#' @export
cluster_origin_comparison2 <- function(s, uk_lineage = NULL, genotype = NULL, mint = decimal_date( as.Date('2020-07-01')) , maxt = Inf, detailed=TRUE )
{
if ( 'del_lineage' %in% colnames(s) & !('del_introduction' %in% colnames(s) ) )
s$del_introduction <- s$del_lineage
# filter missing
s <- s[with( s, !is.na(sample_time) & !is.na(del_introduction) ) , ]
# compute spans and make sure they cover period ; exclude others
s_clusts <- split( s, s$del_introduction )
clusts <- names( s_clusts )
clustspans <- sapply( s_clusts , function(ss){
range( ss$sample_time )
})# time range of cluster
colnames( clustspans ) <- clusts
newClusts <- clusts[ clustspans[1,] >= mint & clustspans[2, ] <= maxt ]
s$isnew <- s$del_introduction %in% newClusts
# remove sample times outside of mint and maxt
s <- s[ s$sample_time >= mint & s$sample_time <= maxt , ]
rownames(s) <- NULL
s <- s[ order(s$sample_time ) , ]
if (!is.null( uk_lineage) )
ss = split( s, s$uk_lineage==uk_lineage )
else if ( !is.null( genotype ))
ss = split( s, s$genotype == genotype )
else
ss <- split( s, s$isnew )
weeks = seq( min ( s$epi_week ) , max( s$epi_week ))
#~ browser()
weights = sapply( weeks, function(w){
n1 <- sum( ss[['TRUE']]$epi_week == w )
n2 = sum( ss[['FALSE']]$epi_week == w )
n <- n1 + n2
if ( n1 == 0 )
return(0)
if ( n2 == 0 )
return(0)
p = sum( ss[['TRUE']]$epi_week == w ) / n
v = p * (1 - p) / n
1 / sqrt( v )
})
weights = weights / median( weights )
logodds = sapply( weeks, function(w){
n1 <- sum( ss[['TRUE']]$epi_week == w )
n2 = sum( ss[['FALSE']]$epi_week == w )
if ( n1 == 0 | n2 == 0 )
return(NA)
log( n1 ) - log(n2 )
})
#list( result=cbind( weeks = weeks, weights = weights, logodds = logodds ) , data = s )
res = data.frame( weeks = weeks, weights = weights, logodds = logodds )
if ( detailed)
res = list(
result = res
, data = s
)
return(res)
}
#' Simple log odds freqency plot for given variable. Does not partition by cluster.
#'
#' If the column 'weight' is in the input data frame, this will use the values to weight each observation.
#' If computing for genotypes you will want to remove gaps and 'X' before passing to this function; also deduplicate by patient id.
#'
#' @param s data frame which must contain sample_time and 'variable' for each sequence
#' @param variable Column of s which contains variable to split the data
#' @param value The value in conlumn 'variable' for which we will compute frequency
#' @param mint Minimum sample time to include
#' @param maxt Max time to include
#' @param form formula passed to mgcv::gam for estimating frequencies
#' @return list with data, plots, and estimated frequencies
#' @export
variable_frequency_epiweek <- function(s, variable='genotype', value='mutant', mint = -Inf , maxt = Inf, detailed=TRUE , form = y~s(sample_time, bs = 'gp', k = 5) )
{
stop( 'Not implemented; needs revisions')
library( lubridate )
# remove sample times outside of mint and maxt
if ( ( 'sample_date' %in% colnames(s) ) & !('sample_time' %in% colnames(s) ) )
{
s$sample_time <- decimal_date( as.Date( ymd( s$sample_date ) ) )
}
stopifnot( is.numeric( s$sample_time ) )
s <- s[ s$sample_time >= mint & s$sample_time <= maxt , ]
rownames(s) <- NULL
s <- s[ order(s$sample_time ) , ]
if ( !('epi_week' %in% colnames(s))){
s$epi_week <- lubridate::epiweek( s$sample_time )
i <- (year( s$sample_date ) < 2021) & (s$epi_week < 53)
s$epi_week[ i ] <- s$epi_week[ i ] + 53
}
weeks = seq( min ( s$epi_week ) , max( s$epi_week ))
weekstarts = sapply( weeks, function(x) min( na.omit( s$sample_time[ s$epi_week==x ] ) ) )
weekends = sapply( weeks, function(x) max( na.omit( s$sample_time[ s$epi_week==x ] ) ) )
if ( 'weight' %in% colnames(s))
s$weight <- s$weight / mean( s$weight )
ss <- split( s, s[[variable]]==value )
weights = sapply( weeks, function(w){
if ( 'weight' %in% colnames(s) ) {
n1 <- sum( ss[['TRUE']]$weight[ ss[['TRUE']]$epi_week==w ] )
n2 <- sum( ss[['FALSE']]$weight[ ss[['FALSE']]$epi_week==w ] )
} else{
n1 <- sum( ss[['TRUE']]$epi_week == w )
n2 = sum( ss[['FALSE']]$epi_week == w )
}
n <- n1 + n2
if ( n1 == 0 )
return(0)
if ( n2 == 0 )
return(0)
p = sum( ss[['TRUE']]$epi_week == w ) / n
v = p * (1 - p) / n
1 / sqrt( v )
})
weights = weights / median( weights )
logodds = sapply( weeks, function(w){
if ( 'weight' %in% colnames(s) ) {
n1 <- sum( ss[['TRUE']]$weight[ ss[['TRUE']]$epi_week==w ] )
n2 <- sum( ss[['FALSE']]$weight[ ss[['FALSE']]$epi_week==w ] )
} else{
n1 <- sum( ss[['TRUE']]$epi_week == w )
n2 = sum( ss[['FALSE']]$epi_week == w )
}
if ( n1 == 0 | n2 == 0 )
return(NA)
log( n1 ) - log(n2 )
})
library( mgcv )
.s1 <- s[ order( s$sample_time ) , ]
.s1 <- .s1[ .s1$sample_time >= (min(ss[['TRUE']]$sample_time)-1/365), ]
.s1$y <- .s1[[variable]]==value
m = mgcv::gam( form , family = binomial(link='logit') , data = .s1 )
.s1$estimated = predict( m )
estdf = data.frame( time = .s1$sample_time, estimated_logodds = .s1$estimated )
estdf <- estdf[ !duplicated(estdf$time), ]
estdf <- estdf[ order( estdf$time ) , ]
estdf$epiweek = lubridate::epiweek( date_decimal( estdf$time ) )
week2bnds = t( sapply( weeks, function(w){
if ( 'weight' %in% colnames(s) ) {
n1 <- round( sum( ss[['TRUE']]$weight[ ss[['TRUE']]$epi_week==w ] ) )
n2 <- round( sum( ss[['FALSE']]$weight[ ss[['FALSE']]$epi_week==w ] ) )
} else{
n1 <- sum( ss[['TRUE']]$epi_week == w )
n2 = sum( ss[['FALSE']]$epi_week == w )
}
lo= median( estdf[estdf$epiweek == w , ]$estimated_logodds )
f = exp( lo) / ( 1 + exp( lo ))
ub = qbinom( .975, size = n1 + n2, prob = f ) / (n1+n2 )
lb = qbinom( .025, size = n1 + n2, prob = f ) / (n1+n2 )
c( log(lb/(1-lb)), log(ub/(1-ub)) )
}))
estdf$lb = week2bnds[ match( estdf$epiweek, weeks), 1]
estdf$ub = week2bnds[ match( estdf$epiweek, weeks), 2]
res = data.frame( time = weekends, weeks = weeks, weights = weights, logodds = logodds )
if ( detailed) {
library( ggplot2 )
res = list(
result = res
, estimated = estdf
, data = .s1
#~ , plot = ggplot( aes( x = as.Date( date_decimal( time ) ), y = logodds, size=weights, weight=weights ) , data= res ) + geom_point () + theme_minimal() + theme( legend.pos='') + geom_smooth(method=stats::loess) + xlab('') + ggtitle( paste0('Frequency of ', variable, '=', value) ) #, method.args=list(span = 1)
, plot = ggplot() +
geom_point ( aes( x = as.Date( date_decimal( time ) ), y = logodds, size=weights, weight=weights ) , data= res) +
theme_minimal() + theme( legend.pos='') +
geom_path( aes( x = as.Date( date_decimal( time ) ), y = estimated_logodds), data = estdf , color = 'blue') +
ggtitle( paste0('Frequency of ', variable, '=', value) )
)
}
return(res)
}
#' @export
variable_frequency_day <- function(s, variable='genotype', value='mutant', mint = -Inf , maxt = Inf, detailed=TRUE , form = y~s(sample_time, bs = 'gp', k = 5) )
{
library( ggplot2 )
library( lubridate )
# remove sample times outside of mint and maxt
stopifnot( ('sample_date' %in% colnames(s)) | ('sample_time' %in% colnames(s)) )
if ( ( 'sample_date' %in% colnames(s) ) & !('sample_time' %in% colnames(s) ) )
{
s$sample_time <- decimal_date( as.Date( ymd( s$sample_date ) ) )
}
stopifnot( is.numeric( s$sample_time ) )
s <- s[ s$sample_time >= mint & s$sample_time <= maxt , ]
rownames(s) <- NULL
s <- s[ order(s$sample_time ) , ]
if ( 'weight' %in% colnames(s)){
s$weight <- s$weight / mean( s$weight )
} else{
s$weight <- 1
}
ss <- split( s, s[[variable]]==value )
library( mgcv )
.s1 <- s[ order( s$sample_time ) , ]
.s1 <- .s1[ .s1$sample_time >= (min(ss[['TRUE']]$sample_time)-1/365), ]
.s1$y <- .s1[[variable]]==value
m = mgcv::gam( form , family = binomial(link='logit') , data = .s1 )
.s1$estimated = predict( m )
estdf = data.frame( time = .s1$sample_time, estimated_logodds = .s1$estimated )
estdf <- estdf[ !duplicated(estdf$time), ]
estdf <- estdf[ order( estdf$time ) , ]
estdf <- cbind( estdf, t( sapply( 1:nrow(estdf), function(k){
.s2 <- .s1[ .s1$sample_time == estdf$time[k] , ]
n <- nrow( .s2 )
lo= estdf$estimated_logodds[k]
f = exp( lo) / ( 1 + exp( lo ))
ub = qbinom( .975, size = n, prob = f ) / n
lb = qbinom( .025, size = n, prob = f ) /n
n1 <- sum( .s2[[variable]]==value)
n2 <- n - n1
ef = n1 / n
c(lb= log(lb/(1-lb)), ub = log(ub/(1-ub)) , n = n
, weights = 1 / sqrt( f*(1-f)/n )
, logodds = log(ef / ( 1 - ef )) )
})) )
#, weight=weights
pl = ggplot() +
geom_point( aes( x = as.Date( date_decimal( time ) ), y = logodds, size=n ), data= estdf) +
theme_minimal() +
theme( legend.pos='' ) +
xlab('') +
ggtitle( paste0('Frequency of ', variable, '=', value) ) +
geom_path( aes(x=as.Date(date_decimal( time )), y=estimated_logodds), data = estdf, color = 'blue' , size = 1) +
geom_ribbon( aes(x = as.Date(date_decimal( time )), ymin = lb, ymax = ub ), data = estdf , alpha = .25 )
if ( detailed) {
library( ggplot2 )
res = list(
result = estdf
, data = .s1
, plot = pl
)
}
return(res)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.