#' Produce a random sample of given global lineage and sample weights
#'
#' Sampling is repeated multiple times and over multiple sample sizes. Can confine sampling to a particular time frame. A proportion of samples can be stratified through time rather than being drawn completely at random (weighted smapling is still used within each week)
#'
#' @param lineage Global lineage
#' @param weightsfn output produced by coverage_weights
#' @param mindate minimum date
#' @param maxdate maximum date; note it takes about 12 days for P2 sampling to stabilize
#' @param nreps replicates
#' @param ns sample sizes to use
#' @param prop_stratified proportion of sample to reserve for stratified sampling
#' @param deduplicate if TRUE, duplicate samples will be removed
#' @export
sample_lineage <- function(
lineage = 'B.1.1.7'
, weightsfn = '/cephfs/covid/bham/climb-covid19-volze/b0-weightsdf-2021-01-04.csv'
, mindate = as.Date( '2020-10-15' )
, maxdate = as.Date( Sys.Date() - 12 )
, nreps = 100
, ns = c( 250, 500, 750, 1000 )
, prop_stratified = .25
, deduplicate = TRUE
){
library( lubridate )
library( ape )
library( glue )
mintime <- decimal_date( mindate )
maxtime <- decimal_date( maxdate )
wdf = na.omit( read.csv( weightsfn , stringsAs=FALSE ) )
civetfn = list.files( '../phylolatest/civet/' , patt = 'cog_global_[0-9\\-]+_metadata.csv', full.names=TRUE) #'../phylolatest/civet/cog_global_2020-12-01_metadata.csv'
civmd = read.csv( civetfn , stringsAs=FALSE , header=TRUE )
civmd$central_sample_id <- sapply( strsplit( civmd$sequence_name , split='/' ) , '[', 2 ) # for linkage
civmd$sample_date <- as.Date( civmd$sample_date )
civmd$sample_time <- decimal_date( civmd$sample_date )
# filter dates
civmd <- civmd[ civmd$sample_time >= mintime , ]
civmd <- civmd[ civmd$sample_time <= maxtime , ]
# filter lineage
civmd <- civmd[ civmd$lineage == lineage , ]
# combine
s <- merge( civmd, wdf, by = 'central_sample_id' )
# exclude p1
lhls <- c( 'MILK', 'ALDP', 'QEUH', 'CAMC')
s$lighthouse = FALSE
for (lh in lhls ){
s$lighthouse[ grepl(s$central_sample_id, patt = lh) ] <- TRUE
}
s <- s [ s$lighthouse , ]
s$epiweek <- epiweek( s$sample_date )
# load tree and deduplicate
tr0 = read.tree( list.files( '../phylolatest/trees', patt = '.*newick', full=T ))
tr0$tip.label <- sapply( strsplit( tr0$tip.label, split='/' ), '[', 2 )
tr1 <- keep.tip( tr0, s$central_sample_id )
if ( deduplicate )
{
if ( Ntip (tr1) > 6e3 ){
stop('Sample size is large. Current dedup code cannot handle this size: ', Ntip(tr1 ))
}
D <- as.matrix( cophenetic.phylo( tr1 ) ) # TODO not sustainable
diag(D) <- 0
clusters = sapply(rownames(D), function(x) {
paste( sort( colnames(D)[ D[x,] < 1e-6 ] ), collapse = '.' )
})
s$cluster = NA
s$cluster <- clusters[ match( rownames(D), s$central_sample_id ) ]
s <- s[ order( s$sample_time ) , ]
.s <- s[ !duplicated( s$cluster ), ]
}
# main sampling func
.sample <- function( n )
{
nstrat <- floor ( n * prop_stratified )
nn <- n - nstrat
npw <- floor( nstrat / length( unique( s$epiweek )) )
## make strat sample
stratsample <- do.call( c, lapply( split( s, s$epiweek ), function(ss){
sample( ss$central_sample_id , prob = ss$coverage_weight, replace=FALSE, size = min(nrow(ss),npw) )
}))
## make remainder
sids <- setdiff( s$central_sample_id , stratsample )
s1 <- s[ !( s$central_sample_id %in% stratsample ), ]
osample <- with( s1, sample( central_sample_id , prob = coverage_weight, replace=FALSE, size = min( nrow(s1), n - length(stratsample)) ))
c( stratsample, osample )
}
#~ .sample( 100 )
# sample over reps, ns;
Z <- list()
for ( n in ns ){
X = do.call( rbind, lapply(1:nreps, function(k){
y = data.frame( central_sample_id = .sample( n ), replicate = k , sample_size = n )
}))
Z[[ as.character(n) ]] <- X
write.csv( X, file = glue('sampler1_{lineage}_{Sys.Date()}_n={X$sample_size[1]}{ifelse(deduplicate,"-deduped","")}.csv') )
}
#make tres
for (X in Z ){
Xs <- split( X, X$replicate )
tres <- lapply( Xs, function(y){
keep.tip( tr1, intersect( y$central_sample_id, tr1$tip.label ) )
})
class( tres ) <- 'multiPhylo'
write.tree( tres, file = glue('sampler1_{lineage}_{Sys.Date()}_n={X$sample_size[1]}{ifelse(deduplicate,"-deduped","")}.nwk') )
}
list(
trees = tres
, tables = Z
)
}
#' Matched sample by time (week) and UTLA
#'
#' @param csids central_sample_id to be matched
#' @param lineage If not NULL, sample will only include this lineage
#' @param not_lineage The sample will *not* include this lineage
#' @param nreps replicates
#' @param n_multiplier Optionally can draw multiple (integer) matches per element in csids
#' @param deduplicate not implemented
#' @export
matched_sample <- function(
csids
, lineage = NULL
, not_lineage = 'B.1.1.7'
, nreps = 1
, n_multiplier = 1
, deduplicate = FALSE
) {
stopifnot( !deduplicate ) #not implemented
#~ csids <- read.tree('sampler1_B.1.1.7_2021-01-04_n=500-deduped.nwk')[[1]]$tip.label
#~ lineage = NULL
#~ not_lineage = 'B.1.1.7'
#~ weightsfn = '/cephfs/covid/bham/climb-covid19-volze/b0-weightsdf-2021-01-04.csv'
#~ nreps = 100
#~ n_multiplier = 1
library( lubridate )
library( ape )
library( glue )
civetfn = list.files( '../phylolatest/civet/' , patt = 'cog_global_[0-9\\-]+_metadata.csv', full.names=TRUE) #'../phylolatest/civet/cog_global_2020-12-01_metadata.csv'
civmd = read.csv( civetfn , stringsAs=FALSE , header=TRUE )
civmd$central_sample_id <- sapply( strsplit( civmd$sequence_name , split='/' ) , '[', 2 ) # for linkage
civmd$sample_date <- as.Date( civmd$sample_date )
civmd$sample_time <- decimal_date( civmd$sample_date )
# load majora '../latest/majora.20201204.metadata.matched.tsv'
jdf <- read.csv( list.files( '../latest/' , patt = 'majora.[0-9]+.metadata.matched.tsv', full.names=TRUE)
, stringsAs=FALSE, sep = '\t' )
# combine
s <- merge( civmd, jdf[ , c('central_sample_id', 'adm2') ], by = 'central_sample_id' )
# exclude p1
lhls <- c( 'MILK', 'ALDP', 'QEUH', 'CAMC')
s$lighthouse = FALSE
for (lh in lhls ){
s$lighthouse[ grepl(s$central_sample_id, patt = lh) ] <- TRUE
}
s <- s [ s$lighthouse , ]
s$epiweek <- epiweek( s$sample_date )
s$year <- year ( s$sample_date )
# load tree and deduplicate
tr0 = read.tree( list.files( '../phylolatest/trees', patt = '.*newick', full=T ))
tr0$tip.label <- sapply( strsplit( tr0$tip.label, split='/' ), '[', 2 )
tr1 <- keep.tip( tr0, s$central_sample_id )
#~ StatMatch package, NND.hotdeck
## find controls
### filter lineage
s <- s[ , c('year' , 'epiweek', 'adm2', 'central_sample_id', 'lineage') ]
s <- na.omit( s)
s$key <- with( s, paste( sep = '.' , year, epiweek, adm2 ) )
srec <- s[ s$central_sample_id %in% csids , ]
sdon <- s[ !( s$central_sample_id %in% csids ) & (s$lineage!=not_lineage) , ]
.sample <- function()
{
csid_control = do.call(c, lapply( srec$key , function (key ){
.s <- sdon[ sdon$key == key , ]
n <- min( nrow( .s ) , n_multiplier )
if ( n == 0 )
return( NA )
sample ( .s$central_sample_id, size = n , replace=FALSE )
}) )
csid_control <- na.omit( csid_control )
csid_control
}
# sample over reps, ns;
{
X = do.call( rbind, lapply(1:nreps, function(k){
csids_control = .sample( )
y = data.frame( central_sample_id = csids_control, replicate = k , sample_size = length( csids_control) )
y
}))
write.csv( X, file = glue('matchSample_not{not_lineage}_{Sys.Date()}.csv') )
}
#make tres
{
Xs <- split( X, X$replicate )
tres <- lapply( Xs, function(y){
keep.tip( tr1, intersect( y$central_sample_id, tr1$tip.label ) )
})
class( tres ) <- 'multiPhylo'
write.tree( tres, file = glue('matchSample_not{not_lineage}_{Sys.Date()}.nwk') )
}
list(
trees = tres
, tables = X
)
}
#' Matched sample by time (week) and UTLA for each sampled tree output from sample_lineage function
#'
#' @param sample_lineage_output output from sample_lineage
#' @param lineage If not NULL, will interpret as regular expression; only matches will be sampled
#' @param lineage_name incorporated into output file name
#' @param not_lineage The sample will *not* include this lineage
#' @param n_multiplier Optionally can draw multiple (integer) matches per element in csids
#' @param deduplicate not implemented
#' @export
matched_sample2 <- function(
sample_lineage_output_table
, lineage = NULL # 'B\\.1\\.177.*' # pattern
, lineage_name = NA # for output file name
, not_lineage = 'B.1.1.7' # dont sample these
, n_multiplier = 1
, deduplicate = FALSE
) {
stopifnot( !deduplicate ) #not implemented
library( lubridate )
library( ape )
library( glue )
csids_list = lapply( split(sample_lineage_output_table, sample_lineage_output_table$replicate) , function(y) y$central_sample_id )
nreps = length( csids_list )
civetfn = list.files( '/cephfs/covid/bham/climb-covid19-volze/phylolatest/civet/' , patt = 'cog_global_[0-9\\-]+_metadata.csv', full.names=TRUE) #'../phylolatest/civet/cog_global_2020-12-01_metadata.csv'
civmd = read.csv( civetfn , stringsAs=FALSE , header=TRUE )
civmd$central_sample_id <- sapply( strsplit( civmd$sequence_name , split='/' ) , '[', 2 ) # for linkage
civmd$sample_date <- as.Date( civmd$sample_date )
civmd$sample_time <- decimal_date( civmd$sample_date )
# load majora '../latest/majora.20201204.metadata.matched.tsv'
jdf <- read.csv( list.files( '/cephfs/covid/bham/climb-covid19-volze/latest/' , patt = 'majora.metadata.matched.tsv', full.names=TRUE)
, stringsAs=FALSE, sep = '\t' )
# combine
s <- merge( civmd, jdf[ , c('central_sample_id', 'adm2') ], by = 'central_sample_id' )
# exclude p1
lhls <- c( 'MILK', 'ALDP', 'QEUH', 'CAMC')
s$lighthouse = FALSE
for (lh in lhls ){
s$lighthouse[ grepl(s$central_sample_id, patt = lh) ] <- TRUE
}
s <- s [ s$lighthouse , ]
s$epiweek <- epiweek( s$sample_date )
s$year <- year ( s$sample_date )
# load tree and deduplicate
tr0 = read.tree( list.files( '/cephfs/covid/bham/climb-covid19-volze/phylolatest/trees', patt = '.*newick', full=T ))
tr0$tip.label <- sapply( strsplit( tr0$tip.label, split='/' ), '[', 2 )
tr1 <- keep.tip( tr0, s$central_sample_id )
#~ StatMatch package, NND.hotdeck
## find controls
### filter lineage
s <- s[ , c('year' , 'epiweek', 'adm2', 'central_sample_id', 'lineage') ]
s <- na.omit( s)
s$key <- with( s, paste( sep = '.' , year, epiweek, adm2 ) )
.sample <- function(k)
{
csids = csids_list[[k]]
srec <- s[ s$central_sample_id %in% csids , ]
sdon <- s[ !( s$central_sample_id %in% csids ) & (s$lineage!=not_lineage) , ]
if ( !is.null( lineage ) ){
sdon <- sdon [ grepl(sdon$lineage, patt = lineage) , ]
}
csid_control = do.call(c, lapply( srec$key , function (key ){
.s <- sdon[ sdon$key == key , ]
n <- min( nrow( .s ) , n_multiplier )
if ( n == 0 )
return( NA )
sample ( .s$central_sample_id, size = n , replace=FALSE )
}) )
csid_control <- na.omit( csid_control )
print(paste('made sample', k))
csid_control
}
# sample over reps, ns;
{
Xs <- lapply(1:nreps, function(k){
csids_control = .sample( k )
y = data.frame( central_sample_id = csids_control, replicate = k , sample_size = length( csids_control) )
y
})
saveRDS( Xs, file = glue('matchSample_not{not_lineage}_lineage{lineage_name}_{Sys.Date()}.rds') )
}
#make tres
{
tres <- lapply( Xs, function(y){
keep.tip( tr1, intersect( y$central_sample_id, tr1$tip.label ) )
})
class( tres ) <- 'multiPhylo'
write.tree( tres, file = glue('matchSample_not{not_lineage}_lineage{lineage_name}_{Sys.Date()}.nwk') )
}
list(
trees = tres
, tables_list = Xs
)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.