In emvolz-phylodynamics/variantAnalysis: sarscov2 variant analysis

library("knitr")
options(knitr.duplicate.label = "allow")

library( ape )
library( lubridate )
library( xtable )
library( variantAnalysis )
library( glue )
library( stringr ) 
library( kableExtra )
library( sjmisc )
library( Hmisc )
library( dplyr )
library( tidyr )

#lineages = c("A.23.1", "B.1.351", "B.1.525")
#mutations =  NULL #c("E494K")
#cut_off = NULL #c("Logistic growth rate > 0.2")

#data paths

scanner_fn = glue('{path_to_scanner}scanner-{max_date}-{min_date}.rds')
scanner_env_fn = glue('{path_to_scanner}scanner-env-{max_date}_{min_date}.rds')
data_fn = glue('{path_to_data}')

#load data

ress = readRDS(scanner_fn)
envs = readRDS(scanner_env_fn)
Ygr1 = readRDS(glue('{path_to_scanner}Ygr1_{max_date}_{min_date}.rds')) 
s = readRDS(glue('{path_to_scanner}sequence_data_{max_date}_{min_date}.rds'))

mutfn = list.files( paste0( path_to_data) ,   
                    patt = 'cog_global_[0-9\\-]+_mutations.csv' , 
                    full.name = TRUE)

afn = list.files( paste0( path_to_data) ,   
                  patt = 'cog_[0-9\\-]+_metadata.csv' , 
                  full.name = TRUE)
cfn = list.files( paste0( path_to_data) ,   
                  patt = 'cog_global_[0-9\\-]+_metadata.csv' , 
                  full.name = TRUE)
gfn = list.files( paste0( path_to_data) , 
                  patt = 'cog_global_[0-9\\-]+_geography.csv', 
                  full.names=TRUE) 
wfn = list.files( paste0( path_to_data) , 
                  patt = 'a0-weightsdf-[0-9\\-]+.csv',
                  full.name = TRUE)

mdf = read.csv( mutfn , stringsAs=FALSE )
mdf$lineage_muts = mdf$lineage
mdf = mdf %>% select(lineage_muts, variants, sequence_name)

gdf = read.csv(gfn , stringsAs = FALSE)
gdf = gdf %>% select(longitude, latitude, sequence_name)

amd = read.csv( afn , stringsAs = FALSE ) 
amd$sample_time = decimal_date(ymd(amd$sample_date))

cmd = read.csv(cfn, stringsAs = FALSE)
cmd = cmd %>% select(sequence_name, central_sample_id)

wdf = read.csv(wfn, stringsAs = FALSE )

amd = merge(amd, cmd, by = "sequence_name")
amd = merge(amd, mdf, by = "sequence_name")
amd = merge(amd, gdf, by = "sequence_name")
amd = merge(amd, wdf, by = "central_sample_id")


#parameters setup

max_time = decimal_date(as.Date( max_date ))
min_time = decimal_date(as.Date( min_date ))

amd$sample_date = ymd( amd$sample_date )

  # exclude global 
amd <- amd[ amd$country == 'UK', ]
  # exclude p1 
if ( !include_pillar1 )
  {
    lhls <- paste0( '.*/', c( 'MILK', 'ALDP', 'QEUH', 'CAMC'), '.*')
    lhpatt = paste( lhls, collapse = '|' )
    amd <- amd[ grepl(amd$sequence_name, patt = lhpatt ) , ] 
  }

  # sample time 
amd$sts <- decimal_date ( as.Date( amd$sample_date ) )
amd <- amd [ (amd$sts >= min_time) & (amd$sts <= max_time) , ] 
sts <- setNames( amd$sts , amd$sequence_name )
amd <- amd[ !is.na( amd$sequence_name ) , ]


if(!is.null(mutations)) {

  mts_patt = paste0('.*', c(mutations), '.*')

}

least_recent_tip_date = min(amd$sample_date)
most_recent_tip_date = max(amd$sample_date)
num_sequences = length(amd$sequence_name)
generation_time = generation_time 
scanner_run_date = as.Date(ymd(scanner_run_date))

if(is.null(mutations) & is.null(cut_off) & !is.null(lineages)){ 

  selection = toString(paste0(lineages))

}

if(!is.null(mutations) & is.null(lineages) & is.null(cut_off)){ 

  selection = toString(paste0(mutations))

}

if(!is.null(cut_off) & !is.null(lineages) & is.null(mutations)) { 

  selection = toString(paste0(cut_off))

  }

if(!is.null(mutations) & !is.null(lineages) & is.null(cut_off)) { 

  selection = paste0(toString(paste0(lineages)), " : ", toString(paste0(mutations)))
}

if(is.null(mutations) & !is.null(lineages) & !is.null(cut_off)) { 

  selection = paste0(toString(paste0(lineages)), " : ", toString(paste0(cut_off)))
}

if(!is.null(mutations) & is.null(lineages) & !is.null(cut_off)) { 

  selection = paste0(toString(paste0(mutations)), " : ", toString(paste0(cut_off)))
  }

if(!is.null(cut_off) & !is.null(lineages) & !is.null(mutations)) { 

  selection = paste0(toString(paste0(lineages)), " : ", toString(paste0(mutations)), " : ", toString(paste0(cut_off)))

}

title: "Scanner analyses: r selection" date: r format(Sys.Date(), "%B %d, %Y") author: - Olivia Boyd, o.boyd@imperial.ac.uk - Erik Volz, e.volz@imperial.ac.uk

This analysis is based on outputs from our scanning tool run on r scanner_run_date. We use COGUK sequence data collected between r least_recent_tip_date and r most_recent_tip_date with:

r as.character(num_sequences) whole genomes sampled from COGUK overall

if(!is.null(mutations) | !is.null(lineages)){ 

  seq_num = NA

  if(is.null(mutations) & !is.null(lineages)) { 


    input = lineages
    type = rep("lineage", length(input))

    for(i in 1:length(input)){ 

      seq_num[[i]] = length(amd$lineage_muts[amd$lineage_muts == lineages[[i]]])

      }
    }


  if(!is.null(mutations) & is.null(lineages)){ 

    input = mutations
    type = rep("mutation", length(input))

    for(i in 1:length(input)){ 

      seq_num[[i]] = length(amd$lineage_muts[grepl(amd$variants, patt = mts_patt[[i]])])

      }
    }

  if(!is.null(mutations) & !is.null(lineages)){ 

    input = c(lineages,mutations)
    type = c(rep("lineage", length(lineages)) ,rep("mutation", length(mutations)))

    for(i in 1:length(input)){ 

      if(type[[i]] == "lineage"){ 

      seq_num[[i]] = length(amd$lineage_muts[amd$lineage_muts == lineages[[i]]])

      } else { 

        seq_num[[i]] = length(amd$lineage_muts[grepl(amd$variants, patt = mts_patt[[i - length(lineages)]])])

        }

      }
    }

  seq_data = data.frame(seq_num, type, input)

  template <- 
  "* **%s whole genomes** sampled with %s **%s**

"

  for (i in seq(nrow(seq_data))) {
    current <- seq_data[i,]
    cat(sprintf(template, current$seq_num, current$type, current$input))

    }
  }

We conduct variant scanning in real-time on lineages or variants of interest to identify potential clusters or variants of concern. We identify clusters with a minimum of r as.character(min_size) genome sequences and a maximum size of r as.character(max_size) genome sequences. We use a generalised linear (GLM) model to compare growth rates between clusters of interest. To do this, we select a cluster of interest and select a matched control sample of sequences based on sampling time, admin2, and prevalence over time. We run a GLM to calculate the log odds of a sample being from our cluster of interest vs. our control sample over time. We multiply this by generation time to calculate a relative growth rate per generation for each cluster of interest, using a mean generation time of r generation_time days.

if(is.null(mutations) & is.null(cut_off)){ 

  inclusion_criteria = paste0("lineages ", toString(paste0(lineages)))
  cluster_interest = "lineages"
  cluster_interest2 = "(% lineage)"

}

if(is.null(lineages) & is.null(cut_off)){ 

  inclusion_criteria = paste0("mutations ", toString(paste0(mutations)))
  cluster_interest = "mutations"
  cluster_interest2 = "(% mutant)"

}

if(is.null(lineages) & is.null(mutations)){ 

  inclusion_criteria = paste0(cut_off)
  cluster_interest = "statistical cut off"
  cluster_interest2 = ""

}

if(!is.null(lineages) & !is.null(mutations) & is.null(cut_off)){ 

  inclusion_criteria = paste0("lineages ",toString(paste0(lineages)), " and mutations ",
                              toString(paste0(mutations)))
  cluster_interest = "lineages and mutations"
  cluster_interest2 = "(% mutant)"
}

if(is.null(lineages) & !is.null(mutations) & !is.null(cut_off)){ 

  inclusion_criteria = paste0("mutations",toString(paste0(mutations)), " and ", cut_off)
  cluster_interest = "variants and statistical cut off"
  cluster_interest2 = "(% mutant)"
}


if(!is.null(lineages) & is.null(mutations) & !is.null(cut_off)){ 

  inclusion_criteria = paste0("lineages ",toString(paste0(lineages)), " and ", cut_off)
  cluster_interest = "lineages and statistical cut off"
  cluster_interest2 = "(% lineage)"
}


if(!is.null(lineages) & !is.null(mutations) & !is.null(cut_off)){ 

  inclusion_criteria = paste0("lineages ",toString(paste0(lineages)), ", mutations",
                              toString(paste0(mutations)), ", and ", cut_off)
  cluster_interest = "lineages, mutations, and statistical cut off"
  cluster_interest2 = "(% mutant)"
  }





## run #####
Ygr2_out = report_table(csd = s, log_growth_rate_cut_off = log_growth_rate_cut_off)
names(Ygr2_out) = c("Ygr2_output", "csd_select", "csd_cluster", "kp_nodes", 
                    "growth_rank", "cmts1", "cmts2", "Ygr2", "growth_rank2")
#store outputs#### 
Ygr2_output = Ygr2_out[[1]]
csd_select = Ygr2_out[[2]]
csd_cluster = Ygr2_out[[3]]
kp_nodes = Ygr2_out[[4]]
growth_rank = Ygr2_out[[5]]
cmts1 = Ygr2_out[[6]]
cmts2 = Ygr2_out[[7]]
Ygr2 = Ygr2_out[[8]]
growth_rank2 = Ygr2_out[[9]]

growth_rank = growth_rank2

n_clusters = length(Ygr1$log_rank[Ygr1$logistic_growth_rate > log_growth_rate_cut_off])
Ygr1$logistic_growth_rate = sprintf(as.numeric(Ygr1$logistic_growth_rate), fmt = '%#.2f')  
max_growth_rate = max(Ygr1$logistic_growth_rate)
n_clusters_included = length(Ygr2_output$`Log growth rate`)
min_growth_rate = min(Ygr2_output$`Log growth rate`)
max_growth_rate2 = max(Ygr2_output$`Log growth rate`)

Table 1 and Figure 1 present some summary data of the sequence data and logistic growth rate analyses for the clusters of interest. We use a logsitic growth rate cut-off of r log_growth_rate_cut_off to exclude clusters which show an equivalent or reduced growth rate compared to matched control samples from the general population prior to applying inclusion criteria, selecting only clusters with r inclusion_criteria. We identify r n_clusters clusters with a growth rate > r log_growth_rate_cut_off, with growth rates ranging from r log_growth_rate_cut_off to r max_growth_rate. Following the application of inclusion criteria to outputs, we identify r n_clusters_included cluster with growth rates equal to r max_growth_rate2 for our r cluster_interest of interest. Defining mutations are identified as mutations that are present in >= r defining_mutations_cut_off*%* of sequences within a given cluster of interest and are not present in >= r defining_mutations_cut_off% of sequences in a matched sample from the general population.

\newpage

Table 1. Summary information on clusters of interest including logistic growth rate rank, cluster sizer cluster_interest2, most and least recent tips, growth rates, growth rate rank (out of N = r n_clusters clusters) and defining mutations (mutations present in > r defining_mutations_cut_off% of sequences in a cluster but not in > r defining_mutations_cut_off% of sequences in the matched sequences).

if(!is.null(lineages) & !is.null(mutations)){ 

    kbl(Ygr2_output, booktabs = T, align = c(rep('c', 8))) %>% 
  kable_styling(full_width = T) %>%
  #column_spec(1, width = "10em" )%>% 
  #column_spec(2, width = "" )%>% 
  #column_spec(3, width = "" )%>% 
  #column_spec(4, width = "" )%>% 
  #column_spec(5, width = "" )%>% 
  column_spec(6, width = "1cm" )%>% 
  column_spec(7, width = "1cm" )%>% 
  column_spec(8, width = "1cm")%>% 
  column_spec(9, width = "1cm")%>% 
  column_spec(10, width = "2cm")

} else { 

kbl(Ygr2_output, booktabs = T, align = c(rep('c', 8))) %>% 
  kable_styling(full_width = T) %>%
  #column_spec(1, width = "10em" )%>% 
  #column_spec(2, width = "" )%>% 
  #column_spec(3, width = "" )%>% 
  column_spec(4, width = "2cm" )%>% 
  column_spec(5, width = "2cm" )%>% 
  column_spec(6, width = "1cm" )%>% 
  column_spec(7, width = "1cm" )%>% 
  column_spec(8, width = "1cm")%>% 
  column_spec(9, width = "2cm")

  }

Cluster defining mutations exclude synSNPs, as these are unlikely to be driving higher growth rates, although they are observed in the original analyses and outputs. A figure (Figure 7) with proportions of sequences with defining mutations by cluster is included further in the report. Mutation status is measured as defining (>90% of sequences in cluster of interest and not >90% of sequences in cluster of comparison sample), present or absent.

``` {r echo=FALSE, results="asis", message=F, warning=F}

sdf = csd_select freq_plots(kp_nodes = kp_nodes, growth_rank = growth_rank2)

```r
 if(length(kp_nodes) > 1){ 

knitr::include_graphics(glue('{path_to_scanner}cum_samples_{max_date}_{min_date}.png'))

   }

```r knitr::include_graphics(glue('{path_to_scanner}cum_samples_clusters_{max_date}_{min_date}.png'))

*Figure 1. Cumulative sum of sampled sequences from `r cluster_interest` of interest (top) and clusters of interest (bottom) by sample date.*


Figure 2 shows the frequency of log odds over time of our clusters of interest compared to a matched sample from the general population using a generalised additive model (GAM). We match clusters of interest to comaparison samples based on adm2 and sample time, using weights derived from PHE metadata. 

``` {r echo=FALSE, results="asis", message=F, warning=F}

amd = merge(amd, wdf, by = "central_sample_id")
gam_plots(e1 = envs, kp_nodes = kp_nodes, growth_rank = growth_rank2)

\newpage

```r knitr::include_graphics(glue('{path_to_scanner}logodds_{max_date}_{min_date}.png'))

*Figure 2. Frequency of log odds of cluster of interest over time. Points represent actual log odds (size shows weight of samples), line represents estimated log odds and shading represents lower and upper bound of GAM estimates (95% CI).* 


Figure 3 and 4 shows the coordinates and logistic growth rates of samples per **cluster** by location in the UK, using longitude and latitude data linked to COGUK sequence data. **29 sequences** were excluded from regional plots due to missing data on longitude, latitude or outer postcode. Figure 5 shows the sample time ditribution for sequences from clusters of interest. Only the  genomes sequences from the clusters of interest are plotted on the maps.


``` {r echo=FALSE, results="asis", message=F, warning=F}

plot_maps(sdf = csd_cluster, path_to_data = path_to_data)

```r knitr::include_graphics(glue('{path_to_scanner}region_map_{max_date}_{min_date}.png'))

*Figure 3. Locations of sequence samples by region and lineage.*


\newpage

  ```r
knitr::include_graphics(glue('{path_to_scanner}log_growth_map_{max_date}_{min_date}.png'))

Figure 4. Logistic growth rates (GLM) results from Table 1 by region.

```r knitr::include_graphics(glue('{path_to_scanner}sample_time_map_{max_date}_{min_date}.png'))

*Figure 5. Sampling dates of sequence samples from clusters of interest plotted by region.* 

We compare the present growth rates to those from previous runs to identify changes occuring over time in lineages or mutations of interest. We select the clusters for `r cluster_interest` of interest with the highest growth rates across all variant scanner runs to identify potential clusters or variants of interest. Figure 6 shows the clusters identified for `r cluster_interest`. 

\newpage

``` {r echo=FALSE, results="asis", message=F, warning=F}


grtp(s = sdf_all)

```r knitr::include_graphics(glue('{path_to_scanner}growth_rate_over_time_{max_date}_{min_date}.png'))

*Figure 6. Growth rates over time for lineages of interest. Date represents the most recent sample date and size represents the size of the cluster. Only clusters which contain >50% of lineages of interest are selected for comparison. Shape represents minimum clade size chosen for scanner runs (min = {10,25}). * 

We report the proportion of sequences (**>`r prop_mutations` %**) per clusters with mutations of interest present. We exclude synSNPs, as these are unlikely to be driving higher growth rates. These figures include all mutations present within clusters of interest, unlike Table 1 which includes only mutations present in > **`r defining_mutations_cut_off`****%** of sequences within a cluster of interest and not present in > **`r defining_mutations_cut_off`****%** of sequences in the matched sample sequences. 

\newpage 

``` {r echo=FALSE, results="asis", message=F, warning=F}


mutations_plots(,cmst2 = cmst2, prop_mutations = prop_mutations, growth_rank = growth_rank2)

r knitr::include_graphics(glue('{path_to_scanner}mutations_{max_date}_{min_date}.png'))

Figure 7. Proportion of sequences (> r prop_mutations%) with mutations of interest across selected clusters.

Acknowledgements

This work was supported by the MRC Centre for Global Infectious Disease Analysis at Imperial College London, Wellcome Trust and COGUK. Sequence data were provided by COGUK. Additional sample metadata were provided by PHE.

emvolz-phylodynamics/variantAnalysis documentation built on Nov. 13, 2021, 7:16 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

Tweet to @rdrrHQ

GitHub issue tracker

ian@mutexlabs.com