# Load raw data and save it as .rda files for blantyreESBL package
# the load_phd_data scripts are from the Thesis repo https://github.com/joelewis101/thesis
#library(tidyverse)
source("/Users/joelewis/Documents/PhD/Thesis/bookdown/final_cleaning_scripts/load_PhD_data.R")
source("/Users/joelewis/Documents/PhD/Thesis/bookdown/final_cleaning_scripts/load_and_clean_lims.R")
library(tidyverse)
library(lubridate)
library(phytools)
library(devtools)
library(PopGenome)
library(here)
library(ape)
library(readxl)
source(here("data-raw/load_metadata_fn.R"))
# baseline characteristics -------------------------------------------
enroll %>%
filter(arm != 4) %>%
mutate(
art_time = as.numeric(
(data_date - hivartstart) / (365.25 / 12)
),
hivart = if_else(hivart != "5A", "ART regimen: other", hivart),
animalskept = if_else(animalskept == "Other", "Elsewhere", animalskept),
arm = as.character(arm),
enroll_date = data_date,
) %>% select(pid,
arm,
enroll_date,
calc_age,
ptsex,
hivstatus,
recieved_prehosp_ab,
pmhxrechospital,
tbstatus,
tbongoing,
hivonart,
hivart,
art_time,
hivcpt,
tobaccoyn,
alcoholyn,
highestedu,
job,
housholdadultsno,
householdchildno,
toilet ,
watersource,
watertreated,
electricityyn,
fuel,
keepanim,
keep.poultry,
keep.goats,
keep.dogs,
keep.cattle,
keep.sheep,
keep.mules
) -> btESBL_participants
use_data(btESBL_participants, overwrite = TRUE)
# longitudinal exposure (covariate) data ---------------------------------
read_csv(
"~/Documents/PhD/Thesis/bookdown/data/longit_covariate_data.csv") ->
btESBL_exposures
# add in cotrim
btESBL_exposures %>%
left_join(select(enroll, pid, hivcpt) %>%
unique()) %>%
mutate(cotri = if_else(hivcpt == "Yes" & !is.na(hivcpt)
, 1, cotri)) %>%
select(-hivcpt) -> btESBL_exposures
use_data(btESBL_exposures, overwrite = TRUE)
# ESBL carriage data -------------------------------------------------
lims_dates %>%
rename("t" = "assess_type") ->
btESBL_stoolESBL
use_data(btESBL_stoolESBL, overwrite = TRUE)
lims_orgs %>%
select(lab_id,organism, ESBL) %>%
# recode - negatives were coded as all sorts
mutate(ESBL = if_else(ESBL == "Positive",
"Positive",
"Negative")) %>%
unique() %>%
filter(!is.na(organism)) -> btESBL_stoolorgs
use_data(btESBL_stoolorgs, overwrite = TRUE)
# stan_data -------------------------------
readRDS("data-raw/stan_data_m2.rds") ->
btESBL_stanmodeldata
use_data(btESBL_stanmodeldata , overwrite = TRUE)
read_csv("/Users/joelewis/Documents/PhD/Thesis/bookdown/data/stan_df.csv") %>%
select(-`...1`) ->
btESBL_modeldata
use_data(btESBL_modeldata, overwrite = TRUE)
readRDS("/Users/joelewis/Documents/PhD/Thesis/bookdown/chapter_9/stan_models/model_2/stanfit_m2.rds") ->
btESBL_model2posterior
use_data(btESBL_model2posterior, overwrite = TRUE)
readRDS("/Users/joelewis/Documents/PhD/Thesis/bookdown/chapter_9/stan_models/model_1/stanfit_m1.rds") ->
btESBL_model1posterior
use_data(btESBL_model1posterior, overwrite = TRUE)
# model3 -
list.files(here("data-raw/review_comment_work/models/"),
pattern = "csv") -> csvfiles
rstan::read_stan_csv(
paste0(here("data-raw/review_comment_work/models/"),
csvfiles)
) -> btESBL_model3posterior
use_data(btESBL_model3posterior, overwrite = TRUE)
# simulations from posterior
btESBL_model2simulations <-
readRDS(here("data-raw/btESBL_model2simulations.rda"))
use_data(btESBL_model2simulations, overwrite = TRUE)
# amr genes ---------------------------------------------------------
esco1 <- read_csv(here("data-raw/amr_genes/esco-amr-ariba-report-part1.csv"))
esco2 <- read_csv(here("data-raw/amr_genes/esco-amr-ariba-report-part2.csv"))
klebs <- read_csv(here("data-raw/amr_genes/kleb-amr-ariba-report.csv"))
bind_rows(
esco1 %>%
select(contains("name") |
contains("match") |
contains("assembled") |
contains("ref_seq")) %>%
pivot_longer(-name,
names_to = c("cluster_name", ".value"),
names_sep = "\\.") %>%
filter(match == "yes") %>%
select(name, ref_seq) %>%
mutate(ref_seq = sapply(strsplit(ref_seq, split = "__"), function(x)
x[3]),
genus = "E. coli"),
esco2 %>%
select(contains("name") |
contains("match") |
contains("assembled") |
contains("ref_seq")) %>%
pivot_longer(-name,
names_to = c("cluster_name", ".value"),
names_sep = "\\.") %>%
filter(match == "yes") %>%
select(name, ref_seq) %>%
mutate(ref_seq = sapply(strsplit(ref_seq, split = "__"), function(x)
x[3]),
genus = "E. coli"),
klebs %>%
mutate(name = gsub("\\./","", name),
name = gsub("/report\\.tsv","",name)) %>%
select(contains("name") |
contains("match") |
contains("assembled") |
contains("ref_seq")) %>%
pivot_longer(-name,
names_to = c("cluster_name", ".value"),
names_sep = "\\.") %>%
filter(match == "yes") %>%
select(name, ref_seq) %>%
mutate(ref_seq = sapply(strsplit(ref_seq, split = "__"), function(x)
x[3]),
genus = "K. pneumoniae complex"
)
) %>%
mutate(name = gsub("\\./","", name),
name = gsub("/report\\.tsv","",name),
name = gsub("_1\\.fastq\\.gz", "", name)
) %>%
rename("lane" = "name") %>%
mutate(lane = gsub("#", "_", lane)) %>%
filter(lane %in% btESBL_sequence_sample_metadata$lane) %>%
mutate(ref_seq = if_else(ref_seq == "TEM_95",
"TEM_1",
ref_seq)) ->
btESBL_amrgenes
use_data(btESBL_amrgenes, overwrite = TRUE)
# QRDR resistance determinants
bind_rows(
read_tsv(
here("data-raw/ecoli-genomics-paper/QRDR_ariba_output.tsv")
) %>%
rename(sample = "26141_1#222") %>%
filter(ref_name != "ref_name") %>%
mutate(sample = gsub("#", "_", sample)) %>%
filter(sample %in% btESBL_sequence_sample_metadata$lane) %>%
mutate(
codon_posn = str_extract(ref_ctg_change, "[0-9]+"),
codon_posn = as.numeric(codon_posn)
) %>%
filter(ref_ctg_effect == "NONSYN") %>%
filter(
(ref_name == "GyrA" &
codon_posn >= 67 & codon_posn <= 106) |
(ref_name == "GyrB" &
codon_posn >= 426 & codon_posn <= 464) |
(ref_name == "ParC" &
codon_posn >= 56 & codon_posn <= 108) |
(ref_name == "ParE" &
codon_posn >= 365 & codon_posn <= 525)
) %>%
mutate(ref_name = gsub("(^.{1})", '\\L\\1',
ref_name,
perl = TRUE)) %>%
transmute(
gene = ref_name,
variant = ref_ctg_change,
lane = sample,
genus = "E. coli"
),
# kleb
read_tsv(
here(
"data-raw/kleb-genomics-paper/DASIM3_ariba_qrdr_snpcalls.tsv"
)
) %>%
rename(sample = `34154_7#184`) %>%
filter(ref_name != "ref_name") %>%
mutate(sample = gsub("#", "_", sample)) %>%
filter(sample %in% btESBL_sequence_sample_metadata$lane) %>%
mutate(
codon_posn = str_extract(ref_ctg_change, "[0-9]+"),
codon_posn = as.numeric(codon_posn)
) %>%
filter(ref_ctg_effect == "NONSYN") %>%
filter(
(ref_name == "GyrA" &
codon_posn >= 67 & codon_posn <= 106) |
(ref_name == "GyrB" &
codon_posn >= 426 & codon_posn <= 464) |
(ref_name == "ParC" &
codon_posn >= 56 & codon_posn <= 108) |
(ref_name == "ParE" &
codon_posn >= 365 & codon_posn <= 525)
) %>%
mutate(ref_name = gsub("(^.{1})", '\\L\\1',
ref_name,
perl = TRUE)) %>%
transmute(
gene = ref_name,
variant = ref_ctg_change,
lane = sample,
genus = "K. pneumoniae complex"
)
) ->
btESBL_qrdr_mutations
use_data(btESBL_qrdr_mutations, overwrite = TRUE)
# CARD described QRDR mutations -----------------------
btESBL_CARD_qrdr_mutations <-
read_csv(
here("data-raw/ecoli-genomics-paper/2021-09-06_CARD-QRDR-mutations.csv"))
use_data(btESBL_CARD_qrdr_mutations, overwrite = TRUE)
# NCBI beta-lactamase definition
btESBL_NCBI_phenotypic_bl <-
read_tsv("https://ftp.ncbi.nlm.nih.gov/pathogen/betalactamases/Allele.tab") %>%
rename_with( ~ tolower(gsub(" ", "_", .x))) %>%
rename_with( ~ gsub("#", "", .x)) %>%
mutate(
allele_name = gsub("-", "_", allele_name),
class = case_when(
grepl("carbapenem-hydrolyzing",
curated_gene_product_name) ~ "Carbapenemase",
grepl("metallo-beta-lactamase",
curated_gene_product_name) ~ "Carbapenemase",
grepl(
"extended-spectrum beta-lactamase",
curated_gene_product_name
) ~ "ESBL",
grepl("class C",
curated_gene_product_name) ~ "AmpC",
grepl("beta-lactamase",
curated_gene_product_name) ~ "Penicillinase"
)
) %>%
select(allele_name, protein_accession_, nucleotide_accession_,
gene_name, curated_gene_product_name, class)
use_data(btESBL_NCBI_phenotypic_bl, overwrite = TRUE)
# core gene trees --------------------------------------------------------
read.tree(here("data-raw/core_gene_trees/esco_core_gene_tree.treefile")) ->
btESBL_coregene_tree_esco
read.tree(here("data-raw/core_gene_trees/kleb_core_gene_tree.treefile")) ->
btESBL_coregene_tree_kleb
btESBL_coregene_tree_esco <- midpoint.root(btESBL_coregene_tree_esco)
btESBL_coregene_tree_kleb <- midpoint.root(btESBL_coregene_tree_kleb)
use_data(btESBL_coregene_tree_esco, overwrite = TRUE)
use_data(btESBL_coregene_tree_kleb, overwrite = TRUE)
# non ASC trees
read.tree(here(
"data-raw/core_gene_trees/kleb_core_gene_tree_nonASC.treefile")) ->
btESBL_coregene_tree_kleb_nonASC
read.tree(here(
"data-raw/core_gene_trees/esco_core_gene_tree_nonASC.treefile")) ->
btESBL_coregene_tree_esco_nonASC
btESBL_coregene_tree_kleb_nonASC <-
midpoint.root(btESBL_coregene_tree_kleb_nonASC)
btESBL_coregene_tree_esco_nonASC <-
midpoint.root(btESBL_coregene_tree_esco_nonASC)
#use_data(btESBL_coregene_tree_kleb_nonASC, overwrite = TRUE)
# global kleb tree -----------------------------------------------
btESBL_kleb_globaltree <-
read.tree(
here("data-raw/GLOBAL_core_gene_alignment_snp_sites.fa.treefile"))
midpoint.root(btESBL_kleb_globaltree) -> btESBL_kleb_globaltree
# global e coli tree
btESBL_ecoli_globaltree_noASC <-
read.tree(
here("data-raw/ecoli-genomics-paper/horesh_full_diversity/core_gene_alignment_snpsites.fasta.treefile"
))
midpoint.root(btESBL_ecoli_globaltree_noASC) ->
btESBL_ecoli_globaltree_noASC
use_data(btESBL_ecoli_globaltree_noASC, overwrite = TRUE)
# use_data(btESBL_kleb_globaltree, overwrite = TRUE)
# non ASC treee
btESBL_kleb_globaltree_noASC <-
read.tree(
here("data-raw/kleb-genomics-paper/GLOBAL_core_gene_alignment_snp_sites_noASC.fa.treefile"))
midpoint.root(btESBL_kleb_globaltree_noASC) ->
btESBL_kleb_globaltree_noASC
# malawi kleb tree -----------------------------
btESBL_kleb_malawi_allisolate_core_gene_tree <-
read.tree(
here("data-raw/MALAWI_core_gene_alignment_snp_sites.fa.treefile")
)
midpoint.root(btESBL_kleb_malawi_allisolate_core_gene_tree) ->
btESBL_kleb_malawi_allisolate_core_gene_tree
use_data(btESBL_kleb_malawi_allisolate_core_gene_tree, overwrite = TRUE)
# no ASC
btESBL_kleb_malawi_allisolate_core_gene_tree_noASC <-
read.tree(
here("data-raw/kleb-genomics-paper/MALAWI_core_gene_alignment_snp_sites_noASC.fa.treefile")
)
midpoint.root(btESBL_kleb_malawi_allisolate_core_gene_tree_noASC) ->
btESBL_kleb_malawi_allisolate_core_gene_tree_noASC
# use_data(btESBL_kleb_malawi_allisolate_core_gene_tree_noASC, overwrite = TRUE)
# overwrite ASC trees withnon ASC - kleb
btESBL_coregene_tree_kleb <- btESBL_coregene_tree_kleb_nonASC
use_data(btESBL_coregene_tree_kleb, overwrite = TRUE)
btESBL_kleb_globaltree <- btESBL_kleb_globaltree_noASC
use_data(btESBL_kleb_globaltree, overwrite = TRUE)
btESBL_kleb_malawi_allisolate_core_gene_tree <-
btESBL_kleb_malawi_allisolate_core_gene_tree_noASC
use_data(btESBL_kleb_malawi_allisolate_core_gene_tree, overwrite = TRUE)
# and esco
btESBL_coregene_tree_esco <- btESBL_coregene_tree_esco_nonASC
use_data(btESBL_coregene_tree_esco, overwrite = TRUE)
btESBL_ecoli_globaltree_noASC -> btESBL_ecoli_globaltree
use_data(btESBL_ecoli_globaltree, overwrite = TRUE)
# sequence sample metadata ----------------------------------
# If there is no lane accession this will use sample accession
# Need to update once klebs go into the wild!
samp_metadata <- load_DASSIM3_metadata(
location_of_phd_loading_script =
"/Users/joelewis/Documents/PhD/Thesis/bookdown/final_cleaning_scripts/load_PhD_data.R",
location_of_lims_loading_script =
"/Users/joelewis/Documents/PhD/Thesis/bookdown/final_cleaning_scripts/load_and_clean_lims.R",
lanes_to_include =
"/Users/joelewis/Documents/Sanger/DASSIM3/data_raw/metadata/all_DASSIM_kleb_and_esco_lanes.txt",
sanger_metadata_file = "/Users/joelewis/Documents/Sanger/DASSIM3/data_raw/metadata/all_DASSIM_kleb_and_esco_metadata.csv",
recent_dc_def = 120 )
outcome %>%
group_by(pid) %>%
arrange(pid,hospoutcome,hospoutcomedate) %>%
slice(n = 1) -> outcome
samp_metadata %>%
rename_with(~ tolower(gsub(" |\\.","_", .x))) %>%
mutate(lane = gsub("#","_", lane)) %>%
left_join(outcome,
by = "pid") %>%
select(lane, supplier_name,pid, arm, visit, data_date, enroll_date, assess_type,
hosp_assoc, hospoutcomedate) -> btESBL_sequence_sample_metadata
left_join(
btESBL_sequence_sample_metadata,
read_csv("data-raw/all_dassim_kleb_and_esco_accession.csv") %>%
mutate(
`Lane name` = gsub("#", "_", `Lane name`),
`Lane accession` = if_else(`Lane accession` == "not found",
`Sample accession`,
`Lane accession`)
) %>%
transmute(lane = `Lane name`,
accession = `Lane accession`),
by = "lane"
) %>%
relocate(accession, .before = everything()) ->
btESBL_sequence_sample_metadata
# E coli virulence determinants --------------------------------------
vf <- read_csv(
here("data-raw/ecoli-genomics-paper/all_dassim_esco_vf_ariba.csv"))
vf %>%
mutate(name = gsub("\\./", "", name),
name = gsub("/report.tsv", "", name)) %>%
pivot_longer(-name,
names_to = c( "cluster", ".value"),
names_sep = "\\.") %>%
filter(match == "yes") %>%
select(name, ref_seq) ->
btESBL_ecoli_virulence
use_data(btESBL_ecoli_virulence, overwrite = TRUE)
# add poppunk clusters --------------------
bind_rows(
read_csv(here(
"data-raw/poppunk/strain_db_clustersKLEB.csv"
)) %>%
mutate(
Taxon = gsub("#", "_", Taxon),
Cluster = paste0("K", Cluster)
),
read_csv(here(
"data-raw/poppunk/strain_db_clusters.csv"
)) %>%
mutate(
Taxon = gsub("#", "_", Taxon),
Cluster = paste0("E", Cluster)
)
) -> btESBL_popPUNK
left_join(
btESBL_sequence_sample_metadata,
btESBL_popPUNK,
by = c("lane" = "Taxon")
) -> btESBL_sequence_sample_metadata
# add assembly statistics --------------------------
btESBL_sequence_sample_metadata %>%
left_join(
# e coli
rbind(read_tsv(#"~/Documents/PhD/Thesis/bookdown/chapter_7/
# checkm_quast/D1/transposed_report.tsv"
here(
"data-raw/ecoli-genomics-paper/QUAST_report1.tsv"
)) ,
read_tsv(#"~/Documents/PhD/Thesis/bookdown/chapter_7/
#checkm_quast/D220190318/transposed_report.tsv"
here(
"data-raw/ecoli-genomics-paper/QUAST_report2.tsv"
)),
read_tsv(# "~/Documents/PhD/Thesis/bookdown/chapter_7/
# checkm_quast/D220190503/transposed_report.tsv"
here(
"data-raw/ecoli-genomics-paper/QUAST_report3.tsv"
))) %>%
transmute(
lane = gsub("\\.contigs_spades", "", Assembly),
number_of_contigs = `# contigs`,
N50 = N50
) %>%
bind_rows(
read_tsv(
here(
"data-raw/kleb-genomics-paper/DASSIM3_QUAST_transposed_report.tsv")
) %>%
transmute(
lane = gsub("\\.contigs_spades", "", Assembly),
number_of_contigs = `# contigs`,
N50 = N50
)
),
by = "lane"
) %>%
# add MLST calls -----------------------------------------------
#- e coli
left_join(
bind_rows(
read_csv(here("data-raw/ecoli-genomics-paper/mlst.csv")) %>%
mutate(ST = case_when(
lane %in% c(
# update those that were novel but now not
"28099_1_144",
"28099_1_18",
"28099_1_159",
"28099_1_249",
"28099_1_23",
"28099_1_249",
"28099_1_330",
"26141_1_141"
) ~ "9847",
TRUE ~ ST
)),
# klebs
read_tsv(
here("data-raw/kleb-genomics-paper/DASSIM3_ariba_mlst_summary.tsv")
) %>%
filter(ST != "ST") %>%
mutate(ST = gsub("\\*", "", ST),
lane = gsub("#","_", lane)) %>%
select(ST, lane)
),
by = "lane"
) %>%
# add e coli phylogroup ------------------------------------------
left_join(
read_csv(here("data-raw/ecoli-genomics-paper/phylogroups.csv")) %>%
transmute(lane = Lane,
ecoli_phylogroup = Phylogroup),
by = "lane") %>%
# e coli pathotype
left_join(
btESBL_ecoli_virulence %>%
filter(grepl("stx|ltcA|sta|eae|aatA|aggR|aaiC|ipaH|ipaD", ref_seq)) %>%
mutate(ref_seq = gsub("_[0-9|A-Z|a-z]*$","", ref_seq)) %>%
mutate(Pathotype = case_when(
grepl("stx", ref_seq) & grepl("eae", ref_seq) ~ "EHEC",
grepl("stx", ref_seq) ~ "STEC",
grepl("eae", ref_seq) ~ "aEPEC/EPEC",
grepl("aatA|aggR|aaiC", ref_seq) ~ "EAEC",
grepl("ltcA|sta", ref_seq) ~ "ETEC",
grepl("ipaH|ipaD", ref_seq) ~ "EIEC",
TRUE ~ NA_character_ )) %>%
transmute(lane = name,
ecoli_pathotype = Pathotype) %>%
unique() %>%
mutate(lane = gsub("#", "_", lane)),
by = "lane"
) %>%
# add kleb k, o locus, virulence
left_join(
read_tsv(here("data-raw/kleb-genomics-paper/DASSIM3_kleborate.all.txt")) %>%
mutate(strain = gsub("\\.contigs_spades","",strain),
strain = gsub("#", "_", strain)) %>%
filter(ST != "ST") %>%
transmute(
lane = strain,
species = species,
kleb_k_locus = K_locus,
kleb_k_locus_confidence = K_locus_confidence ,
kleb_o_locus = O_locus,
kleb_o_locus_confidence = O_locus_confidence,
kleb_YbST = YbST,
kleb_CbST = CbST,
kleb_AbST = AbST,
kleb_SmST = SmST,
kleb_rmpA = rmpA,
kleb_rmpA2 = rmpA2),
by = "lane") %>%
mutate(species =
if_else(is.na(species),
"E. coli",
species)
) ->
btESBL_sequence_sample_metadata
use_data(btESBL_sequence_sample_metadata, overwrite = TRUE)
# contig clusters --------------------------------
#make summary df
filez <- list.files(here("data-raw/contig_clusters/"))
out <- list()
for (i in 1:length(filez)) {
dftemp <-
read_tsv(paste0(
here("data-raw/contig_clusters/", filez[i])
))
dftemp$gene <-
str_extract(filez[i], "(?<=contigs-).*(?=\\.clust\\.)")
out[[i]] <- dftemp
}
do.call(rbind, out) -> clusters
clusters %>%
mutate(clstr_name = paste0(gene, ".", clstr),
lane = gsub("^\\.", "", id),
lane = gsub("\\..*$", "", lane),
clstr_iden = as.numeric(gsub("%", "", clstr_iden)),
clstr_cov = as.numeric(gsub("%", "", clstr_cov))) %>%
select(-clstr) %>%
mutate(species = case_when(
lane %in% btESBL_coregene_tree_kleb$tip.label ~ "K. pneumoniae",
lane %in% btESBL_coregene_tree_esco$tip.label ~ "E. coli",
TRUE ~ NA_character_)
) -> btESBL_contigclusters
use_data(btESBL_contigclusters, overwrite = TRUE)
# snpdist matrices ----------------------------------------------------
snpdists.e <- read_csv(here("data-raw/snpdists/esco_clean.full.filtered_polymorphic_sites-snp-dists.csv"))
names(snpdists.e)[1] <- "sample"
snpdists.e %>%
filter(sample != "Reference") %>%
select(-Reference) %>%
rename_with(~ gsub("#", "_", .x)) %>%
mutate(sample = gsub("#", "_", sample)) ->
btESBL_snpdists_esco
# kleb
snpdists.k <- read_csv(here("data-raw/snpdists/clean.full.filtered_polymorphic_sites_kleb_snp-dists.csv"))
names(snpdists.k)[1] <- "sample"
snpdists.k %>%
filter(sample != "Reference") %>%
select(-Reference) %>%
rename_with(~ gsub("#", "_", .x)) %>%
mutate(sample = gsub("#", "_", sample)) ->
btESBL_snpdists_kleb
use_data(btESBL_snpdists_esco, overwrite = TRUE)
use_data(btESBL_snpdists_kleb, overwrite = TRUE)
# plasmid replicons --------------------------------
esco.plasm1 <-
read_csv(here("data-raw/plasmids/D1ariba_plasmidfinder_clustsmall.csv"))
esco.plasm2 <-
read_csv(here("data-raw/plasmids/D2ariba_plasmidfinder_summary.csv"))
klebs.plasm <- read_csv("data-raw/plasmids/klebs-plasmidfinder-summary.csv")
dassim.klebs <- read_lines(here("data-raw/included_lanes/kleb_lanes_retained_following_qc.txt"))
bind_rows(
esco.plasm1 %>%
mutate(name = gsub("\\./","", name),
name = gsub("_1\\..+$","", name),
name = gsub("#","_", name)) %>%
pivot_longer(-name,
names_to = c("cluster_name", ".value"),
names_sep = "\\.") %>%
filter(match == "yes") %>%
select(name, ref_seq) %>%
mutate(species = "E. coli") %>%
filter(name %in% btESBL_coregene_tree_esco$tip.label),
esco.plasm2 %>%
mutate(name = gsub("\\./","", name),
name = gsub("/report.tsv","", name),
name = gsub("#","_", name)) %>%
pivot_longer(-name,
names_to = c("cluster_name", ".value"),
names_sep = "\\.") %>%
filter(match == "yes") %>%
select(name, ref_seq) %>%
mutate(species = "E. coli") %>%
filter(name %in% btESBL_coregene_tree_esco$tip.label),
klebs.plasm %>%
mutate(name = gsub("\\./","", name),
name = gsub("/report\\.tsv","",name),
name = gsub("#","_", name)) %>%
filter(name %in% gsub("#","_",dassim.klebs)) %>%
pivot_longer(-name,
names_to = c("cluster_name", ".value"),
names_sep = "\\.") %>%
filter(match == "yes") %>%
select(name, ref_seq) %>%
mutate(species = "K. pneumoniae")
) %>%
rename("lane" = "name") -> btESBL_plasmidreplicons
use_data(btESBL_plasmidreplicons, overwrite = TRUE)
# ST410 data ----------------------------
btESBL_ecoli_st410_metadata <-
read_xlsx(here(
"data-raw/ecoli-genomics-paper/st410-new/42003_2019_569_MOESM4_ESM.xlsx"
)) %>%
filter(Notes != "excluded" | is.na(Notes)) %>%
transmute(
accession = SRA,
host = Host,
source = Source,
Country = Location,
`Collection Year` = if_else(
Year == "-", NA_real_,
as.numeric(str_extract(Year, "^.{4}"))
)) %>%
mutate(across(where(is.character), ~ if_else(is.na(.x),
NA_character_,
.x)))
use_data(btESBL_ecoli_st410_metadata, overwrite = TRUE)
# st410 plasmids -----------------
st410_plasm <-
read_csv(
here("data-raw/ecoli-genomics-paper/st410-new/st410-ariba-plasmidfinder-summary.csv"))
st410_plasm %>%
mutate(name = gsub("\\./", "", name),
name = gsub("/report.tsv", "", name),
name = gsub("_filtered","", name),
name = gsub("#","_", name)) %>%
pivot_longer(-name,
names_to= c( "cluster", ".value"),
names_sep = "\\.") %>%
filter(match == "yes") %>%
select(name, ref_seq) %>%
mutate(ref_seq = gsub("\\..*$", "", ref_seq),
ref_seq = gsub("_.*$","", ref_seq),
ref_seq = gsub("^FIA", "IncFIA", ref_seq)) ->
btESBL_ecoli_st410_plasmids
use_data(btESBL_ecoli_st410_plasmids, overwrite = TRUE)
# st410 tree -----------------------------------------------
read.tree(
here(
"data-raw/ecoli-genomics-paper/st410-new/clean.full.filtered_polymorphic_sites.fasta.treefile"
)
) ->
st410_tree
midpoint.root(st410_tree) -> btESBL_ecoli_globalst410_tree
btESBL_ecoli_globalst410_tree$tip.label <-
gsub("_filtered", "", btESBL_ecoli_globalst410_tree$tip.label)
use_data(btESBL_ecoli_globalst410_tree, overwrite = TRUE)
# st410 amr ---------------------------------------------------
amr.ariba410 <-
read_csv(
here(
"data-raw/ecoli-genomics-paper/st410-new/st410-ariba-srst2-summary.csv"
)
)
amr.ariba410 %>%
mutate(
name = gsub("\\./", "", name),
name = gsub("/report.tsv", "", name),
name = gsub("_filtered", "", name),
name = gsub("#", "_", name)
) %>%
pivot_longer(-name,
names_to = c("cluster", ".value"),
names_sep = "\\."
) %>%
mutate(gene = sapply(str_split(ref_seq, "__"), function(x) x[3])) %>%
filter(match == "yes") %>%
mutate(gene = case_when(
gene == "TEM_95" ~ "TEM_1",
TRUE ~ gene
)) %>%
select(name, gene) ->
btESBL_ecoli_st410_amr
use_data(btESBL_ecoli_st410_amr, overwrite = TRUE)
### st167
st167_metadata_new <-
read_xls(
here("data-raw/ecoli-genomics-paper/st167-new/13073_2019_699_MOESM2_ESM.xls")
) %>%
filter(ST == 167) %>%
filter(!grepl("^G", Assembly) & !grepl("_", Assembly)) %>%
janitor::clean_names()
st167_metadata <-
read_tsv(here("data-raw/ecoli-genomics-paper/st167/st167.tsv"))
st167_metadata %>%
select(
Uberstrain,
Name,
`Data Source(Accession No.;Sequencing Platform;Sequencing Library;Insert Size;Experiment;Status)`,
`Source Niche`,
`Source Details`,
Country,
`Collection Year`,
ST
) %>%
separate_rows(
`Data Source(Accession No.;Sequencing Platform;Sequencing Library;Insert Size;Experiment;Status)`,
sep = ","
) %>%
separate(
`Data Source(Accession No.;Sequencing Platform;Sequencing Library;Insert Size;Experiment;Status)`,
into = c(
"accession",
"platform",
"library",
"insert_size",
"experiment"
),
sep = ";"
) ->
btESBL_ecoli_st167_metadata
left_join(select(st167_metadata_new, assembly, origin),
btESBL_ecoli_st167_metadata,
by = c("assembly" = "accession")
) %>%
mutate(County = if_else(
is.na(Country) & origin != "unknown",
origin,
Country
)) %>%
select(-origin) %>%
rename(accession = assembly)-> btESBL_ecoli_st167_metadata
use_data(btESBL_ecoli_st167_metadata, overwrite = TRUE)
# st167 plasmids -----------------
st167_plasm <-
read_csv(
here("data-raw/ecoli-genomics-paper/st167-new/st167-ariba-plasmidfinder-summary.csv"))
st167_plasm %>%
mutate(name = gsub("\\./", "", name),
name = gsub("/report.tsv", "", name),
name = gsub("_filtered","", name),
name = gsub("#","_", name)) %>%
pivot_longer(-name,
names_to= c( "cluster", ".value"),
names_sep = "\\.") %>%
filter(match == "yes") %>%
select(name, ref_seq) %>%
mutate(ref_seq = gsub("\\..*$", "", ref_seq),
ref_seq = gsub("_.*$","", ref_seq),
ref_seq = gsub("^FIA", "IncFIA", ref_seq)) ->
btESBL_ecoli_st167_plasmids
use_data(btESBL_ecoli_st167_plasmids, overwrite = TRUE)
# st167 tree -----------------------------------------------
read.tree(
"data-raw/ecoli-genomics-paper/st167-new/clean.full.filtered_polymorphic_sites.fasta.treefile"
) ->
st167_tree
st167_tree$tip.label <- gsub("_filtered","", st167_tree$tip.label)
midpoint.root(st167_tree) -> btESBL_ecoli_globalst167_tree
use_data(btESBL_ecoli_globalst167_tree, overwrite = TRUE)
# non ASC tree
# st167 amr ---------------------------------------------------
amr.ariba167 <-
read_csv(
here(
"data-raw/ecoli-genomics-paper/st167-new/st167-ariba-srst2-summary.csv"
)
)
amr.ariba167 %>%
mutate(
name = gsub("\\./", "", name),
name = gsub("/report.tsv", "", name),
name = gsub("_filtered", "", name),
name = gsub("#", "_", name)
) %>%
pivot_longer(-name,
names_to = c("cluster", ".value"),
names_sep = "\\."
) %>%
mutate(gene = sapply(str_split(ref_seq, "__"), function(x) x[3])) %>%
filter(match == "yes") %>%
mutate(gene = case_when(
gene == "TEM_95" ~ "TEM_1",
TRUE ~ gene
)) %>%
select(name, gene) ->
btESBL_ecoli_st167_amr
use_data(btESBL_ecoli_st167_amr, overwrite = TRUE)
# st131 amr and tree---------------------
read.tree(here("data-raw/ecoli-genomics-paper/st131/non_ASC_trees/gub_base.filtered_polymorphic_sites.fasta.treefile")) ->
btESBL_ecoli_globalst131_tree
phytools::midpoint.root(btESBL_ecoli_globalst131_tree) -> btESBL_ecoli_globalst131_tree
btESBL_ecoli_globalst131_tree$tip.label <-
gsub("_filtered", "", btESBL_ecoli_globalst131_tree$tip.label)
use_data(btESBL_ecoli_globalst131_tree, overwrite = TRUE)
bind_rows(
read_csv(
here("data-raw/ecoli-genomics-paper/st131/mbio.00644-19-st001.csv")
) %>%
mutate(clade = case_when(
`BAPS-1` %in% c(3) ~ "A",
`BAPS-1` %in% c(2,4,5) ~ "B",
`BAPS-1` %in% c(1) ~ "C",
TRUE ~ "other")) %>%
select(Accession_number, Year,Country, clade),
btESBL_sequence_sample_metadata %>%
transmute(Accession_number = lane,
Year = year(data_date),
Country = "Malawi") %>%
filter(Accession_number %in% btESBL_ecoli_globalst131_tree$tip.label)
) %>%
bind_rows(btESBL_ecoli_musicha_metadata %>%
transmute(Accession_number = lane,
Year = Year,
Country = "Malawi")) %>%
filter(Accession_number %in% btESBL_ecoli_globalst131_tree$tip.label) %>%
as.data.frame() -> btESBL_ecoli_st131_metadata
use_data(btESBL_ecoli_st131_metadata, overwrite = TRUE)
# amr
read_csv(
here(
"data-raw/ecoli-genomics-paper/st131/non_ASC_trees/st131_ariba_srst2_summary.csv"
)) %>%
mutate(name = gsub("\\./", "", name),
name = gsub("/report.tsv", "", name),
name = gsub("_filtered", "", name),
name = gsub("#","_", name)) %>%
pivot_longer(-name,
names_to = c( "cluster", ".value"),
names_sep = "\\.") %>%
mutate(gene = sapply(str_split(ref_seq, "__"), function(x) x[3])) %>%
filter(match == "yes") %>%
mutate(gene = case_when(
gene == "TEM_95" ~ "TEM_1",
TRUE ~ gene
)) %>%
select(name, gene) ->
btESBL_ecoli_st131_amr
use_data(btESBL_ecoli_st131_amr, overwrite = TRUE)
# plasmids
st167_plasm <-
read_csv(
here("data-raw/ecoli-genomics-paper/st167/st167_pf_ariba_summary.csv"))
read_csv(
here("data-raw/ecoli-genomics-paper/st131/non_ASC_trees/st131-ariba-plasm-summary.csv")
) %>%
mutate(name = gsub("\\./", "", name),
name = gsub("/report.tsv", "", name),
name = gsub("_filtered","", name),
name = gsub("#","_", name)) %>%
pivot_longer(-name,
names_to= c( "cluster", ".value"),
names_sep = "\\.") %>%
filter(match == "yes") %>%
select(name, ref_seq) %>%
mutate(ref_seq = gsub("\\..*$", "", ref_seq),
ref_seq = gsub("_.*$","", ref_seq),
ref_seq = gsub("^FIA", "IncFIA", ref_seq)) ->
btESBL_ecoli_st131_plasmids
use_data(btESBL_ecoli_st131_plasmids, overwrite = TRUE)
# merge in clermonTyping phylogroups
ct <- read_tsv(
here("data-raw/ecoli-genomics-paper/horesh_full_diversity/analysis_2021-11-03_110200_phylogroups.txt")
, col_names = FALSE) %>%
select(X1,X5) %>%
rename(lane = X1,
ct_phylogroup = X5)
# to horesh
btESBL_ecoli_horesh_metadata %>%
mutate(matcher =
gsub("\\.|fasta|fa|trimmed|spades|velvet|contigs_|_trimmed","",
Assembly_name)) %>% pull(matcher)
left_join(
btESBL_ecoli_horesh_metadata %>%
mutate(matcher =
gsub("\\.|fasta|fa|trimmed|spades|velvet|contigs_|_trimmed","",
Assembly_name)),
ct %>%
mutate(lane =
gsub("\\.|fasta|fa|trimmed|spades|velvet|contigs_|_trimmed","",
lane)),
by = c("matcher" = "lane")) %>%
mutate(Phylogroup =
case_when(Phylogroup =="Not Determined" &
!is.na(ct_phylogroup) ~ ct_phylogroup,
TRUE ~ Phylogroup)) %>%
select(-c(ct_phylogroup, matcher)) -> btESBL_ecoli_horesh_metadata
# dassim samples
btESBL_sequence_sample_metadata %>%
left_join(
ct %>%
mutate(lane =
gsub("\\.|fasta|fa|trimmed|spades|velvet|contigs_|_trimmed","",
lane),
lane = gsub("#","_", lane))
) %>%
mutate(ecoli_phylogroup = ct_phylogroup) %>%
select(-ct_phylogroup) -> btESBL_sequence_sample_metadata
btESBL_ecoli_musicha_metadata %>%
left_join(
ct %>%
mutate(lane =
gsub("\\.|fasta|fa|trimmed|spades|velvet|contigs_|_trimmed","",
lane),
lane = gsub("#","_", lane)) %>%
mutate(phylogroup = ct_phylogroup) %>%
select(-ct_phylogroup)
) -> btESBL_ecoli_musicha_metadata
use_data(btESBL_ecoli_horesh_metadata, overwrite = TRUE)
use_data(btESBL_sequence_sample_metadata, overwrite = TRUE)
use_data(btESBL_ecoli_musicha_metadata, overwrite = TRUE)
# add horesh AMR
source(here("data-raw/ecoli-genomics-paper/parse_horesh_amr.R"))
# load vfdb klebsiella output -------------------------------
vfdb <- read_csv("data-raw/kleb-genomics-paper/vfdb_core_ariba_summary.csv")
vfdb_genes <- read_lines("data-raw/kleb-genomics-paper/vfdb_core_gene_names.txt")
listout <- list()
for (i in 1:length(vfdb_genes)) {
# print(i)
strsplit(vfdb_genes[i], " |\\(|//)|>") -> split
vfg <- grep("VFG", split[[1]], value = TRUE)
vf <- grep("VF[0-9]", split[[1]], value = TRUE)
desc <- str_extract(vfdb_genes[i], "(?<=\\) )[A-Za-z].*(?= \\[)")
desc <- gsub("\\[.*$","", desc)
if (length(vfg) == 0) {
vfg <- NA_character_
}
if (length(vf) == 0) {
vf <- NA_character_
}
listout[[i]] <-
data.frame(
vfg = vfg,
vf = vf,
description = desc
)
}
do.call(rbind,listout) %>%
mutate(vf = gsub("\\)\\]", "", vf)) %>%
unique() ->
vfdb_vf_lookup
vfdb %>%
select(contains("name") |
contains("match") |
contains("assembled") |
contains("ref_seq")) %>%
pivot_longer(-name,
names_to = c("cluster_name", ".value"),
names_sep = "\\.") %>%
filter(match == "yes") %>%
mutate(ref_seq2 = sapply(strsplit(ref_seq, split = "\\."), function(x)
x[1]),
vfg = sapply(strsplit(ref_seq, split = "\\."), function(x)
x[2])) %>%
mutate(name = gsub("\\./|/report\\.tsv", "", name),
name = gsub("#","_", name),
vfg = gsub("_.*$","", vfg)) %>%
select(name, ref_seq2, vfg) %>%
mutate(vfg = if_else(is.na(vfg), ref_seq2, vfg)) -> vfdb_results
left_join(vfdb_results, vfdb_vf_lookup, by = "vfg") -> vfdb_results
vfdb_results %>%
mutate(ref_seq2 = gsub("afaE_I","afaE-I", ref_seq2)) %>%
select(name,ref_seq2, vf, vfg) %>%
# filter(
# vf != "VF0560", #capsule
# vf != "VF0561", #O type
# vf != "VF0564", # yersinabactin
# vf != "VF0562" , # enterobactin
# vf != "VF0228", # enterobactin
# vf != "VF0564", # enterobactin
# !grepl("rmpA|rmpA2|ybt|iuc|iro|clb", ref_seq2)) %>%
# select(name, vf, ref_seq2) %>%
unique() %>%
left_join(
select(vfid_descriptions,
VFID,
VF_Name,
VF_FullName),
by = c("vf" = "VFID")) %>%
rename(
gene = ref_seq2) ->
# mutate(VF_Name = if_else(
# !is.na(VF_FullName), VF_FullName,
# VF_Name)) %>%
btESBL_kleb_malawi_vfdb_output
use_data(btESBL_kleb_malawi_vfdb_output, overwrite = TRUE)
# kleb global metadata ------------------
# load and clean global AMR - aim: ESBL vs not
amr.global <- read_csv(here("data-raw/kleb-genomics-paper/GLOBAL_ariba_amr.csv"))
amr.global %>%
mutate(name = gsub("\\./", "", name),
name = gsub("/report.tsv", "", name)) %>%
pivot_longer(-name,
names_to= c( "cluster", ".value"),
names_sep = "\\.") %>%
mutate(gene = sapply(str_split(ref_seq, "__"), function(x) x[3])) %>%
filter(match == "yes") %>%
mutate(gene = case_when(
gene == "TEM_95" ~ "TEM_1",
TRUE ~ gene
)) %>%
select(name, gene) %>%
pivot_wider(names_from = gene,
values_from = gene,
values_fn = length,
values_fill = 0) ->
amr.global
amr.global %>%
pivot_longer(-name, names_to = "gene") %>%
filter(value == 1) %>%
left_join(select(btESBL_NCBI_phenotypic_bl, allele_name, class),
by = c("gene" = "allele_name")) %>%
mutate(class = case_when(
gene == "SHV_12" ~ "ESBL",
TRUE ~ class)
) %>%
group_by(name) %>%
summarise(ESBL =
case_when(
any(class == "ESBL") ~ "ESBL",
TRUE ~ "0"
)
) %>%
# add accesssion
left_join(
bind_rows(
read_csv(here("data-raw/kleb-genomics-paper/context_genomes/global_accession.csv")),
read_csv(here("data-raw/kleb-genomics-paper/context_genomes/malawi_accession.csv"))
) %>%
select(`Lane name`,
`Sample accession`,
`Lane accession`),
by = c("name" = "Lane name")
) -> metadata_global
# merge in other data
musicha <- read_lines(here("data-raw/kleb-genomics-paper/context_genomes/musicha_klebs_list.txt"))
cornick <- read_lines(here("data-raw/kleb-genomics-paper/context_genomes/chathinka_kleb_lanes.txt"))
global <- read_lines(here("data-raw/kleb-genomics-paper/context_genomes/global_context_lanes.txt"))
kenya <- read_lines(here("data-raw/kleb-genomics-paper/context_genomes/kenya_lanes.txt"))
metadata_global %>%
mutate(study =
case_when(
name %in% musicha ~ "musciha",
name %in% cornick ~ "cornick",
name %in% kenya ~ "kenya",
name %in% global ~ "global",
TRUE ~ "DASSIM"
)) -> metadata_global
holt_metadata <-
read_csv(here("data-raw/kleb-genomics-paper/context_genomes/holt_global_kleb_metadata.csv"))
holt_metadata %>%
filter(!grepl("Kleb", File_ID)) %>%
mutate(
Year = if_else(
Year_isolated == "Unknown",
NA_real_,
as.numeric(Year_isolated)),
Infection_status =
case_when(Source_Host == "Human" &
is.na(Infection_status) &
Sample_note %in% c(
"blood",
"sputum",
"urine",
"pus",
"bronchial alveolar lavage") ~ "Human_invasive",
Source_Host == "Human" &
is.na(Infection_status) &
Clinical_note == "Carriage" ~ "Human_carriage",
Source_Host == "Human" &
is.na(Infection_status) ~ "Unknown",
TRUE ~ Infection_status),
sample_source = case_when(
Source_Host == "Environmental" ~ "Environmental",
Source_Host != "Human" ~ "Animal",
TRUE ~ "Human"),
isolate_type = case_when(
Source_Host == "Human" &
Infection_status == "Human_carriage" ~ "Carriage",
Source_Host == "Human" &
Infection_status %in% c(
"Human_infection",
"Human_invasive") ~ "Infection",
Source_Host == "Human" ~ NA_character_,
TRUE ~ NA_character_)) -> holt_metadata
musicha_metadata <-
read_csv(here("data-raw/kleb-genomics-paper/context_genomes/musciha_sample_metadata.csv"))
musicha_metadata %>%
mutate(
Year = Year,
sample_source = "Human",
isolate_type = case_when(
Source == "RS" ~ "Carriage",
TRUE~ "Infection")) -> musicha_metadata
left_join(
metadata_global %>%
mutate(name = gsub("#","_", name)),
bind_rows(
select(holt_metadata, File_ID,sample_source,isolate_type, Year),
select(musicha_metadata, File_ID,sample_source,isolate_type, Year)
) %>%
mutate(File_ID = gsub("#","_", File_ID)),
by = c("name" = "File_ID")) %>%
mutate(location =
case_when(study %in% c(
"musciha",
"cornick",
"DASSIM") ~ "Malawi",
study == "kenya" ~ "Kenya",
TRUE ~ "Global"),
isolate_type =
case_when(
study %in% c("musicha","cornick", "kenya") ~ "Infection",
study == "DASSIM" ~ "Carriage",
TRUE ~ isolate_type),
sample_source = case_when(
study %in% c("musicha","cornick", "DASSIM","kenya") ~ "Human",
TRUE ~ sample_source)
) %>%
rename("Sample Source" = sample_source,
"Isolate Type" = isolate_type) ->
metadata_global
metadata_global %>%
left_join(
bind_rows(
read_tsv(
here("data-raw/kleb-genomics-paper/DASSIM3_kleborate.all.txt")) %>%
mutate(strain = gsub("\\.contigs_spades","",strain)) %>%
mutate(strain = gsub("\\.contigs_velvet", "", strain),
ST = if_else(is.na(ST), "Unknown", ST)) %>%
filter(ST != "ST") %>%
select(
strain,
ST,
YbST,
CbST,
AbST,
SmST,
rmpA,
rmpA2,
O_locus,
K_locus,
K_locus_confidence,
O_locus_confidence
) %>%
mutate(strain = gsub("#", "_", strain),
ST = paste0("ST",ST)),
read_tsv(
here(
"data-raw/kleb-genomics-paper/MALAWI_all_malawi_context_assemblies_kleborate_output.txt"
)
) %>%
mutate(strain = gsub("\\.contigs_velvet", "", strain),
ST = if_else(is.na(ST), "Unknown", ST)) %>%
filter(ST != "ST") %>%
select(
strain,
ST,
K_locus,
K_locus_confidence,
O_locus,
O_locus_confidence,
YbST,
CbST,
AbST,
SmST,
rmpA,
rmpA2
) %>%
mutate(ST = gsub("-.*$","", ST))
) %>%
mutate(
strain = gsub("#", "_", strain),
ybt = if_else(YbST != "0", "1", "0"),
clb = if_else(CbST != "0", "1", "0"),
iuc = if_else(AbST != "0", "1", "0"),
iro = if_else(SmST != "0", "1", "0"),
rmpA = if_else(rmpA != "-", "1", "0"),
rmpA2 = if_else(rmpA2 != "-", "1", "0")),
by = c("name" = "strain")
) -> metadata_global
# add in malawi accessions
left_join(
metadata_global,
read_csv(
here("data-raw/kleb-genomics-paper/DASSIM3_accessions_with_ERR.csv")
) %>%
transmute(
name = gsub("#", "_", `Lane name`),
s_accession = `Sample name`,
l_accession = `Lane accession`
),
by = c("name" = "name")
) %>%
left_join(
btESBL_sequence_sample_metadata %>%
transmute(
lane = lane,
year_dassim = lubridate::year(data_date),
ST_ariba = if_else(ST == "Novel", "Novel",paste0("ST", ST))),
by = c("name" = "lane")) %>%
mutate(
`Lane accession` = if_else(is.na(`Lane accession`),
l_accession,
`Lane accession`),
`Sample accession` = if_else(is.na(`Sample accession`),
s_accession,
`Sample accession`),
Year = if_else(
study == "DASSIM",
year_dassim,
Year),
ST = if_else(
study == "DASSIM",
ST_ariba,
ST)) %>%
select(-c(l_accession,s_accession, ST_ariba, -year_dassim)) ->
metadata_global
metadata_global <- as.data.frame(metadata_global)
rownames(metadata_global) <- metadata_global$name
metadata_global$Malawi <- ifelse(metadata_global$location == "Malawi","1","0")
metadata_global$ST[metadata_global$ST == "0"] <- "Unknown"
metadata_global$ESBL <- as.character(metadata_global$ESBL)
btESBL_kleb_global_metadata <- metadata_global
use_data(btESBL_kleb_global_metadata, overwrite = TRUE)
# contig sens ax ----------------------------------
# function to load and prep data
source(here("data-raw/review_comment_work/contig_sens_ax/load_contig_sens_ax_data_fn.R"))
btESBL_contigclusters_sensax <- load_contig_sens_ax_data()
use_data(btESBL_contigclusters_sensax, overwrite = TRUE)
# contig msa ----------------------------------------------------
list.files(here("data-raw/review_comment_work/contig_msa"),
pattern = "paf$",
full.names = TRUE) -> paffiles
list.files(here("data-raw/review_comment_work/contig_msa"),
pattern = "paf$") -> paffile_names
purrr::map(paffiles, read_tsv, col_select = 1:12,
col_names =
c("qname",
"qlen",
"qstart",
"qend",
"strand",
"tname",
"tlen",
"tstart",
"tend",
"nmatch",
"alen",
"mapq")
) -> paf_file_list
names(paf_file_list) <- gsub("\\.paf", "", paffile_names)
btESBL_contigclusters_msa_paf_files <- paf_file_list
list.dirs(here("data-raw/review_comment_work/contig_msa/alignments/"),
recursive = FALSE) -> msa_dirs
purrr::map(msa_dirs,
PopGenome::readData,
format = "fasta",
include.unknown = TRUE) -> msa_list
names(msa_list) <- gsub("\\.paf", "", paffile_names)
btESBL_contigclusters_msa_alignments <- msa_list
use_data(btESBL_contigclusters_msa_paf_files, overwrite = TRUE)
use_data(btESBL_contigclusters_msa_alignments, overwrite = TRUE)
# BLAST results for msa ---------------------------------
blast_colnames <- c(
"qseqid",
"sseqid",
"pident",
"slen",
"length",
"mismatch",
"gapopen",
"qstart",
"qend",
"sstart",
"send",
"evalue",
"bitscore"
)
bind_rows(
read_csv(
here(paste0(
"data-raw/review_comment_work/plot_contigs/",
"cluster_reps_srst2_blast.csv"
)),
col_names = blast_colnames
) %>%
separate(sseqid,
sep = "__",
into = c(NA, "sseqid_group", "sseqid_gene", NA),
remove = FALSE
) %>%
mutate(type = "amr"),
read_csv(
here(paste0(
"data-raw/review_comment_work/plot_contigs/",
"cluster_reps_plasmidfinder_blast.csv"
)),
col_names = blast_colnames
) %>%
mutate(sseqid = gsub("_.+$", "", sseqid)) %>%
separate(sseqid,
sep = "\\(",
into = c("sseqid_group", "sseqid_gene"),
remove = FALSE
) %>%
mutate(sseqid_gene = gsub("\\)", "", sseqid_gene)) %>%
mutate(type = "plasmid"),
read_csv(
here(paste0(
"data-raw/review_comment_work/plot_contigs/",
"cluster_reps_isfinder_blast.csv"
)),
col_names = blast_colnames
) %>%
separate(sseqid,
sep = "_",
into = c("sseqid_gene", "sseqid_group", NA),
remove = FALSE
) %>%
mutate(type = "is")
) -> btESBL_contigclusters_msa_blastoutput
use_data(btESBL_contigclusters_msa_blastoutput, overwrite = TRUE)
# phenotypic AST data ------------------------------------------
btESBL_AST <-
read_csv(
"data-raw/review_comment_work/phenotypic_sens/ESBL_orgs.csv"
)
btESBL_AST %>%
filter(profile_name == "DASSIM Culture") %>%
transmute(
supplier_name = sample_number,
organism = organism,
amikacin = `Amikacin 30`,
chloramphenicol = `Chloramphenicol 30`,
ciprofloxacin = `Ciprofloxacin 1`,
cotrimoxazole = `Cotrimoxazole 25`,
gentamicin = `Gentamicin 10`,
meropenem = Meropenam) %>%
unique() %>%
filter(grepl("Escheric|ella pneum", organism)) %>%
semi_join(
btESBL_sequence_sample_metadata %>%
mutate(
organism = if_else(
grepl("coli", species),
"Escherichia coli",
"Klebsiella pneumoniae")),
by = c("supplier_name", "organism")
) %>%
mutate(
organism =
case_when(
organism == "Escherichia coli" ~ "E. coli",
organism == "Klebsiella pneumoniae" ~ "KpSC")) %>%
filter(!(is.na(amikacin) &
is.na(chloramphenicol) &
is.na(ciprofloxacin) &
is.na(cotrimoxazole) &
is.na(gentamicin) &
is.na(meropenem))) -> btESBL_AST
use_data(btESBL_AST, overwrite = TRUE)
## Further simulations to compare antimicrobials vs hospitalisation ---------
# This file is not included in the repo - too large
readRDS(here("data-raw/btESBL_model2simulations_2.rda")) ->
btESBL_model2simulations_2
bind_rows(
btESBL_model2simulations_2 %>%
group_by(time, abx_days, hosp_days) %>%
summarise(
med = median(pr_esbl_pos),
lq = quantile(pr_esbl_pos, 0.025)[[1]],
uq = quantile(pr_esbl_pos, 0.975)[[1]]
) %>%
filter(hosp_days == 0) %>%
ungroup() %>%
transmute(
time = time,
days = abx_days,
exposure = "Antimicrobials",
median = med,
lq = lq,
uq = uq),
btESBL_model2simulation_2 %>%
group_by(time, abx_days, hosp_days) %>%
summarise(
med = median(pr_esbl_pos),
lq = quantile(pr_esbl_pos, 0.025)[[1]],
uq = quantile(pr_esbl_pos, 0.975)[[1]]
) %>%
filter(abx_days == 0) %>%
ungroup() %>%
transmute(
time = time,
days = hosp_days,
exposure = "Hospitalisation",
median = med,
lq = lq,
uq = uq)
) -> btESBL_model2simulations_2
use_data(btESBL_model2simulations_2, overwrite = TRUE)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.