library(knitr) library(rmdformats) library(tidyverse) library(jakR) library(outbreakinfo) library(MappgeneSummary) library(purrr) library(DT) library(patchwork) library(ggVennDiagram) #library(jsonlite) library(Biostrings) library(msa) library(ggmsa) library(reticulate) conda_install("r-reticulate", "biopython") conda_install("r-reticulate", "pandas") options(max.print="100", scipen = 999) opts_chunk$set(echo=params$echo_type, cache=FALSE, prompt=FALSE, tidy=FALSE, comment=NA, message=FALSE, warning=FALSE, fig.height = 5, fig.width = 8)
# global options (set from yaml header) if (params$mappgene_file_path == 'test') { mappgene_file_path = system.file("test_files", "", package = "MappgeneSummary") } else { mappgene_file_path = params$mappgene_file_path } summary_output_file_path = params$summary_output_file_path overwrite_Rds_files = params$overwrite_Rds_files api_top_count = params$api_top_count samples_to_exclude = c("CADPH171B", "CADPH220B") # these will be printed below where they are actually removed # Parse python code reticulate::source_python(system.file("python/mappygene.py", package = "MappgeneSummary"))
The summary is looking for files in the r mappgene_file_path
directory, and will write the following files there. Creation of the .Rds
files are typically the slowest part of this script, so they are saved to speed up testing and creation of the html summary.
File Name | File Type | Notes --- | --- | --- _summary_lofreq.Rds | R data | The lofreq data (tibble) _summary_ivar.Rds | R data | The ivar data (tibble) _summary_DF.Rds | R data | The "final" data use for all analysis (tibble) _summary_DF.txt | tab-delimited | The "final" data use for all analysis _lofreq_top_outbreak_results.txt | tab-delimited | The Outbreak API results from the top lofreq iSNVs _ivar_top_outbreak_results.txt | tab-delimited | The Outbreak API results from the top ivar iSNVs _amplicon_summary.txt | tab-delimited | Attempt at re-creating a specific report for Excel analysis _amplicon_results.txt | tab-delimited | Attempt at re-creating a specific report for Excel analysis
This is the "iVar -> LoFreq -> snpEFF/snpSIFT" workflow, and these results will have the "lofreq" PIPELINE
string.
files = list.files(path = mappgene_file_path, pattern = "*.ivar.lofreq.snpSIFT.txt$", recursive = TRUE, full.names = T) df = sapply(files, read_delim, delim = "\t", comment = "##", simplify = FALSE, col_types = "ficcdifffcciic") %>% bind_rows(.id = "id") df = df %>% mutate(FILE = basename(id)) %>% select(-id) %>% mutate(SAMPLE = str_replace(FILE, ".ivar.lofreq.snpSIFT.txt", "")) %>% mutate(SAMPLE = str_replace(SAMPLE, "LLNL_", "")) names(df) = gsub(x = names(df), pattern = "ANN\\[\\*\\]\\.", replacement = "") df = df %>% select("SAMPLE", "GENE", "POS", "REF", "ALT", "AF", "DP", "EFFECT", "HGVS_C", "HGVS_P", "CDNA_POS", "AA_POS", "FILE") %>% unique()%>% mutate(PIPELINE = "lofreq", ALT_DP = as.integer(DP * AF)) saveRDS(df, file = file.path(summary_output_file_path, "_summary_lofreq.Rds")) rm(df)
The ALT_DP
column has been added as as.integer(DP * AF)
.
Also, converting the HGVS_P
column to REF_AA
and ALT_AA
, and removing any synonymous mutations.
keeper_effects = c('stop_gained', 'missense_variant', 'conservative_inframe_deletion', 'conservative_inframe_insertion', 'disruptive_inframe_deletion', 'disruptive_inframe_insertion') df_lofreq = readRDS(file = file.path(summary_output_file_path, "_summary_lofreq.Rds")) %>% filter(EFFECT %in% keeper_effects) #%>% #filter(AF <= 0.01) df_lofreq = df_lofreq %>% parse_ref_alt() %>% filter(REF_AA != ALT_AA) # Use mappygene to populate hit NSPs (or hit ALL_GENES if FALSE) and hit ORFs columns df_lofreq <- mappgene_summary_populate_df_cols(df_lofreq, nsp=TRUE, out_type='str')
lofreq_with_B = length(unique(df_lofreq$SAMPLE)) df_lofreq = df_lofreq %>% filter(!SAMPLE %in% samples_to_exclude) lofreq_without_B = length(unique(df_lofreq$SAMPLE))
Finally, the samples below were removed bringing the total LoFreq samples from r lofreq_with_B
to r lofreq_without_B
.
tibble("Excluded Samples" = samples_to_exclude) %>% knitr::kable()
This is the "iVar -> snpEFF/snpSIFT" workflow, and these results will have the "ivar" PIPELINE
string.
iVar output was processed according to Jimmy's pipeline
files = list.files(path = mappgene_file_path, pattern = "*.ivar.snpSIFT.txt$", recursive = TRUE, full.names = T) df = sapply(files, read_delim, delim = "\t", comment = "##", simplify = FALSE, col_types = "ficcdifffcciic") %>% bind_rows(.id = "id") df = df %>% mutate(FILE = basename(id)) %>% select(-id) %>% mutate(SAMPLE = str_replace(FILE, ".ivar.snpSIFT.txt", "")) %>% mutate(SAMPLE = str_replace(SAMPLE, "LLNL_", "")) %>% filter(ALT_QUAL > 20) %>% filter(FILTER == 'PASS') names(df) = gsub(x = names(df), pattern = "ANN\\[\\*\\]\\.", replacement = "") df = df %>% select("SAMPLE", "GENE", "POS", "REF", "ALT", "AF", "DP", "EFFECT", "HGVS_C", "HGVS_P", "CDNA_POS", "AA_POS", "FILE") %>% unique()%>% mutate(PIPELINE = "ivar", ALT_DP = as.integer(DP * AF)) saveRDS(df, file = file.path(summary_output_file_path, "_summary_ivar.Rds")) rm(df)
The ALT_DP
column has been added as as.integer(DP * AF)
.
Also, converting the HGVS_P
column to REF_AA
and ALT_AA
, and removing any synonymous mutations.
# keeper_effects, replacement and patterns already defined above df_ivar = readRDS(file = file.path(summary_output_file_path, "_summary_ivar.Rds")) %>% filter(EFFECT %in% keeper_effects) #%>% #filter(AF >= 0.01) df_ivar = df_ivar %>% parse_ref_alt() %>% filter(REF_AA != ALT_AA) # Use mappygene to populate hit NSPs (or hit ALL_GENES if FALSE) and hit ORFs columns df_ivar <- mappgene_summary_populate_df_cols(df_ivar, nsp=TRUE, out_type='str')
ivar_with_B = length(unique(df_ivar$SAMPLE)) df_ivar = df_ivar %>% filter(!SAMPLE %in% samples_to_exclude) ivar_without_B = length(unique(df_ivar$SAMPLE))
Finally, the samples below were removed bringing the total iVar samples from r ivar_with_B
to r ivar_without_B
.
tibble("Excluded Samples" = samples_to_exclude) %>% knitr::kable()
df_ivar = df_ivar %>% mutate(SNV = paste(GENE, POS, REF, ALT, sep = ".")) df_lofreq = df_lofreq %>% mutate(SNV = paste(GENE, POS, REF, ALT, sep = ".")) df = bind_rows(df_ivar, df_lofreq) %>% arrange(POS) saveRDS(df, file = file.path(summary_output_file_path, "_summary_DF.Rds")) df %>% write_delim(file.path(summary_output_file_path, "_summary_DF.txt"), delim = "\t")
lofreq = df %>% filter(PIPELINE == 'lofreq') %>% pull(SAMPLE) %>% unique() ivar = df %>% filter(PIPELINE == 'ivar') %>% pull(SAMPLE) %>% unique() sample_count = df %>% pull(SAMPLE) %>% unique() x1 = list(lofreq = unique(df_lofreq$SNV), ivar = unique(df_ivar$SNV) ) ggVennDiagram(x1) + scale_fill_gradient(low = "#F4FAFE", high = "#4981BF") + scale_color_manual(values = c("black", "black")) + theme(legend.position = "none") + labs(title = "SNVs")
SAMPLEs: There are r length(sample_count)
total samples with data so far, r length(setdiff(lofreq, ivar))
done with just lofreq, r length(setdiff(ivar, lofreq))
done with just ivar, and r length(intersect(lofreq, ivar))
with both.
lofreq = df %>% filter(PIPELINE == 'lofreq') %>% pull(SNV) %>% unique() ivar = df %>% filter(PIPELINE == 'ivar') %>% pull(SNV) %>% unique() snv_count = df %>% pull(SNV) %>% unique() x2 = list(lofreq = unique(paste0(df_lofreq$SNV, df_lofreq$SAMPLE)), ivar = unique(paste0(df_ivar$SNV, df_ivar$SAMPLE))) ggVennDiagram(x2) + scale_fill_gradient(low = "#F4FAFE", high = "#4981BF") + scale_color_manual(values = c("black", "black")) + theme(legend.position = "none") + labs(title = "Sample + SNVs")
SNVs: There are r length(snv_count)
total SNVs identified so far by 'lofreq' or 'ivar' in at least one sample, r length(setdiff(lofreq, ivar))
found with just lofreq, r length(setdiff(ivar, lofreq))
found with just ivar, and r length(intersect(lofreq, ivar))
found with both.
df %>% mutate(C = 1) %>% group_by(SAMPLE, SNV, PIPELINE) %>% summarise(S = sum(C)) %>% arrange(SNV) %>% pull(S) %>% table() %>% knitr::kable()
snpSIFT classifies the SNV effect into several categories. Here are the categories and their total frequencies in all samples. Only the "blue" categories are used in this analysis. Most the of "ivar" effect categories are removed with the ALT_QUAL > 20
filtering.
lofreq_effects = readRDS(file = file.path(summary_output_file_path, "_summary_lofreq.Rds")) #%>% #filter(AF <= 0.01) ivar_effects = readRDS(file = file.path(summary_output_file_path, "_summary_ivar.Rds")) #%>% #filter(AF >= 0.01) rbind(lofreq_effects, ivar_effects) %>% group_by(EFFECT, PIPELINE) %>% count() %>% ggplot(aes(x = n, y = EFFECT)) + jak_theme() + geom_col(position = "dodge", aes(fill = ifelse(EFFECT %in% keeper_effects, "#6495ed", "gray30"))) + geom_text(aes(label = n, x = n), color = "#AAAAAA", hjust = 1) + scale_fill_identity() + facet_wrap(~PIPELINE) + theme(legend.position = "none") + scale_x_continuous(labels=function(x) format(x, big.mark = ",", decimal.mark = ".", scientific = FALSE), trans = "log10")
EFFECT | SO | Description --- | --- | --- missense_variant | SO:0001583 | A sequence variant, that changes one or more bases, resulting in a different amino acid sequence but where the length is preserved. stop_gained | SO:0001587 | A sequence variant whereby at least one base of a codon is changed, resulting in a premature stop codon, leading to a shortened polypeptide conservative_inframe_deletion | SO:0001825 | An inframe decrease in cds length that deletes one or more entire codons from the coding sequence but does not change any remaining codons disruptive_inframe_deletion | SO:0001826 | An inframe decrease in cds length that deletes bases from the coding sequence starting within an existing codon conservative_inframe_insertion | SO:0001823 | An inframe increase in cds length that inserts one or more codons into the coding sequence between existing codons disruptive_inframe_insertion | SO:0001824 | An inframe increase in cds length that inserts one or more codons into the coding sequence within an existing codon.
There are 9 different ways to parse the HGVS_P
based on the EFFECT
and whether there is an insertion, deletion or duplication. The table below has all 11 possibilities.
TYPE | GENE | POS | REF | ALT | AF | DP | EFFECT | HGVS_C | HGVS_P | REF_AA | ALT_AA | CDNA_POS | AA_POS --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- 1 | S | 21575 | C | G | 0.012994 | 106739 | missense_variant | c.13C>G | p.Leu5Val | L | V | 13 | 5 2 | S | 23560 | TGGTGCAGGTATA | T | 0.01875 | 160 | conservative_inframe_deletion | c.1999_2010delGGTGCAGGTATA | p.Gly667_Ile670del | G667_I670 | del | 1999 | 667 3 | S | 21566 | TTTGTTTTTC | T | 5.15E-04 | 101017 | disruptive_inframe_deletion | c.12_20delTCTTGTTTT | p.Phe4_Val6del | F4_V6 | del | 12 | 4 4 | S | 21650 | AATTCTT | A | 0.00122 | 4919 | disruptive_inframe_deletion | c.89_94delATTCTT | p.Asn30_Phe32delinsIle | N30_F32 | insI | 89 | 30 5 | ORF1ab | 20188 | C | CCTGAAA | 2.87E-04 | 45358 | conservative_inframe_insertion | c.571_576dupGAAACT | p.Glu191_Thr192dup | E191_T192 | dup | 577 | 193 6 | M | 26720 | G | GCTTTTTAGC | 9.07E-04 | 36366 | conservative_inframe_insertion | c.201_202insTTTAGCCTT | p.Leu67_Ala68insPheSerLeu | L67_A68 | insFSL | 202 | 68 7 | ORF1ab | 3705 | C | CATT | 1.31E-04 | 258674 | disruptive_inframe_insertion | c.992_994dupTAT | p.Leu331dup | L331 | dup | 995 | 332 8 | ORF1ab | 9395 | T | TTAGTAAGTAAATTTA | 1.99E-04 | 75341 | disruptive_inframe_insertion | c.9130_9131insTAGTAAGTAAATTTA | p.Ser3044delinsLeuValSerLysPheThr | S3044 | insLVSKFT | 9131 | 3044 9 | ORF1ab | 386 | G | T | 0.002922 | 60925 | stop_gained | c.121G>T | p.Glu41* | E | * | 121 | 41
MappgeneSummary::view_HGVS_C("c.13C>G")
table(df$ALT_AA, df$EFFECT, df$PIPELINE) %>% as.data.frame() %>% dplyr::rename("EFFECT" = "Var2", "ALT_AA" = "Var1", "PIPELINE" = "Var3") %>% filter(Freq > 0) %>% format_dt(file = "SNV_count")
Each SNV was given a unique identifier with the gene name, the chromosome position, and the REF and ALT nucleotide, all separated by .
. So, S.23367.C.A
is in the S gene, chromosome position 23,367, and results in the reference "C" converted to a "A".
Below is the Sample count of all unique SNVs in both the lofreq and ivar pipelines.
df %>% group_by(GENE, SNV, PIPELINE) %>% count() %>% pivot_wider(names_from = PIPELINE, values_from = n, values_fill = list(n = 0)) %>% format_dt()
df %>% group_by(GENE, SNV, PIPELINE) %>% count() %>% pivot_wider(names_from = PIPELINE, values_from = n, values_fill = list(n = 0)) %>% ggplot(aes(x = lofreq, y = ivar)) + jak_theme() + geom_hex()
Just some examples of different plots that can be generated. These are typically static images, although some interaction could potentially be added later.
df %>% mutate(SAMPLE = paste(SAMPLE, PIPELINE, sep = "_")) %>% filter(GENE == 'S') %>% select(SAMPLE, SNV, AF) %>% pivot_wider(names_from = SAMPLE, values_from = AF, values_fill = list(AF = -1)) %>% column_to_rownames('SNV') %>% as.matrix() %>% pheatmap::pheatmap(show_rownames = T, border_color = NA, color = viridis::viridis(100))
df.v1 = df %>% mutate(SAMPLE = paste(SAMPLE, PIPELINE, sep = "_")) %>% select(SAMPLE, SNV, AF) %>% unique() %>% pivot_wider(names_from = SAMPLE, values_from = AF, values_fill = list(AF = 0)) %>% column_to_rownames('SNV') %>% as.matrix() %>% t() %>% vegan::metaMDS(distance = "bray", trace = 0) SAMPLEs = as.data.frame(df.v1$points) %>% rownames_to_column("SAMPLE") %>% separate(SAMPLE, into = c('CADPH', "PIPELINE"), sep = "_", remove = F) SNVs = as.data.frame(df.v1$species) SNVs$SNV = rownames(SNVs) ggplot(SAMPLEs, aes(x = MDS1, y = MDS2)) + jak_theme() + ggrepel::geom_text_repel(aes(label = CADPH), size = 3) + geom_point(data = SNVs, size = 1, alpha = 0.3) + geom_point(size = 3, pch = 24, alpha = 0.8, aes(fill = PIPELINE)) + scale_fill_manual(values = palette_jak$bay(2)) + #facet_wrap(~PHYLUM) + theme(legend.position = "right") + labs(title = "Bray Curtis Dissimilarity")
The stress of the above plot is r df.v1$stress * 100
(values > 10-15% are generally "high stress").
If a SNV was found in a sample by both ivar and lofreq, the frequency is averaged.
df.v2 = df %>% select(SAMPLE, SNV, AF) %>% group_by(SAMPLE, SNV) %>% summarize(AF_MEAN = mean(AF)) %>% unique() %>% pivot_wider(names_from = SAMPLE, values_from = AF_MEAN, values_fill = list(AF_MEAN = 0)) %>% column_to_rownames('SNV') %>% as.matrix() %>% t() %>% vegan::metaMDS(distance = "bray", trace = 0) SAMPLEs = as.data.frame(df.v2$points) %>% rownames_to_column("SAMPLE") SNVs = as.data.frame(df.v2$species) SNVs$SNV = rownames(SNVs) SNVs = SNVs %>% mutate(PIPELINE = case_when( SNV %in% df_lofreq$SNV & SNV %in% df_ivar$SNV ~ "BOTH", SNV %in% df_lofreq$SNV ~ "lofreq", SNV %in% df_ivar$SNV ~ "ivar" )) ggplot(SAMPLEs, aes(x = MDS1, y = MDS2)) + jak_theme() + ggrepel::geom_text_repel(aes(label = SAMPLE), size = 3) + geom_point(size = 5, pch = 24, alpha = 0.7, fill = "#6495ed") + geom_point(data = SNVs, size = 2,pch = 21, alpha = 0.5, aes(fill = PIPELINE)) + scale_fill_manual(values = palette_jak$bay(3)) + theme(legend.position = "right")
The stress of the above plot is r df.v2$stress * 100
(values > 10-15% are generally "high stress").
This table is too big to display here. It is saved as a tab-delimited file here: r file.path(summary_output_file_path, "_amplicon_summary.txt")
.
# https://martinctc.github.io/blog/vignette-downloadable-tables-in-rmarkdown-with-the-dt-package/ as = df %>% #filter(EFFECT %in% keeper_effects | PIPELINE == 'ivar') %>% filter(EFFECT %in% keeper_effects) %>% select(SAMPLE, PIPELINE, POS, SAMPLE, REF, ALT, AF, GENE, NSP, ORF, REF_AA, ALT_AA, AA_POS, SNV) %>% group_by(PIPELINE, POS, REF, ALT, GENE, NSP, ORF, REF_AA, ALT_AA, AA_POS, SNV) %>% summarize(SAMPLE_COUNT = n(), AF_MEAN = mean(AF), AF_SD = sd(AF)) as %>% write_delim(file.path(summary_output_file_path, "_amplicon_summary.txt"), delim = "\t")
This table is too big to display here. It is saved as a tab-delimited file here: r file.path(summary_output_file_path, "_amplicon_result.txt")
.
ar = df %>% filter(EFFECT %in% keeper_effects) %>% select(POS, SAMPLE, PIPELINE, REF, ALT, DP, AF, GENE, NSP, ORF, REF_AA, ALT_AA, AA_POS, SNV) ar %>% write_delim(file.path(summary_output_file_path, "_amplicon_result.txt"), delim = "\t")
The outbreak.info API is currently undergoing changes and is not available.
location_palette = c("California" = "#00496F", "United States" = "#EDD746", "Worldwide" = "#DD4124") #Note, only mutations with the 'missense_variant' `EFFECT` are searchable in the API. All others are not included below.
df %>% filter(EFFECT == 'missense_variant') %>% mutate(API_CODE = paste0(GENE, ':', REF_AA, AA_POS, ALT_AA)) %>% arrange(desc(AF)) %>% select(SAMPLE, EFFECT, API_CODE, SNV, PIPELINE, GENE, NSP, ORF, POS, REF, ALT, AF, REF_AA, ALT_AA) %>% format_dt()
top_lofreq = df %>% filter(PIPELINE == "lofreq", GENE == "S") %>% filter(EFFECT == 'missense_variant') %>% mutate(API_CODE = paste0(GENE, ':', REF_AA, AA_POS, ALT_AA)) %>% arrange(desc(AF)) %>% select(API_CODE) %>% unique() %>% head(api_top_count) %>% pull(API_CODE) top_lofreq_results = get_outbreak_info(top_lofreq) write_delim(top_lofreq_results, file = file.path(summary_output_file_path, "_lofreq_top_outbreak_results.txt"), delim = "\t") top_ivar = df %>% filter(PIPELINE == "ivar", GENE == "S") %>% filter(EFFECT == 'missense_variant') %>% mutate(API_CODE = paste0(GENE, ':', REF_AA, AA_POS, ALT_AA)) %>% arrange(desc(AF)) %>% select(API_CODE) %>% unique() %>% head(api_top_count) %>% pull(API_CODE) top_ivar_results = get_outbreak_info(top_ivar) write_delim(top_ivar_results, file = file.path(summary_output_file_path, "_ivar_top_outbreak_results.txt"), delim = "\t")
top_lofreq_results = read_delim(file = file.path(summary_output_file_path, "_lofreq_top_outbreak_results.txt"), delim = "\t") top_ivar_results = read_delim(file = file.path(summary_output_file_path, "_ivar_top_outbreak_results.txt"), delim = "\t") a = tryCatch( expr = {plot_outbreak_trends(top_lofreq_results, title = "Lofreq Proportions (free y-axis scale)") + jak_theme() + scale_fill_manual(values = location_palette) + scale_color_manual(values = location_palette)}, error = function(e){ print("uh oh") } ) b = plot_outbreak_trends(top_ivar_results, title = "iVar Proportions (free y-axis scale)") + jak_theme() + scale_fill_manual(values = location_palette) + scale_color_manual(values = location_palette) a + b
m = bind_rows(top_lofreq_results, top_ivar_results) %>% get_outbreak_peak() m_light = m %>% ungroup() %>% select(CODE, PERCENT) f = bind_rows(top_lofreq_results, top_ivar_results) %>% get_outbreak_start() l = bind_rows(top_lofreq_results, top_ivar_results) %>% get_outbreak_latest() bind_rows(f, l, m) %>% select(lineage, location, DATE, TYPE, CODE) %>% pivot_wider(values_from = DATE, names_from = TYPE) %>% left_join(m_light, by = "CODE") %>% select(-CODE) %>% dplyr::rename("PEAK_PERCENT" = "PERCENT") %>% format_dt() bind_rows(m, f, l) %>% mutate(location = as.factor(location), location = fct_relevel(location, c("CA", "USA", "GLOBAL"))) %>% mutate(lineage = as.factor(lineage), lineage = fct_relevel(lineage, rev(levels(lineage)))) %>% ggplot(aes(x = DATE, y = location, fill = TYPE)) + jak_theme() + geom_point(pch = 21, size = 3, alpha = 0.7) + scale_fill_manual(values = palette_jak$bay(3)) + facet_wrap(~lineage, ncol = 3) + labs(title = "First, last and peak occurrence", x = "Month-Year") + scale_x_date(date_breaks = "months" , date_labels = "%b-%y")
bind_rows(m, f, l) %>% mutate(location = as.factor(location), location = fct_relevel(location, c("Worldwide", "United States", "California"))) %>% mutate(lineage = as.factor(lineage), lineage = fct_relevel(lineage, rev(levels(lineage)))) %>% mutate(TYPE = fct_relevel(TYPE, 'FIRST', 'PEAK', 'LAST')) %>% arrange(TYPE) %>% # some wonkiness to get the line to plot in the right order when two TYPEs have the same data ggplot(aes(x = DATE, y = TYPE, color = location)) + jak_theme() + geom_path(aes(group = location), size = 1, alpha = 0.7) + # some wonkiness to get the line to plot in the right order when two TYPEs have the same data geom_point(size = 2, alpha = 0.7) + scale_color_manual(values = location_palette) + facet_wrap(~lineage, ncol = 3) + labs(title = "First, peak and last occurrence", x = "Month-Year") + scale_x_date(date_breaks = "months" , date_labels = "%b-%y")
a = readxl::read_excel(system.file("extdata", "mutation list for VOC and VOI 11.16.21.xlsx", package = "MappgeneSummary"), col_names = FALSE) b1 = readxl::read_excel(system.file("extdata", "mutation list for VOC and VOI 12.02.21v2.xlsx", package = "MappgeneSummary"), sheet = 1, col_names = FALSE) b2 = readxl::read_excel(system.file("extdata", "mutation list for VOC and VOI 12.02.21v2.xlsx", package = "MappgeneSummary"), sheet = 2, col_names = FALSE) c = readxl::read_excel(system.file("extdata", "updated omicron and sublineages mutations 05.04.2022.xlsx", package = "MappgeneSummary")) colnames(a) = c("GENE", "VOI_VOC", "AA_RANGE") colnames(b1) = c("GENE", "VOI_VOC", "AA_RANGE") colnames(b2) = c("GENE", "VOI_VOC", "AA_RANGE") colnames(c) = c("GENE", "VOI_VOC", "AA_RANGE") VOI_VOC = rbind(a[,1:3], b1[,1:3], b2[,1:3], c[,1:3]) %>% unique() %>% separate(AA_RANGE, sep = "/", into = c("A", "B"), remove = F) %>% mutate(B = ifelse(is.na(B), A, B)) %>% transmute(GENE, VOI_VOC, AA_RANGE, AA_POS = map2(A, B, seq, by = 1)) %>% unnest(cols = AA_POS) %>% mutate(CODE = paste(GENE, AA_POS, sep = "_")) %>% select(-GENE, -AA_POS) %>% arrange(CODE)
df_VOI_VOC = readRDS(file = file.path(summary_output_file_path, "_summary_DF.Rds")) %>% mutate(GENE = case_when( GENE == "ORF1ab" & AA_POS >= 4401 ~ "ORF1b", GENE == "ORF1ab" & AA_POS < 4401 ~ "ORF1a", TRUE ~ GENE )) %>% mutate(AA_POS = ifelse(GENE == "ORF1b", AA_POS - 4401, AA_POS)) %>% mutate(CODE = paste(GENE, AA_POS, sep = "_")) df_VOI_VOC = df_VOI_VOC %>% left_join(VOI_VOC, by = "CODE") %>% select(-FILE, -SNV, -REF, -ALT, -CDNA_POS) %>% filter(VOI_VOC != "NA") %>% arrange(GENE, AA_RANGE) %>% group_by(GENE, NSP, ORF, VOI_VOC, AA_RANGE, AA_POS, REF_AA, ALT_AA, HGVS_C, HGVS_P, PIPELINE) %>% summarize(SAMPLE_LIST = list(SAMPLE)) %>% mutate(SAMPLE_COUNT = lengths(SAMPLE_LIST), SAMPLES = mapply(paste0, SAMPLE_LIST, collapse = ",")) %>% select(-SAMPLE_LIST) df_VOI_VOC %>% format_dt(row_count = 20, file = "VOI_VOC_list") df_VOI_VOC %>% write_delim(file.path(summary_output_file_path, "_VOI_VOC.txt"), delim = "\t")
df_sess = sessioninfo::platform_info() %>% unlist() %>% as.data.frame() %>% rownames_to_column() colnames(df_sess) = c("name", "value") df_sess%>% knitr::kable() sessioninfo::package_info() %>% as_tibble() %>% filter(attached == TRUE) %>% #arrange(desc(attached)) %>% select(package, loadedversion, source, date, attached) %>% format_dt(row_count = 20)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.