In jeffkimbrel/MappgeneSummary: Summarize Mappgene SNV Data

library(knitr)
library(rmdformats)
library(tidyverse)
library(jakR)
library(outbreakinfo)
library(MappgeneSummary)
library(purrr)
library(DT)
library(patchwork)
library(ggVennDiagram)
#library(jsonlite)
library(Biostrings)
library(msa)
library(ggmsa)
library(reticulate)
conda_install("r-reticulate", "biopython")
conda_install("r-reticulate", "pandas")

options(max.print="100", scipen = 999)
opts_chunk$set(echo=params$echo_type,
                 cache=FALSE,
               prompt=FALSE,
               tidy=FALSE,
               comment=NA,
               message=FALSE,
               warning=FALSE, 
               fig.height = 5, 
               fig.width = 8)

# global options (set from yaml header)
if (params$mappgene_file_path == 'test') {
  mappgene_file_path = system.file("test_files", "", package = "MappgeneSummary")
} else {
  mappgene_file_path = params$mappgene_file_path
}

summary_output_file_path = params$summary_output_file_path
overwrite_Rds_files = params$overwrite_Rds_files
api_top_count = params$api_top_count

samples_to_exclude = c("CADPH171B", "CADPH220B") # these will be printed below where they are actually removed

# Parse python code
reticulate::source_python(system.file("python/mappygene.py", package = "MappgeneSummary"))

Background

Load and Process Files

The summary is looking for files in the r mappgene_file_path directory, and will write the following files there. Creation of the .Rds files are typically the slowest part of this script, so they are saved to speed up testing and creation of the html summary.

File Name | File Type | Notes --- | --- | --- _summary_lofreq.Rds | R data | The lofreq data (tibble) _summary_ivar.Rds | R data | The ivar data (tibble) _summary_DF.Rds | R data | The "final" data use for all analysis (tibble) _summary_DF.txt | tab-delimited | The "final" data use for all analysis _lofreq_top_outbreak_results.txt | tab-delimited | The Outbreak API results from the top lofreq iSNVs _ivar_top_outbreak_results.txt | tab-delimited | The Outbreak API results from the top ivar iSNVs _amplicon_summary.txt | tab-delimited | Attempt at re-creating a specific report for Excel analysis _amplicon_results.txt | tab-delimited | Attempt at re-creating a specific report for Excel analysis

Lofreq

This is the "iVar -> LoFreq -> snpEFF/snpSIFT" workflow, and these results will have the "lofreq" PIPELINE string.

files = list.files(path = mappgene_file_path, pattern = "*.ivar.lofreq.snpSIFT.txt$", recursive = TRUE, full.names = T)

df = sapply(files, 
            read_delim, 
            delim = "\t", 
            comment = "##", 
            simplify = FALSE,
            col_types = "ficcdifffcciic") %>% 
  bind_rows(.id = "id")

df = df %>%
  mutate(FILE = basename(id)) %>% 
  select(-id) %>%
  mutate(SAMPLE = str_replace(FILE, ".ivar.lofreq.snpSIFT.txt", "")) %>%
  mutate(SAMPLE = str_replace(SAMPLE, "LLNL_", ""))

names(df) = gsub(x = names(df), pattern = "ANN\\[\\*\\]\\.", replacement = "")

df = df %>%
  select("SAMPLE", "GENE", "POS", "REF", "ALT", "AF", "DP", "EFFECT", 
         "HGVS_C", "HGVS_P", "CDNA_POS", "AA_POS", "FILE") %>%
  unique()%>%
  mutate(PIPELINE = "lofreq", ALT_DP = as.integer(DP * AF))

saveRDS(df, file = file.path(summary_output_file_path, "_summary_lofreq.Rds"))
rm(df)

The ALT_DP column has been added as as.integer(DP * AF).

Also, converting the HGVS_P column to REF_AA and ALT_AA, and removing any synonymous mutations.

keeper_effects = c('stop_gained', 
                   'missense_variant',
                   'conservative_inframe_deletion',
                   'conservative_inframe_insertion',
                   'disruptive_inframe_deletion',
                   'disruptive_inframe_insertion') 

df_lofreq = readRDS(file = file.path(summary_output_file_path, "_summary_lofreq.Rds"))  %>% 
  filter(EFFECT %in% keeper_effects) #%>%
  #filter(AF <= 0.01)

df_lofreq = df_lofreq %>%
  parse_ref_alt() %>%
  filter(REF_AA != ALT_AA)

# Use mappygene to populate hit NSPs (or hit ALL_GENES if FALSE) and hit ORFs columns
df_lofreq <- mappgene_summary_populate_df_cols(df_lofreq, nsp=TRUE, out_type='str')

lofreq_with_B = length(unique(df_lofreq$SAMPLE))

df_lofreq = df_lofreq %>%
  filter(!SAMPLE %in% samples_to_exclude)

lofreq_without_B = length(unique(df_lofreq$SAMPLE))

Finally, the samples below were removed bringing the total LoFreq samples from r lofreq_with_B to r lofreq_without_B.

tibble("Excluded Samples" = samples_to_exclude) %>%
  knitr::kable()

iVar

This is the "iVar -> snpEFF/snpSIFT" workflow, and these results will have the "ivar" PIPELINE string.

iVar output was processed according to Jimmy's pipeline

removing variant position with an ALT_QUAL of 20 (these are almost entirely single nt insertions),
removing positions that did not pass the Fisher test (PASS=FALSE)
removing synonymous mutations (REF_AA=ALT_AA)

files = list.files(path = mappgene_file_path, pattern = "*.ivar.snpSIFT.txt$", recursive = TRUE, full.names = T)

df = sapply(files, 
            read_delim, 
            delim = "\t", 
            comment = "##", 
            simplify = FALSE,
            col_types = "ficcdifffcciic") %>% 
  bind_rows(.id = "id")

df = df %>%
  mutate(FILE = basename(id)) %>% 
  select(-id) %>%
  mutate(SAMPLE = str_replace(FILE, ".ivar.snpSIFT.txt", "")) %>%
  mutate(SAMPLE = str_replace(SAMPLE, "LLNL_", "")) %>%
  filter(ALT_QUAL > 20) %>%
  filter(FILTER == 'PASS')

names(df) = gsub(x = names(df), pattern = "ANN\\[\\*\\]\\.", replacement = "")

df = df %>%
  select("SAMPLE", "GENE", "POS", "REF", "ALT", "AF", "DP", "EFFECT", 
         "HGVS_C", "HGVS_P", "CDNA_POS", "AA_POS", "FILE") %>%
  unique()%>%
  mutate(PIPELINE = "ivar", ALT_DP = as.integer(DP * AF))

saveRDS(df, file = file.path(summary_output_file_path, "_summary_ivar.Rds"))
rm(df)

The ALT_DP column has been added as as.integer(DP * AF).

Also, converting the HGVS_P column to REF_AA and ALT_AA, and removing any synonymous mutations.

# keeper_effects, replacement and patterns already defined above

df_ivar = readRDS(file = file.path(summary_output_file_path, "_summary_ivar.Rds")) %>% 
  filter(EFFECT %in% keeper_effects) #%>%
  #filter(AF >= 0.01)

df_ivar = df_ivar %>%
  parse_ref_alt() %>%
  filter(REF_AA != ALT_AA)

# Use mappygene to populate hit NSPs (or hit ALL_GENES if FALSE) and hit ORFs columns
df_ivar <- mappgene_summary_populate_df_cols(df_ivar, nsp=TRUE, out_type='str')

ivar_with_B = length(unique(df_ivar$SAMPLE))

df_ivar = df_ivar %>%
  filter(!SAMPLE %in% samples_to_exclude)

ivar_without_B = length(unique(df_ivar$SAMPLE))

Finally, the samples below were removed bringing the total iVar samples from r ivar_with_B to r ivar_without_B.

tibble("Excluded Samples" = samples_to_exclude) %>%
  knitr::kable()

Pipeline Summary and Merging

df_ivar = df_ivar  %>%
  mutate(SNV = paste(GENE, POS, REF, ALT, sep = "."))

df_lofreq = df_lofreq  %>%
  mutate(SNV = paste(GENE, POS, REF, ALT, sep = "."))

df = bind_rows(df_ivar, df_lofreq) %>%
  arrange(POS)

saveRDS(df, file = file.path(summary_output_file_path, "_summary_DF.Rds"))

df %>%
  write_delim(file.path(summary_output_file_path, "_summary_DF.txt"), delim = "\t")

lofreq = df %>% 
  filter(PIPELINE == 'lofreq') %>%
  pull(SAMPLE) %>% unique()

ivar = df %>% 
  filter(PIPELINE == 'ivar') %>%
  pull(SAMPLE) %>% unique()

sample_count = df %>%
  pull(SAMPLE) %>% unique()

x1 = list(lofreq = unique(df_lofreq$SNV),
         ivar = unique(df_ivar$SNV)  
         )

ggVennDiagram(x1) +
  scale_fill_gradient(low = "#F4FAFE", high = "#4981BF") +
  scale_color_manual(values = c("black", "black")) +
  theme(legend.position = "none") +
  labs(title = "SNVs")

SAMPLEs: There are r length(sample_count) total samples with data so far, r length(setdiff(lofreq, ivar)) done with just lofreq, r length(setdiff(ivar, lofreq)) done with just ivar, and r length(intersect(lofreq, ivar)) with both.

lofreq = df %>% 
  filter(PIPELINE == 'lofreq') %>%
  pull(SNV) %>% unique()

ivar = df %>% 
  filter(PIPELINE == 'ivar') %>%
  pull(SNV) %>% unique()

snv_count = df %>%
  pull(SNV) %>% unique()

x2 = list(lofreq = unique(paste0(df_lofreq$SNV, df_lofreq$SAMPLE)),
         ivar = unique(paste0(df_ivar$SNV, df_ivar$SAMPLE)))

ggVennDiagram(x2) +
  scale_fill_gradient(low = "#F4FAFE", high = "#4981BF") +
  scale_color_manual(values = c("black", "black")) +
  theme(legend.position = "none") +
  labs(title = "Sample + SNVs")

SNVs: There are r length(snv_count) total SNVs identified so far by 'lofreq' or 'ivar' in at least one sample, r length(setdiff(lofreq, ivar)) found with just lofreq, r length(setdiff(ivar, lofreq)) found with just ivar, and r length(intersect(lofreq, ivar)) found with both.

df %>%
  mutate(C = 1) %>%
  group_by(SAMPLE, SNV, PIPELINE) %>%
  summarise(S = sum(C)) %>%
  arrange(SNV) %>%
  pull(S) %>% table() %>% knitr::kable()

Effects

snpSIFT classifies the SNV effect into several categories. Here are the categories and their total frequencies in all samples. Only the "blue" categories are used in this analysis. Most the of "ivar" effect categories are removed with the ALT_QUAL > 20 filtering.

lofreq_effects = readRDS(file = file.path(summary_output_file_path, "_summary_lofreq.Rds")) #%>% 
  #filter(AF <= 0.01)

ivar_effects = readRDS(file = file.path(summary_output_file_path, "_summary_ivar.Rds")) #%>% 
  #filter(AF >= 0.01)

rbind(lofreq_effects, ivar_effects) %>%
  group_by(EFFECT, PIPELINE) %>%
  count() %>%
  ggplot(aes(x = n, y = EFFECT)) +
    jak_theme() +
    geom_col(position = "dodge", aes(fill = ifelse(EFFECT %in% keeper_effects, "#6495ed", "gray30"))) +
    geom_text(aes(label = n, x = n), color = "#AAAAAA", hjust = 1) +
    scale_fill_identity() +
    facet_wrap(~PIPELINE) +
    theme(legend.position = "none") + 
    scale_x_continuous(labels=function(x) format(x, big.mark = ",", decimal.mark = ".", scientific = FALSE), trans = "log10")

EFFECT Codes

EFFECT | SO | Description --- | --- | --- missense_variant | SO:0001583 | A sequence variant, that changes one or more bases, resulting in a different amino acid sequence but where the length is preserved. stop_gained | SO:0001587 | A sequence variant whereby at least one base of a codon is changed, resulting in a premature stop codon, leading to a shortened polypeptide conservative_inframe_deletion | SO:0001825 | An inframe decrease in cds length that deletes one or more entire codons from the coding sequence but does not change any remaining codons disruptive_inframe_deletion | SO:0001826 | An inframe decrease in cds length that deletes bases from the coding sequence starting within an existing codon conservative_inframe_insertion | SO:0001823 | An inframe increase in cds length that inserts one or more codons into the coding sequence between existing codons disruptive_inframe_insertion | SO:0001824 | An inframe increase in cds length that inserts one or more codons into the coding sequence within an existing codon.

Parsing of EFFECTs and HGVS_P

There are 9 different ways to parse the HGVS_P based on the EFFECT and whether there is an insertion, deletion or duplication. The table below has all 11 possibilities.

TYPE | GENE | POS | REF | ALT | AF | DP | EFFECT | HGVS_C | HGVS_P | REF_AA | ALT_AA | CDNA_POS | AA_POS --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- 1 | S | 21575 | C | G | 0.012994 | 106739 | missense_variant | c.13C>G | p.Leu5Val | L | V | 13 | 5 2 | S | 23560 | TGGTGCAGGTATA | T | 0.01875 | 160 | conservative_inframe_deletion | c.1999_2010delGGTGCAGGTATA | p.Gly667_Ile670del | G667_I670 | del | 1999 | 667 3 | S | 21566 | TTTGTTTTTC | T | 5.15E-04 | 101017 | disruptive_inframe_deletion | c.12_20delTCTTGTTTT | p.Phe4_Val6del | F4_V6 | del | 12 | 4 4 | S | 21650 | AATTCTT | A | 0.00122 | 4919 | disruptive_inframe_deletion | c.89_94delATTCTT | p.Asn30_Phe32delinsIle | N30_F32 | insI | 89 | 30 5 | ORF1ab | 20188 | C | CCTGAAA | 2.87E-04 | 45358 | conservative_inframe_insertion | c.571_576dupGAAACT | p.Glu191_Thr192dup | E191_T192 | dup | 577 | 193 6 | M | 26720 | G | GCTTTTTAGC | 9.07E-04 | 36366 | conservative_inframe_insertion | c.201_202insTTTAGCCTT | p.Leu67_Ala68insPheSerLeu | L67_A68 | insFSL | 202 | 68 7 | ORF1ab | 3705 | C | CATT | 1.31E-04 | 258674 | disruptive_inframe_insertion | c.992_994dupTAT | p.Leu331dup | L331 | dup | 995 | 332 8 | ORF1ab | 9395 | T | TTAGTAAGTAAATTTA | 1.99E-04 | 75341 | disruptive_inframe_insertion | c.9130_9131insTAGTAAGTAAATTTA | p.Ser3044delinsLeuValSerLysPheThr | S3044 | insLVSKFT | 9131 | 3044 9 | ORF1ab | 386 | G | T | 0.002922 | 60925 | stop_gained | c.121G>T | p.Glu41* | E | * | 121 | 41

MappgeneSummary::view_HGVS_C("c.13C>G")

table(df$ALT_AA, df$EFFECT, df$PIPELINE) %>% 
  as.data.frame() %>%
  dplyr::rename("EFFECT" = "Var2", "ALT_AA" = "Var1", "PIPELINE" = "Var3") %>%
  filter(Freq > 0) %>%
  format_dt(file = "SNV_count")

SNV Sample Count per Pipeline

Each SNV was given a unique identifier with the gene name, the chromosome position, and the REF and ALT nucleotide, all separated by .. So, S.23367.C.A is in the S gene, chromosome position 23,367, and results in the reference "C" converted to a "A".

Below is the Sample count of all unique SNVs in both the lofreq and ivar pipelines.

df %>%
  group_by(GENE, SNV, PIPELINE) %>%
  count() %>%
  pivot_wider(names_from = PIPELINE, values_from = n, values_fill = list(n = 0)) %>%
  format_dt()

df %>%
  group_by(GENE, SNV, PIPELINE) %>%
  count() %>%
  pivot_wider(names_from = PIPELINE, values_from = n, values_fill = list(n = 0)) %>%
  ggplot(aes(x = lofreq, y = ivar)) +
    jak_theme() +
    geom_hex()

Visualization

Just some examples of different plots that can be generated. These are typically static images, although some interaction could potentially be added later.

Heatmap

Showing only the S protein.

df %>%
  mutate(SAMPLE = paste(SAMPLE, PIPELINE, sep = "_")) %>%
  filter(GENE == 'S') %>%
  select(SAMPLE, SNV, AF) %>% 
  pivot_wider(names_from = SAMPLE, values_from = AF, values_fill = list(AF = -1)) %>%
  column_to_rownames('SNV') %>%
  as.matrix() %>%
  pheatmap::pheatmap(show_rownames = T,
          border_color = NA,
          color = viridis::viridis(100))

NMDS

Between Pipelines

df.v1 =  df %>%
  mutate(SAMPLE = paste(SAMPLE, PIPELINE, sep = "_")) %>%
  select(SAMPLE, SNV, AF) %>% 
  unique() %>%
  pivot_wider(names_from = SAMPLE, values_from = AF, values_fill = list(AF = 0)) %>%
  column_to_rownames('SNV') %>%
  as.matrix() %>% 
  t() %>%
  vegan::metaMDS(distance = "bray", trace = 0)

SAMPLEs = as.data.frame(df.v1$points) %>%
  rownames_to_column("SAMPLE") %>%
  separate(SAMPLE, into = c('CADPH', "PIPELINE"), sep = "_", remove = F)

SNVs = as.data.frame(df.v1$species)
SNVs$SNV = rownames(SNVs)

ggplot(SAMPLEs, aes(x = MDS1, y = MDS2)) +   
  jak_theme() +
  ggrepel::geom_text_repel(aes(label = CADPH), size = 3) +  
  geom_point(data = SNVs, size = 1, alpha = 0.3) +  
  geom_point(size = 3, pch = 24, alpha = 0.8, aes(fill = PIPELINE)) +
  scale_fill_manual(values = palette_jak$bay(2)) +
  #facet_wrap(~PHYLUM) +
  theme(legend.position = "right") + 
  labs(title = "Bray Curtis Dissimilarity")

The stress of the above plot is r df.v1$stress * 100 (values > 10-15% are generally "high stress").

Between Samples

If a SNV was found in a sample by both ivar and lofreq, the frequency is averaged.

df.v2 = df %>%
  select(SAMPLE, SNV, AF) %>% 
  group_by(SAMPLE, SNV) %>%
  summarize(AF_MEAN = mean(AF)) %>%
  unique() %>%
  pivot_wider(names_from = SAMPLE, values_from = AF_MEAN, values_fill = list(AF_MEAN = 0)) %>%
  column_to_rownames('SNV') %>%
  as.matrix() %>% 
  t() %>%
  vegan::metaMDS(distance = "bray", trace = 0)

SAMPLEs = as.data.frame(df.v2$points) %>%
  rownames_to_column("SAMPLE")

SNVs = as.data.frame(df.v2$species)
SNVs$SNV = rownames(SNVs)

SNVs = SNVs %>%
  mutate(PIPELINE = case_when(
    SNV %in% df_lofreq$SNV & SNV %in% df_ivar$SNV ~ "BOTH",
    SNV %in% df_lofreq$SNV ~ "lofreq",
    SNV %in% df_ivar$SNV ~ "ivar"
  ))

ggplot(SAMPLEs, aes(x = MDS1, y = MDS2)) +   
  jak_theme() +
  ggrepel::geom_text_repel(aes(label = SAMPLE), size = 3) +  
  geom_point(size = 5, pch = 24, alpha = 0.7, fill = "#6495ed") +
  geom_point(data = SNVs, size = 2,pch = 21,  alpha = 0.5, aes(fill = PIPELINE)) +  
  scale_fill_manual(values = palette_jak$bay(3)) +
  theme(legend.position = "right")

The stress of the above plot is r df.v2$stress * 100 (values > 10-15% are generally "high stress").

Summary Tables

Amplicon Summary

This table is too big to display here. It is saved as a tab-delimited file here: r file.path(summary_output_file_path, "_amplicon_summary.txt").

# https://martinctc.github.io/blog/vignette-downloadable-tables-in-rmarkdown-with-the-dt-package/
as = df %>%
  #filter(EFFECT %in% keeper_effects | PIPELINE == 'ivar') %>%
  filter(EFFECT %in% keeper_effects) %>%
  select(SAMPLE, PIPELINE, POS, SAMPLE, REF, ALT, AF, GENE, NSP, ORF, REF_AA, ALT_AA, AA_POS, SNV) %>%
  group_by(PIPELINE, POS, REF, ALT, GENE, NSP, ORF, REF_AA, ALT_AA, AA_POS, SNV) %>%
  summarize(SAMPLE_COUNT = n(), AF_MEAN = mean(AF), AF_SD = sd(AF))

as %>%
  write_delim(file.path(summary_output_file_path, "_amplicon_summary.txt"), delim = "\t")

Amplicon Result

This table is too big to display here. It is saved as a tab-delimited file here: r file.path(summary_output_file_path, "_amplicon_result.txt").

ar = df %>%
  filter(EFFECT %in% keeper_effects) %>%
  select(POS, SAMPLE, PIPELINE, REF, ALT, DP, AF, GENE, NSP, ORF, REF_AA, ALT_AA, AA_POS, SNV)

ar %>%
  write_delim(file.path(summary_output_file_path, "_amplicon_result.txt"), delim = "\t")

outbreak.info API

The outbreak.info API is currently undergoing changes and is not available.

location_palette = c("California" = "#00496F", "United States" = "#EDD746", "Worldwide" = "#DD4124")

#Note, only mutations with the 'missense_variant' `EFFECT` are searchable in the API. All others are not included below.

df %>%
  filter(EFFECT == 'missense_variant') %>%
  mutate(API_CODE = paste0(GENE, ':', REF_AA, AA_POS, ALT_AA)) %>% 
  arrange(desc(AF)) %>%
  select(SAMPLE, EFFECT, API_CODE, SNV, PIPELINE, GENE, NSP, ORF, POS, REF, ALT, AF, REF_AA, ALT_AA) %>%
  format_dt()

top_lofreq = df %>%
  filter(PIPELINE == "lofreq", GENE == "S") %>%
  filter(EFFECT == 'missense_variant') %>%
  mutate(API_CODE = paste0(GENE, ':', REF_AA, AA_POS, ALT_AA)) %>% 
  arrange(desc(AF)) %>%
  select(API_CODE) %>%
  unique() %>%
  head(api_top_count) %>%
  pull(API_CODE)

top_lofreq_results = get_outbreak_info(top_lofreq)

write_delim(top_lofreq_results, file = file.path(summary_output_file_path, "_lofreq_top_outbreak_results.txt"), delim = "\t")

top_ivar = df %>%
  filter(PIPELINE == "ivar", GENE == "S") %>%
  filter(EFFECT == 'missense_variant') %>%
  mutate(API_CODE = paste0(GENE, ':', REF_AA, AA_POS, ALT_AA)) %>% 
  arrange(desc(AF)) %>%
  select(API_CODE) %>%
  unique() %>%
  head(api_top_count) %>%
  pull(API_CODE)

top_ivar_results = get_outbreak_info(top_ivar)

write_delim(top_ivar_results, file = file.path(summary_output_file_path, "_ivar_top_outbreak_results.txt"), delim = "\t")

top_lofreq_results = read_delim(file = file.path(summary_output_file_path, "_lofreq_top_outbreak_results.txt"), delim = "\t")

top_ivar_results = read_delim(file = file.path(summary_output_file_path, "_ivar_top_outbreak_results.txt"), delim = "\t")

a = tryCatch(
  expr = {plot_outbreak_trends(top_lofreq_results, title = "Lofreq Proportions (free y-axis scale)") +
      jak_theme() +
      scale_fill_manual(values = location_palette) +
      scale_color_manual(values = location_palette)},
  error = function(e){ 
        print("uh oh")
    }
)

b = plot_outbreak_trends(top_ivar_results, title = "iVar Proportions (free y-axis scale)") +
      jak_theme() +
      scale_fill_manual(values = location_palette) +
      scale_color_manual(values = location_palette)

a + b

m = bind_rows(top_lofreq_results, top_ivar_results) %>%
      get_outbreak_peak()

m_light = m %>%
  ungroup() %>%
  select(CODE, PERCENT)

f = bind_rows(top_lofreq_results, top_ivar_results) %>%
  get_outbreak_start()

l = bind_rows(top_lofreq_results, top_ivar_results) %>%
  get_outbreak_latest()

bind_rows(f, l, m) %>%
  select(lineage, location, DATE, TYPE, CODE) %>%
  pivot_wider(values_from = DATE, names_from = TYPE) %>%
  left_join(m_light, by = "CODE") %>%
  select(-CODE) %>%
  dplyr::rename("PEAK_PERCENT" = "PERCENT") %>%
  format_dt()

bind_rows(m, f, l) %>%
  mutate(location = as.factor(location), location = fct_relevel(location, c("CA", "USA", "GLOBAL"))) %>%
  mutate(lineage = as.factor(lineage), lineage = fct_relevel(lineage, rev(levels(lineage)))) %>%
  ggplot(aes(x = DATE, y = location, fill = TYPE)) +
    jak_theme() +
    geom_point(pch = 21, size = 3, alpha = 0.7) +
    scale_fill_manual(values = palette_jak$bay(3)) +
    facet_wrap(~lineage, ncol = 3) +
    labs(title = "First, last and peak occurrence", x = "Month-Year") + 
    scale_x_date(date_breaks = "months" , date_labels = "%b-%y")

bind_rows(m, f, l) %>%
  mutate(location = as.factor(location), location = fct_relevel(location, c("Worldwide", "United States", "California"))) %>%
  mutate(lineage = as.factor(lineage), lineage = fct_relevel(lineage, rev(levels(lineage)))) %>%
  mutate(TYPE = fct_relevel(TYPE, 'FIRST', 'PEAK', 'LAST')) %>%
  arrange(TYPE) %>% # some wonkiness to get the line to plot in the right order when two TYPEs have the same data
  ggplot(aes(x = DATE, y = TYPE, color = location)) +
    jak_theme() +
    geom_path(aes(group = location), size = 1, alpha = 0.7) + # some wonkiness to get the line to plot in the right order when two TYPEs have the same data
    geom_point(size = 2, alpha = 0.7) +
    scale_color_manual(values = location_palette) +
    facet_wrap(~lineage, ncol = 3) +
    labs(title = "First, peak and last occurrence", x = "Month-Year") + 
    scale_x_date(date_breaks = "months" , date_labels = "%b-%y")

VOI & VOC

a = readxl::read_excel(system.file("extdata", "mutation list for VOC and VOI 11.16.21.xlsx", package = "MappgeneSummary"), col_names = FALSE)
b1 = readxl::read_excel(system.file("extdata", "mutation list for VOC and VOI 12.02.21v2.xlsx", package = "MappgeneSummary"), sheet = 1, col_names = FALSE)
b2 = readxl::read_excel(system.file("extdata", "mutation list for VOC and VOI 12.02.21v2.xlsx", package = "MappgeneSummary"), sheet = 2, col_names = FALSE)
c = readxl::read_excel(system.file("extdata", "updated omicron and sublineages mutations 05.04.2022.xlsx", package = "MappgeneSummary"))

colnames(a) = c("GENE", "VOI_VOC", "AA_RANGE")
colnames(b1) = c("GENE", "VOI_VOC", "AA_RANGE")
colnames(b2) = c("GENE", "VOI_VOC", "AA_RANGE")
colnames(c) = c("GENE", "VOI_VOC", "AA_RANGE")

VOI_VOC = rbind(a[,1:3], b1[,1:3], b2[,1:3], c[,1:3]) %>%
  unique() %>%
  separate(AA_RANGE, sep = "/", into = c("A", "B"), remove = F) %>%
  mutate(B = ifelse(is.na(B), A, B)) %>% 
  transmute(GENE, VOI_VOC, AA_RANGE, AA_POS = map2(A, B, seq, by = 1)) %>% 
  unnest(cols = AA_POS) %>% 
  mutate(CODE = paste(GENE, AA_POS, sep = "_")) %>%
  select(-GENE, -AA_POS) %>% arrange(CODE)

df_VOI_VOC = readRDS(file = file.path(summary_output_file_path, "_summary_DF.Rds")) %>%
  mutate(GENE = case_when(
    GENE == "ORF1ab" & AA_POS >= 4401 ~ "ORF1b",
    GENE == "ORF1ab" & AA_POS < 4401 ~ "ORF1a",
    TRUE ~ GENE
  )) %>%
  mutate(AA_POS = ifelse(GENE == "ORF1b", AA_POS - 4401, AA_POS)) %>% 
  mutate(CODE = paste(GENE, AA_POS, sep = "_"))

df_VOI_VOC = df_VOI_VOC %>%
  left_join(VOI_VOC, by = "CODE") %>%
  select(-FILE, -SNV, -REF, -ALT, -CDNA_POS) %>%
  filter(VOI_VOC != "NA") %>% 
  arrange(GENE, AA_RANGE) %>%
  group_by(GENE, NSP, ORF, VOI_VOC, AA_RANGE, AA_POS, REF_AA,  ALT_AA, HGVS_C, HGVS_P, PIPELINE) %>%
  summarize(SAMPLE_LIST = list(SAMPLE)) %>%
  mutate(SAMPLE_COUNT = lengths(SAMPLE_LIST), SAMPLES = mapply(paste0, SAMPLE_LIST, collapse = ",")) %>%
  select(-SAMPLE_LIST)

df_VOI_VOC %>%
  format_dt(row_count = 20, file = "VOI_VOC_list")

df_VOI_VOC %>% write_delim(file.path(summary_output_file_path, "_VOI_VOC.txt"), delim = "\t")

Session Info

df_sess = sessioninfo::platform_info() %>%
  unlist() %>%
  as.data.frame() %>%
  rownames_to_column()

colnames(df_sess) = c("name", "value")

df_sess%>%
  knitr::kable()

sessioninfo::package_info() %>%
  as_tibble() %>%
  filter(attached == TRUE) %>%
  #arrange(desc(attached)) %>%
  select(package, loadedversion, source, date, attached) %>%
  format_dt(row_count = 20)

jeffkimbrel/MappgeneSummary documentation built on Dec. 24, 2024, 3:12 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

jeffkimbrel/MappgeneSummary
Summarize Mappgene SNV Data

In jeffkimbrel/MappgeneSummary: Summarize Mappgene SNV Data

Background

Load and Process Files

Lofreq

iVar

Pipeline Summary and Merging

Effects

EFFECT Codes

Parsing of EFFECTs and HGVS_P

SNV Sample Count per Pipeline

Visualization

Heatmap

NMDS

Between Pipelines

Between Samples

Summary Tables

Amplicon Summary

Amplicon Result

outbreak.info API

VOI & VOC

Session Info

R Package Documentation

Browse R Packages

We want your feedback!

jeffkimbrel/MappgeneSummary Summarize Mappgene SNV Data

In jeffkimbrel/MappgeneSummary: Summarize Mappgene SNV Data

Background

Load and Process Files

Lofreq

iVar

Pipeline Summary and Merging

Effects

EFFECT Codes

Parsing of EFFECTs and HGVS_P

SNV Sample Count per Pipeline

Visualization

Heatmap

NMDS

Between Pipelines

Between Samples

Summary Tables

Amplicon Summary

Amplicon Result

outbreak.info API

VOI & VOC

Session Info

R Package Documentation

Browse R Packages

We want your feedback!

jeffkimbrel/MappgeneSummary
Summarize Mappgene SNV Data