In saketkc/covmuller: A tool to model COVID19 variant prevalence using data from GISAID

knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  gganimate = list(
    nframes = 0
  ),
  out.width = "100%"
)
knitr::opts_chunk$set(echo = TRUE, warning = FALSE, message = FALSE, fig.dim = c(12, 5))

suppressPackageStartupMessages({
  library(covmuller)
  library(tidyverse)
})

Get cases data

We use covid19bharat.org to get a tally of daily confirmed cases and then summarize it to monthly level.

indian_state_cases <- GetIndiaConfirmedCasesMonthlyLong()
india_cases <- indian_state_cases %>%
  filter(State == "India") %>%
  filter(value > 1)
head(india_cases)

Plot cases for India

It is easy to visualize the monthly case counts on a bar plot:

p1 <- BarPlot(india_cases, ylabel = "Cases per month", label_si = TRUE, title = "Total cases per month - India", caption = "**Source: covid19bharat.org**<br>")
p1

Read variant data from GISAID

We utilize GISAID data to look at prevalence of variants. To access this data, GISAID requires registration.

current_date <- "2024_04_11"
fpath.tar <- paste0("~/data/epicov/metadata_tsv_", current_date, ".tar.xz")
fpath.qs <- paste0("~/data/epicov/metadata_tsv_", current_date, ".qs")

if (file.exists(fpath.qs)) {
  gisaid_metadata <- qs::qread(file = fpath.qs)
} else {
  gisaid_metadata <- ReadGISAIDMetada(path = fpath.tar)
  qs::qsave(gisaid_metadata, fpath.qs)
}

Plot total sequenced cases

We can look at the absolute number of cases that have been sequenced from a country by filtering out information from the metadata made available by GISAID (which includes all countries). Here, we visualize the total sequenced cases coming from India:

gisaid_india <- FilterGISAIDIndia(gisaid_metadata_all = gisaid_metadata)
country_seq_stats <- TotalSequencesPerMonthCountrywise(gisaid_india, rename_country_as_state = TRUE)
p2 <- BarPlot(country_seq_stats, ylabel = "Sequenced per month", color = "slateblue1", label_si = TRUE, title = "Total sequences deposited to GISAID from India", caption = "**Source: gisaid.org **<br>")
p2

Overall, how much has India sequenced over months?

While the absolute numbers are informative, a more useful metric is the proportion of cases (cases sequenced over total cases) that are getting sequenced. Here we look at the proportion of cases that have been sequenced in India over the course of the pandemic:

# india_cases_long <- GetIndiaConfirmedCasesMonthlyLong() %>% filter(State == "India")
GetIndiaCases <- function() {
  data <- read.csv("https://raw.githubusercontent.com/owid/covid-19-data/master/public/data/cases_deaths/new_cases.csv")
  confirmed <- data %>% select(date, India)
  colnames(confirmed)[2] <- c("cases")
  confirmed$MonthYear <- GetMonthYear(confirmed$date)

  confirmed_subset_weekwise <- confirmed %>%
    group_by(MonthYear) %>%
    summarise(value = sum(cases, na.rm = T)) %>%
    arrange(MonthYear)
}
india_cases_long <- GetIndiaCases()
india_cases_long$State <- "India"
india_cases_long$type <- "Confirmed"

india_sequencing_proportion <- CombineSequencedCases(
  cases_sequenced = country_seq_stats,
  confirmed_long = india_cases_long
)
p3 <- BarPlot(india_sequencing_proportion, yaxis = "percent_sequenced_collected", ylabel = "%  deposited to GISAID", color = "yellowgreen", title = "Proportion of cases deposited to GISAID from India", caption = "**Source: gisaid.org and ourworldindata.org/coronavirus**<br>")
p3

Plot proportion of cases that been deposited from each state

We can further break down the proportion of sequenced cases at the state level:

state_seq_stats <- TotalSequencesPerMonthStatewise(gisaid_india, drop_country = TRUE)
seq_stats <- rbind(country_seq_stats, state_seq_stats)
state_cases_long <- GetIndiaConfirmedCasesMonthlyLong()
india_sequencing_proportion <- CombineSequencedCases(
  cases_sequenced = seq_stats,
  confirmed_long = state_cases_long,
  month.min = "Jan 2022",
  month.max = "Feb 2023",
  max.percent = 5
)

india_sequencing_proportion$State <- factor(
  x = india_sequencing_proportion$State,
  levels = as.character(GetIndianStates())
)
p4 <- PlotSequencedPropHeatmap(india_sequencing_proportion)
# p4

# In terms of absolute numbers sequenced:

seq_stats2 <- seq_stats
# seq_stats2 <- seq_stats2 %>% filter(MonthYear >= "July 2022")
seq_stats2$MonthYear <- factor(seq_stats$MonthYear)
seq_stats2$State <- factor(
  x = seq_stats2$State,
  levels = as.character(GetIndianStates())
)

p4.total <- PlotTotalHeatmap(df = seq_stats2)
p4.total

Plot Prevalence

Finally, we look at the prevalence of variants and variants of concern (VOCs):

india_month_counts <- SummarizeVariantsMonthwise(gisaid_india)
india_month_counts$State <- "India"
india_month_prevalence <- CountsToPrevalence(india_month_counts)
vocs <- GetVOCs()
custom_voc_mapping <- list(
  `JN.1` = "JN.1",
  `JN.1.*` = "JN.1",
  `HV.1` = "HV.1",
  `HV.1.*` = "HV.1",
  `B.1` = "B.1",
  `B.1.1.306` = "B.1",
  `B.1.1.306.*` = "B.1",
  `B.1.1.326` = "B.1",
  `B.1.36.29` = "B.1",
  `B.1.560` = "B.1",
  `B.1.1` = "B.1",
  `B.1.210` = "B.1",
  `B.1.36.8` = "B.1",
  `B.1.36` = "B.1",
  `B.1.36.*` = "B.1"
)
india_month_prevalence <- CollapseLineageToVOCs(
  variant_df = india_month_prevalence,
  vocs = vocs,
  custom_voc_mapping = custom_voc_mapping, summarize = FALSE
)

p5 <- StackedBarPlotPrevalence(india_month_prevalence)
p5

For an animated version of the prevalence plot, check out VariantAnimation.

saketkc/covmuller documentation built on April 19, 2024, 10:14 a.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

saketkc/covmuller
A tool to model COVID19 variant prevalence using data from GISAID

In saketkc/covmuller: A tool to model COVID19 variant prevalence using data from GISAID

Get cases data

Plot cases for India

Read variant data from GISAID

Plot total sequenced cases

Overall, how much has India sequenced over months?

Plot proportion of cases that been deposited from each state

Plot Prevalence

R Package Documentation

Browse R Packages

We want your feedback!

saketkc/covmuller A tool to model COVID19 variant prevalence using data from GISAID

In saketkc/covmuller: A tool to model COVID19 variant prevalence using data from GISAID

Get cases data

Plot cases for India

Read variant data from GISAID

Plot total sequenced cases

Overall, how much has India sequenced over months?

Plot proportion of cases that been deposited from each state

Plot Prevalence

R Package Documentation

Browse R Packages

We want your feedback!

saketkc/covmuller
A tool to model COVID19 variant prevalence using data from GISAID