knitr::opts_chunk$set(echo = TRUE)
library(SaviR)
library(dplyr)
library(zoo)
library(tidyr)
library(purrr)
library(ggplot2)

Overview

There are two main sources for testing data that we use: Our World in Data (OWID) and FIND. Both are compiling data from various sources to put together their datasets. Briefly, information on tests can be used in several different ways: tests per 1000 persons in a population (relevant for assessing whether the surveillance is sufficient) and test positivity rate (positive samples divided by number of tests for some time period), and variations on these, such as tests per case.

Note that Our World in Data will no longer be updating after June 23, 2022!

Therefore, these methods apply to historical data from both these data sources, but for prospective data, these functions will return data from FIND.

The purpose of this document is to give some explanation of the two datasets and some of the approaches we have taken in extracting relevant metrics from each source for various projects.

Briefly, for each country we look back 14 days to find the most recent 7-day average of new daily tests per 1K persons and the recent test positivity rate (7-day number of positives or cases divided by 7-day number of tests). We document which countries do not have recent data; we also document which countries have recent data that has a quality flag due to FIND or OWID reporting irregularities in some of the numbers (e.g., total number of tests decreasing or suddenly jumping by a large amount).

Getting started with the testing data

To directly get a dataset with all countries and data from the preferred source based on the flags described in detail below:

testing_processed <- get_testing()
testing_processed

get_testing() uses the SaviR functions get_testing_long() and get_preferred_testpos7() under the hood. This means that the preferred source is selected based on flags applied to test numbers and test positivity.

In addition, you can pull data in from both sources directly:

testing_long <- get_testing_long()
testing_long

To get the preferred data source for each country by date directly, using both test numbers and test positivity lagged seven days from the observation date:

preferred_source <- get_preferred_testpos7(testing_long)
preferred_source

Or just using test numbers over the past 14 days:

preferred_source_14 <- get_preferred_tests14(testing_long)

You can also change the date lookback over which the flags are applied using the argument last_X_days and change the maximum date for which to return data using analysis_date:

preferred_source_custom <-
  get_preferred_testpos7(testing_long, last_X_days = 10, analysis_date = as.Date("2021-03-01"))

To filter testing_long to the data from the preferred source for the most recent 7 days of data for each country:

preferred_testing <-
  testing_long %>%
  left_join(preferred_source %>% 
            select(id, preferred_source), by = "id") %>%
  filter(data_source == preferred_source)
preferred_testing

And to generate the same metrics as get_testing given preferred source information generated by either get_preferred_testpos7 and get_preferred_tests_14.

testing_cleaned <-
  preferred_testing %>%
  group_by(id) %>%
  arrange(date) %>%
  mutate(
    new_cases_daily7 = zoo::na.locf(new_cases_daily7, na.rm = F, maxgap = 14),
    new_tests_daily7 = zoo::na.locf(new_tests_daily7, na.rm = F, maxgap = 14) ,
    new_tests_smoothed_per_thousand = zoo::na.locf(new_tests_daily7_per_1k, na.rm = F, maxgap = 14),
    positive_rate = zoo::na.locf(positive_rate_7day, na.rm = F, maxgap = 14)  
  ) %>%
  ungroup()

Quality and availability of data by source and country

Looking at which data source is preferred over time and by country historically:

# taking a look at slice of historical data
ytd <- seq.Date(as.Date("2022-03-01"), 
                as.Date("2022-06-09"), by = 1)

preferred_timeseries <-
  purrr::map_dfr(
    ytd, 
    ~get_preferred_testpos7(testing_long, analysis_date = .x) %>%
     mutate(analysis_date = .x)
  ) %>%
  left_join(select(SaviR::onetable, id, who_region, who_country)) 

preferred_timeseries %>%
  filter(!is.na(who_region)) %>%
  ggplot(aes(y = who_country, x = analysis_date, fill = preferred_source)) +
  geom_tile() +
  scale_fill_brewer(palette = "Set1", na.value = "darkgrey", name = "Preferred\nsource") +
  theme(axis.text.y = element_text(size = 6)) +
  facet_grid(who_region ~ ., scales = "free_y", space = "free_y")

There are some countries with switching between data sets and some countries where data either is missing or both data sources failed the quality flags.

Looking at flags over time (focusing on the flags used in get_preferred_testpos7:

test_flags_recent <-
  testing_long %>% 
    left_join(select(SaviR::onetable, id, who_region, who_country)) %>%
    filter(date > "2022-01-01") %>%
    select(data_source, id, who_region, who_country, date, starts_with("FLAG")) %>% 
    tidyr::pivot_longer(starts_with("FLAG")) 

test_flags_recent %>%
  filter(name %in% c("FLAG_increase_tests_7day", "FLAG_negative_tests_7day", "FLAG_negative_cases_7day")) %>%
  group_by(date, data_source, name, who_region) %>%
  summarize(n_countries_flagged = sum(value %in% 1)) %>%
  ggplot() +
  geom_col(aes(x = date, y = n_countries_flagged, fill = who_region), width = 0.2, 
           position = "stack") +
  facet_grid(data_source ~ name)

Overall few days and countries fail the quality flags for either dataset (and no failures for the flag on 7 day cases).

Looking at flags over time & by country (in these plots white indicates missing and grey indicates an NA value):

c("FLAG_increase_tests_7day", "FLAG_negative_tests_7day") %>%
  purrr::set_names() %>%
  purrr::map(
      ~test_flags_recent %>% 
        filter(name %in% .x, !is.na(who_region)) %>%
        ggplot(aes(x = date, y = who_country, fill = factor(value))) +
        geom_tile() +
        scale_fill_brewer(palette = "Dark2", labels = c("Passed", "Failed"), 
                          na.value = "darkgrey", name = "Quality flag") +
        labs(x = "Country", y = "Date", 
             title = .x) +
        facet_grid(who_region ~ data_source, scales = "free_y", space = "free_y") +
        theme(axis.text.y = element_text(size = 8))
  )

Summary of testing data processing

We report the recent values for each country/area along with the preferred data source. Note that, it's plausible that one week to the next, the preferred data source for a country could shift between FIND and OWID.

More details on flags

As mentioned, we attempt to flag the following instances for implausibility to prevent bad quality data from being reported:

More details on OWID and FIND sources

OWID

A large benefit of OWID data is the detailed methodology listed on both their website and in published research.

Key features of this data:

FIND

FIND data has less detailed documentation on their methods -- it is essentially all relegated to their code, which can be difficult to follow. Their data can be accessed from the following sources:

Key features of this data:

To summarize, there are 3 basic series provided by FIND:

SaviR functions work from the original values that FIND reports, and we use the same quality flags, summarized test metrics, and linear interpolation as with OWID (using a maximum gaps of 31 days).

Appendix: checking functions to make sure they return the data even when OWID switches off

Checking functions to make sure they return data even when OWID data stops being updated:

# same as function in pkg except passing through testing_long
get_testing_mod <- function (testing_long,
                             analysis_date = Sys.Date() - 1L) {
    preferred <- get_preferred_testpos7(testing_long, last_X_days = 14, 
        analysis_date = analysis_date)
    preferred_long <- testing_long %>% left_join(preferred %>% 
        select(id, preferred_source), by = "id") %>% filter(data_source == 
        preferred_source)
    preferred_long_locf <- 
      preferred_long %>%
        group_by(id) %>%
        arrange(date) %>%
        mutate(new_tests_smoothed_per_thousand = zoo::na.locf(new_tests_daily7_per_1k,
          na.rm = F, maxgap = 14
        ), new_tests_smoothed_per_thousand_14 = zoo::na.locf(new_tests_daily14_per_1k,
          na.rm = F, maxgap = 14
        ), positive_rate = zoo::na.locf(positive_rate_7day,
          na.rm = F, maxgap = 14
        )) %>%
        ungroup(.) %>%
        select(
          id,
          date, new_tests_smoothed_per_thousand, new_tests_smoothed_per_thousand_14,
          positive_rate, data_source
        )
    return(preferred_long_locf)
}

# seems to work with just the 
check_test <-
  testing_long %>%
  filter(data_source == "FIND" | (data_source == "OWID" & date <= "2022-06-01")) %>%
  get_testing_mod() 

check_test %>%
  ggplot(aes(x = date, y = id, fill = new_tests_smoothed_per_thousand)) +
  geom_tile()

check_test %>%
  ggplot(aes(x = date, y = id, fill = data_source)) +
  geom_tile()

And with other sequence of functions:

preferred_source <- 
  get_preferred_testpos7(
    testing_long %>%
    filter(data_source == "FIND" | (data_source == "OWID" & date <= "2022-06-01"))
  )
preferred_source

Or just using test numbers over the past 14 days:

get_preferred_tests14(
  testing_long %>%
    filter(data_source == "FIND" | (data_source == "OWID" & date <= "2022-06-01"))
)
testing_long %>%
  filter(data_source == "FIND" | (data_source == "OWID" & date <= "2022-06-01")) %>%
  left_join(preferred_source %>% 
            select(id, preferred_source), by = "id") %>%
  filter(data_source == preferred_source)


CDCgov/SaviR documentation built on April 14, 2025, 7:46 a.m.