rmarkdown::render('_README.Rmd', output_file = 'README.md')
rmarkdown::clean_site()
rmarkdown::render_site(quiet = TRUE)
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)
devtools::load_all()
library(pdftools)
library(rvest)
library(dplyr)
library(tibble)
library(DT)
library(anytime)
library(glue)
library(reshape2)
library(zoo)
library(xml2)
library(ggplot2)
library(ggrepel)
library(plotly)

Sys.setlocale('LC_TIME', 'en_GB.UTF-8')
dat <- download.updated.pt()

dgs.pt.new     <- dat$dgs.pt 

dgs.pt <- tibble()
tryCatch(dgs.pt <- covid19.pt.data::dgs.pt, error = function(err) { })

# DGS PT
if (digest::digest(dgs.pt.new, algo = "sha256") != digest::digest(dgs.pt, algo = "sha256")) {
  send.discord.msg(dgs.pt.new, dgs.pt)
  dgs.pt <- dgs.pt.new 
  usethis::use_data(dgs.pt, overwrite = TRUE)
  readr::write_csv(dgs.pt, file = '../data/dgs_pt.csv')
}

R package with latest data scrapped from official sources (last data from r format(max(dgs.pt.new$date), "%A, %B %d, %Y"))

It downloads the daily report from DGS and stores this in data-friendly format under /data directory. It also retrieves age data from the DGS' ESRI dashboard

If you are here just for the data, this is what you want:

note: The EU CDC data has been removed as the daily reports from EU have been discontinued in favor of weekly counts.

A mortality analysis of Portugal is available here

Check for new reports

download.updated.pt()
dgs.pt <- dgs.pt.new 
age.data <- get.age.data(dgs.pt)

Data for Portugal

Last 5 days (how each column is changing over these days)

dgs.pt %>% arrange(desc(date)) %>% top_n(6, date) %>%
  dplyr::relocate(second_vaccine, .after = in.icu) %>% 
  dplyr::relocate(first_vaccine, .after = in.icu) %>% 
  dplyr::mutate_if(is.integer, function(my.input) {return(zoo::rollapply(data = my.input, width = 2, FUN = function(a) {return(a[1] - mean(a[-1]))}, align = 'left', partial = TRUE))}) %>% 
  dplyr::filter(!is.nan(confirmed)) %>% 
  dplyr::select(-tests, -country) %>% 
  dplyr::rename_with(~gsub("_", " ", .x, fixed = TRUE)) %>% 
  knitr::kable()
dgs.pt %>% 
  reshape2::melt(
    id.vars = c('country', 'date'), 
    variable.name = 'type', 
    measure.vars = c(
      'deaths', 'confirmed', 'hospitalized', 'in.icu', 'recovered', 'first_vaccine', 'second_vaccine'
    )
  ) %>% 
  mutate(type = 
           gsub('confirmed', 'Confirmed cases', type) %>% 
           gsub('first_vaccine', 'First vaccine', .) %>% 
           gsub('second_vaccine', 'Second vaccine', .) %>% 
           gsub('deaths', 'Deaths', .) %>% 
           gsub('hospitalized', 'Hospitalized', .) %>% 
           gsub('recovered', 'Recovered', .) %>% 
           gsub('tests', 'Suspected individuals', .) %>% 
           gsub('in.icu', 'In ICU', .)) %>% 
  group_by(country, type) %>% 
  arrange(date) %>% 
  mutate(
    label = if_else(
      date == last(date), 
      paste0(type, ': ', format(value, digits = 2, big.mark =',', trim = FALSE)), 
      NA_character_
    )
  ) %>% 
  ggplot(aes(x = date, y = value, color = type, label = label, fill = type)) +
    geom_line(size = 1, alpha = .6) + 
    geom_point(size = 1) +
    ggrepel::geom_label_repel(color = 'white', 
                              segment.color = 'red', 
                              na.rm = TRUE, 
                              min.segment.length = 0, 
                              #nudge_x = -2,
                              #nudge_y = -.5,
                              segment.alpha = .8,
                              max.time = 5,
                              alpha = .85,
                              max.iter = 100000,
                              force = 40,
                              force_pull = 0,
                              max.overlaps = 10,
                              size = 3,
                              seed = 1985) +
    scale_x_date(date_labels = '%B %d (%a)', date_breaks = "10 day", date_minor_breaks = "5 day") +
    labs(x = 'Date', 
         y = 'Number of individuals',
         title = 'Individuals that were Infected, Recovered and Died',
         subtitle = 'Showing in Logarithmic scale (base 10)',
         caption = "{format(max(dgs.pt.new$date), '%A, %B %d, %Y')}\n Label points to the maximum point in time" %>% glue::glue()) +
    scale_y_continuous(trans = 'log10', labels = function(ix) { format(ix, big.mark = ',', scientific = FALSE) }) +
    scale_color_viridis_d(end = .7) +
    scale_fill_viridis_d(end = .7) +
    theme_minimal() +
    theme(legend.position = 'none', axis.text.x = element_text(angle = 45, hjust = 1))
plot.data <- dgs.pt %>% 
  reshape2::melt(
    id.vars = c('country', 'date'), variable.name = 'type', measure.vars = c('deaths', 'confirmed', 'hospitalized', 'in.icu', 'recovered', 'first_vaccine', 'second_vaccine')
  ) %>% 
  mutate(type = 
           gsub('confirmed', 'Confirmed cases', type) %>% 
           gsub('first_vaccine', 'First vaccine', .) %>% 
           gsub('second_vaccine', 'Second vaccine', .) %>% 
           gsub('deaths', 'Deaths', .) %>% 
           gsub('hospitalized', 'Hospitalized', .) %>% 
           gsub('recovered', 'Recovered', .) %>% 
           gsub('tests', 'Suspected individuals', .) %>% 
           gsub('in.icu', 'In ICU', .)) %>% 
  group_by(country, type) %>% 
  arrange(desc(date)) %>% 
  mutate(value = zoo::rollapply(value, 
                                2, 
                                function(ix) { if(length(ix) <= 1) { return(ix) } else { ix[1] - sum(ix[-1]) } }, fill = c(0, 0, 0), align = 'left', partial = TRUE)) %>%
  group_by(country, type) %>% 
  arrange(date) %>% 
  filter(date > min(date)) %>%  ## values will not be for that day
  build.labels('value', 'type', show_plus = TRUE) 

roundUp <- function(x) 10^ceiling(log10(x))
my.breaks <- c(-100, -40, -20, -10, -5, -2, 0, 2, 5, 10, 30, 100, 300, 1000, 3000, 10000, 20000)

my.breaks <- c(my.breaks, max(plot.data$value, na.rm = TRUE) %>% roundUp(), -1 * (abs(min(plot.data$value, na.rm = TRUE)) %>% roundUp())) %>% 
  unique %>% 
  sort

plot.data %>% 
  ggplot(aes(x = date, y = value, color = type, label = label, fill = type)) +
    geom_hline(aes(yintercept = 0), size = .7, alpha = .3, linetype = 3) +
    geom_line(size = 1, alpha = .6) + 
    geom_point(size = 1) +
    ggrepel::geom_label_repel(color = 'white', 
                              segment.color = 'red', 
                              na.rm = TRUE, 
                              min.segment.length = 0, 
                              #nudge_x = -2,
                              #nudge_y = -.5,
                              segment.alpha = .8,
                              max.time = 5,
                              alpha = .85,
                              max.iter = 100000,
                              force = 40,
                              force_pull = 0,
                              size = 3,
                              seed = 1985) +
    labs(x = 'Date', 
         y = 'Number of individuals',
         title = 'Individuals that were Infected, Recovered and Died',
         subtitle = 'Showing in a (pseudo) logarithmic scale',
         caption = "data from {format(max(dgs.pt.new$date), '%A, %B %d, %Y')}\n Label points to the maximum point in time" %>% glue::glue()) +
    scale_x_date(date_labels = '%B %d (%a)', date_breaks = "10 day", date_minor_breaks = "5 day") +
    scale_color_viridis_d(end = .7) +
    scale_fill_viridis_d(end = .7) +
    scale_y_continuous(trans = scales::pseudo_log_trans(sigma = 1, base = 10), 
                       labels = function(ix) { format(ix, big.mark = ',', scientific = FALSE) },
                       breaks =  my.breaks) +
    theme_minimal() +
    theme(legend.position = 'none', axis.text.x = element_text(angle = 45, hjust = 1, size = 5))
plot.data <- dgs.pt %>% 
  reshape2::melt(
    id.vars = c('country', 'date'), variable.name = 'type', measure.vars = c('deaths', 'confirmed', 'hospitalized', 'in.icu', 'recovered', 'first_vaccine', 'second_vaccine')
  ) %>% 
  mutate(type = 
           gsub('confirmed', 'Confirmed cases', type) %>% 
           gsub('first_vaccine', 'First vaccine', .) %>% 
           gsub('second_vaccine', 'Second vaccine', .) %>% 
           gsub('deaths', 'Deaths', .) %>% 
           gsub('hospitalized', 'Hospitalized', .) %>% 
           gsub('recovered', 'Recovered', .) %>% 
           gsub('tests', 'Suspected individuals', .) %>% 
           gsub('in.icu', 'In ICU', .)) %>% 
  group_by(country, type) %>% 
  filter(!is.na(value)) %>% 
  arrange(desc(date)) %>% 
  mutate(value = zoo::rollapply(value, 
                                2, 
                                function(ix) { if(length(ix) <= 1) { return(ix) } else { ix[1] - sum(ix[-1]) } }, fill = c(0, 0, 0), align = 'left', partial = TRUE)) %>%
  # Remove first day of each type as the difference for the previous day is not real
  group_by(country, type) %>% 
  arrange(date) %>% 
  filter(row_number() > 1L) %>% 
  arrange(desc(date)) %>% 
  # Calculate rolling average
  mutate(value = zoo::rollapply(value, 7, mean, align = 'left', fill = c(0,0,0), partial = TRUE)) %>%
  group_by(country, type) %>% 
  arrange(date) %>% 
  filter(date > min(date)) %>%  ## values will not be for that day
  build.labels('value', 'type', digits = 0, show_plus = TRUE) 

roundUp <- function(x) 10^ceiling(log10(x))
my.breaks <- c(-100, -40, -20, -10, -5, -2, 0, 2, 5, 10, 30, 100, 300, 1000, 3000, 10000, 20000)

my.breaks <- c(my.breaks, max(plot.data$value, na.rm = TRUE) %>% roundUp(), -1 * (abs(min(plot.data$value, na.rm = TRUE)) %>% roundUp())) %>% 
  unique %>% 
  sort

plot.data %>% 
  ggplot(aes(x = date, y = value, color = type, label = label, fill = type)) +
    geom_hline(aes(yintercept = 0), size = .7, alpha = .3, linetype = 3) +
    geom_line(size = 1, alpha = .6) + 
    geom_point(size = 1) +
    ggrepel::geom_label_repel(color = 'white',
                              segment.color = 'red', 
                              na.rm = TRUE, 
                              min.segment.length = 0, 
                              #nudge_x = -2,
                              #nudge_y = -.5,
                              segment.alpha = .8,
                              size = 3,
                              alpha = .85,
                              max.time = 5,
                              max.iter = 1000000,
                              max.overlaps = 10,
                              force = 40,
                              force_pull = 0,
                              seed = 1985) +
    labs(x = 'Date', 
         y = 'Number of individuals',
         title = '7-day average of Individuals that were Infected, Recovered and Died',
         subtitle = 'Showing in a (pseudo) logarithmic scale',
         caption = "{format(max(dgs.pt.new$date), '%A, %B %d, %Y')}\n Label points to the maximum point in time" %>% glue::glue()) +
    scale_x_date(date_labels = '%B %d (%a)', date_breaks = "10 day", date_minor_breaks = "5 day") +
    scale_color_viridis_d(end = .7) +
    scale_fill_viridis_d(end = .7) +
    scale_y_continuous(trans = scales::pseudo_log_trans(sigma = 1, base = 10), 
                       labels = function(ix) { format(ix, big.mark = ',', scientific = FALSE) },
                       breaks =  my.breaks) +
    theme_minimal() +
    theme(legend.position = 'none', axis.text.x = element_text(angle = 45, hjust = 1, size = 5))

New cases / deaths by age groups

age.ix.f <- dgs.pt %>% filter(!is.na(`confirmed_m_00-09`)) %>% pull(date) %>% max
age.data.all <- get.age.data.with.labels(age.data, age.ix.f)
age.data.all.new <- get.age.new.data(age.data, age.data.all, age.ix.f)
if(age.ix.f != dgs.pt %>% pull(date) %>% max) {
  cat('*note: data update may be delayed by a couple of hours (since age data was been removed from daily report and we\'ve been retrieving it from DGS\'s covid19 dashboard, which may have a delay)*')
}

Age data may show different numbers from absolutes in the daily report. This is out of our control and dependent on the data source (DGS covid19 dashboard).

if(!is.null(age.data.all.new)) {
 my.plots.new <- get.plot.for.new(age.data.all.new, age.ix.f)
 print(my.plots.new$confirmed + theme(plot.title = element_text(size=10), plot.subtitle = element_text(size=8)))
 print(my.plots.new$deaths + theme(plot.title = element_text(size=10), plot.subtitle = element_text(size=8)))
} else {
 cat('Missing age grouping information for this day:', format(age.ix.f)) 
}

All cases / deaths by age groups

Age data may show different numbers from absolutes in the daily report. This is out of our control and dependent on the data source (DGS covid19 dashboard).

if(!is.null(age.data.all.new)) {
 my.plots.all <- get.plot.for.all(age.data.all, age.ix.f, dgs.pt = dgs.pt)
 print(my.plots.all$confirmed + theme(plot.title = element_text(size=9), plot.subtitle = element_text(size=8)))
 print(my.plots.all$deaths + theme(plot.title = element_text(size=9), plot.subtitle = element_text(size=8)))
} else {
 cat('Missing age grouping information for this day:', format(age.ix.f)) 
}

Data {.tabset .tabset-fade .tabset-pills}

Data from DGS

Only showing last 20 days

dgs.pt %>% 
  arrange(desc(date)) %>% 
  top_n(20, date) %>% 
  select(-country, -tests) %>% 
  dplyr::rename_with(~gsub("_", " ", .x, fixed = TRUE)) %>% 
  knitr::kable()

Future deaths prediction

Based on the daily new confirmed cases and current 'mortality rate' per age group (see plots above)

Only shows last 20 days

age.data.all %>% 
  dplyr::mutate(rate = if_else(death == 0 | is.nan(death) | is.nan(confirmed), 0, abs(death) / abs(confirmed))) %>% 
  dplyr::select(type, gender, rate) %>% 
  dplyr::right_join(age.data, by = c('type', 'gender')) %>% 
  dplyr::filter(age_type == 'confirmed') %>% 
  dplyr::select(-age_type) %>% 
  dplyr::group_by(country, type, gender) %>%
  dplyr::arrange(desc(date)) %>%
  dplyr::mutate(value = zoo::rollapply(value, 2, function(ix) { if(length(ix) <= 1) { return(ix) } else { ix[1] - sum(ix[-1]) } }, fill = c(0, 0, 0), align = 'left', partial = TRUE)) %>%
  dplyr::filter(value != 0) %>%
  dplyr::ungroup() %>% 
  dplyr::mutate(value = abs(value),
         predicted = value * rate) %>% 
  dplyr::select(date, type, gender, predicted) %>% 
#  top_n(10, predicted) %>% 
  reshape2::dcast(date ~ type + gender, value.var = 'predicted') %>% 
  tibble::tibble() %>% 
  dplyr::filter(date != '2020-11-20') %>% # weird number
  dplyr::filter(date != '2020-11-22') %>% # weird number
  dplyr::mutate(across(!starts_with('date'), ~tidyr::replace_na(.x, 0))) %>% 
  dplyr::mutate(across(!starts_with('date'), ~round(.x, 3))) %>% 
  dplyr::rowwise() %>% 
  dplyr::mutate(predicted_future_deaths = sum(across(!starts_with('date')))) %>% 
  dplyr::relocate(predicted_future_deaths, .after = date) %>% 
  dplyr::left_join(dgs.pt %>% 
    arrange(desc(date)) %>% 
    dplyr::mutate_if(is.integer, function(my.input) {return(zoo::rollapply(data = my.input, width = 2, FUN = function(a) {return(a[1] - mean(a[-1]))}, align = 'left', partial = TRUE))}) %>% 
    dplyr::filter(!is.nan(confirmed)) %>% 
    select(date, confirmed), by = "date") %>% 
  dplyr::mutate(confirmed = format(confirmed, big.mark = ' ')) %>% 
  dplyr::relocate(confirmed, .after = date) %>% 
  dplyr::ungroup() %>% 
  dplyr::top_n(20, date) %>% 
  dplyr::arrange(desc(date)) %>% 
  dplyr::rename_with(~gsub("_", " ", .x, fixed = TRUE)) %>% 
  knitr::kable()


averissimo/covid19.pt.data documentation built on March 29, 2022, 3:24 p.m.