rmarkdown::render('mortality.Rmd', output_file = 'mortality.md')
rmarkdown::render('mortality.Rmd', output_file = 'mortality.html')
rmarkdown::clean_site()
rmarkdown::render_site(quiet = TRUE)
knitr::opts_chunk$set(
  collapse = TRUE,
  echo = FALSE,
  comment = "#>",
  fig.height = 10,
  fig.width = 10
)
devtools::load_all()
library(pdftools)
library(rvest)
library(dplyr)
library(tibble)
library(DT)
library(anytime)
library(glue)
library(reshape2)
library(zoo)
library(xml2)
library(ggplot2)
library(ggrepel)
library(plotly)

Sys.setlocale('LC_TIME', 'en_GB.UTF-8')

Updated on r format(Sys.time(), '%A, %d %B %Y (%d %Y-%m-%d %H:%m %Z)')

Description & Code {.tabset .tabset-fade .tabset-pills}

This analysis retrieves data directly from the Portuguese Health ministry death certificate site (SICO) and makes an yearly comparison.

Hide source code

Show source code

Day analysis for every year

# Download from DGS
json <- download.evm() %>% rjson::fromJSON()

# Process into multiple columns
dat <- sapply(1 + seq_along(json$x$data[-1]), function(ix) {
  my.year <- json$x$data[[ix]]
  my.year[sapply(my.year, is.null)] <- NA
  tmp <- tibble::tibble(!!as.name(2009 + ix - 2) := unlist(my.year, recursive=FALSE))
  # current day info may not be finalized
  if (2009 + ix - 2 == format(Sys.Date(), "%Y")) {
    tmp[which(is.na(tmp[,1]) %>% as.vector)[1] - 1, 1] <- NA
  }
  return(tmp)
})

# Merge data
mortality <- dplyr::bind_cols(day = json$x$data[[1]], dat) %>% 
  mutate(day = factor(day, levels = day))

# Add some statistics (mean, sd, max, min)
mortality.with.stats <- mortality %>% 
  replace(is.na(.), NA) %>%
  dplyr::rowwise() %>% 
  mutate(meanPre2020 = mean(c_across(cols = -c("2020", 'day')), na.rm = TRUE),
         mean = mean(c_across(cols = -c("meanPre2020", 'day')), na.rm = TRUE),
         sdPre2020 = sd(c_across(cols = -c("2020","meanPre2020", "mean", 'day')), na.rm = TRUE),
         sd = sd(c_across(cols = -c("meanPre2020", "mean", "sdPre2020", 'day')), na.rm = TRUE)) %>% 
  mutate(maxPre2020 = max(c_across(cols = colnames(.)[colnames(mortality) != 2020][grepl('^[0-9]+$', colnames(.))]), na.rm = TRUE),
         max = max(c_across(cols = colnames(.)[grepl('^[0-9]+$', colnames(.))]), na.rm = TRUE),
         minPre2020 = min(c_across(cols = colnames(.)[colnames(mortality) != 2020][grepl('^[0-9]+$', colnames(.))]), na.rm = TRUE),
         min = min(c_across(cols = colnames(.)[grepl('^[0-9]+$', colnames(.))]), na.rm = TRUE)) %>% 
  dplyr::ungroup()

Poisson baseline distribution vs. 2020

pois.levels <- c('2021-lambda.5',
                 '2020-lambda.5', 
                 'pre2020-lambda', 
                 'pre2020-lambda.5', 
                 'pre2020-ic95.5', 
                 'pre2020-ic99.5')
pois.labels <- c('2020 (smooth regression)',  # this is new
                 #
                 '2020',
                 '2021 (smooth regression)',  # this is new
                 '2021',
                 'Confidence interval 95% 2015-19',
                 'Confidence interval 99% 2015-19',
                 'Base line for 2015-19 (non-averaged)',
                 'Base line for 2015-19')

average.days <- 7

mortality.pois <- mortality %>% 
  reshape2::melt(id.vars = c('day'), variable.name = 'year') %>% 
  filter(!is.na(value)) %>% 
  mutate(year = as.numeric(as.character(year))) %>% 
  #filter(year >= 2015) %>% 
  mutate(group = if_else(year >= 2020, as.character(year), 'pre2020')) %>% 
  group_by(group, day) %>% 
  dplyr::arrange(day, group)

mortality.pois.transf <- mortality.pois  %>% 
  summarise(lambda = mean(value, na.rm = TRUE), .groups = 'keep') %>% 
  mutate(ic95 = qpois(.95, lambda = lambda, lower.tail = TRUE),
         ic99 = qpois(.99, lambda = lambda, lower.tail = TRUE)) %>% 
  dplyr::group_by(group) %>% 
  mutate(lambda.5 = zoo::rollapply(lambda, width = average.days, FUN = function(x) {mean(x,na.rm=TRUE)}, partial = TRUE, fill = NA, align = 'center'),
         ic95.5 = qpois(.95, lambda = lambda.5, lower.tail = TRUE),
         ic99.5 = qpois(.99, lambda = lambda.5, lower.tail = TRUE)) %>% 
  #
  reshape2::melt(id.vars = c('group', 'day')) %>% 
  mutate(new.group = paste0(group, '-', variable)) %>% 
  filter(new.group %in% pois.levels) %>% 
  mutate(ymin = min(value),
         new.group = factor(new.group, levels = pois.levels)) %>% 
  arrange(day, group)

mortality.plot.poisson <- mortality.pois.transf %>% 
  ggplot(aes(x = day, group = new.group, color = new.group)) +
  geom_ribbon(aes(y = value, fill= new.group, ymin = ymin - abs(ymin * .05), ymax = value), alpha = .4, position = 'identity', 
              data = mortality.pois.transf %>% filter(new.group %in% c('pre2020-lambda.5','pre2020-ic99.5','pre2020-ic95.5')),
              show.legend = FALSE) +
  geom_line(aes(y = value, color = new.group)) +
  geom_line(aes(y = value, color = new.group), data = mortality.pois.transf %>% filter(group == '2021'), size = 1.5) +
  geom_line(aes(y = value, color = new.group), data = mortality.pois.transf %>% filter(group == '2020'), size = 1.5) +
  geom_smooth(aes(y = value, color = '2021 (smooth regression)'), alpha = .4, data = mortality.pois.transf %>% filter(group == '2021'), span = 1, method = 'loess', formula = y ~ x, na.rm = TRUE, show.legend = FALSE, se = FALSE) +
  geom_smooth(aes(y = value, color = '2020 (smooth regression)'), alpha = .4, data = mortality.pois.transf %>% filter(group == '2020'), span = 0.1, method = 'loess', formula = y ~ x, na.rm = TRUE, show.legend = FALSE, se = FALSE) +
  #geom_line(aes(y = lambda.5, color = new.grou~p), data = mortality.pois.transf %>% filter(group == 'pre2020')) +
  #geom_line(aes(y = ic95.5), color = 'purple', data = mortality.pois.transf %>% filter(group == 'pre2020')) +
  #geom_line(aes(y = ic99.5), color = 'orange', data = mortality.pois.transf %>% filter(group == 'pre2020')) +
  #geom_line(aes(y = lambda.5), data = mortality.pois.transf %>% filter(group == '2020')) +
  scale_x_discrete(breaks = function(ix) {
      ix[c(FALSE, TRUE, TRUE, TRUE, TRUE)] <- FALSE
      return(ix)
    }) +
  scale_color_discrete("", labels = pois.labels) +
  labs(title = '2020 and 2021 vs Baseline using Poisson distribution', 
       subtitle = 'Data calculated with centered average of {average.days} days' %>% glue::glue(),
       caption = "Data retrieved on {format(Sys.Date(), '%B %d, %Y')}" %>% glue::glue()) +
  xlab('Day of year') +
  ylab('Number of deaths') +
  theme_minimal() +
  theme(legend.key = element_rect(fill = "white", colour = "white"), legend.position = 'top', axis.text.x = element_text(angle = 45, hjust = 1, size = 5))

Sample plot

show.year.mortality(mortality.with.stats, seq(2019, 2020))

Yearly stats from 2009-2020

yearly.stats <- mortality[c(rep(0, which(mortality$day == 'Mar-15') - 1), seq(from = which(mortality$day == 'Mar-15'), nrow(mortality)) > 0) & as.vector(!is.na(mortality[,'2020'])),] %>% 
  rowwise() %>% 
  replace(is.na(.), NA) %>%
  #mutate(maxPre2020    = max(c_across(colnames(.)[!grepl('(^2020)',colnames(.)) & grepl('^[0-9]+$',colnames(.))]), na.rm = TRUE),
  #       minPre2020    = min(c_across(colnames(.)[!grepl('(^2020)',colnames(.)) & grepl('^[0-9]+$',colnames(.))]), na.rm = TRUE),
  #       meanPre2020   = mean(c_across(colnames(.)[!grepl('(^2020)',colnames(.)) & grepl('^[0-9]+$',colnames(.))]), na.rm = TRUE),
  #       medianPre2020 = median(c_across(colnames(.)[!grepl('(^2020)',colnames(.)) & grepl('^[0-9]+$',colnames(.))]), na.rm = TRUE)) %>% 
  reshape2::melt(id.vars = 'day') %>% 
  dplyr::group_by(variable) %>% 
  dplyr::summarise(mean = mean(value, na.rm = TRUE), 
            sd = sd(value, na.rm = TRUE),
            sum = sum(value, na.rm = TRUE),
            median = median(value, na.rm = TRUE),
            max = max(value, na.rm = TRUE),
            min = min(value, na.rm = TRUE),
            .groups = 'rowwise')
yearly.comp <- sapply(seq(nrow(yearly.stats)), function(ix) { yearly.stats$sum - yearly.stats$sum[ix]}) %>%
  data.frame()
yearly.comp[upper.tri(yearly.comp)] <- NA
colnames(yearly.comp) <- yearly.stats$variable
yearly.comp <- bind_cols(year = yearly.stats$variable, yearly.comp)
yearly.comp.rel <- sapply(seq(nrow(yearly.stats)), function(ix) { yearly.stats$sum / yearly.stats$sum[ix]}) %>%
  data.frame()
yearly.comp.rel[upper.tri(yearly.comp.rel)] <- NA
colnames(yearly.comp.rel) <- yearly.stats$variable
yearly.comp.rel <- bind_cols(year = yearly.stats$variable, yearly.comp.rel)

Yearly stats calculated (max, min, mean, median and mean +/- StDev)

yearly.stats.2 <- mortality[c(rep(0, which(mortality$day == 'Mar-15') - 1), seq(from = which(mortality$day == 'Mar-15'), nrow(mortality)) > 0) & as.vector(!is.na(mortality[,'2020'])),] %>% 
  rowwise() %>% 
  replace(is.na(.), NA) %>%
  mutate(maxPre2020    = max(c_across(colnames(.)[!grepl('(^2020)',colnames(.)) & grepl('^[0-9]+$',colnames(.))]), na.rm = TRUE),
         minPre2020    = min(c_across(colnames(.)[!grepl('(^2020)',colnames(.)) & grepl('^[0-9]+$',colnames(.))]), na.rm = TRUE),
         meanPre2020   = mean(c_across(colnames(.)[!grepl('(^2020)',colnames(.)) & grepl('^[0-9]+$',colnames(.))]), na.rm = TRUE),
         meanPlusStDevPre2020   = mean(c_across(colnames(.)[!grepl('(^2020)',colnames(.)) & grepl('^[0-9]+$',colnames(.))]), na.rm = TRUE) + sd(c_across(colnames(.)[!grepl('(^2020)',colnames(.)) & grepl('^[0-9]+$',colnames(.))]), na.rm = TRUE),
         meanMinusStDevPre2020   = mean(c_across(colnames(.)[!grepl('(^2020)',colnames(.)) & grepl('^[0-9]+$',colnames(.))]), na.rm = TRUE) - sd(c_across(colnames(.)[!grepl('(^2020)',colnames(.)) & grepl('^[0-9]+$',colnames(.))]), na.rm = TRUE),
         medianPre2020 = median(c_across(colnames(.)[!grepl('(^2020)',colnames(.)) & grepl('^[0-9]+$',colnames(.))]), na.rm = TRUE)) %>% 
  reshape2::melt(id.vars = 'day') %>% 
  dplyr::group_by(variable) %>% 
  dplyr::summarise(mean = mean(value, na.rm = TRUE), 
            sd = sd(value, na.rm = TRUE),
            sum = sum(value, na.rm = TRUE),
            median = median(value, na.rm = TRUE),
            max = max(value, na.rm = TRUE),
            min = min(value, na.rm = TRUE),
            .groups = 'rowwise') %>% 
  filter(grepl('202[0-9]$', variable)) %>% 
  mutate(variable = factor(variable, levels = c( 'medianPre2020', 'minPre2020', 'meanMinusStDevPre2020', 'meanPre2020', 'meanPlusStDevPre2020', 'maxPre2020', '2020'))) %>% 
  arrange(variable)
yearly.comp.rel.2 <- sapply(seq(nrow(yearly.stats.2)), function(ix) { yearly.stats.2$sum / yearly.stats.2$sum[ix]}) %>%
  data.frame()
yearly.comp.rel.2[upper.tri(yearly.comp.rel.2)] <- NA
colnames(yearly.comp.rel.2) <- yearly.stats.2$variable
yearly.comp.rel.2 <- bind_cols(year = yearly.stats.2$variable, yearly.comp.rel.2)

External causes data

external.raw <- download.causas.externas()

json.ext <- external.raw$json %>% 
  purrr::map(function(ix) {rjson::fromJSON(ix)})

external.causes <- lapply(seq_along(external.raw$year), function(ix.year) {
  # Process into multiple columns
  json.tmp <- json.ext[[ix.year]]
  dat <- sapply(1 + seq_along(json.tmp$x$data[-1]), function(ix) {
    my.colnames <- xml2::read_xml(json.tmp$x$container) %>% 
      rvest::html_nodes('th') %>%
      gsub(' ?</?th> ?', '', .)

    my.year <- json.tmp$x$data[[ix]]
    my.year[sapply(my.year, is.null)] <- NA
    tmp <- tibble::tibble(!!as.name(my.colnames[ix]) := unlist(my.year, recursive=FALSE))
    return(tmp)
  })
  dat$year <- rep(external.raw$year[ix.year], length(dat[[1]]))

  # Merge data
  return(dplyr::bind_cols(day = json.tmp$x$data[[1]], dat) %>% 
    mutate(day = factor(day, levels = json.ext[[1]]$x$data[[1]])))
}) %>% 
  bind_rows %>% 
  dplyr::relocate(year, .before = day) %>% 
  mutate(year = factor(year))

External causes plots

external.causes.plot.covid <- external.causes %>% 
  filter(day %in% levels(external.causes$day)[levels(external.causes$day) %in% (external.causes %>% filter(year == '2020') %>% pull(day))]) %>% 
  filter(day %in% levels(external.causes$day)[seq(from = which(levels(external.causes$day) == 'Mar-15'), to = length(levels(external.causes$day)))]) %>% 
  group_by(year) %>% 
  summarise(across(where(is.numeric), ~ sum(.x , na.rm = TRUE)), .groups = 'keep') %>% 
  reshape2::melt(id = 'year') %>% 
  ggplot(aes(y = year, x = variable)) +
  geom_tile(aes(fill = value)) +
  geom_text(aes(label = value), color = 'white') +
  scale_fill_gradient(low = "orange", high = "red", na.value = '#00000020') +
  scale_x_discrete(position = "top") +
  labs(title = "External causes of death from \'March 15 to {format(Sys.Date(), '%B %d')}'" %>% glue::glue(), subtitle = 'All years show the same period', caption = "Accidents, suicides and others. It excludes natural causes or diseases\nData retrieved on {format(Sys.Date(), '%B %d, %Y')}" %>% glue::glue()) + 
  xlab('External cause of death') +
  ylab('Year') +
  theme_minimal() +
  theme(legend.position = 'none')

external.causes.plot.full.year <- external.causes %>% 
  group_by(year) %>% 
  summarise(across(where(is.numeric), ~ sum(.x , na.rm = TRUE)), .groups = 'keep') %>% 
  reshape2::melt(id = 'year') %>% 
  ggplot(aes(y = year, x = variable)) +
  geom_tile(aes(fill = value)) +
  geom_text(aes(label = value), color = 'white') +
  scale_fill_gradient(low = "orange", high = "red", na.value = '#00000020') +
  scale_x_discrete(position = "top") +
  labs(title = 'External causes of death', subtitle = 'All years show the same period', caption = "Accidents, suicides and others. It excludes natural causes or diseases\nData retrieved on {format(Sys.Date(), '%B %d, %Y')}" %>% glue::glue()) + 
  xlab('External cause of death') +
  ylab('Year') +
  theme_minimal() +
  theme(legend.position = 'none')

Age group data

age.group.raw <- download.age.groups()

json.age.group <- age.group.raw$json %>% 
  purrr::map(function(ix) {rjson::fromJSON(ix)})

age.group <- lapply(seq_along(age.group.raw$year), function(ix.year) {
  # Process into multiple columns
  json.tmp <- json.age.group[[ix.year]]
  dat <- sapply(1 + seq_along(json.tmp$x$data[-1]), function(ix) {
    my.colnames <- xml2::read_xml(json.tmp$x$container) %>% 
      rvest::html_nodes('th') %>%
      gsub(' ?</?th> ?', '', .)

    my.year <- json.tmp$x$data[[ix]]
    my.year[sapply(my.year, is.null)] <- NA
    tmp <- tibble::tibble(!!as.name(my.colnames[ix]) := unlist(my.year, recursive=FALSE))
    return(tmp)
  })
  dat$year <- rep(age.group.raw$year[ix.year], length(dat[[1]]))

  # Merge data
  return(dplyr::bind_cols(day = json.tmp$x$data[[1]], dat) %>% 
    mutate(day = factor(day, levels = json.age.group[[1]]$x$data[[1]])))
}) %>% 
  bind_rows %>% 
  dplyr::relocate(year, .before = day) %>% 
  mutate(year = factor(year))

Plots for age group

age.group.plot.covid <- age.group %>% 
  filter(day %in% levels(age.group$day)[levels(age.group$day) %in% (age.group %>% filter(year == '2020') %>% pull(day))]) %>% 
  filter(day %in% levels(age.group$day)[seq(from = which(levels(age.group$day) == 'Mar-15'), to = length(levels(age.group$day)))]) %>% 
  rowwise() %>% 
  mutate(total = sum(c_across(-c('year', 'day')))) %>% 
  ungroup() %>% 
  group_by(year) %>% 
  summarise(across(where(is.numeric), ~ sum(.x , na.rm = TRUE)), .groups = 'keep') %>% 
  reshape2::melt(id = c('year', 'total')) %>% 
  mutate(label = factor(paste0(year, '\n(', format(total, big.mark = ' '),')'))) %>% 
  ggplot(aes(y = label, x = variable)) +
  geom_tile(aes(fill = value)) +
  geom_text(aes(label = value), color = 'white') +
  scale_fill_gradient(low = "orange", high = "red", na.value = '#00000020') +
  scale_x_discrete(position = "top") +
  labs(title = "Mortality by age group from \'March 15 to {format(Sys.Date(), '%B %d')}'" %>% glue::glue(), subtitle = 'All years show the same period', caption = "Data retrieved on {format(Sys.Date(), '%B %d, %Y')}" %>% glue::glue()) + 
  xlab('Age group') +
  ylab('Year') +
  theme_minimal() +
  theme(legend.position = 'none')

age.group.plot.full.year <- age.group %>% 
  rowwise() %>% 
  mutate(total = sum(c_across(-c('year', 'day')))) %>% 
  ungroup() %>% 
  group_by(year) %>% 
  summarise(across(where(is.numeric), ~ sum(.x , na.rm = TRUE)), .groups = 'keep') %>% 
  reshape2::melt(id = c('year', 'total')) %>% 
  mutate(label = factor(paste0(year, '\n(', format(total, big.mark = ' '),')'))) %>% 
  ggplot(aes(y = label, x = variable)) +
  geom_tile(aes(fill = value)) +
  geom_text(aes(label = value), color = 'white') +
  scale_fill_gradient(low = "orange", high = "red", na.value = '#00000020') +
  scale_x_discrete(position = "top") +
  labs(title = 'Mortality by age group', subtitle = 'All years show the same period (last year may be incomplete)', caption = "Data retrieved on {format(Sys.Date(), '%B %d, %Y')}" %>% glue::glue()) +  
  xlab('Age group') +
  ylab('Year') +
  theme_minimal() +
  theme(legend.position = 'none')

2020 vs. Baseline

mortality.plot.poisson

Year vs. 2020 {.tabset .tabset-fade .tabset-pills}

Yearly comparison against 2020

2019 vs. 2020

show.year.mortality(mortality.with.stats, seq(2019, 2020))

2018 vs. 2020

show.year.mortality(mortality.with.stats, c(2018, 2020))

2017 vs. 2020

show.year.mortality(mortality.with.stats, c(2017, 2020))

2016 vs. 2020

show.year.mortality(mortality.with.stats, c(2016, 2020))

2015 vs. 2020

show.year.mortality(mortality.with.stats, c(2015, 2020))

2014 vs. 2020

show.year.mortality(mortality.with.stats, c(2014, 2020))

2013 vs. 2020

show.year.mortality(mortality.with.stats, c(2013, 2020))

2012 vs. 2020

show.year.mortality(mortality.with.stats, c(2012, 2020))

2011 vs. 2020

show.year.mortality(mortality.with.stats, c(2011, 2020))

2010 vs. 2020

show.year.mortality(mortality.with.stats, c(2010, 2020))

2009 vs. 2020

show.year.mortality(mortality.with.stats, c(2009, 2020))

'March-15 -> r format(Sys.Date(), '%B-%d')' yearly comparison {.tabset .tabset-fade .tabset-pills}

Total numbers

show.mortality.comp(yearly.comp) +
  labs(title = "Absolute comparison between years (March-15 to {format(Sys.Date(), '%B-%d')})" %>% glue::glue())

Relative numbers (in percentage)

show.mortality.comp(yearly.comp.rel, is.percentage = TRUE) +
  labs(title = "Relative comparison between years (March-15 to {format(Sys.Date(), '%B-%d')})" %>% glue::glue())

'March-15 -> r format(Sys.Date(), '%B-%d')' yearly comparisons with 2009-2019 stats {.tabset .tabset-fade .tabset-pills}

Total numbers

yearly.comp.2 <- sapply(seq(nrow(yearly.stats.2)), function(ix) { yearly.stats.2$sum - yearly.stats.2$sum[ix]}) %>%
  data.frame()
yearly.comp.2[upper.tri(yearly.comp.2)] <- NA
colnames(yearly.comp.2) <- yearly.stats.2$variable
yearly.comp.2 <- bind_cols(year = yearly.stats.2$variable, yearly.comp.2)


show.mortality.comp(yearly.comp.2) +
  labs(title = 'Absolute comparison between pseudo-years', caption = "Max/min/mean/median are calculated day by day in period 2009-2019 (for example, Max is the worse case scenario from that period)\nData retrieved on {format(Sys.Date(), '%B %d, %Y')}" %>% glue::glue()) + 
  theme(axis.text.x = element_text(angle = 45, hjust = 0))

Relative numbers (in percentage)

show.mortality.comp(yearly.comp.rel.2, is.percentage = TRUE) +
  labs(title = 'Relative comparison between pseudo-years', caption = "Max/min/mean/median are calculated day by day in period 2009-2019 (for example, Max is the worse case scenario from that period)\nData retrieved on {format(Sys.Date(), '%B %d, %Y')}" %>% glue::glue()) + 
  theme(axis.text.x = element_text(angle = 45, hjust = 0))

External causes of death {.tabset .tabset-fade .tabset-pills}

Accidents, suicides and others. It excludes natural causes or diseases.

Period of COVID-19

external.causes.plot.covid

Full year (r format(Sys.Date(), '%Y') is still incomplete)

external.causes.plot.full.year

Mortality by age groups {.tabset .tabset-fade .tabset-pills}

Period of COVID-19

age.group.plot.covid

Full year (r format(Sys.Date(), '%Y') is still incomplete)

age.group.plot.full.year
age.group %>% 
  ggplot() +
  #geom_line(aes(x = day, group = year, color = year)) +
  geom_smooth(aes(x = day, y = `Eventual suicídio`, color = year, group = year), span = 0.1, method = 'loess', formula = y ~ x, na.rm = TRUE, se = FALSE) +
  scale_x_discrete(breaks = function(ix) {
      ix[c(FALSE, TRUE, TRUE, TRUE, TRUE)] <- FALSE
      return(ix)
    }) +
  theme_minimal() +
  theme(legend.key = element_rect(fill = "white", colour = "white"), legend.position = 'top', axis.text.x = element_text(angle = 45, hjust = 1, size = 5))



averissimo/r-analysis-covid19 documentation built on April 24, 2021, 11:01 a.m.