rmarkdown::render('mortality.Rmd', output_file = 'mortality.md') rmarkdown::render('mortality.Rmd', output_file = 'mortality.html')
rmarkdown::clean_site() rmarkdown::render_site(quiet = TRUE)
knitr::opts_chunk$set( collapse = TRUE, echo = FALSE, comment = "#>", fig.height = 10, fig.width = 10 ) devtools::load_all() library(pdftools) library(rvest) library(dplyr) library(tibble) library(DT) library(anytime) library(glue) library(reshape2) library(zoo) library(xml2) library(ggplot2) library(ggrepel) library(plotly) Sys.setlocale('LC_TIME', 'en_GB.UTF-8')
Updated on r format(Sys.time(), '%A, %d %B %Y (%d %Y-%m-%d %H:%m %Z)')
This analysis retrieves data directly from the Portuguese Health ministry death certificate site (SICO) and makes an yearly comparison.
# Download from DGS json <- download.evm() %>% rjson::fromJSON() # Process into multiple columns dat <- sapply(1 + seq_along(json$x$data[-1]), function(ix) { my.year <- json$x$data[[ix]] my.year[sapply(my.year, is.null)] <- NA tmp <- tibble::tibble(!!as.name(2009 + ix - 2) := unlist(my.year, recursive=FALSE)) # current day info may not be finalized if (2009 + ix - 2 == format(Sys.Date(), "%Y")) { tmp[which(is.na(tmp[,1]) %>% as.vector)[1] - 1, 1] <- NA } return(tmp) }) # Merge data mortality <- dplyr::bind_cols(day = json$x$data[[1]], dat) %>% mutate(day = factor(day, levels = day)) # Add some statistics (mean, sd, max, min) mortality.with.stats <- mortality %>% replace(is.na(.), NA) %>% dplyr::rowwise() %>% mutate(meanPre2020 = mean(c_across(cols = -c("2020", 'day')), na.rm = TRUE), mean = mean(c_across(cols = -c("meanPre2020", 'day')), na.rm = TRUE), sdPre2020 = sd(c_across(cols = -c("2020","meanPre2020", "mean", 'day')), na.rm = TRUE), sd = sd(c_across(cols = -c("meanPre2020", "mean", "sdPre2020", 'day')), na.rm = TRUE)) %>% mutate(maxPre2020 = max(c_across(cols = colnames(.)[colnames(mortality) != 2020][grepl('^[0-9]+$', colnames(.))]), na.rm = TRUE), max = max(c_across(cols = colnames(.)[grepl('^[0-9]+$', colnames(.))]), na.rm = TRUE), minPre2020 = min(c_across(cols = colnames(.)[colnames(mortality) != 2020][grepl('^[0-9]+$', colnames(.))]), na.rm = TRUE), min = min(c_across(cols = colnames(.)[grepl('^[0-9]+$', colnames(.))]), na.rm = TRUE)) %>% dplyr::ungroup()
pois.levels <- c('2021-lambda.5', '2020-lambda.5', 'pre2020-lambda', 'pre2020-lambda.5', 'pre2020-ic95.5', 'pre2020-ic99.5') pois.labels <- c('2020 (smooth regression)', # this is new # '2020', '2021 (smooth regression)', # this is new '2021', 'Confidence interval 95% 2015-19', 'Confidence interval 99% 2015-19', 'Base line for 2015-19 (non-averaged)', 'Base line for 2015-19') average.days <- 7 mortality.pois <- mortality %>% reshape2::melt(id.vars = c('day'), variable.name = 'year') %>% filter(!is.na(value)) %>% mutate(year = as.numeric(as.character(year))) %>% #filter(year >= 2015) %>% mutate(group = if_else(year >= 2020, as.character(year), 'pre2020')) %>% group_by(group, day) %>% dplyr::arrange(day, group) mortality.pois.transf <- mortality.pois %>% summarise(lambda = mean(value, na.rm = TRUE), .groups = 'keep') %>% mutate(ic95 = qpois(.95, lambda = lambda, lower.tail = TRUE), ic99 = qpois(.99, lambda = lambda, lower.tail = TRUE)) %>% dplyr::group_by(group) %>% mutate(lambda.5 = zoo::rollapply(lambda, width = average.days, FUN = function(x) {mean(x,na.rm=TRUE)}, partial = TRUE, fill = NA, align = 'center'), ic95.5 = qpois(.95, lambda = lambda.5, lower.tail = TRUE), ic99.5 = qpois(.99, lambda = lambda.5, lower.tail = TRUE)) %>% # reshape2::melt(id.vars = c('group', 'day')) %>% mutate(new.group = paste0(group, '-', variable)) %>% filter(new.group %in% pois.levels) %>% mutate(ymin = min(value), new.group = factor(new.group, levels = pois.levels)) %>% arrange(day, group) mortality.plot.poisson <- mortality.pois.transf %>% ggplot(aes(x = day, group = new.group, color = new.group)) + geom_ribbon(aes(y = value, fill= new.group, ymin = ymin - abs(ymin * .05), ymax = value), alpha = .4, position = 'identity', data = mortality.pois.transf %>% filter(new.group %in% c('pre2020-lambda.5','pre2020-ic99.5','pre2020-ic95.5')), show.legend = FALSE) + geom_line(aes(y = value, color = new.group)) + geom_line(aes(y = value, color = new.group), data = mortality.pois.transf %>% filter(group == '2021'), size = 1.5) + geom_line(aes(y = value, color = new.group), data = mortality.pois.transf %>% filter(group == '2020'), size = 1.5) + geom_smooth(aes(y = value, color = '2021 (smooth regression)'), alpha = .4, data = mortality.pois.transf %>% filter(group == '2021'), span = 1, method = 'loess', formula = y ~ x, na.rm = TRUE, show.legend = FALSE, se = FALSE) + geom_smooth(aes(y = value, color = '2020 (smooth regression)'), alpha = .4, data = mortality.pois.transf %>% filter(group == '2020'), span = 0.1, method = 'loess', formula = y ~ x, na.rm = TRUE, show.legend = FALSE, se = FALSE) + #geom_line(aes(y = lambda.5, color = new.grou~p), data = mortality.pois.transf %>% filter(group == 'pre2020')) + #geom_line(aes(y = ic95.5), color = 'purple', data = mortality.pois.transf %>% filter(group == 'pre2020')) + #geom_line(aes(y = ic99.5), color = 'orange', data = mortality.pois.transf %>% filter(group == 'pre2020')) + #geom_line(aes(y = lambda.5), data = mortality.pois.transf %>% filter(group == '2020')) + scale_x_discrete(breaks = function(ix) { ix[c(FALSE, TRUE, TRUE, TRUE, TRUE)] <- FALSE return(ix) }) + scale_color_discrete("", labels = pois.labels) + labs(title = '2020 and 2021 vs Baseline using Poisson distribution', subtitle = 'Data calculated with centered average of {average.days} days' %>% glue::glue(), caption = "Data retrieved on {format(Sys.Date(), '%B %d, %Y')}" %>% glue::glue()) + xlab('Day of year') + ylab('Number of deaths') + theme_minimal() + theme(legend.key = element_rect(fill = "white", colour = "white"), legend.position = 'top', axis.text.x = element_text(angle = 45, hjust = 1, size = 5))
Sample plot
show.year.mortality(mortality.with.stats, seq(2019, 2020))
yearly.stats <- mortality[c(rep(0, which(mortality$day == 'Mar-15') - 1), seq(from = which(mortality$day == 'Mar-15'), nrow(mortality)) > 0) & as.vector(!is.na(mortality[,'2020'])),] %>% rowwise() %>% replace(is.na(.), NA) %>% #mutate(maxPre2020 = max(c_across(colnames(.)[!grepl('(^2020)',colnames(.)) & grepl('^[0-9]+$',colnames(.))]), na.rm = TRUE), # minPre2020 = min(c_across(colnames(.)[!grepl('(^2020)',colnames(.)) & grepl('^[0-9]+$',colnames(.))]), na.rm = TRUE), # meanPre2020 = mean(c_across(colnames(.)[!grepl('(^2020)',colnames(.)) & grepl('^[0-9]+$',colnames(.))]), na.rm = TRUE), # medianPre2020 = median(c_across(colnames(.)[!grepl('(^2020)',colnames(.)) & grepl('^[0-9]+$',colnames(.))]), na.rm = TRUE)) %>% reshape2::melt(id.vars = 'day') %>% dplyr::group_by(variable) %>% dplyr::summarise(mean = mean(value, na.rm = TRUE), sd = sd(value, na.rm = TRUE), sum = sum(value, na.rm = TRUE), median = median(value, na.rm = TRUE), max = max(value, na.rm = TRUE), min = min(value, na.rm = TRUE), .groups = 'rowwise')
yearly.comp <- sapply(seq(nrow(yearly.stats)), function(ix) { yearly.stats$sum - yearly.stats$sum[ix]}) %>% data.frame() yearly.comp[upper.tri(yearly.comp)] <- NA colnames(yearly.comp) <- yearly.stats$variable yearly.comp <- bind_cols(year = yearly.stats$variable, yearly.comp)
yearly.comp.rel <- sapply(seq(nrow(yearly.stats)), function(ix) { yearly.stats$sum / yearly.stats$sum[ix]}) %>% data.frame() yearly.comp.rel[upper.tri(yearly.comp.rel)] <- NA colnames(yearly.comp.rel) <- yearly.stats$variable yearly.comp.rel <- bind_cols(year = yearly.stats$variable, yearly.comp.rel)
yearly.stats.2 <- mortality[c(rep(0, which(mortality$day == 'Mar-15') - 1), seq(from = which(mortality$day == 'Mar-15'), nrow(mortality)) > 0) & as.vector(!is.na(mortality[,'2020'])),] %>% rowwise() %>% replace(is.na(.), NA) %>% mutate(maxPre2020 = max(c_across(colnames(.)[!grepl('(^2020)',colnames(.)) & grepl('^[0-9]+$',colnames(.))]), na.rm = TRUE), minPre2020 = min(c_across(colnames(.)[!grepl('(^2020)',colnames(.)) & grepl('^[0-9]+$',colnames(.))]), na.rm = TRUE), meanPre2020 = mean(c_across(colnames(.)[!grepl('(^2020)',colnames(.)) & grepl('^[0-9]+$',colnames(.))]), na.rm = TRUE), meanPlusStDevPre2020 = mean(c_across(colnames(.)[!grepl('(^2020)',colnames(.)) & grepl('^[0-9]+$',colnames(.))]), na.rm = TRUE) + sd(c_across(colnames(.)[!grepl('(^2020)',colnames(.)) & grepl('^[0-9]+$',colnames(.))]), na.rm = TRUE), meanMinusStDevPre2020 = mean(c_across(colnames(.)[!grepl('(^2020)',colnames(.)) & grepl('^[0-9]+$',colnames(.))]), na.rm = TRUE) - sd(c_across(colnames(.)[!grepl('(^2020)',colnames(.)) & grepl('^[0-9]+$',colnames(.))]), na.rm = TRUE), medianPre2020 = median(c_across(colnames(.)[!grepl('(^2020)',colnames(.)) & grepl('^[0-9]+$',colnames(.))]), na.rm = TRUE)) %>% reshape2::melt(id.vars = 'day') %>% dplyr::group_by(variable) %>% dplyr::summarise(mean = mean(value, na.rm = TRUE), sd = sd(value, na.rm = TRUE), sum = sum(value, na.rm = TRUE), median = median(value, na.rm = TRUE), max = max(value, na.rm = TRUE), min = min(value, na.rm = TRUE), .groups = 'rowwise') %>% filter(grepl('202[0-9]$', variable)) %>% mutate(variable = factor(variable, levels = c( 'medianPre2020', 'minPre2020', 'meanMinusStDevPre2020', 'meanPre2020', 'meanPlusStDevPre2020', 'maxPre2020', '2020'))) %>% arrange(variable)
yearly.comp.rel.2 <- sapply(seq(nrow(yearly.stats.2)), function(ix) { yearly.stats.2$sum / yearly.stats.2$sum[ix]}) %>% data.frame() yearly.comp.rel.2[upper.tri(yearly.comp.rel.2)] <- NA colnames(yearly.comp.rel.2) <- yearly.stats.2$variable yearly.comp.rel.2 <- bind_cols(year = yearly.stats.2$variable, yearly.comp.rel.2)
external.raw <- download.causas.externas() json.ext <- external.raw$json %>% purrr::map(function(ix) {rjson::fromJSON(ix)}) external.causes <- lapply(seq_along(external.raw$year), function(ix.year) { # Process into multiple columns json.tmp <- json.ext[[ix.year]] dat <- sapply(1 + seq_along(json.tmp$x$data[-1]), function(ix) { my.colnames <- xml2::read_xml(json.tmp$x$container) %>% rvest::html_nodes('th') %>% gsub(' ?</?th> ?', '', .) my.year <- json.tmp$x$data[[ix]] my.year[sapply(my.year, is.null)] <- NA tmp <- tibble::tibble(!!as.name(my.colnames[ix]) := unlist(my.year, recursive=FALSE)) return(tmp) }) dat$year <- rep(external.raw$year[ix.year], length(dat[[1]])) # Merge data return(dplyr::bind_cols(day = json.tmp$x$data[[1]], dat) %>% mutate(day = factor(day, levels = json.ext[[1]]$x$data[[1]]))) }) %>% bind_rows %>% dplyr::relocate(year, .before = day) %>% mutate(year = factor(year))
external.causes.plot.covid <- external.causes %>% filter(day %in% levels(external.causes$day)[levels(external.causes$day) %in% (external.causes %>% filter(year == '2020') %>% pull(day))]) %>% filter(day %in% levels(external.causes$day)[seq(from = which(levels(external.causes$day) == 'Mar-15'), to = length(levels(external.causes$day)))]) %>% group_by(year) %>% summarise(across(where(is.numeric), ~ sum(.x , na.rm = TRUE)), .groups = 'keep') %>% reshape2::melt(id = 'year') %>% ggplot(aes(y = year, x = variable)) + geom_tile(aes(fill = value)) + geom_text(aes(label = value), color = 'white') + scale_fill_gradient(low = "orange", high = "red", na.value = '#00000020') + scale_x_discrete(position = "top") + labs(title = "External causes of death from \'March 15 to {format(Sys.Date(), '%B %d')}'" %>% glue::glue(), subtitle = 'All years show the same period', caption = "Accidents, suicides and others. It excludes natural causes or diseases\nData retrieved on {format(Sys.Date(), '%B %d, %Y')}" %>% glue::glue()) + xlab('External cause of death') + ylab('Year') + theme_minimal() + theme(legend.position = 'none') external.causes.plot.full.year <- external.causes %>% group_by(year) %>% summarise(across(where(is.numeric), ~ sum(.x , na.rm = TRUE)), .groups = 'keep') %>% reshape2::melt(id = 'year') %>% ggplot(aes(y = year, x = variable)) + geom_tile(aes(fill = value)) + geom_text(aes(label = value), color = 'white') + scale_fill_gradient(low = "orange", high = "red", na.value = '#00000020') + scale_x_discrete(position = "top") + labs(title = 'External causes of death', subtitle = 'All years show the same period', caption = "Accidents, suicides and others. It excludes natural causes or diseases\nData retrieved on {format(Sys.Date(), '%B %d, %Y')}" %>% glue::glue()) + xlab('External cause of death') + ylab('Year') + theme_minimal() + theme(legend.position = 'none')
age.group.raw <- download.age.groups() json.age.group <- age.group.raw$json %>% purrr::map(function(ix) {rjson::fromJSON(ix)}) age.group <- lapply(seq_along(age.group.raw$year), function(ix.year) { # Process into multiple columns json.tmp <- json.age.group[[ix.year]] dat <- sapply(1 + seq_along(json.tmp$x$data[-1]), function(ix) { my.colnames <- xml2::read_xml(json.tmp$x$container) %>% rvest::html_nodes('th') %>% gsub(' ?</?th> ?', '', .) my.year <- json.tmp$x$data[[ix]] my.year[sapply(my.year, is.null)] <- NA tmp <- tibble::tibble(!!as.name(my.colnames[ix]) := unlist(my.year, recursive=FALSE)) return(tmp) }) dat$year <- rep(age.group.raw$year[ix.year], length(dat[[1]])) # Merge data return(dplyr::bind_cols(day = json.tmp$x$data[[1]], dat) %>% mutate(day = factor(day, levels = json.age.group[[1]]$x$data[[1]]))) }) %>% bind_rows %>% dplyr::relocate(year, .before = day) %>% mutate(year = factor(year))
age.group.plot.covid <- age.group %>% filter(day %in% levels(age.group$day)[levels(age.group$day) %in% (age.group %>% filter(year == '2020') %>% pull(day))]) %>% filter(day %in% levels(age.group$day)[seq(from = which(levels(age.group$day) == 'Mar-15'), to = length(levels(age.group$day)))]) %>% rowwise() %>% mutate(total = sum(c_across(-c('year', 'day')))) %>% ungroup() %>% group_by(year) %>% summarise(across(where(is.numeric), ~ sum(.x , na.rm = TRUE)), .groups = 'keep') %>% reshape2::melt(id = c('year', 'total')) %>% mutate(label = factor(paste0(year, '\n(', format(total, big.mark = ' '),')'))) %>% ggplot(aes(y = label, x = variable)) + geom_tile(aes(fill = value)) + geom_text(aes(label = value), color = 'white') + scale_fill_gradient(low = "orange", high = "red", na.value = '#00000020') + scale_x_discrete(position = "top") + labs(title = "Mortality by age group from \'March 15 to {format(Sys.Date(), '%B %d')}'" %>% glue::glue(), subtitle = 'All years show the same period', caption = "Data retrieved on {format(Sys.Date(), '%B %d, %Y')}" %>% glue::glue()) + xlab('Age group') + ylab('Year') + theme_minimal() + theme(legend.position = 'none') age.group.plot.full.year <- age.group %>% rowwise() %>% mutate(total = sum(c_across(-c('year', 'day')))) %>% ungroup() %>% group_by(year) %>% summarise(across(where(is.numeric), ~ sum(.x , na.rm = TRUE)), .groups = 'keep') %>% reshape2::melt(id = c('year', 'total')) %>% mutate(label = factor(paste0(year, '\n(', format(total, big.mark = ' '),')'))) %>% ggplot(aes(y = label, x = variable)) + geom_tile(aes(fill = value)) + geom_text(aes(label = value), color = 'white') + scale_fill_gradient(low = "orange", high = "red", na.value = '#00000020') + scale_x_discrete(position = "top") + labs(title = 'Mortality by age group', subtitle = 'All years show the same period (last year may be incomplete)', caption = "Data retrieved on {format(Sys.Date(), '%B %d, %Y')}" %>% glue::glue()) + xlab('Age group') + ylab('Year') + theme_minimal() + theme(legend.position = 'none')
mortality.plot.poisson
Yearly comparison against 2020
show.year.mortality(mortality.with.stats, seq(2019, 2020))
show.year.mortality(mortality.with.stats, c(2018, 2020))
show.year.mortality(mortality.with.stats, c(2017, 2020))
show.year.mortality(mortality.with.stats, c(2016, 2020))
show.year.mortality(mortality.with.stats, c(2015, 2020))
show.year.mortality(mortality.with.stats, c(2014, 2020))
show.year.mortality(mortality.with.stats, c(2013, 2020))
show.year.mortality(mortality.with.stats, c(2012, 2020))
show.year.mortality(mortality.with.stats, c(2011, 2020))
show.year.mortality(mortality.with.stats, c(2010, 2020))
show.year.mortality(mortality.with.stats, c(2009, 2020))
r format(Sys.Date(), '%B-%d')
' yearly comparison {.tabset .tabset-fade .tabset-pills}show.mortality.comp(yearly.comp) + labs(title = "Absolute comparison between years (March-15 to {format(Sys.Date(), '%B-%d')})" %>% glue::glue())
show.mortality.comp(yearly.comp.rel, is.percentage = TRUE) + labs(title = "Relative comparison between years (March-15 to {format(Sys.Date(), '%B-%d')})" %>% glue::glue())
r format(Sys.Date(), '%B-%d')
' yearly comparisons with 2009-2019 stats {.tabset .tabset-fade .tabset-pills}yearly.comp.2 <- sapply(seq(nrow(yearly.stats.2)), function(ix) { yearly.stats.2$sum - yearly.stats.2$sum[ix]}) %>% data.frame() yearly.comp.2[upper.tri(yearly.comp.2)] <- NA colnames(yearly.comp.2) <- yearly.stats.2$variable yearly.comp.2 <- bind_cols(year = yearly.stats.2$variable, yearly.comp.2) show.mortality.comp(yearly.comp.2) + labs(title = 'Absolute comparison between pseudo-years', caption = "Max/min/mean/median are calculated day by day in period 2009-2019 (for example, Max is the worse case scenario from that period)\nData retrieved on {format(Sys.Date(), '%B %d, %Y')}" %>% glue::glue()) + theme(axis.text.x = element_text(angle = 45, hjust = 0))
show.mortality.comp(yearly.comp.rel.2, is.percentage = TRUE) + labs(title = 'Relative comparison between pseudo-years', caption = "Max/min/mean/median are calculated day by day in period 2009-2019 (for example, Max is the worse case scenario from that period)\nData retrieved on {format(Sys.Date(), '%B %d, %Y')}" %>% glue::glue()) + theme(axis.text.x = element_text(angle = 45, hjust = 0))
Accidents, suicides and others. It excludes natural causes or diseases.
external.causes.plot.covid
r format(Sys.Date(), '%Y')
is still incomplete)external.causes.plot.full.year
age.group.plot.covid
r format(Sys.Date(), '%Y')
is still incomplete)age.group.plot.full.year
age.group %>% ggplot() + #geom_line(aes(x = day, group = year, color = year)) + geom_smooth(aes(x = day, y = `Eventual suicídio`, color = year, group = year), span = 0.1, method = 'loess', formula = y ~ x, na.rm = TRUE, se = FALSE) + scale_x_discrete(breaks = function(ix) { ix[c(FALSE, TRUE, TRUE, TRUE, TRUE)] <- FALSE return(ix) }) + theme_minimal() + theme(legend.key = element_rect(fill = "white", colour = "white"), legend.position = 'top', axis.text.x = element_text(angle = 45, hjust = 1, size = 5))
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.