library(tufte) # invalidate cache when the tufte version changes knitr::opts_chunk$set(tidy = FALSE, cache.extra = packageVersion('tufte')) options(htmltools.dir.version = FALSE) knitr::opts_chunk$set(warning = FALSE, message = FALSE)
## devtools::install_github(bergant/trafficaccidents) library(trafficaccidents) #devtools::install_github("bergant/datamodelr") library(datamodelr) #library(tidyverse) library(ggplot2) library(viridis) library(ggjoy) library(dplyr) library(tidyr) library(rmarkdown) library(knitr) display_levels <- function(col_name) { event %>% left_join(calendar, by = "date") %>% filter(year %in% 2014:2016) %>% group_by_(col_name, "year") %>% summarise(Frequency = n()) %>% spread(key = "year", "Frequency") %>% kable(., row.names = TRUE, format.args = list(big.mark = ",")) }
This tutorial is an overview of
trafficaccidents
data package.
The package contains information about traffic accidents in Slovenia examined by
the police from 2005 to 2016.
Original files^[The original data is published on
http://www.policija.si/]
were converted to R data frames without changes, except for:
The tutorial was created using R language [@R-base] and several R extensions. See the list of all used packages at the end of the document.
Note that this is not a thorough analysis of traffic accidents. The only goal is to bring the data and visualization tools to a larger group of users.
Information about accidents is stored in three data frames:
event
contains accident events with date, time, position and other
accident attributes. party
contains attributes of the parties involved.calendar
contains all dates from
r paste(lubridate::year(range(event$date)), collapse = " to ")
.dm_from_data_frames(event, party, calendar) %>% dm_add_references( party$src_file == event$src_file, party$event_id == event$event_id, event$date == calendar$date ) %>% dm_create_graph(col_attr = c("column","type"), graph_attrs = "rankdir = LR") %>% dm_render_graph
The majority of accidents
(r round(mean(as.numeric(!is.na(event$pos_x)))* 100)
%)
are geocoded. Use pos_x
and pos_y
columns for geographic location.^[
Columns pos_x
and pos_y
are geocoded in D48/GK coordinate reference system.
Use columns lon and lat for WGS84]
For example, we can plot all accidents locations as points, coloured by
road category (road_type
):
library(dplyr) library(ggplot2) library(viridis) library(trafficaccidents) dat_roads <- event %>% filter(!is.na(road_type), !is.na(pos_x)) %>% arrange(desc(road_type)) ggplot(data = dat_roads, aes(pos_y, pos_x, color = road_type) ) + geom_point(alpha = 0.03, size = 0.2) + coord_fixed() + scale_color_viridis(discrete = TRUE, end = 0.7, option = "A") + guides(colour = guide_legend( override.aes = list(alpha = 1, size = 2), title = "Road category")) + theme_void() + theme(legend.position = c(0.95,0.3))
Working days have different time distribution than weekends and holidays.
days_count <- calendar %>% group_by(year, working_day) %>% summarise(days = n()) accidents_per_hour <- event %>% left_join(calendar, by = "date") %>% group_by(year, working_day, hour) %>% count %>% left_join(days_count, by = c("year", "working_day")) %>% mutate(p = n / days) ggplot(accidents_per_hour, aes(x = hour, y = p)) + geom_point(alpha = 0.2, size = 0.2, position = position_jitter(0.1)) + geom_smooth(method = "loess", span = 1/3, color = "gray") + scale_x_continuous(breaks = 1:7*3, name = "Hour", minor_breaks = NULL) + scale_y_continuous(limits = c(0, NA), breaks = 0:6 * 2, name = "Average number of accidents") + coord_cartesian(y=c(0, 9)) + facet_wrap(~ working_day, ncol = 1, scales = "fixed") + theme_minimal()
With "joy plot" from ggjoy
package [@R-ggjoy] we can
discover the differences of time distributions between road categories.
The peaks at rush hour are more visible on roads than on city streets.
library(ggjoy) dat_rt_wd <- event %>% left_join(calendar, by = "date") %>% filter( working_day == "Working day", !is.na(road_type) ) ggplot(dat_rt_wd) + geom_joy(aes(x = hour, y = road_type, fill = road_type), alpha = 0.5 ) + scale_x_continuous(breaks = 1:7*3, minor_breaks = NULL, limits = c(0,23), name = "Hour") + scale_fill_viridis(discrete = TRUE, end = 0.7, option = "A")+ scale_y_discrete(limits = rev(levels(dat_rt_wd$road_type)), name = NULL)+ theme_minimal() + theme(legend.position = "none")
With animation it is possible to see accident locations by hour. The following animation was created with gganimate package [@R-gganimate].
All accidents are marked with injury severity (column injury
). The plot
below compares time distributions of accidents for different levels of
severities.
dat <- event %>% left_join(calendar, by = "date") ggplot(dat) + geom_joy(aes(x = hour, y = injury, alpha = injury), fill = "darkred") + scale_x_continuous(breaks = 1:7*3, minor_breaks = NULL, limits = c(0,23), name = "Hour") + scale_y_discrete(limits = rev(levels(dat$injury)), name = NULL)+ scale_alpha_discrete(range = c(0.06, 0.7), guide = "none") + theme_minimal() + facet_wrap(~working_day, ncol = 1)
Police classifies each accident by main cause. The plot below represent the number of accidents as a circle size and injury severity as color opacity for different accident causes:
dat_type <- event %>% group_by(cause, injury) %>% count ggplot(dat_type, aes(x = 1, y=1, size = sqrt(n) / 7, alpha = injury)) + scale_size_identity() + geom_point(color = "darkred") + scale_y_discrete(limits = rev(levels(dat_type$cause))) + coord_fixed(0.7) + scale_alpha_discrete(range = c(0.06, 0.7)) + guides(alpha = guide_legend(override.aes = list(size = 6))) + theme_void(base_size = 12) + #theme(strip.text = element_text(size = 10)) + labs(x = NULL, y = NULL, alpha = "Severity") + facet_wrap(~cause, ncol = 3)
The accident_type
column defines what happened (e.g. side collision or vehicle rollover).
The plot below explores its relation to accident cause.
dat_cause_type <- event %>% group_by(cause, event_type, injury) %>% count ggplot(dat_cause_type, aes(x = event_type, y=cause, size = sqrt(n) / 7, alpha = injury)) + geom_point(color = "darkred") + scale_size_identity() + scale_alpha_discrete(range = c(0.06, 0.7)) + guides(alpha = guide_legend(override.aes = list(size = 10))) + scale_x_discrete(position = "top") + scale_y_discrete(limits = rev(levels(dat_cause_type$cause))) + coord_fixed() + theme_minimal() + theme( axis.text.x = element_text(angle=90,hjust = 0, vjust = 0.3, size = 12), axis.text.y = element_text(size = 12), legend.position = "bottom" ) + labs(x = "Accident type", y = "Cause", alpha = "Severity")
There can be several parties involved in same accident. The data
includes also some cases where there are several parties at fault for the
same accident.
Here the events_party
data frame is created with single
event and some attributes from one party at fault.
party_at_fault <- party %>% filter(accident_role == "At fault") other_party <- party %>% filter(accident_role != "At fault") events_party <- party_at_fault %>% group_by(src_file, event_id) %>% summarise( age = first(age), gender = first(gender), alco = first(alcotest) > 0, experience = first(experience_y) + first(experience_m)/12, traffic_role = first(traffic_role) ) events_party %>% group_by(traffic_role) %>% summarise(accidents = n()) %>% arrange(desc(accidents)) %>% head(8) %>% kable
Most frequent parties at fault are car drivers. Is it true, that there are more accidents involving young drivers, driving at night?
hour_intervals <- setNames( c(0, 6, 22), c("Night (22h - 06h)", "Day (6h-21h)", "Night (22h - 06h)") ) events_party_car_drivers <- events_party %>% filter(traffic_role %in% c("Car driver")) %>% filter(age >= 18) %>% inner_join(event, by = c("src_file", "event_id")) %>% inner_join(calendar, by = "date") %>% mutate( hour_interval = findInterval(hour, hour_intervals), hour_interval = names(hour_intervals)[hour_interval], age = round(age) ) events_party_car_drivers %>% filter(date >= as.Date("2011-01-01")) %>% group_by(age, hour_interval, src_file) %>% count %>% ggplot(aes(x = age, y = n)) + #geom_point(alpha = 0.7) + geom_point(alpha = 0.5, mapping = aes(color = src_file)) + geom_smooth(alpha = 0.2, method = "loess") + facet_wrap(~hour_interval, scales = "free_y") + labs(x = "Age", y = "Number of accidents", color = NULL) + scale_x_continuous(breaks = 1:10*10, limits = c(18, 85)) + scale_color_brewer(palette = "Blues") + theme_bw()
Alcohol involved accidents, number of accidents by age at day/night:
events_party_car_drivers %>% filter(date >= as.Date("2011-01-01"), alco) %>% group_by(age, hour_interval, src_file) %>% count %>% ggplot(aes(x = age, y = n)) + #geom_point(alpha = 0.7) + geom_point(alpha = 0.3, mapping = aes(color = src_file)) + geom_smooth(method = "loess", color = "orange", span = 0.6) + facet_wrap(~hour_interval) + labs(x = "Age", y = "Number of accidents", color = NULL) + scale_x_continuous(breaks = 1:10*10, limits = c(18, 80)) + scale_color_brewer(palette = "Oranges") + theme_bw()+ theme(legend.position = "none")
for(p in sessionInfo()$otherPkgs) { cat(sep = "", "**", p$Package, "** ", p$Version, "\n\n ", format(citation(package = p$Package)[[1]], style = "html"), "\n\n" ) }
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.