knitr::opts_chunk$set(echo = TRUE)
library(dplyr) library(ggplot2) library(leaflet) library(lubridate) set.seed(42)
The first step aims to investigate simple information about Marina’s dataset. Let’s read it and print a sample of the data.frame.
marine <- read.table(file = "../../../data/raw/ships.csv", header = TRUE, sep = ",", dec = ".", fill = T) head(marine)
Although it takes around 10 seconds to read the data, I will not use a different package (such as data.table) just to read it faster.
Let's see which columns have missing data and their proportion.
100*sort(colMeans(is.na(marine)))
We can observe missing data in the SHIP_ID variable which will be used later as an id. Therefore I am going to remove these lines since I will not be able to use them.
marine <- marine %>% dplyr::filter(!is.na(SHIP_ID))
Now I have missing data only for LENGTH, WIDTH, DWT, and DESTINATION. I think it is ok for now.
As SHIP_ID is an identifier, it should be unique. In my mind (assumption), we should not have different ids for two vessels for a specific ship type. Let's check!
name_id <- marine %>% dplyr::group_by(SHIPNAME, ship_type) %>% dplyr::summarise(n = dplyr::n_distinct(SHIP_ID)) %>% dplyr::arrange(desc(n)) name_id
I will remove the [SAT-AIS] ship as it seems something related to the AIS technology. For the remaining, I am going to keep them, and later I will pick one of the IDs at random instead of removing them.
marine <- marine %>% dplyr::filter(SHIPNAME != "[SAT-AIS]")
Another possibility is the opposite: Can I have different ship names for the same SHIP_ID?
id_name <- marine %>% dplyr::group_by(SHIP_ID, ship_type) %>% dplyr::summarise(n = dplyr::n_distinct(SHIPNAME)) %>% dplyr::arrange(desc(n)) %>% dplyr::filter(n > 1) id_name
It happens a lot. I will remove the id 4666609 and explore the remaining.
marine <- marine %>% dplyr::filter(SHIP_ID != 4666609)
marine %>% dplyr::filter(SHIP_ID %in% id_name$SHIP_ID) %>% dplyr::select(SHIP_ID, ship_type, SHIPNAME) %>% unique() %>% dplyr::arrange(SHIP_ID, ship_type)
Most of them seem to be the same. Again I am going to keep them and later I will pick one at random.
ship_id <- sample(x = marine$SHIP_ID, size = 1) marine_plt <- marine %>% dplyr::filter(SHIP_ID == ship_id) %>% dplyr::arrange(DATETIME) ggplot2::ggplot(data = marine_plt) + ggplot2::geom_point(mapping = aes(x = LON, y = LAT), color = "steelblue") + ggplot2::theme_bw() # Using ggplot2 just to make it easy to show on github # leaflet::leaflet(data = marine_plt) %>% # leaflet::addCircleMarkers( # lng = ~LON, # lat = ~LAT, # label = ~SHIPNAME, # color = "white", # radius = 1 # ) %>% # leaflet::addProviderTiles( # provider = leaflet::providers$CartoDB.DarkMatter, # options = leaflet::providerTileOptions(noWrap = T) # )
Now I am going to visualize some statistics and plots looking for oddities.
dplyr::n_distinct(marine$ship_type) unique(marine$ship_type)
dplyr::n_distinct(marine$port) unique(marine$port)
dplyr::n_distinct(marine$SHIP_ID)
dplyr::n_distinct(marine$DESTINATION) head(sort(table(marine$DESTINATION), decreasing = TRUE))
summary(marine$SPEED) # 1550? :O ggplot2::ggplot(data = marine) + # Too many zeros ggplot2::geom_histogram(mapping = aes(x = SPEED), fill = "steelblue", bins = 100) + ggplot2::xlab("Speed") + ggplot2::ylab("Count") + ggplot2::scale_y_continuous(labels = scales::label_number_si()) + ggplot2::theme_bw()
Let's remove those observations for parked vessels.
marine_np <- marine %>% dplyr::filter(!is_parked) summary(marine_np$SPEED) ggplot2::ggplot(data = marine_np) + ggplot2::geom_histogram(mapping = aes(x = SPEED), fill = "steelblue", bins = 100) + ggplot2::xlab("Speed") + ggplot2::ylab("Count") + ggplot2::scale_y_continuous(labels = scales::label_number_si()) + ggplot2::theme_bw()
I think this variable has some inconsistency. However, I am going to present it in the shiny app.
ggplot2::ggplot(data = marine) + ggplot2::geom_histogram(mapping = aes(x = LENGTH), fill = "steelblue", bins = 50) + ggplot2::xlab("Length (m)") + ggplot2::ylab("Count") + ggplot2::scale_y_continuous(labels = scales::label_number_si()) + ggplot2::theme_bw()
ggplot2::ggplot(data = marine) + ggplot2::geom_histogram(mapping = aes(x = DWT), fill = "steelblue", bins = 50) + ggplot2::xlab("Deadweight (Ton.)") + ggplot2::ylab("Count") + ggplot2::scale_y_continuous(labels = scales::label_number_si()) + ggplot2::theme_bw()
marine <- marine %>% dplyr::mutate(DATETIME = lubridate::ymd_hms(DATETIME)) range(marine$DATETIME)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.