knitr::opts_chunk$set(echo = TRUE)
library(dplyr)
library(ggplot2)
library(leaflet)
library(lubridate)

set.seed(42)

The Marine's dataset

The first step aims to investigate simple information about Marina’s dataset. Let’s read it and print a sample of the data.frame.

marine <- read.table(file = "../../../data/raw/ships.csv", header = TRUE, sep = ",", dec = ".", fill = T)
head(marine)

Although it takes around 10 seconds to read the data, I will not use a different package (such as data.table) just to read it faster.

Missing data

Let's see which columns have missing data and their proportion.

100*sort(colMeans(is.na(marine)))

We can observe missing data in the SHIP_ID variable which will be used later as an id. Therefore I am going to remove these lines since I will not be able to use them.

marine <- marine %>%
  dplyr::filter(!is.na(SHIP_ID))

Now I have missing data only for LENGTH, WIDTH, DWT, and DESTINATION. I think it is ok for now.

SHIP_ID column

As SHIP_ID is an identifier, it should be unique. In my mind (assumption), we should not have different ids for two vessels for a specific ship type. Let's check!

name_id <- marine %>%
  dplyr::group_by(SHIPNAME, ship_type) %>%
  dplyr::summarise(n = dplyr::n_distinct(SHIP_ID)) %>%
  dplyr::arrange(desc(n))

name_id

I will remove the [SAT-AIS] ship as it seems something related to the AIS technology. For the remaining, I am going to keep them, and later I will pick one of the IDs at random instead of removing them.

marine <- marine %>%
  dplyr::filter(SHIPNAME != "[SAT-AIS]")

Another possibility is the opposite: Can I have different ship names for the same SHIP_ID?

id_name <- marine %>%
  dplyr::group_by(SHIP_ID, ship_type) %>%
  dplyr::summarise(n = dplyr::n_distinct(SHIPNAME)) %>%
  dplyr::arrange(desc(n)) %>%
  dplyr::filter(n > 1)

id_name

It happens a lot. I will remove the id 4666609 and explore the remaining.

marine <- marine %>%
  dplyr::filter(SHIP_ID != 4666609)
marine %>%
  dplyr::filter(SHIP_ID %in% id_name$SHIP_ID) %>%
  dplyr::select(SHIP_ID, ship_type, SHIPNAME) %>%
  unique() %>%
  dplyr::arrange(SHIP_ID, ship_type)

Most of them seem to be the same. Again I am going to keep them and later I will pick one at random.

Let's see what we have for each vessel:

ship_id <- sample(x = marine$SHIP_ID, size = 1)

marine_plt <- marine %>%
  dplyr::filter(SHIP_ID == ship_id) %>%
  dplyr::arrange(DATETIME)

ggplot2::ggplot(data = marine_plt) +
  ggplot2::geom_point(mapping = aes(x = LON, y = LAT), color = "steelblue") +
  ggplot2::theme_bw()

# Using ggplot2 just to make it easy to show on github
# leaflet::leaflet(data = marine_plt) %>%
#   leaflet::addCircleMarkers(
#     lng = ~LON,
#     lat = ~LAT,
#     label = ~SHIPNAME,
#     color = "white",
#     radius = 1
#   ) %>%
#   leaflet::addProviderTiles(
#     provider = leaflet::providers$CartoDB.DarkMatter,
#     options = leaflet::providerTileOptions(noWrap = T)
#   )

Some stats

Now I am going to visualize some statistics and plots looking for oddities.

Ship types

dplyr::n_distinct(marine$ship_type)
unique(marine$ship_type)

Ports

dplyr::n_distinct(marine$port)
unique(marine$port)

Ports

dplyr::n_distinct(marine$SHIP_ID)

Destination

dplyr::n_distinct(marine$DESTINATION)
head(sort(table(marine$DESTINATION), decreasing = TRUE))

Speed

summary(marine$SPEED) # 1550? :O
ggplot2::ggplot(data = marine) + # Too many zeros
  ggplot2::geom_histogram(mapping = aes(x = SPEED), fill = "steelblue", bins = 100) +
  ggplot2::xlab("Speed") +
  ggplot2::ylab("Count") +
  ggplot2::scale_y_continuous(labels = scales::label_number_si()) +
  ggplot2::theme_bw()

Let's remove those observations for parked vessels.

marine_np <- marine %>%
  dplyr::filter(!is_parked)

summary(marine_np$SPEED)
ggplot2::ggplot(data = marine_np) +
  ggplot2::geom_histogram(mapping = aes(x = SPEED), fill = "steelblue", bins = 100) +
  ggplot2::xlab("Speed") +
  ggplot2::ylab("Count") +
  ggplot2::scale_y_continuous(labels = scales::label_number_si()) +
  ggplot2::theme_bw()

I think this variable has some inconsistency. However, I am going to present it in the shiny app.

Length

ggplot2::ggplot(data = marine) +
  ggplot2::geom_histogram(mapping = aes(x = LENGTH), fill = "steelblue", bins = 50) +
  ggplot2::xlab("Length (m)") +
  ggplot2::ylab("Count") +
  ggplot2::scale_y_continuous(labels = scales::label_number_si()) +
  ggplot2::theme_bw()

DWT

ggplot2::ggplot(data = marine) +
  ggplot2::geom_histogram(mapping = aes(x = DWT), fill = "steelblue", bins = 50) +
  ggplot2::xlab("Deadweight (Ton.)") +
  ggplot2::ylab("Count") +
  ggplot2::scale_y_continuous(labels = scales::label_number_si()) +
  ggplot2::theme_bw()

Datetime

marine <- marine %>%
  dplyr::mutate(DATETIME = lubridate::ymd_hms(DATETIME))

range(marine$DATETIME)


DouglasMesquita/marineApp documentation built on Dec. 17, 2021, 5:29 p.m.