inst/extdata/analysis/02_preprocessing.R

# Packages
library(dplyr)
library(lubridate)
library(purrr)
library(stringr)
library(rvest)

library(geosphere)

source(file = "inst/extdata/analysis/utils.R", encoding = "UTF-8")

# Reading the dataset
marine <- read.table(
  file = "data/raw/ships.csv", header = TRUE,
  sep = ",", dec = ".", fill = T,
  na.strings = c("NA", "NaN", "--", "")
)

# Cleaning steps:
#- Removing SHIP_ID = and SHIPNAME = "[SAT AIS]" (See 01_data_exploration.md)
#- Removing PORT variable (I already have the port variable)
#- Removing SHIPTYPE variable (I already have the port ship_type)
#- Using lower case for variables
#- Filtering only the vessels with valid id (not NA)
#- Transforming the DATETIME column into a datetime object
#- Getting the link address to each flag and vessel images
#- Choosing a pattern for categorical variables
#- Renaming columns and keeping only columns I would like to use
marine <- marine %>%
  dplyr::filter(SHIP_ID != 4666609, SHIPNAME != "[SAT-AIS]") %>%
  dplyr::select(-PORT, -SHIPTYPE) %>%
  dplyr::rename_all(.funs = stringr::str_to_lower) %>%
  dplyr::filter(!is.na(ship_id)) %>%
  dplyr::mutate(
    datetime = lubridate::ymd_hms(datetime),
    flag_img = get_flag_link(flag),
    vessel_img = get_vessel_link(ship_id),
    port = stringr::str_to_title(port),
    destination = stringr::str_to_title(destination),
    ship_name = stringr::str_to_title(shipname)
  ) %>%
  select(
    datetime, lat, lon, speed,
    port, destination,
    flag, flag_img,
    ship_type, ship_name, ship_id, vessel_img,
    length, dwt,
    is_parked
  )

colnames(marine) <- gsub(pattern = "ship", replacement = "vessel", colnames(marine))

# I am assuming that, when the trio (vessel_id, port, destination) changes it is considered a new sail (I am keeping the destination's NA).
marine <- marine %>%
  dplyr::group_by(vessel_id, port, destination) %>%
  dplyr::arrange(datetime) %>%
  dplyr::mutate(
    lon_lag1 = dplyr::lag(x = lon, n = 1),
    lat_lag1 = dplyr::lag(x = lat, n = 1),
    datetime_lag1 = dplyr::lag(x = datetime, n = 1),
    time_diff = as.numeric(difftime(time1 = datetime, time2 = datetime_lag1, units = "mins"))
  ) %>%
  dplyr::ungroup()

marine$dist <- geosphere::distHaversine(p1 = cbind(marine$lon, marine$lat), p2 = cbind(marine$lon_lag1, marine$lat_lag1))

# Now I am going to calculate some statistics to present in my final map.
marine_stats <- marine %>%
  dplyr::group_by(vessel_id, port, destination) %>%
  dplyr::arrange(dplyr::desc(dist), dplyr::desc(datetime)) %>%
  dplyr::summarise(
    vessel_name = dplyr::first(vessel_name),
    vessel_type = dplyr::first(vessel_type),
    vessel_img = dplyr::first(vessel_img),
    flag = dplyr::first(flag),
    flag_img = dplyr::first(flag_img),
    start_date = min(datetime),
    final_date = max(datetime),
    n_msg = dplyr::n(),
    n_msg_mov = sum(!is_parked),
    speed_mov_sum = sum(!is_parked*speed),
    speed_mov = speed_mov_sum/n_msg_mov,
    length = dplyr::first(length),
    dwt = dplyr::first(dwt),
    lat = dplyr::first(lat),
    lon = dplyr::first(lon),
    lat_lag1 = dplyr::first(lat_lag1),
    lon_lag1 = dplyr::first(lon_lag1),
    dist = dplyr::first(dist),
    time_diff = dplyr::first(time_diff)
    ) %>%
  dplyr::ungroup() %>%
  dplyr::group_by(vessel_id) %>%
  dplyr::arrange(dplyr::desc(dist), dplyr::desc(start_date)) %>%
  dplyr::slice(1) %>%
  dplyr::ungroup() %>%
  dplyr::rename(vessel_id = vessel_id)

# Saving it
save(marine, file = "data/marine.rda")
save(marine_stats, file = "data/marine_stats.rda")
DouglasMesquita/marineApp documentation built on Dec. 17, 2021, 5:29 p.m.