data-raw/clean_lemis.R

# LEMIS data cleaning script based on WildDB script originally written
# by Allison White


# Load packages
library(assertthat)
library(dplyr)
library(readr)
library(stringr)
library(tidyr)

h <- here::here
source(h("data-raw", "R", "lemis_cleaning_functions.R"))

#==============================================================================


# Merge all LEMIS data


# Merge yearly LEMIS CSV files into one data frame
lemis.cols <- c(
  "control_number",
  "species_code",
  "genus",
  "species",
  "subspecies",
  "specific_name",
  "generic_name",
  "wildlife_description",
  "quantity",
  "unit",
  "value",
  "country_origin_iso2c",
  "country_imp_exp_iso2c",
  "purpose",
  "source_",
  "action",
  "disposition",
  "disposition_date",
  "shipment_date",
  "import_export",
  "port",
  "us_co",
  "foreign_co",
  "file_num"
)

yearly.files <- dir(path = h("data-raw", "csv_by_year"), full.names = TRUE)
lemis_raw <- data.frame()

for (file in yearly.files) {

  print(file)

  lemis_raw <-
    rbind(
      lemis_raw,
      read_csv(file,
               col_names = lemis.cols,
               skip = 1,
               col_types = cols(.default = col_character()),
               na = character()
      )
    )
}

# Generate values for NA checking
periods <- sapply(1:100, function(x) paste0(rep(".", x), collapse = ""))
asterisks <- sapply(1:100, function(x) paste0(rep("*", x), collapse = ""))
dashes <- sapply(1:100, function(x) paste0(rep("-", x), collapse = ""))
slashes <- sapply(1:100, function(x) paste0(rep("/", x), collapse = ""))

na.characters <- c(
  periods, asterisks, dashes, slashes,
  "", " ", "NA", "N/A", "NULL",
  "*8", "*****8", "`*", "*`", "**`", "******`"
)

# Convert NA values
lemis <- lemis_raw %>%
  mutate_all(
    list(~ if_else(. %in% na.characters, NA_character_, .))
  )

# Eliminate rows of all NA values
lemis <- lemis[apply(select(lemis, -file_num), 1, function(x) any(!is.na(x))), ]
# (should eliminate 28 rows with all missing data)

#==============================================================================


# Clean up control number 2006798504

# In our raw data, the record for control number 2006798504 actually contains
# a 'foreign_co' value with multiple other records embedded within it. We need
# to extract these "hidden records" and clean up the surrounding data


# Extract the "hidden records"
hidden.rows <- lemis %>%
  # Filter down to the problematic record
  filter(control_number == "2006798504") %>%
  # Pull out the 'foreign_co' value
  pull(foreign_co) %>%
  # Split along the newline character
  str_split("\n") %>%
  # Create a data frame that contains one row for each record that was
  # hiding in the 'foreign_co' column
  data.frame() %>%
  # Get rid of the first row, since this only corresponds to the actual
  # 'foreign_co' value of the original problematic record
  slice(-1) %>%
  # Separate out the data frame into columns based on tab characters
  separate(col = 1, into = lemis.cols, sep = "\t") %>%
  # Clean up the 'foreign_co' column
  mutate(foreign_co = str_replace(foreign_co, "\r\r", ""))

# The row following the problematic row in the raw data was cut off and
# incomplete. We need to combine information from the last row of
# "hidden.rows" with this data
last.row.index <- nrow(hidden.rows)
incomplete.row <- lemis[lemis$control_number == "38735", ]

# The split between the last row of "hidden.rows" and "incomplete.row" happened
# over the 'shipment_date' column. From examining the raw data, it was apparent
# that the correct 'shipment_date' for the record is "01/18/2006"
hidden.rows[last.row.index, "shipment_date"] <- "01/18/2006"
# The second through the fifth columns of "incomplete.row" actually hold what
# should be the final four columns of data for the last row in "hidden.rows"
hidden.rows[last.row.index, 20:23] <- incomplete.row[ , 2:5]

# Now, remove the incomplete row from the lemis data
lemis <- lemis[lemis$control_number != "38735", ]

# Replace the 'foreign_co' value of the original problematic record
lemis[lemis$control_number == "2006798504", "foreign_co"] <- "EL ARPA"

# Replace empty strings in hidden.rows (i.e., "" or " ") with NA values
hidden.rows <- hidden.rows %>%
  mutate_at(
    vars(colnames(.)),
    list(~ ifelse(. == "" | . == " ", NA_character_, .))
  ) %>%
  # Change date string formatting
  mutate(
    disposition_date = as.character(as.Date(disposition_date, format = "%m/%d/%Y")),
    shipment_date = as.character(as.Date(shipment_date, format = "%m/%d/%Y")),
  ) %>%
  # And indicate "file_num" is 1 for these records
  mutate(file_num = "1")

# Bind on the previously hidden rows
lemis <- bind_rows(lemis, hidden.rows)

#==============================================================================


# Clean up columns


lemis <- lemis %>%
  mutate(
    control_number = as.numeric(control_number),
    species_code = toupper(species_code),
    genus = tolower(genus),
    species = tolower(species),
    subspecies = tolower(subspecies),
    specific_name = toupper(specific_name),
    generic_name = toupper(generic_name),
    description = toupper(wildlife_description), # uppercase codes
    quantity = as.numeric(gsub(",", "", quantity)),
    unit = toupper(unit), # uppercase codes
    value = as.numeric(gsub(",", "", value)),
    country_origin = toupper(country_origin_iso2c), # uppercase codes
    country_imp_exp = toupper(country_imp_exp_iso2c), # uppercase codes
    purpose = toupper(purpose), # uppercase codes
    source = toupper(source_), # uppercase codes
    action = toupper(action), # uppercase codes
    disposition = toupper(disposition), # uppercase codes
    disposition_date = as.Date(disposition_date, format = "%Y-%m-%d"),
    shipment_date = as.Date(shipment_date, format = "%Y-%m-%d"),
    import_export = toupper(import_export),
    port = toupper(port), # uppercase codes
    us_co = toupper(us_co),
    foreign_co = toupper(foreign_co)
  )

#==============================================================================


# Further data filtering


# 1)
# What proportion of the data does not have a value of "I" in the
# 'import_export' column?
nrow(lemis[lemis$import_export != "I", ])/nrow(lemis)
# Remove any "E", "T", and NA records from the 'import_export' column,
# leaving only importation data
lemis <- lemis %>%
  filter(
    import_export != "E",
    import_export != "T",
    !is.na(import_export)
  )

assert_that(sum(lemis$import_export == "I") == nrow(lemis))


# 2)
# Identify problematic records that have NA values for 'value' and are
# otherwise exact duplicates of other records
control.numbers.cant.be.dups <-
  as.numeric(names(which(table(lemis$control_number) == 1)))

control.numbers.w.NA.values <- lemis %>%
  group_by(control_number) %>%
  summarize(n_NAs = sum(is.na(value))) %>%
  filter(n_NAs >= 1) %>%
  pull(control_number)

grouping.vars <-
  colnames(lemis)[!(colnames(lemis) %in% c("value", "file_num"))]

problem.row.set <- lemis %>%
  filter(
    !(control_number %in% control.numbers.cant.be.dups),
    control_number %in% control.numbers.w.NA.values
  ) %>%
  group_by_at(grouping.vars) %>%
  summarize(
    row.count = n(),
    NA.count = sum(is.na(value)),
    distinct_value_values = paste(unique(value), collapse = ", "),
    distinct_file_num_values = paste(unique(file_num), collapse = ", ")
  ) %>%
  filter(row.count > 1) %>%
  ungroup()

dups <- problem.row.set %>%
  filter(NA.count > 0,
         str_detect(distinct_value_values, ",")) %>%
  mutate(value = NA_real_,
         file_num = str_extract(distinct_file_num_values, "."))

dup.years <- dups %>%
  pull(shipment_date) %>%
  str_extract(., "20.{1,2}") %>%
  unique(.)

# Verify all of these records come from 2013 data
assert_that(dup.years == "2013")

# Remove the duplicate, NA-containing records from the data
lemis.row.count <- nrow(lemis)
dups.NA.count <- sum(dups$NA.count)
lemis <- anti_join(lemis, dups, by = lemis.cols)
assert_that(nrow(lemis) == (lemis.row.count - dups.NA.count))


# 3)
# Manually remove likely duplicate records
# Note: most of these are clear duplicates where the product was recorded
# in year 2013 file 1 and replicated in a later file number with
# value data present and some other field altered (for example, 'country_origin'
# or 'purpose' may have changed between the two entries)
manually.curated.duplicates.for.removal <-
  read_csv(
    h("data-raw", "data", "manually_curated_duplicates_for_removal.csv"),
    col_types = list(
      subspecies = col_character(),
      value = col_double(),
      file_num = col_character()
    )
  ) %>%
  mutate(
    disposition_date = as.Date(disposition_date, format = "%m/%d/%y"),
    shipment_date = as.Date(shipment_date, format = "%m/%d/%y"),
  )

# Remove the manually-curated duplicates from the data
lemis.row.count <- nrow(lemis)
dups.count <- nrow(manually.curated.duplicates.for.removal)
lemis <- anti_join(lemis, manually.curated.duplicates.for.removal, by = lemis.cols)
assert_that(nrow(lemis) == (lemis.row.count - dups.count))
# Check that all control_numbers for which duplicates were removed still
# remain in the data (i.e., we removed some, but not all, records for a
# given 'control_number')
n.control.numbers.remaining <-
  sum(unique(manually.curated.duplicates.for.removal$control_number) %in% lemis$control_number)
assert_that(n.control.numbers.remaining == n_distinct(manually.curated.duplicates.for.removal$control_number))


# 4)
# Identify problematic records that have NA values for 'value',
# conflicting values for 'us_co'/'foreign_co', and are
# otherwise exact duplicates of other records
grouping.vars2 <-
  colnames(lemis)[!(colnames(lemis) %in% c("value", "us_co", "foreign_co", "file_num"))]

problem.row.set2 <- lemis %>%
  filter(
    !(control_number %in% control.numbers.cant.be.dups),
    control_number %in% control.numbers.w.NA.values
  ) %>%
  group_by_at(grouping.vars2) %>%
  summarize(
    row.count = n(),
    NA.count = sum(is.na(value)),
    distinct_value_values = paste(unique(value), collapse = ", "),
    distinct_us_co_values = paste(unique(us_co), collapse = "@~"),
    distinct_foreign_co_values = paste(unique(foreign_co), collapse = "@~"),
    distinct_file_num_values = paste(unique(file_num), collapse = ",")
  ) %>%
  filter(row.count > 1) %>%
  ungroup()

dups2 <- problem.row.set2 %>%
  filter(NA.count > 0,
         str_detect(distinct_value_values, ",")) %>%
  mutate(
    value = NA_real_,
    us_co = case_when(
      str_detect(distinct_us_co_values, "@~") ~ str_extract(distinct_us_co_values, ".*(?=@~)"),
      TRUE ~ distinct_us_co_values
    ),
    us_co = ifelse(us_co == "NA", NA_character_, us_co),
    foreign_co = case_when(
      str_detect(distinct_foreign_co_values, "@~") ~ str_extract(distinct_foreign_co_values, ".*(?=@~)"),
      TRUE ~ distinct_foreign_co_values
    ),
    foreign_co = ifelse(foreign_co == "NA", NA_character_, foreign_co),
    file_num = str_extract(distinct_file_num_values, ".")
  )

dup.years2 <- dups2 %>%
  pull(shipment_date) %>%
  str_extract(., "20.{1,2}") %>%
  unique(.)

# Verify all of these records come from 2013 data
assert_that(dup.years2 == "2013")

# Remove the duplicate, NA-containing records from the data
lemis.row.count <- nrow(lemis)
dups2.NA.count <- sum(dups2$NA.count)
lemis <- anti_join(lemis, dups2, by = lemis.cols)
assert_that(nrow(lemis) == (lemis.row.count - dups2.NA.count))

# Manually change 'us_co' and 'foreign_co' values where they still
# disagree within 'control_number' because some records record
# exemptions for these fields while others simply record missing values
lemis <- lemis %>%
  mutate(
    us_co = case_when(
      control_number == 2013302852 ~ NA_character_,
      control_number == 2013314117 ~ NA_character_,
      TRUE ~ us_co
    ),
    foreign_co = case_when(
      control_number == 2013302852 ~ NA_character_,
      control_number == 2013320235 ~ NA_character_,
      TRUE ~ foreign_co
    )
  )

# Assert that all control numbers now only have one associated
# 'country_imp_exp', 'us_co', 'foreign_co', and 'port'
lemis.grouped <- group_by(lemis, control_number)

assert_that(max(summarize(lemis.grouped, n = n_distinct(country_imp_exp))$n) == 1)
assert_that(max(summarize(lemis.grouped, n = n_distinct(us_co))$n) == 1)
assert_that(max(summarize(lemis.grouped, n = n_distinct(foreign_co))$n) == 1)
assert_that(max(summarize(lemis.grouped, n = n_distinct(port))$n) == 1)


# 5)
# Address potential exact duplicate records
grouping.vars3 <-
  colnames(lemis)[!(colnames(lemis) %in% c("file_num"))]

problem.row.set3 <- lemis %>%
  filter(!(control_number %in% control.numbers.cant.be.dups)) %>%
  group_by_at(grouping.vars3) %>%
  summarize(
    row.count = n(),
    distinct_file_num_values = paste(unique(file_num), collapse = ", ")
  ) %>%
  filter(row.count > 1) %>%
  ungroup()

# How many of these "duplicate" records are in fact from the same original
# data file, indicating they are probably not errors generated from
# collating together multiple data sheets within years?
from.same.file.count <-
  sum(!str_detect(problem.row.set3$distinct_file_num_values, ","))

assert_that(from.same.file.count == nrow(problem.row.set3))
# Since duplicate records of a given record all come from the same
# original data file, it's probably best to keep them and treat them
# as intentionally duplicated product records

#==============================================================================


# Generate a 'cleaning_notes' column to hold automatically generated notes
lemis$cleaning_notes <- rep(NA_character_, nrow(lemis))

#==============================================================================


# Cleaning of non-standard descriptions present in the LEMIS data


sort(unique(lemis$description))

valid.description.codes <-
  c(
    "BAL", "BAR", "BOC", "BOD", "BON", "BOP", "BUL", "CAL",
    "CAP", "CAR", "CAV", "CHP", "CLA", "CLO", "COR", "CPR",
    "CUL", "CUT", "DEA", "DER", "DPL", "EAR", "EGG", "EGL",
    "ESH", "EXT", "FEA", "FIB", "FIG", "FIN", "FLO", "FOO",
    "FPT", "FRU", "GAB", "GAL", "GAR", "GEN", "GRS", "HAI",
    "HAP", "HOC", "HOP", "HOR", "IJW", "IVC", "IVP", "JWL",
    "KEY", "LEG", "LIV", "LOG", "LPL", "LPS", "LVS", "MEA",
    "MED", "MUS", "NES", "OIL", "PIV", "PLA", "PLY", "POW",
    "ROC", "ROO", "RUG", "SAW", "SCA", "SDL", "SEE", "SHE",
    "SHO", "SID", "SKE", "SKI", "SKP", "SKU", "SOU", "SPE",
    "SPR", "STE", "SWI", "TAI", "TEE", "TIM", "TRI", "TRO",
    "TUS", "UNS", "VEN", "WAX", "WNG", "WPR"
  )

# Which values are not in the valid codes?
index.invalid.codes <- !(unique(lemis$description) %in% valid.description.codes)
sort(unique(lemis$description)[index.invalid.codes])

# Convert irregular values to good values
lemis <- lemis %>%
  mutate(
    description = case_when(
      # Change "GMT" to "GAR", under the assumption this was meant to indicate
      # "garment"
      description == "GMT" ~ "GAR",
      # Change "JEW" to "JWL" when the foreign company is an arts and crafts
      # dealer, under the assumption this was meant to indicate "jewelry"
      description == "JEW" & foreign_co == "BAYEAD ARTS AND CRAFTS" ~ "JWL",
      # Change "LI" to "LIV"
      description == "LI" ~ "LIV",
      # Change "LPC" to "LPS", under the assumption this was a typographic
      # error
      description == "LPC" ~ "LPS",
      # Change "LPW" to "LPS", under the assumption this was a typographic
      # error
      description == "LPW" ~ "LPS",
      # Change "MAE" to "MEA" when the unit is a weight, under the assumption
      # this was meant to indicate "meat"
      description == "MAE" & unit == "KG" ~ "MEA",
      # Change "SK" to "SKI" when the foreign company is "BASS RIVER FARMS",
      # under the assumption this was meant to indicate "skin"
      description == "SK" & foreign_co == "BASS RIVER FARMS" ~ "SKI",
      # Change "SP" and "SPW" to "SPR" when the generic name is either "MOLLUSC",
      # "CLAM", or "SHELL" and unit is "NO", under the assumption this
      # was meant to indicate a shell product
      description == "SP" & generic_name %in% c("MOLLUSC", "CLAM", "SHELL") &
        unit == "NO" ~ "SPR",
      description == "SPW" & generic_name %in% c("MOLLUSC", "CLAM", "SHELL") &
        unit == "NO" ~ "SPR",
      # Change "TWO" to "TRO" when species_code is "ELAN", under the
      # assumption this was meant to indicate a trophy
      description == "TWO" & species_code == "ELAN" ~ "TRO",
      TRUE ~ description
    )
  )

# Which values are not in the valid codes?
index.invalid.codes <- !(unique(lemis$description) %in% valid.description.codes)
sort(unique(lemis$description)[index.invalid.codes])

# Remove remaining non-standard descriptions
lemis <- get_cleaned_lemis("description", valid.description.codes)

# Assertion for quality checking. All levels should be a valid code
assert_that(all(levels(lemis$description) %in%
  c(valid.description.codes, "non-standard value")))

summary(lemis$description)

#==============================================================================


# Cleaning of non-standard units present in the LEMIS data


sort(unique(lemis$unit))

valid.unit.codes <-
  c("C2", "C3", "CM", "GM", "KG", "LT", "M2", "M3", "MG", "ML", "MT", "NO")

# Which values are not in the valid codes?
index.invalid.codes <- !(unique(lemis$unit) %in% valid.unit.codes)
sort(unique(lemis$unit)[index.invalid.codes])

# The unit "LB" may stand for pounds, so we need to change this to kilograms
# where 1 LB = 0.454 KG
lemis <- lemis %>%
  mutate(
    # backup quantity and unit columns to preserve the originals
    quantity_original_value = quantity,
    unit_original_value = unit,
    # convert pounds to kilograms by multiplying the pound units by 0.453592
    quantity = case_when(
      unit == "LB" ~ quantity * 0.453592,
      TRUE ~ quantity
    ),
    unit = ifelse(unit == "LB", "KG", unit)
  )

# The unit "GL" may stand for gallons, so we need to change this to liters
# where 1 gallon = 3.78541 liters
lemis <- lemis %>%
  mutate(
    # convert gallons to liters by multiplying the gallon units by 3.78541
    quantity = case_when(
      unit == "GL" ~ quantity * 3.78541,
      TRUE ~ quantity
    ),
    unit = ifelse(unit == "GL", "LT", unit)
  )

# Convert irregular values to good values
lemis <- lemis %>%
  mutate(unit = case_when(
    # All variations of the "number of specimens" entries should be recoded
    # as "NO" ("CT" is likely "count" while "PC" is likely "piece")
    unit %in% c("CT", "N", "N0", "PC") ~ "NO",
    # Recode "L" as "liters"
    unit == "L" ~ "LT",
    TRUE ~ unit
    )
  )

# Which values are not in the valid codes?
index.invalid.codes <- !(unique(lemis$unit) %in% valid.unit.codes)
sort(unique(lemis$unit)[index.invalid.codes])

# Remove remaining non-standard units
lemis <- get_cleaned_lemis("unit", valid.unit.codes)

# Assertion for quality checking. All levels should be a valid code
assert_that(all(levels(lemis$unit) %in%
  c(valid.unit.codes, "non-standard value")))

summary(lemis$unit)

#==============================================================================


# Cleaning of non-standard country origins present in the LEMIS data


sort(unique(lemis$country_origin))

valid.country.codes <- read_csv("inst/extdata/iso_2_country_codes.csv") %>%
  pull(Value)
# Add on other valid codes
valid.country.codes <- c(
  valid.country.codes,
  "BL", "BQ", "CW", "GG", "IM", "JE", "ME", "MF", "PC", "PS", "RS",
  "SX", "TL", "VS", "XX", "YU", "ZR", "ZZ"
)

# Which values are not in the valid codes?
index.invalid.codes <- !(unique(lemis$country_origin) %in% valid.country.codes)
sort(unique(lemis$country_origin)[index.invalid.codes])

# Convert irregular values to good values
lemis <- lemis %>%
  mutate(
    country_origin = case_when(
      # Change "1D" to "ID", indicating Indonesia
      country_origin == "1D" & country_imp_exp == "ID" ~ "ID",
      # Change "FP" to "PF", indicating French Polynesia
      country_origin == "FP" ~ "PF",
      # Change "FS" to "FM", indicating the Federated States of Micronesia
      country_origin == "FS" ~ "FM",
      # Change "UK" to "GB", indicating the United Kingdom
      country_origin == "UK" & country_imp_exp == "GB" ~ "GB",
      # Change "UN" ("unknown"?) to "XX" since that represents unknown country
      country_origin == "UN" ~ "XX",
      # Change "X" to "XX" since that represents unknown country
      country_origin == "X" ~ "XX",
      # Change NA values to "XX" since that represents unknown country
      is.na(country_origin) ~ "XX",
      TRUE ~ country_origin
    )
  )

# Which values are not in the valid codes?
index.invalid.codes <- !(unique(lemis$country_origin) %in% valid.country.codes)
sort(unique(lemis$country_origin)[index.invalid.codes])

# Remove remaining non-standard country origins
lemis <- get_cleaned_lemis("country_origin", valid.country.codes)

# Assertion for quality checking. All levels should be a valid code
assert_that(all(levels(lemis$country_origin) %in%
  c(valid.country.codes, "non-standard value")))

summary(lemis$country_origin)

#==============================================================================


# Cleaning of non-standard country import/export present in the LEMIS data


sort(unique(lemis$country_imp_exp))

# Which values are not in the valid codes?
index.invalid.codes <- !(unique(lemis$country_imp_exp) %in% valid.country.codes)
sort(unique(lemis$country_imp_exp)[index.invalid.codes])

# Convert irregular values to good values
lemis <- lemis %>%
  mutate(
    country_imp_exp = case_when(
      # Change "**" to "XX" since that represents unknown country
      str_detect(country_imp_exp, fixed("**", TRUE)) ~ "XX",
      # Change "FS" to "FM", indicating the Federated States of Micronesia
      country_imp_exp == "FS" ~ "FM",
      # Change NA values to "XX" since that represents unknown country
      is.na(country_imp_exp) ~ "XX",
      TRUE ~ country_imp_exp
    )
  )

# Which values are not in the valid codes?
index.invalid.codes <- !(unique(lemis$country_imp_exp) %in% valid.country.codes)
sort(unique(lemis$country_imp_exp)[index.invalid.codes])

# Remove remaining non-standard country import/export
lemis <- get_cleaned_lemis("country_imp_exp", valid.country.codes)

# Assertion for quality checking. All levels should be a valid code
assert_that(all(levels(lemis$country_imp_exp) %in%
  c(valid.country.codes, "non-standard value")))

summary(lemis$country_imp_exp)

#==============================================================================


# Cleaning of non-standard purposes present in the LEMIS data


sort(unique(lemis$purpose))

valid.purpose.codes <-
  c("B", "E", "G", "H", "L", "M", "P", "Q", "S", "T", "Y", "Z")

# Which values are not in the valid codes?
index.invalid.codes <- !(unique(lemis$purpose) %in% valid.purpose.codes)
sort(unique(lemis$purpose)[index.invalid.codes])

# Remove remaining non-standard purposes
lemis <- get_cleaned_lemis("purpose", valid.purpose.codes)

# Assertion for quality checking. All levels should be a valid code
assert_that(all(levels(lemis$purpose) %in%
  c(valid.purpose.codes, "non-standard value")))

summary(lemis$purpose)

#==============================================================================


# Cleaning of non-standard sources present in the LEMIS data


sort(unique(lemis$source))

valid.source.codes <-
  c("A", "C", "D", "F", "I", "P", "R", "U", "W")

# Which values are not in the valid codes?
index.invalid.codes <- !(unique(lemis$source) %in% valid.source.codes)
sort(unique(lemis$source)[index.invalid.codes])

# Remove remaining non-standard sources
lemis <- get_cleaned_lemis("source", valid.source.codes)

# Assertion for quality checking. All levels should be a valid code
assert_that(all(levels(lemis$source) %in%
  c(valid.source.codes, "non-standard value")))

summary(lemis$source)

#==============================================================================


# Cleaning of non-standard actions present in the LEMIS data


sort(unique(lemis$action))

valid.action.codes <- c("C", "R")

# Which values are not in the valid codes?
index.invalid.codes <- !(unique(lemis$action) %in% valid.action.codes)
sort(unique(lemis$action)[index.invalid.codes])

# Assertion for quality checking. All levels should be a valid code
assert_that(all(levels(lemis$action) %in%
  c(valid.action.codes, "non-standard value")))

summary(as.factor(lemis$action))

#==============================================================================


# Cleaning of non-standard dispositions present in the LEMIS data


sort(unique(lemis$disposition))

valid.disposition.codes <- c("A", "C", "R", "S")

# Which values are not in the valid codes?
index.invalid.codes <- !(unique(lemis$disposition) %in% valid.disposition.codes)
sort(unique(lemis$disposition)[index.invalid.codes])

# Remove remaining non-standard dispositions
lemis <- get_cleaned_lemis("disposition", valid.disposition.codes)

# Assertion for quality checking. All levels should be a valid code
assert_that(all(levels(lemis$disposition) %in%
  c(valid.disposition.codes, "non-standard value")))

summary(lemis$disposition)

#==============================================================================


# Cleaning of non-standard ports present in the LEMIS data


sort(unique(lemis$port))

valid.port.codes <-
  c(
    "1", "2", "3", "4", "5", "6", "7", "8",
    "AG", "AL", "AN", "AT", "BA", "BL", "BN",
    "BO", "BV", "CA", "CH", "CL", "CP", "CX",
    "DE", "DF", "DG", "DL", "DN", "DR", "DS",
    "DU", "EA", "EL", "FB", "GP", "HA", "HN",
    "HO", "HS", "IF", "JK", "JU", "LA", "LK",
    "LO", "LR", "LV", "MC", "ME", "MI", "MP",
    "NF", "NG", "NO", "NW", "NY", "PA", "PB",
    "PH", "PL", "PT", "PX", "RY", "SE", "SF",
    "SJ", "SL", "SP", "SS", "SU", "SW", "SY",
    "TP", "XX"
  )

# Which values are not in the valid codes?
index.invalid.codes <- !(unique(lemis$port) %in% valid.port.codes)
sort(unique(lemis$port)[index.invalid.codes])

# Convert irregular values to good values
lemis <- lemis %>%
  mutate(
    port = case_when(
      # Change "03" to "3"
      port == "03" ~ "3",
      # Change NA values to "XX" since that represents unknown port of entry
      is.na(port) ~ "XX",
      TRUE ~ port
    )
  )

# Which values are not in the valid codes?
index.invalid.codes <- !(unique(lemis$port) %in% valid.port.codes)
sort(unique(lemis$port)[index.invalid.codes])

# Remove remaining non-standard ports
lemis <- get_cleaned_lemis("port", valid.port.codes)

# Assertion for quality checking. All levels should be a valid code
assert_that(all(levels(lemis$port) %in%
  c(valid.port.codes, "non-standard value")))

summary(lemis$port)

#==============================================================================


# Clean date columns


# Clean dates for control number 2006690535
lemis <- lemis %>%
  mutate(
    shipment_date = case_when(
      control_number == 2006690535 & is.na(shipment_date) ~ "2006-01-18",
      TRUE ~ as.character(shipment_date)
    ),
    disposition_date = case_when(
      control_number == 2006690535 & is.na(disposition_date) ~ "2006-01-27",
      TRUE ~ as.character(disposition_date)
    )
  )

# Verify that the vast majority of disposition dates occur on or after the
# shipment date
filter(lemis, !is.na(shipment_date) & !is.na(disposition_date)) %>%
  mutate(
    disposition_date = as.Date(disposition_date, format = "%Y-%m-%d"),
    shipment_date = as.Date(shipment_date, format = "%Y-%m-%d")
  ) %>%
  mutate(date_test = (disposition_date - shipment_date) >= 0) %>%
  summarize(sum(date_test)/n())

# Clean disposition dates
lemis <- lemis %>%
  mutate(
    disposition_date = as.Date(disposition_date, format = "%Y-%m-%d"),
    shipment_date = as.Date(shipment_date, format = "%Y-%m-%d"),
    disposition_date_original_value = disposition_date,
    # extract only the year component of the disposition and shipment dates
    disposition_year = format(disposition_date, "%Y"),
    shipment_year = format(shipment_date, "%Y"),
    disposition_date = case_when(
      # clean cases where 'disposition_date' is far later than 'shipment_date'
      shipment_date == "2012-02-13" & disposition_date == "2016-02-16" ~
        "2012-02-16",
      shipment_date == "2014-06-15" & disposition_date == "2017-06-17" ~
        "2014-06-17",
      shipment_date == "2014-12-17" & disposition_date == "2019-12-17" ~
        "2014-12-17",
      shipment_year == "2002" & disposition_year == "2020" ~
        str_replace(disposition_date, "2020", "2002"),
      shipment_date == "2002-03-27" & disposition_date == "2027-03-27" ~
        "2002-03-27",
      shipment_year == "2003" & disposition_year == "2030" ~
        str_replace(disposition_date, "2030", "2003"),
      shipment_date == "2002-11-27" & disposition_date == "2030-02-06" ~
        "2003-02-06",
      shipment_year == "2003" & disposition_year == "2033" ~
        str_replace(disposition_date, "2033", "2003"),
      shipment_year == "2004" & disposition_year == "2044" ~
        str_replace(disposition_date, "2044", "2004"),
      shipment_year == "2001" & disposition_year == "2201" ~
        str_replace(disposition_date, "2201", "2001"),
      shipment_year == "2002" & disposition_year == "2202" ~
        str_replace(disposition_date, "2202", "2002"),
      shipment_year == "2001" & disposition_year == "2991" ~
        str_replace(disposition_date, "2991", "2001"),
      shipment_year == "2003" & disposition_year == "3003" ~
        str_replace(disposition_date, "3003", "2003"),
      shipment_year == "2004" & disposition_year == "3004" ~
        str_replace(disposition_date, "3004", "2004"),
      shipment_date == "2003-12-28" & disposition_date == "5004-01-06" ~
        "2004-01-06",
      # clean cases where 'disposition_date' is far earlier than 'shipment_date'
      disposition_date == "1900-01-01" ~ NA_character_,
      disposition_date == "1933-07-15" ~ NA_character_,
      (shipment_year == "2003" | shipment_year == "2004") &
        disposition_year == "1996" ~ NA_character_,
      shipment_date == "2002-07-23" & disposition_date == "1954-07-26" ~
        "2002-07-26",
      shipment_date == "2002-07-24" & disposition_date == "1954-07-26" ~
        "2002-07-26",
      shipment_date == "2002-01-07" & disposition_date == "1992-01-11" ~
        "2002-01-11",
      shipment_date == "2000-02-29" & disposition_date == "1996-03-04" ~
        "2000-03-04",
      shipment_year == "2000" & disposition_year == "1998" ~
        str_replace(disposition_date, "1998", "2000"),
      # keep all others
      TRUE ~ as.character(disposition_date)
    )
  )

date.corrections.file <- read_csv("data-raw/data/disposition_date_corrections.csv") %>%
  mutate_at(c("disposition_date", "shipment_date",
              "new_disposition_date"),
            list(~ as.character(as.Date(., format = "%m/%d/%y")))) %>%
  filter(!is.na(new_disposition_date))

for(i in 1:nrow(date.corrections.file)) {

  lemis[which(lemis$disposition_date == date.corrections.file$disposition_date[i] &
                lemis$shipment_date == date.corrections.file$shipment_date[i]),
        "disposition_date"] <-
    date.corrections.file$new_disposition_date[i]
}

# Assert that all control numbers now only have one associated 'shipment_date'
lemis.grouped <- lemis %>%
  mutate_at(
    c("country_imp_exp",
      "port"),
    list(~ as.character(.))
  ) %>%
  group_by(control_number)

assert_that(max(summarize(lemis.grouped, n = n_distinct(country_imp_exp))$n) == 1)
assert_that(max(summarize(lemis.grouped, n = n_distinct(us_co))$n) == 1)
assert_that(max(summarize(lemis.grouped, n = n_distinct(foreign_co))$n) == 1)
assert_that(max(summarize(lemis.grouped, n = n_distinct(port))$n) == 1)
assert_that(max(summarize(lemis.grouped, n = n_distinct(shipment_date))$n) == 1)

#==============================================================================


# Data saving

na.characters <- c("na", "?", ".", "/", "`", "=", "-")

lemis_intermediate <- lemis %>%
  # extract only the year component of the disposition and shipment dates
  mutate(
    disposition_date = as.Date(disposition_date, format = "%Y-%m-%d"),
    shipment_date = as.Date(shipment_date, format = "%Y-%m-%d"),
    disposition_year = as.numeric(format(disposition_date, "%Y")),
    shipment_year = as.numeric(format(shipment_date, "%Y"))
  ) %>%
  # select columns to keep
  select(
    control_number,
    species_code,
    genus,
    species,
    subspecies,
    specific_name,
    generic_name,
    description,
    quantity,
    unit,
    value,
    country_origin,
    country_imp_exp,
    purpose,
    source,
    action,
    disposition,
    disposition_date,
    disposition_year,
    shipment_date,
    shipment_year,
    import_export,
    port,
    us_co,
    foreign_co,
    cleaning_notes
  ) %>%
  # change column types
  mutate_all(list(~ as.character(.))) %>%
  mutate_at(
    c("control_number",
      "quantity",
      "value",
      "disposition_year",
      "shipment_year"),
    list(~ as.integer(.))
  ) %>%
  # clean remaining values that should be NA characters
  mutate_if(
    is.character,
    list(~ if_else(. %in% na.characters, NA_character_, .))
  ) %>%
  arrange(shipment_date, control_number, species_code)

# Write a cleaned CSV file of intermediate LEMIS data
write_csv(
  lemis_intermediate,
  h("data-raw", "lemis_intermediate.csv")
)

#==============================================================================


# Directory cleanup

# Delete subdirectories containing intermediate LEMIS files, if desired
# unlink(h("data-raw", "raw_data"), recursive = TRUE)
# unlink(h("data-raw", "csv_by_year"), recursive = TRUE)
kephelps/LEMIS documentation built on June 9, 2025, 7:56 a.m.