data-raw/clean-sitc-to-naics.R

################################################################################
## setup
################################################################################
# clean slate
rm(list = ls())
date()

# load packages
library(tidyverse)


################################################################################
# SITC4 to NAICS (combined) via HS (combined)
################################################################################
# load cleaned data
load("./data/hs_sitc4.RData")
load("./data/hs_naics.RData")

# subset
hs.sitc4.sub <- hs_sitc4 %>%
  select(SITC4_5d, HS_6d)

hs.naics.sub <- hs_naics %>%
  select(HS_6d, NAICS_6d)

# merge
sitc4.naics.m <- full_join(hs.sitc4.sub, hs.naics.sub,
                           by = "HS_6d")

# clean
sitc4_naics <- sitc4.naics.m %>%
  select(-HS_6d) %>%
  distinct() %>%
  mutate(SITC4_4d = str_sub(SITC4_5d, start = 1, end = 4),
         SITC4_3d = str_sub(SITC4_5d, start = 1, end = 3),
         SITC4_2d = str_sub(SITC4_5d, start = 1, end = 2),
         SITC4_1d = str_sub(SITC4_5d, start = 1, end = 1),
         NAICS_5d = str_sub(NAICS_6d, start = 1, end = 5),
         NAICS_4d = str_sub(NAICS_6d, start = 1, end = 4),
         NAICS_3d = str_sub(NAICS_6d, start = 1, end = 3),
         NAICS_2d = str_sub(NAICS_6d, start = 1, end = 2)
         ) %>%
  select(SITC4_5d, SITC4_4d, SITC4_3d, SITC4_2d, SITC4_1d,
         NAICS_6d, NAICS_5d, NAICS_4d, NAICS_3d, NAICS_2d
         ) %>%
  arrange(SITC4_5d) %>%
  filter_all(any_vars(!is.na(.))) %>%
  mutate(NAICS_2d = if_else(NAICS_2d == "31", "31-33", NAICS_2d),
         NAICS_2d = if_else(NAICS_2d == "32", "31-33", NAICS_2d),
         NAICS_2d = if_else(NAICS_2d == "33", "31-33", NAICS_2d),
         NAICS_2d = if_else(NAICS_2d == "44", "44-45", NAICS_2d),
         NAICS_2d = if_else(NAICS_2d == "45", "44-45", NAICS_2d),
         NAICS_2d = if_else(NAICS_2d == "48", "48-49", NAICS_2d),
         NAICS_2d = if_else(NAICS_2d == "49", "48-49", NAICS_2d))

# save
save(sitc4_naics,
     file = "./data/sitc4_naics.RData", compress = "xz")


################################################################################
# SITC3 to NAICS (combined) via HS (combined)
################################################################################
# load cleaned data
load("./data/hs_sitc3.RData")

# subset
hs.sitc3.sub <- hs_sitc3 %>%
  select(SITC3_5d, HS_6d)

# merge
sitc3.naics.m <- full_join(hs.sitc3.sub, hs.naics.sub,
                           by = "HS_6d")

# clean
sitc3_naics <- sitc3.naics.m %>%
  select(-HS_6d) %>%
  distinct() %>%
  mutate(SITC3_4d = str_sub(SITC3_5d, start = 1, end = 4),
         SITC3_3d = str_sub(SITC3_5d, start = 1, end = 3),
         SITC3_2d = str_sub(SITC3_5d, start = 1, end = 2),
         SITC3_1d = str_sub(SITC3_5d, start = 1, end = 1),
         NAICS_5d = str_sub(NAICS_6d, start = 1, end = 5),
         NAICS_4d = str_sub(NAICS_6d, start = 1, end = 4),
         NAICS_3d = str_sub(NAICS_6d, start = 1, end = 3),
         NAICS_2d = str_sub(NAICS_6d, start = 1, end = 2)) %>%
  select(SITC3_5d, SITC3_4d, SITC3_3d, SITC3_2d, SITC3_1d,
         NAICS_6d, NAICS_5d, NAICS_4d, NAICS_3d, NAICS_2d) %>%
  arrange(SITC3_5d) %>%
  filter_all(any_vars(!is.na(.))) %>%
  mutate(NAICS_2d = if_else(NAICS_2d == "31", "31-33", NAICS_2d),
         NAICS_2d = if_else(NAICS_2d == "32", "31-33", NAICS_2d),
         NAICS_2d = if_else(NAICS_2d == "33", "31-33", NAICS_2d),
         NAICS_2d = if_else(NAICS_2d == "44", "44-45", NAICS_2d),
         NAICS_2d = if_else(NAICS_2d == "45", "44-45", NAICS_2d),
         NAICS_2d = if_else(NAICS_2d == "48", "48-49", NAICS_2d),
         NAICS_2d = if_else(NAICS_2d == "49", "48-49", NAICS_2d))

# save
save(sitc3_naics,
     file = "./data/sitc3_naics.RData", compress = "xz")


################################################################################
# SITC2 to NAICS (combined) via HS (combined)
################################################################################
# load cleaned data
load("./data/hs_sitc2.RData")

# subset
hs.sitc2.sub <- hs_sitc2 %>%
  select(SITC2_5d, HS_6d)

# merge
sitc2.naics.m <- full_join(hs.sitc2.sub, hs.naics.sub,
                           by = "HS_6d")

# clean
sitc2_naics <- sitc2.naics.m %>%
  select(-HS_6d) %>%
  distinct() %>%
  mutate(SITC2_4d = str_sub(SITC2_5d, start = 1, end = 4),
         SITC2_3d = str_sub(SITC2_5d, start = 1, end = 3),
         SITC2_2d = str_sub(SITC2_5d, start = 1, end = 2),
         SITC2_1d = str_sub(SITC2_5d, start = 1, end = 1),
         NAICS_5d = str_sub(NAICS_6d, start = 1, end = 5),
         NAICS_4d = str_sub(NAICS_6d, start = 1, end = 4),
         NAICS_3d = str_sub(NAICS_6d, start = 1, end = 3),
         NAICS_2d = str_sub(NAICS_6d, start = 1, end = 2)) %>%
  select(SITC2_5d, SITC2_4d, SITC2_3d, SITC2_2d, SITC2_1d,
         NAICS_6d, NAICS_5d, NAICS_4d, NAICS_3d, NAICS_2d) %>%
  arrange(SITC2_5d) %>%
  filter_all(any_vars(!is.na(.))) %>%
  mutate(NAICS_2d = if_else(NAICS_2d == "31", "31-33", NAICS_2d),
         NAICS_2d = if_else(NAICS_2d == "32", "31-33", NAICS_2d),
         NAICS_2d = if_else(NAICS_2d == "33", "31-33", NAICS_2d),
         NAICS_2d = if_else(NAICS_2d == "44", "44-45", NAICS_2d),
         NAICS_2d = if_else(NAICS_2d == "45", "44-45", NAICS_2d),
         NAICS_2d = if_else(NAICS_2d == "48", "48-49", NAICS_2d),
         NAICS_2d = if_else(NAICS_2d == "49", "48-49", NAICS_2d))

# save
save(sitc2_naics,
     file = "./data/sitc2_naics.RData", compress = "xz")


################################################################################
# SITC1 to NAICS (combined) via HS (combined)
################################################################################
# load cleaned data
load("./data/hs_sitc1.RData")

# subset
hs.sitc1.sub <- hs_sitc1 %>%
  select(SITC1_5d, HS_6d)

# merge
sitc1.naics.m <- full_join(hs.sitc1.sub, hs.naics.sub,
                           by = "HS_6d")

# clean
sitc1_naics <- sitc1.naics.m %>%
  select(-HS_6d) %>%
  distinct() %>%
  mutate(SITC1_4d = str_sub(SITC1_5d, start = 1, end = 4),
         SITC1_3d = str_sub(SITC1_5d, start = 1, end = 3),
         SITC1_2d = str_sub(SITC1_5d, start = 1, end = 2),
         SITC1_1d = str_sub(SITC1_5d, start = 1, end = 1),
         NAICS_5d = str_sub(NAICS_6d, start = 1, end = 5),
         NAICS_4d = str_sub(NAICS_6d, start = 1, end = 4),
         NAICS_3d = str_sub(NAICS_6d, start = 1, end = 3),
         NAICS_2d = str_sub(NAICS_6d, start = 1, end = 2)) %>%
  select(SITC1_5d, SITC1_4d, SITC1_3d, SITC1_2d, SITC1_1d,
         NAICS_6d, NAICS_5d, NAICS_4d, NAICS_3d, NAICS_2d) %>%
  arrange(SITC1_5d) %>%
  filter_all(any_vars(!is.na(.))) %>%
  mutate(NAICS_2d = if_else(NAICS_2d == "31", "31-33", NAICS_2d),
         NAICS_2d = if_else(NAICS_2d == "32", "31-33", NAICS_2d),
         NAICS_2d = if_else(NAICS_2d == "33", "31-33", NAICS_2d),
         NAICS_2d = if_else(NAICS_2d == "44", "44-45", NAICS_2d),
         NAICS_2d = if_else(NAICS_2d == "45", "44-45", NAICS_2d),
         NAICS_2d = if_else(NAICS_2d == "48", "48-49", NAICS_2d),
         NAICS_2d = if_else(NAICS_2d == "49", "48-49", NAICS_2d))

# save
save(sitc1_naics,
     file = "./data/sitc1_naics.RData", compress = "xz")
insongkim/concordance documentation built on Jan. 25, 2023, 4:55 p.m.