create_data/6.esco_onet_crosswalk.R

# This script uses ateco2esco table as a bridge between ATECO sectors and SOC codes.
# It will match ISCO codes with the equivalent SOC and then paste the corresponding ATECO sector

library(openxlsx)
library(tidyverse)
library(magrittr)

# Loading data

ateco2esco <- readRDS("create_data/ateco2esco_skills.rds") %>%
  ungroup() %>%
  unnest(cols =  c(data)) %>%
  ungroup() %>%
  select(ateco_code, ateco_sector, ateco_sector_it, isco_code, confidence) %>%
  mutate(isco_code = substr(isco_code, 1, 4)) %>%
  group_by(ateco_sector,
           ateco_sector_it,
           ateco_code,
           isco_code) %>%
  mutate(confidence = max(confidence)) %>%
  ungroup() %>%
  unique()

ONET_DB <- readRDS("create_data/ONET_DB.rds") %>%
  mutate(soc_code = substr(soc_code, 1, 7)) %>%
  ungroup()

esco_onet_crosswalk <- read.xlsx("C://Users/Luis/Desktop/isco_soc_crosswalk.xlsx",
                                 sheet = 2) %>%
  select(1, 4) %>%
  rename(soc_code = 1, isco_code = 2) %>%
  .[-(1:5), ]

# Joining tables to enrich information

# Joining ONET with the ISCO codes
temp <- inner_join(ONET_DB, esco_onet_crosswalk, by = c("soc_code"))

# Joining previous table with the ATECO codes
soc_ateco_crosswalk <- inner_join(temp, ateco2esco, by = c("isco_code")) %>%
  select(soc_code,
         isco_code,
         ateco_code,
         ateco_sector,
         ateco_sector_it,
         confidence) %>%
  # Removing duplicates because of how inner_join works here
  unique()

# Helper function
soc2ateco <- function(soc_code) {
  row <- match(soc_code, soc_ateco_crosswalk$soc_code)
  if (!is.na(row)) {
    ateco_code <- soc_ateco_crosswalk$ateco_code[[row]]
  } else {
    ateco_code <- NULL
  }
  return(ateco_code)
}

soc2ateco_confidence <- function(soc_code) {
  row <- match(soc_code, soc_ateco_crosswalk$soc_code)
  if (!is.na(row)) {
    ateco_code_confidence <- soc_ateco_crosswalk$confidence[[row]]
  } else {
    ateco_code_confidence <- NULL
  }
  return(ateco_code_confidence)
}

ateco_code2name <- function(ateco_code) {
  if (length(ateco_code) > 0) {
    row <- match(ateco_code, ateco2esco$ateco_code)
    if (!is.na(row)) {
      ateco_sector <- ateco2esco$ateco_sector[[row]]
    } else {
      ateco_sector <- "Unknown"
    }
    return(ateco_sector)
  } else {

    return("Unknown")
  }

}

ateco_code2name_it <- function(ateco_code) {
  if (length(ateco_code) > 0) {
    row <- match(ateco_code, ateco2esco$ateco_code)
    if (!is.na(row)) {
      ateco_sector <- ateco2esco$ateco_sector_it[[row]]
    } else {
      ateco_sector <- "Unknown"
    }
    return(ateco_sector)
  } else {

    return("Unknown")
  }

}


# Mapping ATECO codes from SOC codes
ateco_code <- purrr::map(ONET_DB$soc_code, soc2ateco)
ateco_code_confidence <- purrr::map(ONET_DB$soc_code, soc2ateco_confidence)
ateco_sector <- vector("character", length(ateco_code))
ateco_sector_it <- vector("character", length(ateco_code))

for (i in 1:length(ateco_code)) {
  ateco_sector[i] <- ateco_code2name(ateco_code[[i]])
  ateco_sector_it[i] <- ateco_code2name_it(ateco_code[[i]])
}


# Formatting data
ateco2onet_skills <- readRDS("create_data/ONET_DB.rds") %>%
  mutate(ateco_code = ateco_code,
         ateco_sector = ateco_sector,
         ateco_sector_it = ateco_sector,
         confidence = ateco_code_confidence) %>%
  filter(ateco_sector != "Unknown") %>%
  # Removing technologies
  select(-technology) %>%
  # Unnesting skills
  unnest(cols = c("skill")) %>%
  ungroup()
# Setting correct type to be able to sort
ateco2onet_skills$ateco_code %<>% as.character
# Sorting and grouping
ateco2onet_skills <- ateco2onet_skills %>%
  arrange(ateco_code) %>%
  nest(-ateco_code)


ateco2onet_technologies <- readRDS("create_data/ONET_DB.rds") %>%
  mutate(ateco_code = ateco_code,
         ateco_sector = ateco_sector,
         ateco_sector_it = ateco_sector,
         confidence = ateco_code_confidence) %>%
  filter(ateco_sector != "Unknown") %>%
  # Removing technologies
  select(-skill) %>%
  # Unnesting skills
  unnest(cols = c("technology")) %>%
  ungroup()

ateco2onet_technologies$ateco_code %<>% as.character

ateco2onet_technologies <- ateco2onet_technologies %>%
  arrange(ateco_code) %>%
  nest(-ateco_code)


# Saving final ONET skills table with ATECO codes
saveRDS(ateco2onet_skills, file = "create_data/ateco2onet_skills.rds")
saveRDS(ateco2onet_technologies, file = "create_data/ateco2onet_technologies.rds")
ldbolanos/standards documentation built on Aug. 7, 2020, 8:13 p.m.