create_data/5.ateco2esco_crosswalk.R

library(textyr)
library(tidyverse)
library(openxlsx)
library(googleLanguageR)

# Computing mappping between 4 digit ISCO codes and ATECO sector
ateco_df <- read.xlsx("C://Users/Luis/Desktop/Ateco 2007 VI cifra 17 dicembre 2008 1.xlsx") %>%
  select(-3) %>%
  rename(ateco_code = 1, ateco_sector_it = 2) %>%
  filter(stringr::str_length(ateco_code) == 2) %>%
  mutate(ateco_sector_it = stringr::str_to_lower(ateco_sector_it))

# Translating ATECO sectors to English
translatedText <- gl_translate(ateco_df$ateco_sector_it, target = "en")

ateco_df <- ateco_df %>%
  mutate(ateco_sector_en = translatedText$translatedText)

# Loading ISCO 4 digit codes
isco_structure <- readRDS("create_data/ISCO_structure.rds") %>%
  filter(stringr::str_length(code) == 4) %>%
  rename(isco_en = title, isco_code = code, isco_it = title_it)

# Building all possible combinations between ISCO codes and ATECO sectors
combined_df <- merge(isco_structure, ateco_df) %>%
  select(isco_en, ateco_sector_en, everything())
# Computing semantic similarity between pairs
result_en_en <- semantic_similarity(data = combined_df,
                                    method = c("BERT"),
                                    lang = "ENG",
                                    preprocessing = FALSE)

combined_df <- merge(isco_structure, ateco_df) %>%
  select(isco_en, ateco_sector_it, everything())
result_en_it <- semantic_similarity(data = combined_df,
                                    method = c("BERT"),
                                    lang = "ITA",
                                    preprocessing = FALSE)

combined_df <- merge(isco_structure, ateco_df) %>%
  select(isco_it, ateco_sector_en, everything())
result_it_en <- semantic_similarity(data = combined_df,
                                    method = c("BERT"),
                                    lang = "ITA",
                                    preprocessing = FALSE)

combined_df <- merge(isco_structure, ateco_df) %>%
  select(isco_it, ateco_sector_it, everything())
result_it_it <- semantic_similarity(data = combined_df,
                                    method = c("BERT"),
                                    lang = "ITA",
                                    preprocessing = FALSE)
# Merging both results
results <- bind_rows(result_en_en, result_en_it, result_it_en, result_it_it)

# Choosing top result for each ATECO sector
n <- 1
results <- results %>%
  group_by(isco_en) %>%
  arrange(isco_en, desc(score_BERT)) %>%
  slice_head(n=n) %>%
  select(isco_en, ateco_sector_en, score_BERT, everything()) %>%
  arrange(score_BERT) %>%
  ungroup()

results_non_militar <- results %>%
  filter(substr(isco_code,1,1) != 0)

results_militar <- results %>%
  filter(substr(isco_code,1,1) == 0)

# Exploring results with low confidence
# results %>%
#   filter(score_BERT < 0.5) %>%
#   View()

rows_to_fix <- read.csv("create_data/rows_to_fix.csv", header = FALSE) %>%
  pull(1)

matches_to_fix <- results[rows_to_fix, ]$isco_en

rows_to_copy <- read.csv("create_data/rows_to_copy.csv", header = FALSE, colClasses = c("character")) %>%
  rename(ateco_code = V1)

matches_to_copy <- left_join(rows_to_copy, ateco_df, by = c("ateco_code" = "ateco_code")) %>%
  select(ateco_sector_en) %>%
  pull(1)

matches_to_copy_it <- left_join(rows_to_copy, ateco_df, by = c("ateco_code" = "ateco_code")) %>%
  select(ateco_sector_it) %>%
  pull(1)

results_non_militar[rows_to_fix, ]$ateco_sector_en <- matches_to_copy
results_non_militar[rows_to_fix, ]$ateco_sector_it <- matches_to_copy_it
results_non_militar[rows_to_fix, ]$ateco_code <- rows_to_copy$ateco_code

results_militar <- results_militar %>%
  mutate(ateco_sector_en = NA,
         ateco_sector_it = NA,
         ateco_code = NA)

results <- bind_rows(results_non_militar, results_militar)


#################################################################

# After having the mapping, let us use that table to enrich the ESCO DB

# Loading tables for crosswalk

esco_df <- readRDS("create_data/ESCO_DB.rds") %>%
  select(occupation_en, occupation_it, occupation_uri, isco_code, data) %>%
  mutate(isco_code_4 = substr(isco_code, 1, 4))

enrichment_data <- results %>%
  select(ateco_sector_en, ateco_sector_it, ateco_code, isco_code, score_BERT) %>%
  ungroup() %>%
  rename(isco_code_tmp = "isco_code")

ateco2esco_skills <- inner_join(esco_df, enrichment_data, by = c("isco_code_4" = "isco_code_tmp")) %>%
  select(-isco_code_4)

# Formatting the output
ateco2esco_skills <- ateco2esco_skills %>%
  unnest(cols = c("data")) %>%
  # Selecting columns of interest
  select(ateco_sector_en,
         ateco_sector_it,
         ateco_code,
         skill,
         skill_it,
         is_digital,
         is_soft,
         score_BERT,
         occupation_en,
         occupation_it,
         occupation_uri,
         isco_code) %>%
  # Removing military because it has no category in ATECO
  # filter(!(substr(isco_code,1,1) == '0')) %>%
  # Pooling BERT scores to leave only maximum one
  group_by(ateco_sector_en,
           ateco_sector_it,
           ateco_code,
           skill,
           skill_it,
           is_digital,
           is_soft) %>%
  mutate(score_BERT = max(score_BERT)) %>%
  ungroup() %>%
  # Renaming columns
  rename(confidence = "score_BERT",
         ateco_sector = "ateco_sector_en") %>%
  # Reordering columns
  select(ateco_code,
         ateco_sector,
         ateco_sector_it,
         occupation_en,
         occupation_it,
         isco_code,
         occupation_uri,
         confidence,
         skill,
         skill_it,
         is_digital,
         is_soft) %>%
  # Nesting the professiones from which the skills came
  arrange(ateco_code) %>%
  nest(-ateco_code) %>%
  arrange(ateco_code)


saveRDS(object = ateco2esco_skills, file="create_data/ateco2esco_skills.rds")
ldbolanos/standards documentation built on Aug. 7, 2020, 8:13 p.m.