library(textyr)
library(tidyverse)
library(openxlsx)
library(googleLanguageR)
# Computing mappping between 4 digit ISCO codes and ATECO sector
ateco_df <- read.xlsx("C://Users/Luis/Desktop/Ateco 2007 VI cifra 17 dicembre 2008 1.xlsx") %>%
select(-3) %>%
rename(ateco_code = 1, ateco_sector_it = 2) %>%
filter(stringr::str_length(ateco_code) == 2) %>%
mutate(ateco_sector_it = stringr::str_to_lower(ateco_sector_it))
# Translating ATECO sectors to English
translatedText <- gl_translate(ateco_df$ateco_sector_it, target = "en")
ateco_df <- ateco_df %>%
mutate(ateco_sector_en = translatedText$translatedText)
# Loading ISCO 4 digit codes
isco_structure <- readRDS("create_data/ISCO_structure.rds") %>%
filter(stringr::str_length(code) == 4) %>%
rename(isco_en = title, isco_code = code, isco_it = title_it)
# Building all possible combinations between ISCO codes and ATECO sectors
combined_df <- merge(isco_structure, ateco_df) %>%
select(isco_en, ateco_sector_en, everything())
# Computing semantic similarity between pairs
result_en_en <- semantic_similarity(data = combined_df,
method = c("BERT"),
lang = "ENG",
preprocessing = FALSE)
combined_df <- merge(isco_structure, ateco_df) %>%
select(isco_en, ateco_sector_it, everything())
result_en_it <- semantic_similarity(data = combined_df,
method = c("BERT"),
lang = "ITA",
preprocessing = FALSE)
combined_df <- merge(isco_structure, ateco_df) %>%
select(isco_it, ateco_sector_en, everything())
result_it_en <- semantic_similarity(data = combined_df,
method = c("BERT"),
lang = "ITA",
preprocessing = FALSE)
combined_df <- merge(isco_structure, ateco_df) %>%
select(isco_it, ateco_sector_it, everything())
result_it_it <- semantic_similarity(data = combined_df,
method = c("BERT"),
lang = "ITA",
preprocessing = FALSE)
# Merging both results
results <- bind_rows(result_en_en, result_en_it, result_it_en, result_it_it)
# Choosing top result for each ATECO sector
n <- 1
results <- results %>%
group_by(isco_en) %>%
arrange(isco_en, desc(score_BERT)) %>%
slice_head(n=n) %>%
select(isco_en, ateco_sector_en, score_BERT, everything()) %>%
arrange(score_BERT) %>%
ungroup()
results_non_militar <- results %>%
filter(substr(isco_code,1,1) != 0)
results_militar <- results %>%
filter(substr(isco_code,1,1) == 0)
# Exploring results with low confidence
# results %>%
# filter(score_BERT < 0.5) %>%
# View()
rows_to_fix <- read.csv("create_data/rows_to_fix.csv", header = FALSE) %>%
pull(1)
matches_to_fix <- results[rows_to_fix, ]$isco_en
rows_to_copy <- read.csv("create_data/rows_to_copy.csv", header = FALSE, colClasses = c("character")) %>%
rename(ateco_code = V1)
matches_to_copy <- left_join(rows_to_copy, ateco_df, by = c("ateco_code" = "ateco_code")) %>%
select(ateco_sector_en) %>%
pull(1)
matches_to_copy_it <- left_join(rows_to_copy, ateco_df, by = c("ateco_code" = "ateco_code")) %>%
select(ateco_sector_it) %>%
pull(1)
results_non_militar[rows_to_fix, ]$ateco_sector_en <- matches_to_copy
results_non_militar[rows_to_fix, ]$ateco_sector_it <- matches_to_copy_it
results_non_militar[rows_to_fix, ]$ateco_code <- rows_to_copy$ateco_code
results_militar <- results_militar %>%
mutate(ateco_sector_en = NA,
ateco_sector_it = NA,
ateco_code = NA)
results <- bind_rows(results_non_militar, results_militar)
#################################################################
# After having the mapping, let us use that table to enrich the ESCO DB
# Loading tables for crosswalk
esco_df <- readRDS("create_data/ESCO_DB.rds") %>%
select(occupation_en, occupation_it, occupation_uri, isco_code, data) %>%
mutate(isco_code_4 = substr(isco_code, 1, 4))
enrichment_data <- results %>%
select(ateco_sector_en, ateco_sector_it, ateco_code, isco_code, score_BERT) %>%
ungroup() %>%
rename(isco_code_tmp = "isco_code")
ateco2esco_skills <- inner_join(esco_df, enrichment_data, by = c("isco_code_4" = "isco_code_tmp")) %>%
select(-isco_code_4)
# Formatting the output
ateco2esco_skills <- ateco2esco_skills %>%
unnest(cols = c("data")) %>%
# Selecting columns of interest
select(ateco_sector_en,
ateco_sector_it,
ateco_code,
skill,
skill_it,
is_digital,
is_soft,
score_BERT,
occupation_en,
occupation_it,
occupation_uri,
isco_code) %>%
# Removing military because it has no category in ATECO
# filter(!(substr(isco_code,1,1) == '0')) %>%
# Pooling BERT scores to leave only maximum one
group_by(ateco_sector_en,
ateco_sector_it,
ateco_code,
skill,
skill_it,
is_digital,
is_soft) %>%
mutate(score_BERT = max(score_BERT)) %>%
ungroup() %>%
# Renaming columns
rename(confidence = "score_BERT",
ateco_sector = "ateco_sector_en") %>%
# Reordering columns
select(ateco_code,
ateco_sector,
ateco_sector_it,
occupation_en,
occupation_it,
isco_code,
occupation_uri,
confidence,
skill,
skill_it,
is_digital,
is_soft) %>%
# Nesting the professiones from which the skills came
arrange(ateco_code) %>%
nest(-ateco_code) %>%
arrange(ateco_code)
saveRDS(object = ateco2esco_skills, file="create_data/ateco2esco_skills.rds")
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.