# This script merges ESCO and ONET onto a single data structure
library(tidyverse)
library(googleLanguageR)
library(parallel)
library(textyr)
ateco2esco_skills <- readRDS("create_data/ateco2esco_skills.rds") %>%
unnest(cols = c("data")) %>%
# Changing to generic name for the entry
rename(item = "skill") %>%
rename(item_it = "skill_it") %>%
# Adding column to specify type of entry
mutate(type_of_item = "skill") %>%
# Adding fake columns to resemble ONET data
mutate(bright_outlook = NA,
green = NA,
apprenticeship = NA) %>%
# Generic name for the occupation code
rename(code = "isco_code") %>%
# Generic name for the URI
rename(href = "occupation_uri") %>%
# Adding meta-info
mutate(source = "ESCO") %>%
# Reordering columns
select(item,
item_it,
is_digital,
is_soft,
type_of_item,
occupation_en,
occupation_it,
green,
bright_outlook,
apprenticeship,
code,
href,
ateco_sector,
ateco_sector_it,
ateco_code,
confidence,
source)
ateco2onet_skills <- readRDS("create_data/ateco2onet_skills.rds") %>%
unnest(cols = c("data")) %>%
rename(item = "skill") %>%
mutate(type_of_item = "skill") %>%
rename(code = "soc_code") %>%
rename(occupation_en = "occupation") %>%
# Adding missing fields
mutate(occupation_it = NA) %>%
mutate(skill_it = NA) %>%
mutate(item_it = NA) %>%
# Adding meta-info
mutate(source = "ONET") %>%
# Reordering columns
select(item,
item_it,
is_digital,
is_soft,
type_of_item,
occupation_en,
occupation_it,
green,
bright_outlook,
apprenticeship,
code,
href,
ateco_sector,
ateco_sector_it,
ateco_code,
confidence,
source)
ateco2onet_technologies <- readRDS("create_data/ateco2onet_technologies.rds") %>%
unnest(cols = c("data")) %>%
rename(item = "technology") %>%
mutate(type_of_item = "technology") %>%
rename(code = "soc_code") %>%
rename(occupation_en = "occupation") %>%
mutate(occupation_it = NA) %>%
mutate(item_it = NA) %>%
# Adding missing fields
mutate(is_digital = TRUE) %>%
mutate(is_soft = FALSE) %>%
# Adding meta-info
mutate(source = "ONET") %>%
# Reordering columns
select(item,
item_it,
is_digital,
is_soft,
type_of_item,
occupation_en,
occupation_it,
green,
bright_outlook,
apprenticeship,
code,
href,
ateco_sector,
ateco_sector_it,
ateco_code,
confidence,
source)
# Filling NAs in ateco2esco item_it
to_translate <- ateco2esco_skills %>%
filter(is.na(item_it))
complete_records <- ateco2esco_skills %>%
filter(!is.na(item_it))
translated_text <- gl_translate(to_translate$item, target = "it")
to_translate <- to_translate %>%
mutate(item_it = translated_text$translatedText)
### Re tagging these items
# Applying technimetro
# Calculate the number of cores
no_cores <- detectCores() - 1
# Initiate cluster
cl <- makeCluster(no_cores)
# Tagging English skills
is_digital_skill_en <- parLapply(cl,
to_translate$item,
text_tagger) %>%
tibble() %>%
unnest(cols = c(.))
is_digital_skill_en <- purrr::map(is_digital_skill_en$entity, is_empty)
# Tagging Italian skills
is_digital_skill_it <- parLapply(cl,
to_translate$item_it,
text_tagger) %>%
tibble() %>%
unnest(cols = c(.))
is_digital_skill_it <- purrr::map(is_digital_skill_it$entity, is_empty)
is_digital_skill <- !(unlist(is_digital_skill_en) & unlist(is_digital_skill_it))
to_translate <- to_translate %>%
mutate(is_digital = is_digital_skill)
# Applying soft skills tagger
# Tagging English skills
is_soft <- parLapply(cl,
to_translate$item,
text_tagger, type = "soft_skills") %>%
tibble() %>%
unnest(cols = c(.))
is_soft <- purrr::map(is_soft$entity, is_empty)
is_soft <- !(unlist(is_soft))
to_translate <- to_translate %>%
select(-is_soft) %>%
mutate(is_soft = is_soft) %>%
select(item,
item_it,
is_digital,
is_soft,
type_of_item,
occupation_en,
occupation_it,
green,
bright_outlook,
apprenticeship,
code,
href,
ateco_sector,
ateco_sector_it,
ateco_code,
confidence,
source)
###
ateco2esco_skills_clean <- bind_rows(complete_records, to_translate)
saveRDS(object = ateco2esco_skills_clean, file = "create_data/ateco2esco_skills_clean.rds")
# Translating items from ONET
# Skills
item_it <- gl_translate(ateco2onet_skills$item, target = "it")
ateco2onet_skills <- ateco2onet_skills %>%
select(-item_it) %>%
mutate(item_it = item_it$translatedText)
occupation_it <- gl_translate(ateco2onet_skills$occupation_en, target = "it")
ateco2onet_skills <- ateco2onet_skills %>%
select(-occupation_it) %>%
mutate(occupation_it = occupation_it$translatedText) %>%
select(item,
item_it,
is_digital,
is_soft,
type_of_item,
occupation_en,
occupation_it,
green,
bright_outlook,
apprenticeship,
code,
href,
ateco_sector,
ateco_sector_it,
ateco_code,
confidence,
source)
saveRDS(object = ateco2onet_skills, file = "create_data/ateco2onet_skills_clean.rds")
# Technologies
item_it <- gl_translate(ateco2onet_technologies$item, target = "it")
ateco2onet_technologies <- ateco2onet_technologies %>%
select(-item_it) %>%
mutate(item_it = item_it$translatedText)
occupation_it <- gl_translate(ateco2onet_technologies$occupation_en, target = "it")
ateco2onet_technologies <- ateco2onet_technologies %>%
select(-occupation_it) %>%
mutate(occupation_it = occupation_it$translatedText) %>%
select(item,
item_it,
is_digital,
is_soft,
type_of_item,
occupation_en,
occupation_it,
green,
bright_outlook,
apprenticeship,
code,
href,
ateco_sector,
ateco_sector_it,
ateco_code,
confidence,
source)
saveRDS(object = ateco2onet_technologies, file = "create_data/ateco2onet_technologies_clean.rds")
# Let us calculate the three most similar skills for each entry from the other data source
# Most similar ONET to each ESCO
ateco2esco_skills_clean <- readRDS("create_data/ateco2esco_skills_clean.rds") %>%
mutate(most_similar = NA)
ateco2onet_skills_clean <- readRDS("create_data/ateco2onet_skills_clean.rds") %>%
mutate(most_similar = NA)
ateco2onet_skills_clean$confidence <- as.double(ateco2onet_skills_clean$confidence)
ateco2onet_technologies_clean <- readRDS("create_data/ateco2onet_technologies_clean.rds")
ateco2onet_technologies_clean$confidence <- as.double(ateco2onet_technologies_clean$confidence)
esco_names <- names(ateco2esco_skills_clean) %>%
gsub("(.*)", "esco_\\1", .)
onet_names <- names(ateco2onet_skills_clean) %>%
gsub("(.*)", "onet_\\1", .)
names(ateco2esco_skills_clean) <- esco_names
names(ateco2onet_skills_clean) <- onet_names
start <- Sys.time()
step <- 1000
for (j in seq(from = 1,to = nrow(ateco2esco_skills_clean), by = step)) {
print(j)
if (j != 114001) {
esco_item <- ateco2esco_skills_clean$esco_item[j:(j+step-1)]
all_combinations <- ateco2onet_skills_clean %>%
pull(onet_item) %>%
unique() %>%
as_tibble() %>%
rename(onet_item = 1) %>%
merge(., tibble(esco_item))
top_3_similar <- semantic_similarity(all_combinations,
lang = "ENG",
method = "average_word_embeddings_glove.6B.300d")
top_3_similar <- top_3_similar %>%
group_by(esco_item) %>%
arrange(desc(score_BERT)) %>%
slice_head(n=3)
for (i in j:(j+step-1)){
top3_similar_by_item <- top_3_similar %>%
filter(esco_item == ateco2esco_skills_clean$esco_item[i]) %>%
pull(onet_item) %>%
as.vector()
ateco2esco_skills_clean$esco_most_similar[i] <- list(top3_similar_by_item)
}
} else {
esco_item <- ateco2esco_skills_clean$esco_item[j:nrow(ateco2esco_skills_clean)]
all_combinations <- ateco2onet_skills_clean %>%
pull(onet_item) %>%
unique() %>%
as_tibble() %>%
rename(onet_item = 1) %>%
merge(., tibble(esco_item))
top_3_similar <- semantic_similarity(all_combinations,
lang = "ENG",
method = "average_word_embeddings_glove.6B.300d")
top_3_similar <- top_3_similar %>%
group_by(esco_item) %>%
arrange(desc(score_BERT)) %>%
slice_head(n=3)
for (i in j:nrow(ateco2esco_skills_clean)){
top3_similar_by_item <- top_3_similar %>%
filter(esco_item == ateco2esco_skills_clean$esco_item[i]) %>%
pull(onet_item) %>%
as.vector()
ateco2esco_skills_clean$esco_most_similar[i] <- list(top3_similar_by_item)
}
}
}
end <- Sys.time()
end-start
# Most similar ESCO to each ONET
start <- Sys.time()
step <- 100
for (j in seq(from = 1, to = nrow(ateco2onet_skills_clean), by = step)) {
print(j)
if (j != 11201) {
onet_item <- ateco2onet_skills_clean$onet_item[j:(j+step-1)]
all_combinations <- ateco2esco_skills_clean %>%
pull(esco_item) %>%
unique() %>%
as_tibble() %>%
rename(esco_item = 1) %>%
merge(., tibble(onet_item))
top_3_similar <- semantic_similarity(all_combinations,
lang = "ENG",
method = "average_word_embeddings_glove.6B.300d")
top_3_similar <- top_3_similar %>%
group_by(onet_item) %>%
arrange(desc(score_BERT)) %>%
slice_head(n=3)
for (i in j:(j+step-1)){
top3_similar_by_item <- top_3_similar %>%
filter(onet_item == ateco2onet_skills_clean$onet_item[i]) %>%
pull(esco_item) %>%
as.vector()
ateco2onet_skills_clean$onet_most_similar[i] <- list(top3_similar_by_item)
}
} else {
onet_item <- ateco2onet_skills_clean$onet_item[j:nrow(ateco2onet_skills_clean)]
all_combinations <- ateco2esco_skills_clean %>%
pull(esco_item) %>%
unique() %>%
as_tibble() %>%
rename(esco_item = 1) %>%
merge(., tibble(onet_item))
top_3_similar <- semantic_similarity(all_combinations,
lang = "ENG",
method = "average_word_embeddings_glove.6B.300d")
top_3_similar <- top_3_similar %>%
group_by(onet_item) %>%
arrange(desc(score_BERT)) %>%
slice_head(n=3)
for (i in j:nrow(ateco2onet_skills_clean)){
top3_similar_by_item <- top_3_similar %>%
filter(onet_item == ateco2onet_skills_clean$onet_item[i]) %>%
pull(esco_item) %>%
as.vector()
ateco2onet_skills_clean$onet_most_similar[i] <- list(top3_similar_by_item)
}
}
}
end <- Sys.time()
end-start
esco_names <- names(ateco2esco_skills_clean) %>%
gsub("esco_", "", .)
names(ateco2esco_skills_clean) <- esco_names
saveRDS(object = ateco2esco_skills_clean, file = "create_data/ateco2esco_skills_clean_with_similar.rds")
onet_names <- names(ateco2onet_skills_clean) %>%
gsub("onet_", "", .)
names(ateco2onet_skills_clean) <- onet_names
saveRDS(object = ateco2onet_skills_clean, file = "create_data/ateco2onet_skills_clean_with_similar.rds")
# Merging everything and saving
ateco2esco_skills_clean_with_similar <- readRDS("create_data/ateco2esco_skills_clean_with_similar.rds")
ateco2onet_skills_clean_with_similar <- readRDS("create_data/ateco2onet_skills_clean_with_similar.rds")
ateco2onet_skills_clean_with_similar$confidence <- as.double(ateco2onet_skills_clean_with_similar$confidence)
ateco2onet_technologies_clean_with_similar <- readRDS("create_data/ateco2onet_technologies_clean.rds") %>%
mutate(most_similar = NA)
ateco2onet_technologies_clean_with_similar$confidence <- as.double(ateco2onet_technologies_clean_with_similar$confidence)
standards_DB <- bind_rows(ateco2esco_skills_clean_with_similar,
ateco2onet_skills_clean_with_similar,
ateco2onet_technologies_clean_with_similar)
saveRDS(object = standards_DB, file = "create_data/standards_DB.rds")
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.