create_data/7.merge_databases.R

# This script merges ESCO and ONET onto a single data structure

library(tidyverse)
library(googleLanguageR)
library(parallel)
library(textyr)

ateco2esco_skills <- readRDS("create_data/ateco2esco_skills.rds") %>%
  unnest(cols = c("data")) %>%
  # Changing to generic name for the entry
  rename(item = "skill") %>%
  rename(item_it = "skill_it") %>%
  # Adding column to specify type of entry
  mutate(type_of_item = "skill") %>%
  # Adding fake columns to resemble ONET data
  mutate(bright_outlook = NA,
         green = NA,
         apprenticeship = NA) %>%
  # Generic name for the occupation code
  rename(code = "isco_code") %>%
  # Generic name for the URI
  rename(href = "occupation_uri") %>%
  # Adding meta-info
  mutate(source = "ESCO") %>%
  # Reordering columns
  select(item,
         item_it,
         is_digital,
         is_soft,
         type_of_item,
         occupation_en,
         occupation_it,
         green,
         bright_outlook,
         apprenticeship,
         code,
         href,
         ateco_sector,
         ateco_sector_it,
         ateco_code,
         confidence,
         source)

ateco2onet_skills <- readRDS("create_data/ateco2onet_skills.rds") %>%
  unnest(cols = c("data")) %>%
  rename(item = "skill")  %>%
  mutate(type_of_item = "skill") %>%
  rename(code = "soc_code") %>%
  rename(occupation_en = "occupation") %>%
  # Adding missing fields
  mutate(occupation_it = NA) %>%
  mutate(skill_it = NA) %>%
  mutate(item_it = NA) %>%
  # Adding meta-info
  mutate(source = "ONET") %>%
  # Reordering columns
  select(item,
         item_it,
         is_digital,
         is_soft,
         type_of_item,
         occupation_en,
         occupation_it,
         green,
         bright_outlook,
         apprenticeship,
         code,
         href,
         ateco_sector,
         ateco_sector_it,
         ateco_code,
         confidence,
         source)

ateco2onet_technologies <- readRDS("create_data/ateco2onet_technologies.rds") %>%
  unnest(cols = c("data")) %>%
  rename(item = "technology")  %>%
  mutate(type_of_item = "technology") %>%
  rename(code = "soc_code") %>%
  rename(occupation_en = "occupation") %>%
  mutate(occupation_it = NA) %>%
  mutate(item_it = NA) %>%
  # Adding missing fields
  mutate(is_digital = TRUE) %>%
  mutate(is_soft = FALSE) %>%
  # Adding meta-info
  mutate(source = "ONET") %>%
  # Reordering columns
  select(item,
         item_it,
         is_digital,
         is_soft,
         type_of_item,
         occupation_en,
         occupation_it,
         green,
         bright_outlook,
         apprenticeship,
         code,
         href,
         ateco_sector,
         ateco_sector_it,
         ateco_code,
         confidence,
         source)



# Filling NAs in ateco2esco item_it

to_translate <- ateco2esco_skills %>%
  filter(is.na(item_it))

complete_records <- ateco2esco_skills %>%
  filter(!is.na(item_it))

translated_text <- gl_translate(to_translate$item, target = "it")

to_translate <- to_translate %>%
  mutate(item_it = translated_text$translatedText)


### Re tagging these items

# Applying technimetro
# Calculate the number of cores
no_cores <- detectCores() - 1
# Initiate cluster
cl <- makeCluster(no_cores)
# Tagging English skills
is_digital_skill_en <- parLapply(cl,
                                 to_translate$item,
                                 text_tagger) %>%
  tibble() %>%
  unnest(cols = c(.))
is_digital_skill_en <- purrr::map(is_digital_skill_en$entity, is_empty)
# Tagging Italian skills
is_digital_skill_it <- parLapply(cl,
                                 to_translate$item_it,
                                 text_tagger) %>%
  tibble() %>%
  unnest(cols = c(.))
is_digital_skill_it <- purrr::map(is_digital_skill_it$entity, is_empty)

is_digital_skill <- !(unlist(is_digital_skill_en) & unlist(is_digital_skill_it))

to_translate <- to_translate %>%
  mutate(is_digital = is_digital_skill)

# Applying soft skills tagger
# Tagging English skills
is_soft <- parLapply(cl,
                     to_translate$item,
                     text_tagger, type = "soft_skills") %>%
  tibble() %>%
  unnest(cols = c(.))
is_soft <- purrr::map(is_soft$entity, is_empty)

is_soft <- !(unlist(is_soft))

to_translate <- to_translate %>%
  select(-is_soft) %>%
  mutate(is_soft = is_soft) %>%
  select(item,
         item_it,
         is_digital,
         is_soft,
         type_of_item,
         occupation_en,
         occupation_it,
         green,
         bright_outlook,
         apprenticeship,
         code,
         href,
         ateco_sector,
         ateco_sector_it,
         ateco_code,
         confidence,
         source)

###


ateco2esco_skills_clean <- bind_rows(complete_records, to_translate)

saveRDS(object = ateco2esco_skills_clean, file = "create_data/ateco2esco_skills_clean.rds")

# Translating items from ONET

# Skills

item_it <- gl_translate(ateco2onet_skills$item, target = "it")

ateco2onet_skills <- ateco2onet_skills %>%
  select(-item_it) %>%
  mutate(item_it = item_it$translatedText)

occupation_it <- gl_translate(ateco2onet_skills$occupation_en, target = "it")

ateco2onet_skills <- ateco2onet_skills %>%
  select(-occupation_it) %>%
  mutate(occupation_it = occupation_it$translatedText) %>%
  select(item,
         item_it,
         is_digital,
         is_soft,
         type_of_item,
         occupation_en,
         occupation_it,
         green,
         bright_outlook,
         apprenticeship,
         code,
         href,
         ateco_sector,
         ateco_sector_it,
         ateco_code,
         confidence,
         source)

saveRDS(object = ateco2onet_skills, file = "create_data/ateco2onet_skills_clean.rds")

# Technologies

item_it <- gl_translate(ateco2onet_technologies$item, target = "it")

ateco2onet_technologies <- ateco2onet_technologies %>%
  select(-item_it) %>%
  mutate(item_it = item_it$translatedText)

occupation_it <- gl_translate(ateco2onet_technologies$occupation_en, target = "it")

ateco2onet_technologies <- ateco2onet_technologies %>%
  select(-occupation_it) %>%
  mutate(occupation_it = occupation_it$translatedText) %>%
  select(item,
         item_it,
         is_digital,
         is_soft,
         type_of_item,
         occupation_en,
         occupation_it,
         green,
         bright_outlook,
         apprenticeship,
         code,
         href,
         ateco_sector,
         ateco_sector_it,
         ateco_code,
         confidence,
         source)

saveRDS(object = ateco2onet_technologies, file = "create_data/ateco2onet_technologies_clean.rds")


# Let us calculate the three most similar skills for each entry from the other data source

# Most similar ONET to each ESCO

ateco2esco_skills_clean <- readRDS("create_data/ateco2esco_skills_clean.rds") %>%
  mutate(most_similar = NA)

ateco2onet_skills_clean <- readRDS("create_data/ateco2onet_skills_clean.rds") %>%
  mutate(most_similar = NA)
ateco2onet_skills_clean$confidence <- as.double(ateco2onet_skills_clean$confidence)

ateco2onet_technologies_clean <- readRDS("create_data/ateco2onet_technologies_clean.rds")
ateco2onet_technologies_clean$confidence <- as.double(ateco2onet_technologies_clean$confidence)

esco_names <- names(ateco2esco_skills_clean) %>%
  gsub("(.*)", "esco_\\1", .)

onet_names <- names(ateco2onet_skills_clean) %>%
  gsub("(.*)", "onet_\\1", .)

names(ateco2esco_skills_clean) <- esco_names
names(ateco2onet_skills_clean) <- onet_names

start <-  Sys.time()
step <-  1000
for (j in seq(from = 1,to =  nrow(ateco2esco_skills_clean), by = step)) {

  print(j)

  if (j != 114001) {

    esco_item <- ateco2esco_skills_clean$esco_item[j:(j+step-1)]

    all_combinations <- ateco2onet_skills_clean %>%
      pull(onet_item) %>%
      unique() %>%
      as_tibble() %>%
      rename(onet_item = 1) %>%
      merge(., tibble(esco_item))

    top_3_similar <- semantic_similarity(all_combinations,
                                         lang = "ENG",
                                         method = "average_word_embeddings_glove.6B.300d")
    top_3_similar <- top_3_similar %>%
      group_by(esco_item) %>%
      arrange(desc(score_BERT)) %>%
      slice_head(n=3)

    for (i in j:(j+step-1)){

      top3_similar_by_item <- top_3_similar %>%
        filter(esco_item == ateco2esco_skills_clean$esco_item[i]) %>%
        pull(onet_item) %>%
        as.vector()

      ateco2esco_skills_clean$esco_most_similar[i] <- list(top3_similar_by_item)
    }
  } else {

    esco_item <- ateco2esco_skills_clean$esco_item[j:nrow(ateco2esco_skills_clean)]

    all_combinations <- ateco2onet_skills_clean %>%
      pull(onet_item) %>%
      unique() %>%
      as_tibble() %>%
      rename(onet_item = 1) %>%
      merge(., tibble(esco_item))

    top_3_similar <- semantic_similarity(all_combinations,
                                         lang = "ENG",
                                         method = "average_word_embeddings_glove.6B.300d")
    top_3_similar <- top_3_similar %>%
      group_by(esco_item) %>%
      arrange(desc(score_BERT)) %>%
      slice_head(n=3)

    for (i in j:nrow(ateco2esco_skills_clean)){

      top3_similar_by_item <- top_3_similar %>%
        filter(esco_item == ateco2esco_skills_clean$esco_item[i]) %>%
        pull(onet_item) %>%
        as.vector()

      ateco2esco_skills_clean$esco_most_similar[i] <- list(top3_similar_by_item)
    }
  }

}


end <-  Sys.time()
end-start


# Most similar ESCO to each ONET

start <-  Sys.time()
step <-  100
for (j in seq(from = 1, to = nrow(ateco2onet_skills_clean), by = step)) {

  print(j)

  if (j != 11201) {

    onet_item <- ateco2onet_skills_clean$onet_item[j:(j+step-1)]

    all_combinations <- ateco2esco_skills_clean %>%
      pull(esco_item) %>%
      unique() %>%
      as_tibble() %>%
      rename(esco_item = 1) %>%
      merge(., tibble(onet_item))

    top_3_similar <- semantic_similarity(all_combinations,
                                         lang = "ENG",
                                         method = "average_word_embeddings_glove.6B.300d")
    top_3_similar <- top_3_similar %>%
      group_by(onet_item) %>%
      arrange(desc(score_BERT)) %>%
      slice_head(n=3)

    for (i in j:(j+step-1)){

      top3_similar_by_item <- top_3_similar %>%
        filter(onet_item == ateco2onet_skills_clean$onet_item[i]) %>%
        pull(esco_item) %>%
        as.vector()

      ateco2onet_skills_clean$onet_most_similar[i] <- list(top3_similar_by_item)
    }
  } else {

    onet_item <- ateco2onet_skills_clean$onet_item[j:nrow(ateco2onet_skills_clean)]

    all_combinations <- ateco2esco_skills_clean %>%
      pull(esco_item) %>%
      unique() %>%
      as_tibble() %>%
      rename(esco_item = 1) %>%
      merge(., tibble(onet_item))

    top_3_similar <- semantic_similarity(all_combinations,
                                         lang = "ENG",
                                         method = "average_word_embeddings_glove.6B.300d")
    top_3_similar <- top_3_similar %>%
      group_by(onet_item) %>%
      arrange(desc(score_BERT)) %>%
      slice_head(n=3)

    for (i in j:nrow(ateco2onet_skills_clean)){

      top3_similar_by_item <- top_3_similar %>%
        filter(onet_item == ateco2onet_skills_clean$onet_item[i]) %>%
        pull(esco_item) %>%
        as.vector()

      ateco2onet_skills_clean$onet_most_similar[i] <- list(top3_similar_by_item)
    }
  }

}


end <-  Sys.time()
end-start

esco_names <- names(ateco2esco_skills_clean) %>%
  gsub("esco_", "", .)
names(ateco2esco_skills_clean) <- esco_names
saveRDS(object = ateco2esco_skills_clean, file = "create_data/ateco2esco_skills_clean_with_similar.rds")

onet_names <- names(ateco2onet_skills_clean) %>%
  gsub("onet_", "", .)
names(ateco2onet_skills_clean) <- onet_names
saveRDS(object = ateco2onet_skills_clean, file = "create_data/ateco2onet_skills_clean_with_similar.rds")


# Merging everything and saving

ateco2esco_skills_clean_with_similar <- readRDS("create_data/ateco2esco_skills_clean_with_similar.rds")
ateco2onet_skills_clean_with_similar <- readRDS("create_data/ateco2onet_skills_clean_with_similar.rds")
ateco2onet_skills_clean_with_similar$confidence <- as.double(ateco2onet_skills_clean_with_similar$confidence)

ateco2onet_technologies_clean_with_similar <- readRDS("create_data/ateco2onet_technologies_clean.rds") %>%
  mutate(most_similar = NA)
ateco2onet_technologies_clean_with_similar$confidence <- as.double(ateco2onet_technologies_clean_with_similar$confidence)


standards_DB <- bind_rows(ateco2esco_skills_clean_with_similar,
                          ateco2onet_skills_clean_with_similar,
                          ateco2onet_technologies_clean_with_similar)


saveRDS(object = standards_DB, file = "create_data/standards_DB.rds")
ldbolanos/standards documentation built on Aug. 7, 2020, 8:13 p.m.