create_data/4.get_ONET_DB.R

library(jsonlite)
library(tidyverse)
library(parallel)
library(httr)
library(textyr)

ONET_careers <- tibble(.rows = NULL)

next_url <- "https://services.onetcenter.org/ws/mnm/careers/"

tot_careers <- 1000000

while (nrow(ONET_careers) < tot_careers) {

  r <- GET(next_url,
           add_headers(Authorization = "Basic dGV4dHlfYml6Ojg5NDlyems=",
                       Accept = "application/json"))

  response_careers <- content(r, "parsed", type = "application/json")

  for (link in response_careers$link) {
    if (link$rel == "next") {
      next_url <- link$href
    }
  }


  # For each career
  for (career in response_careers$career) {

    # print(career$title)

    career_code <- career$code

    url_occupation <- paste0("https://services.onetcenter.org/ws/online/occupations/",
                             career_code,
                             "/")

    r <- GET(url_occupation,
             add_headers(Authorization = "Basic dGV4dHlfYml6Ojg5NDlyems=",
                         Accept = "application/json"))

    response_occupation <- content(r, "parsed", type = "application/json")

    # For each resource retrieve data

    technology_skills <- c()
    skills <- c()
    abilities <- c()
    knowledge <- c()
    tools <- c()
    tools_technology <- c()

    for (resource in response_occupation$summary_resources$resource) {


      # Avoid retrieving other resources
      if (resource$title %in% c("Technology Skills",
                                "Skills",
                                "Abilities",
                                "Knowledge",
                                "Tools Used",
                                "Tools &amp; Technology")) {

        # Getting the resource URL
        url_summary_resource <- resource$href

        # Making request
        r <- GET(url_summary_resource,
                 add_headers(Authorization = "Basic dGV4dHlfYml6Ojg5NDlyems=",
                             Accept = "application/json"))

        # Parsing response
        response_summary_resource <- content(r, "parsed", type = "application/json")

        # Retrieving data
        if (resource$title == "Technology Skills") {
          for (skill in response_summary_resource$category) {
            technology_skills <- c(technology_skills, skill$title$name)
          }
        } else if (resource$title == "Skills") {
          for (skill in response_summary_resource$element) {
            skills <- c(skills, skill$name)
          }
        } else if (resource$title == "Abilities") {
          for (skill in response_summary_resource$element) {
            abilities <- c(abilities, skill$name)
          }
        } else if (resource$title == "Knowledge") {
          for (skill in response_summary_resource$element) {
            knowledge <- c(knowledge, skill$name)
          }
        } else if (resource$title == "Tools Used") {
          for (skill in response_summary_resource$category) {
            tools <- c(tools, skill$title$name)
          }
        } else if (resource$title == "Tools &amp; Technology") {
          for (skill in response_summary_resource$tools) {
            tools_technology <- c(tools_technology, skill$title$name)
          }
          for (skill in response_summary_resource$technology) {
            tools_technology <- c(tools_technology, skill$title$name)
          }
        }

      }

    }

    # Trick to achieve one row per career
    technology_skills <- list(technology_skills)
    skills <- list(skills)
    abilities <- list(abilities)
    knowledge <- list(knowledge)
    tools <- list(tools)
    tools_technology <- list(tools_technology)

    # Creating record
    career_tibble <- tibble(technology_skills,
                            skills,
                            abilities,
                            knowledge,
                            tools) %>%
      mutate(occupation = career$title,
             soc_code = career_code,
             href = career$href,
             bright_outlook = career$tags$bright_outlook,
             green = career$tags$green,
             apprenticeship = career$tags$apprenticeship) %>%
      group_by(occupation, soc_code, href, bright_outlook, green, apprenticeship) %>%
      nest()

    # Appending to the final tibble
    ONET_careers <- bind_rows(ONET_careers, career_tibble)
  }

  tot_careers <- response_careers$total

  print(paste0("Queried ",
               nrow(ONET_careers),
               " of ",
               tot_careers,
               " careers..."))
}


ONET_careers <- ONET_careers[1:974, ]

saveRDS(ONET_careers, file = "create_data/ONET_DB_raw.rds")

# Reformating data like ESCO_DB

ONET_DB_raw <- readRDS("create_data/ONET_DB_raw.rds") %>%
  unnest(cols = c("data")) %>%
  replace_na(list(skills = "None", abilities = "None", knowledge = "None"))

skills_concat_per_row <- tibble(.rows = NULL)
technology_concat_per_row <- tibble(.rows = NULL)

# Calculate the number of cores
no_cores <- detectCores() - 1
# Initiate cluster
cl <- makeCluster(no_cores)

for (i in 1:nrow(ONET_DB_raw)) {
  # Concatenating all columns
  skill <- c()
  technology <- c()

  for (j in 7:ncol(ONET_DB_raw)) {
    if (j == 7 | j == 11) {
      N_digital = length(ONET_DB_raw[[i, j]][[1]]) # Number of digital skills
      technology <- c(technology, ONET_DB_raw[[i, j]])
    } else {
      skill <- c(skill, ONET_DB_raw[[i, j]])
    }

  }

  skill_tibble <- tibble(unlist(skill)) %>%
    unique()
  technology_tibble <- tibble(unlist(technology)) %>%
    unique()

  is_digital = rep(FALSE, nrow(skill_tibble))

  # Applying soft skills tagger
  # Tagging English skills
  is_soft <- parLapply(cl,
                       skill_tibble[, 1],
                       text_tagger, type = "soft_skills") %>%
    tibble() %>%
    unnest(cols = c(.))
  is_soft <- purrr::map(is_soft$entity, is_empty)

  is_soft <- !(unlist(is_soft))

  skill_tibble <- skill_tibble %>%
    rename("skill"= 1) %>%
    mutate(is_digital = is_digital) %>%
    mutate(is_soft = is_soft) %>%
    mutate(row = i)

  technology_tibble <- technology_tibble %>%
    rename("technology"= 1) %>%
    mutate(row = i)

  skills_concat_per_row <- bind_rows(skills_concat_per_row, skill_tibble)
  technology_concat_per_row <- bind_rows(technology_concat_per_row, technology_tibble)
}

skills_concat_per_row <- skills_concat_per_row %>%
  group_by(row) %>%
  nest()

technology_concat_per_row <- technology_concat_per_row %>%
  group_by(row) %>%
  nest()

ONET_DB <- ONET_DB_raw %>%
  mutate(skill = skills_concat_per_row$data,
         technology = technology_concat_per_row$data) %>%
  select(-(technology_skills:tools))

saveRDS(ONET_DB, file = "create_data/ONET_DB.rds")
ldbolanos/standards documentation built on Aug. 7, 2020, 8:13 p.m.