R/occupations_classify.R

Defines functions classify_occupation

Documented in classify_occupation

#' Classify occupations
#'
#' @description
#' This function takes advantage of the hierarchical structure of the ESCO-ISCO mapping and matches multilingual free-text with the
#' \href{https://ec.europa.eu/esco/portal/home}{ESCO} occupations vocabulary in order to map semi-structured vacancy data into the official
#' ESCO-ISCO classification.
#'
#' @param corpus A data.frame or a data.table that contains the id and the text variables.
#' @param id_col The name of the id variable.
#' @param text_col The name of the text variable.
#' @param lang The language that the text is in.
#' @param num_leaves The number of occupations/neighbors that are kept when matching.
#' @param isco_level The \href{https://ec.europa.eu/esco/portal/escopedia/Occupation}{ISCO} level of the suggested occupations.
#' Can be either 1, 2, 3, 4 for ISCO occupations, or NULL that returns ESCO occupations.
#' @param max_dist String distance used for fuzzy matching. The \code{\link[stringdist]{amatch}} function from the stringdist package is used.
#' @param string_dist String dissimilarity measurement. Available string distance metrics: \code{\link[stringdist]{stringdist-metrics}}.
#'
#' @details
#' First, the input text is cleansed and tokenized. The tokens are then matched with the ESCO occupations vocabulary, created from
#' the preferred and alternative labels of the occupations. They are joined with the \code{\link[=tf_idf]{tfidf}}
#' weighted tokens of the ESCO occupations and the sum of the tf-idf score is used to retrieve the suggested ontologies. Technically speaking, the
#' suggested ESCO occupations are retrieved by solving the optimization problem, \deqn{\arg\max_d\left\{\vec{u}_{binary}\cdot \vec{u}_d\right\}}
#' where, \eqn{\vec{u}_{binary}} stands for the binary representation of a query to the ESCO-vocabulary space,
#' while, \eqn{\vec{u}_d} is the ESCO occupation normalized vector generated by the tf-idf numerical statistic.
#' If an ISCO level is specified, the k-nearest neighbors algorithm is used to determine the suggested occupation, classified by a plurality vote in the corresponding hierarchical level of its neighbors.
#'
#' Before the suggestions are returned, the preferred label of each suggested occupation is added to the result, using the
#' \code{\link{occupations_bundle}} and \code{\link{isco_occupations_bundle}} as look-up tables.
#'
#' @return Either a data.table with the id, the preferred label and the suggested ESCO occupation URIs (num_leaves predictions for each id),
#' or a data.table with the id, the preferred label and the suggested ISCO group of the inputted level (one for each id).
#'
#' @export
#'
#' @references
#' M.P.J. van der Loo (2014). \href{https://journal.r-project.org/archive/2014-1/loo.pdf}{The stringdist package for approximate string matching}. R Journal 6(1) pp 111-122.
#'
#' Gweon, H., Schonlau, M., Kaczmirek, L., Blohm, M., & Steiner, S. (2017). \href{https://doi.org/10.1515/jos-2017-0006}{Three Methods for Occupation Coding Based on Statistical Learning, Journal of Official Statistics}, 33(1), 101-122.
#'
#' Arthur Turrell, Bradley J. Speigner, Jyldyz Djumalieva, David Copple, James Thurgood (2019).
#' \href{https://www.nber.org/papers/w25837}{Transforming Naturally Occurring Text Data Into Economic Statistics:
#' The Case of Online Job Vacancy Postings}.
#'
#' ESCO Service Platform - \href{https://ec.europa.eu/esco/portal/document/en/87a9f66a-1830-4c93-94f0-5daa5e00507e}{
#' The ESCO Data Model documentation}
#'
#' @import data.table
#' @import magrittr
#' @importFrom stringdist amatch
#' @importFrom utils head
#'
#' @examples
#' corpus <- data.frame(
#'  id = 1:3,
#'  text = c(
#'    "Junior Architect Engineer",
#'    "Cashier at McDonald's",
#'    "Priest at St. Martin Catholic Church"
#'  )
#' )
#' classify_occupation(corpus = corpus, isco_level = 3, lang = "en", num_leaves = 5)
#'
classify_occupation <- function(corpus, id_col = "id", text_col = "text", lang = "en",
                                num_leaves = 10, isco_level = 3, max_dist = 0.1, string_dist = NULL) {

  # due to NSE notes in R CMD check
  NULL -> language -> term -> text -> tfIdf -> id -> iscoGroup -> weight_sum -> isco_nn -> conceptUri -> preferredLabel
  occupations_bundle <- occupations_bundle
  isco_occupations_bundle <- isco_occupations_bundle

  if(!any("data.frame" %in% class(corpus)))
    stop("Corpus must be either a data.frame or a data.table.")

  if(!all(c(id_col, text_col) %in% names(corpus)))
    stop(paste0("Corpus must contain the specified variables: ", id_col, " and ", text_col, "."))

  # Prepare corpus.
  corpus <- data.table(corpus)
  setnames(corpus, c(id_col, text_col), c("id", "text"))

  # Prepare the weighted tokens and the vocabulary.
  weightTokens <- tfidf_tokens[language == lang]
  if(nrow(weightTokens) == 0)
    stop(paste0(lang, " is not an acceptable language."))
  vocabulary <- unique(tfidf_tokens[language == lang][, list(term)])[order(term)]

  # Cleanse, tokenize free-text and remove stopwords.
  corpus[, text := cleansing_corpus(as.character(text))]
  freeTextTokensList <- lapply(strsplit(corpus$text, split = " "), function(x) x[!x %in% get_stopwords(lang)])
  names(freeTextTokensList) <- corpus$id

  freeTextTokensDT <- lapply(freeTextTokensList, data.table) %>%
    rbindlist(idcol = TRUE) %>%
    setnames(c("id", "term"))

  # Match free-text with the vocabulary.
  vocaIndexes <- match(freeTextTokensDT$term, vocabulary$term)
  if(!is.null(string_dist))
    vocaIndexes[is.na(vocaIndexes)] <- amatch(freeTextTokensDT$term[is.na(vocaIndexes)], vocabulary$term, maxDist = max_dist, method = string_dist)

  matches <- data.table(id = freeTextTokensDT$id, term = vocabulary[vocaIndexes]$term)[!is.na(term)]

  # Join the free-text matches with the tfidf weighted tokens and keep the top num_leaves using a weighted sum model.
  predictions <- merge(
    matches,
    weightTokens,
    allow.cartesian = TRUE
  )[, list(weight_sum = sum(tfIdf)), by = c("id", "class")][order(id, -weight_sum)][, head(.SD, num_leaves), by = "id"]

  if(!is.null(isco_level) && !isco_level %in% 1:4)
    stop("The ISCO level parameter must be 1, 2, 3, 4 or NULL.")

  setnames(predictions, "class", "conceptUri")

  # The K-Nearest Neighbors algorithm and the suggested occupations are used to determine the most popular occupation of the requested ISCO level.
  if(!is.null(isco_level)) {
    predictions <- merge(predictions, occupations_bundle[, list(conceptUri, iscoGroup)], on = "conceptUri")
    predictions[, iscoGroup := substr(iscoGroup, 0, isco_level)]
    predictions <- predictions[, list(isco_nn = .N), by = c("id", "iscoGroup")
        ][order(id, -isco_nn)
          ][, head(.SD, 1), by = "id"]
    # Add new variable, the preferred label of the ISCO occupation.
    predictions <- merge(
      predictions,
      isco_occupations_bundle,
      on = "iscoGroup"
    )[order(id, -isco_nn)][, list(id, iscoGroup, preferredLabel)]
  } else {
    # Add new variable, the preferred label of the ESCO occupations.
    predictions <- merge(
      predictions,
      occupations_bundle[, list(conceptUri, preferredLabel)],
      on = "conceptUri"
    )[order(id, -weight_sum)][, list(id, conceptUri, preferredLabel)]
  }
  setnames(predictions, "id", id_col)

  predictions
}

Try the labourR package in your browser

Any scripts or data that you put into this service are public.

labourR documentation built on July 18, 2020, 5:06 p.m.