Nothing
#' Classify occupations
#'
#' @description
#' This function takes advantage of the hierarchical structure of the ESCO-ISCO mapping and matches multilingual free-text with the
#' \href{https://ec.europa.eu/esco/portal/home}{ESCO} occupations vocabulary in order to map semi-structured vacancy data into the official
#' ESCO-ISCO classification.
#'
#' @param corpus A data.frame or a data.table that contains the id and the text variables.
#' @param id_col The name of the id variable.
#' @param text_col The name of the text variable.
#' @param lang The language that the text is in.
#' @param num_leaves The number of occupations/neighbors that are kept when matching.
#' @param isco_level The \href{https://ec.europa.eu/esco/portal/escopedia/Occupation}{ISCO} level of the suggested occupations.
#' Can be either 1, 2, 3, 4 for ISCO occupations, or NULL that returns ESCO occupations.
#' @param max_dist String distance used for fuzzy matching. The \code{\link[stringdist]{amatch}} function from the stringdist package is used.
#' @param string_dist String dissimilarity measurement. Available string distance metrics: \code{\link[stringdist]{stringdist-metrics}}.
#'
#' @details
#' First, the input text is cleansed and tokenized. The tokens are then matched with the ESCO occupations vocabulary, created from
#' the preferred and alternative labels of the occupations. They are joined with the \code{\link[=tf_idf]{tfidf}}
#' weighted tokens of the ESCO occupations and the sum of the tf-idf score is used to retrieve the suggested ontologies. Technically speaking, the
#' suggested ESCO occupations are retrieved by solving the optimization problem, \deqn{\arg\max_d\left\{\vec{u}_{binary}\cdot \vec{u}_d\right\}}
#' where, \eqn{\vec{u}_{binary}} stands for the binary representation of a query to the ESCO-vocabulary space,
#' while, \eqn{\vec{u}_d} is the ESCO occupation normalized vector generated by the tf-idf numerical statistic.
#' If an ISCO level is specified, the k-nearest neighbors algorithm is used to determine the suggested occupation, classified by a plurality vote in the corresponding hierarchical level of its neighbors.
#'
#' Before the suggestions are returned, the preferred label of each suggested occupation is added to the result, using the
#' \code{\link{occupations_bundle}} and \code{\link{isco_occupations_bundle}} as look-up tables.
#'
#' @return Either a data.table with the id, the preferred label and the suggested ESCO occupation URIs (num_leaves predictions for each id),
#' or a data.table with the id, the preferred label and the suggested ISCO group of the inputted level (one for each id).
#'
#' @export
#'
#' @references
#' M.P.J. van der Loo (2014). \href{https://journal.r-project.org/archive/2014-1/loo.pdf}{The stringdist package for approximate string matching}. R Journal 6(1) pp 111-122.
#'
#' Gweon, H., Schonlau, M., Kaczmirek, L., Blohm, M., & Steiner, S. (2017). \href{https://doi.org/10.1515/jos-2017-0006}{Three Methods for Occupation Coding Based on Statistical Learning, Journal of Official Statistics}, 33(1), 101-122.
#'
#' Arthur Turrell, Bradley J. Speigner, Jyldyz Djumalieva, David Copple, James Thurgood (2019).
#' \href{https://www.nber.org/papers/w25837}{Transforming Naturally Occurring Text Data Into Economic Statistics:
#' The Case of Online Job Vacancy Postings}.
#'
#' ESCO Service Platform - \href{https://ec.europa.eu/esco/portal/document/en/87a9f66a-1830-4c93-94f0-5daa5e00507e}{
#' The ESCO Data Model documentation}
#'
#' @import data.table
#' @import magrittr
#' @importFrom stringdist amatch
#' @importFrom utils head
#'
#' @examples
#' corpus <- data.frame(
#' id = 1:3,
#' text = c(
#' "Junior Architect Engineer",
#' "Cashier at McDonald's",
#' "Priest at St. Martin Catholic Church"
#' )
#' )
#' classify_occupation(corpus = corpus, isco_level = 3, lang = "en", num_leaves = 5)
#'
classify_occupation <- function(corpus, id_col = "id", text_col = "text", lang = "en",
num_leaves = 10, isco_level = 3, max_dist = 0.1, string_dist = NULL) {
# due to NSE notes in R CMD check
NULL -> language -> term -> text -> tfIdf -> id -> iscoGroup -> weight_sum -> isco_nn -> conceptUri -> preferredLabel
occupations_bundle <- occupations_bundle
isco_occupations_bundle <- isco_occupations_bundle
if(!any("data.frame" %in% class(corpus)))
stop("Corpus must be either a data.frame or a data.table.")
if(!all(c(id_col, text_col) %in% names(corpus)))
stop(paste0("Corpus must contain the specified variables: ", id_col, " and ", text_col, "."))
# Prepare corpus.
corpus <- data.table(corpus)
setnames(corpus, c(id_col, text_col), c("id", "text"))
# Prepare the weighted tokens and the vocabulary.
weightTokens <- tfidf_tokens[language == lang]
if(nrow(weightTokens) == 0)
stop(paste0(lang, " is not an acceptable language."))
vocabulary <- unique(tfidf_tokens[language == lang][, list(term)])[order(term)]
# Cleanse, tokenize free-text and remove stopwords.
corpus[, text := cleansing_corpus(as.character(text))]
freeTextTokensList <- lapply(strsplit(corpus$text, split = " "), function(x) x[!x %in% get_stopwords(lang)])
names(freeTextTokensList) <- corpus$id
freeTextTokensDT <- lapply(freeTextTokensList, data.table) %>%
rbindlist(idcol = TRUE) %>%
setnames(c("id", "term"))
# Match free-text with the vocabulary.
vocaIndexes <- match(freeTextTokensDT$term, vocabulary$term)
if(!is.null(string_dist))
vocaIndexes[is.na(vocaIndexes)] <- amatch(freeTextTokensDT$term[is.na(vocaIndexes)], vocabulary$term, maxDist = max_dist, method = string_dist)
matches <- data.table(id = freeTextTokensDT$id, term = vocabulary[vocaIndexes]$term)[!is.na(term)]
# Join the free-text matches with the tfidf weighted tokens and keep the top num_leaves using a weighted sum model.
predictions <- merge(
matches,
weightTokens,
allow.cartesian = TRUE
)[, list(weight_sum = sum(tfIdf)), by = c("id", "class")][order(id, -weight_sum)][, head(.SD, num_leaves), by = "id"]
if(!is.null(isco_level) && !isco_level %in% 1:4)
stop("The ISCO level parameter must be 1, 2, 3, 4 or NULL.")
setnames(predictions, "class", "conceptUri")
# The K-Nearest Neighbors algorithm and the suggested occupations are used to determine the most popular occupation of the requested ISCO level.
if(!is.null(isco_level)) {
predictions <- merge(predictions, occupations_bundle[, list(conceptUri, iscoGroup)], on = "conceptUri")
predictions[, iscoGroup := substr(iscoGroup, 0, isco_level)]
predictions <- predictions[, list(isco_nn = .N), by = c("id", "iscoGroup")
][order(id, -isco_nn)
][, head(.SD, 1), by = "id"]
# Add new variable, the preferred label of the ISCO occupation.
predictions <- merge(
predictions,
isco_occupations_bundle,
on = "iscoGroup"
)[order(id, -isco_nn)][, list(id, iscoGroup, preferredLabel)]
} else {
# Add new variable, the preferred label of the ESCO occupations.
predictions <- merge(
predictions,
occupations_bundle[, list(conceptUri, preferredLabel)],
on = "conceptUri"
)[order(id, -weight_sum)][, list(id, conceptUri, preferredLabel)]
}
setnames(predictions, "id", id_col)
predictions
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.