occupationCoding: Supervised Learning for Occupation Coding

Documented in predictLogisticRegressionWithPenalization

#' Predict codes using a logistic regression model
#'
#' Function does the same preprocessing as in \code{\link{trainLogisticRegressionWithPenalization}} and calls the glmnet predict-function.
#'
#' (Not confirmed:) Since overfitting is not an issue with occupational data, our experience is: the smaller \code{lambda}, the better are the predictions. Check if this holds for your data using \code{deviance(model)}.
#'
#' @param model the output created from \code{\link{trainLogisticRegressionWithPenalization}}
#' @param newdata eiter a data.table created with \code{\link{removeFaultyAndUncodableAnswers_And_PrepareForAnalysis}} or a character vector
#' @param lambda see \code{\link[glmnet]{glmnet}}
#'
#' @seealso \code{\link{trainLogisticRegressionWithPenalization}}, \code{\link[glmnet]{glmnet}}
#'
#' @return a data.table of class \code{occupationalPredictions} that contains predicted probabilities \code{pred.prob} for every combination of \code{ans} and \code{pred.code}. pred.code may not cover the full set of possible codes.
#' @import data.table
#' @import text2vec
#' @export
#'
#' @examples
#' # set up data
#' data(occupations)
#' allowed.codes <- c("71402", "71403", "63302", "83112", "83124", "83131", "83132", "83193", "83194", "-0004", "-0030")
#' allowed.codes.titles <- c("Office clerks and secretaries (without specialisation)-skilled tasks", "Office clerks and secretaries (without specialisation)-complex tasks", "Gastronomy occupations (without specialisation)-skilled tasks",
#'  "Occupations in child care and child-rearing-skilled tasks", "Occupations in social work and social pedagogics-highly complex tasks", "Pedagogic specialists in social care work and special needs education-unskilled/semiskilled tasks", "Pedagogic specialists in social care work and special needs education-skilled tasks", "Supervisors in education and social work, and of pedagogic specialists in social care work", "Managers in education and social work, and of pedagogic specialists in social care work",
#'  "Not precise enough for coding", "Student assistants")
#' proc.occupations <- removeFaultyAndUncodableAnswers_And_PrepareForAnalysis(occupations, colNames = c("orig_answer", "orig_code"), allowed.codes, allowed.codes.titles)
#'
#' ## split sample
#' set.seed(3451345)
#' n.test <- 50
#' group <- sample(c(rep("test", n.test), rep("training", nrow(proc.occupations) - n.test)))
#' splitted.data <- split(proc.occupations, group)
#'
#' # train model and make predictions
#' model <- trainLogisticRegressionWithPenalization(splitted.data$train, preprocessing = list(stopwords = tm::stopwords("de"), stemming = "de", countWords = FALSE), tuning = list(alpha = 0.05, maxit = 50^5, nlambda = 100, thresh = 1e-5))
#' predictLogisticRegressionWithPenalization(model, c("test", "HIWI", "Hilfswissenschaftler"))
#' res <- predictLogisticRegressionWithPenalization(model, splitted.data$test)
#'
#' # look at most probable answer from each id
#' res[, .SD[which.max(pred.prob), list(ans, true.code = code, pred.code, acc = code == pred.code)], by = id]
#' res[, .SD[which.max(pred.prob), list(ans, true.code = code, pred.code, acc = code == pred.code)], by = id][, mean(acc)] # calculate accurac of predictions
#'
#' # for further analysis we usually require further processing:
#' produceResults(expandPredictionResults(res, allowed.codes, method.name = "LogisticRegression"), k = 1, n = n.test, num.codes = length(allowed.codes))
#'
#' #######################################################
#' ## RUN A GRID SEARCH (takes some time)
#' \donttest{
#' model.grid <- data.table(expand.grid(stopwords = c(TRUE, FALSE), stemming = c(FALSE, "de"), countWords = c(TRUE, FALSE), alpha = c(0, 0.05, 0.2), thresh = 1e-4, stringsAsFactors =FALSE))
#'
#' # save results here
#' model.grid2 <- rbind(model.grid, model.grid, model.grid, model.grid)
#' model.grid2[, lambda_ind := rep(c(50, 70, 80, 100), each = nrow(model.grid))]
#'
#' # Do grid search
#' for (i in 1:nrow(model.grid)) {
#'   res.model <- trainLogisticRegressionWithPenalization(splitted.data$train, preprocessing = list(stopwords = if (model.grid[i, stopwords]) tm::stopwords("de") else character(0),
#'                                                                                                  stemming = if (model.grid[i, stemming == "de"]) "de" else NULL,
#'                                                                                                  countWords = model.grid[i, countWords]),
#'                                                        tuning = list(alpha = model.grid[i, alpha],
#'                                                                      maxit = 10^6, nlambda = 100, thresh = model.grid[i, thresh]))
#'
#'   for (j in 1:4) { # loop over all lambda_ind-values (c(50, 70, 80, 100))
#'     # if glmnet does not converge, we want to use lambda_ind = length(res.model$lambda)
#'     model.grid2[nrow(model.grid)*(j-1) + i, lambda_ind := min(lambda_ind, length(res.model$lambda))]
#'     lambdav <- res.model$lambda[model.grid2[nrow(model.grid)*(j-1) + i, lambda_ind]]
#'     res.proc <- expandPredictionResults(predictLogisticRegressionWithPenalization(res.model, splitted.data$test, lambda = lambdav), allowed.codes = allowed.codes, method.name = paste0("glmnet.elnet.Stopwords=", model.grid[i, stopwords], "Stemming=", model.grid[i, stemming], "Countwords=", model.grid[i, countWords], "Lambda=", lambdav))
#'
#'     ac <- accuracy(calcAccurateAmongTopK(res.proc, k = 1), n = nrow(splitted.data$test))
#'     ll <- logLoss(res.proc)
#'     sh <- sharpness(res.proc)
#'
#'     model.grid2[nrow(model.grid)*(j-1) + i, lambda := lambdav]
#'     model.grid2[nrow(model.grid)*(j-1) + i, acc := ac[, acc]]
#'     model.grid2[nrow(model.grid)*(j-1) + i, acc.se := ac[, se]]
#'     model.grid2[nrow(model.grid)*(j-1) + i, acc.N := ac[, N]]
#'     model.grid2[nrow(model.grid)*(j-1) + i, acc.prob0 := ac[, count.pred.prob0]]
#'     model.grid2[nrow(model.grid)*(j-1) + i, loss.full := ll[1, logscore]]
#'     model.grid2[nrow(model.grid)*(j-1) + i, loss.full.se := ll[1, se]]
#'     model.grid2[nrow(model.grid)*(j-1) + i, loss.full.N := ll[1, N]]
#'     model.grid2[nrow(model.grid)*(j-1) + i, loss.sub := ll[2, logscore]]
#'     model.grid2[nrow(model.grid)*(j-1) + i, loss.sub.se := ll[2, se]]
#'     model.grid2[nrow(model.grid)*(j-1) + i, loss.sub.N := ll[2, N]]
#'     model.grid2[nrow(model.grid)*(j-1) + i, sharp := sh[, sharpness]]
#'     model.grid2[nrow(model.grid)*(j-1) + i, sharp.se := sh[, se]]
#'     model.grid2[nrow(model.grid)*(j-1) + i, sharp.N := sh[, N]]
#'   }
#' }
#'
#' # how does alpha and lambda behave?
#' model.grid2[order(alpha, -lambda)][stopwords == FALSE & stemming == FALSE & countWords == FALSE,]
#' # Pick one combination of alpha and lambda and explore the influence of stemming, stopwords, and stemming
#' model.grid2[alpha == 0.05 & lambda < 0.03][order(-lambda, stemming)]
#'
#' }
predictLogisticRegressionWithPenalization <- function(model, newdata, lambda = min(model$lambda)) {

  # get character vector depending on type of input
  if ("occupationData" %in% class(newdata)) {
    ans <- newdata[, ans]
    print("Depending on model training, some very infrequent codes that were less than 2 times in the training data are not in the result from this function.")
  }
  if (is.character(newdata)) {
    ans <- newdata
  }

  # preprocessing
  preprocessing <- model$preprocessing

  ansP <- stringPreprocessing(ans)

  # prepare text for efficient computation -> transform to sparse matrix
  matrix <- asDocumentTermMatrix(ansP, vect.vocab = model$vect.vocab,
                                 stopwords = preprocessing$stopwords,
                                 stemming = preprocessing$stemming,
                                 type = "dgCMatrix")$dtm

  # include feature for number of words
  if (preprocessing$countWords) {
    ans_freq <- sapply(strsplit(ansP, " "), length)
    matrix <- cbind(matrix, ans_freq)
  }

  if (min(lambda) < min(model$lambda)) warning("Your provided lambda is too small. It is replaced by the smallest value allowed, depending on your model estimation.")

  # make predictions
  predProb <- predict(model, newx=matrix, s = lambda, type = "response")

  # and bring them in a nice format
  predProb2 <- data.table(ans = rep(ans, times = dim(predProb)[2]),
                          pred.code = rep(attr(predProb, "dimnames")[[2]], each = length(ans)),
                          pred.prob = as.vector(predProb[,,1]))

  # add additional columns from new data to the result
  if ("occupationData" %in% class(newdata)) {
    for (i in seq_along(names(newdata))) {
      if (names(newdata)[i] != "ans") {
        set(predProb2, i = NULL, j = names(newdata)[i], value = unlist(rep(newdata[, i, with = FALSE], times = dim(predProb)[2])))
      }
    }

    class(predProb2) <- c(class(predProb2), "occupationalPredictions")
  }

  return(predProb2)
}