#' Predict codes using a logistic regression model
#'
#' Function does the same preprocessing as in \code{\link{trainLogisticRegressionWithPenalization}} and calls the glmnet predict-function.
#'
#' (Not confirmed:) Since overfitting is not an issue with occupational data, our experience is: the smaller \code{lambda}, the better are the predictions. Check if this holds for your data using \code{deviance(model)}.
#'
#' @param model the output created from \code{\link{trainLogisticRegressionWithPenalization}}
#' @param newdata eiter a data.table created with \code{\link{removeFaultyAndUncodableAnswers_And_PrepareForAnalysis}} or a character vector
#' @param lambda see \code{\link[glmnet]{glmnet}}
#'
#' @seealso \code{\link{trainLogisticRegressionWithPenalization}}, \code{\link[glmnet]{glmnet}}
#'
#' @return a data.table of class \code{occupationalPredictions} that contains predicted probabilities \code{pred.prob} for every combination of \code{ans} and \code{pred.code}. pred.code may not cover the full set of possible codes.
#' @import data.table
#' @import text2vec
#' @export
#'
#' @examples
#' # set up data
#' data(occupations)
#' allowed.codes <- c("71402", "71403", "63302", "83112", "83124", "83131", "83132", "83193", "83194", "-0004", "-0030")
#' allowed.codes.titles <- c("Office clerks and secretaries (without specialisation)-skilled tasks", "Office clerks and secretaries (without specialisation)-complex tasks", "Gastronomy occupations (without specialisation)-skilled tasks",
#' "Occupations in child care and child-rearing-skilled tasks", "Occupations in social work and social pedagogics-highly complex tasks", "Pedagogic specialists in social care work and special needs education-unskilled/semiskilled tasks", "Pedagogic specialists in social care work and special needs education-skilled tasks", "Supervisors in education and social work, and of pedagogic specialists in social care work", "Managers in education and social work, and of pedagogic specialists in social care work",
#' "Not precise enough for coding", "Student assistants")
#' proc.occupations <- removeFaultyAndUncodableAnswers_And_PrepareForAnalysis(occupations, colNames = c("orig_answer", "orig_code"), allowed.codes, allowed.codes.titles)
#'
#' ## split sample
#' set.seed(3451345)
#' n.test <- 50
#' group <- sample(c(rep("test", n.test), rep("training", nrow(proc.occupations) - n.test)))
#' splitted.data <- split(proc.occupations, group)
#'
#' # train model and make predictions
#' model <- trainLogisticRegressionWithPenalization(splitted.data$train, preprocessing = list(stopwords = tm::stopwords("de"), stemming = "de", countWords = FALSE), tuning = list(alpha = 0.05, maxit = 50^5, nlambda = 100, thresh = 1e-5))
#' predictLogisticRegressionWithPenalization(model, c("test", "HIWI", "Hilfswissenschaftler"))
#' res <- predictLogisticRegressionWithPenalization(model, splitted.data$test)
#'
#' # look at most probable answer from each id
#' res[, .SD[which.max(pred.prob), list(ans, true.code = code, pred.code, acc = code == pred.code)], by = id]
#' res[, .SD[which.max(pred.prob), list(ans, true.code = code, pred.code, acc = code == pred.code)], by = id][, mean(acc)] # calculate accurac of predictions
#'
#' # for further analysis we usually require further processing:
#' produceResults(expandPredictionResults(res, allowed.codes, method.name = "LogisticRegression"), k = 1, n = n.test, num.codes = length(allowed.codes))
#'
#' #######################################################
#' ## RUN A GRID SEARCH (takes some time)
#' \donttest{
#' model.grid <- data.table(expand.grid(stopwords = c(TRUE, FALSE), stemming = c(FALSE, "de"), countWords = c(TRUE, FALSE), alpha = c(0, 0.05, 0.2), thresh = 1e-4, stringsAsFactors =FALSE))
#'
#' # save results here
#' model.grid2 <- rbind(model.grid, model.grid, model.grid, model.grid)
#' model.grid2[, lambda_ind := rep(c(50, 70, 80, 100), each = nrow(model.grid))]
#'
#' # Do grid search
#' for (i in 1:nrow(model.grid)) {
#' res.model <- trainLogisticRegressionWithPenalization(splitted.data$train, preprocessing = list(stopwords = if (model.grid[i, stopwords]) tm::stopwords("de") else character(0),
#' stemming = if (model.grid[i, stemming == "de"]) "de" else NULL,
#' countWords = model.grid[i, countWords]),
#' tuning = list(alpha = model.grid[i, alpha],
#' maxit = 10^6, nlambda = 100, thresh = model.grid[i, thresh]))
#'
#' for (j in 1:4) { # loop over all lambda_ind-values (c(50, 70, 80, 100))
#' # if glmnet does not converge, we want to use lambda_ind = length(res.model$lambda)
#' model.grid2[nrow(model.grid)*(j-1) + i, lambda_ind := min(lambda_ind, length(res.model$lambda))]
#' lambdav <- res.model$lambda[model.grid2[nrow(model.grid)*(j-1) + i, lambda_ind]]
#' res.proc <- expandPredictionResults(predictLogisticRegressionWithPenalization(res.model, splitted.data$test, lambda = lambdav), allowed.codes = allowed.codes, method.name = paste0("glmnet.elnet.Stopwords=", model.grid[i, stopwords], "Stemming=", model.grid[i, stemming], "Countwords=", model.grid[i, countWords], "Lambda=", lambdav))
#'
#' ac <- accuracy(calcAccurateAmongTopK(res.proc, k = 1), n = nrow(splitted.data$test))
#' ll <- logLoss(res.proc)
#' sh <- sharpness(res.proc)
#'
#' model.grid2[nrow(model.grid)*(j-1) + i, lambda := lambdav]
#' model.grid2[nrow(model.grid)*(j-1) + i, acc := ac[, acc]]
#' model.grid2[nrow(model.grid)*(j-1) + i, acc.se := ac[, se]]
#' model.grid2[nrow(model.grid)*(j-1) + i, acc.N := ac[, N]]
#' model.grid2[nrow(model.grid)*(j-1) + i, acc.prob0 := ac[, count.pred.prob0]]
#' model.grid2[nrow(model.grid)*(j-1) + i, loss.full := ll[1, logscore]]
#' model.grid2[nrow(model.grid)*(j-1) + i, loss.full.se := ll[1, se]]
#' model.grid2[nrow(model.grid)*(j-1) + i, loss.full.N := ll[1, N]]
#' model.grid2[nrow(model.grid)*(j-1) + i, loss.sub := ll[2, logscore]]
#' model.grid2[nrow(model.grid)*(j-1) + i, loss.sub.se := ll[2, se]]
#' model.grid2[nrow(model.grid)*(j-1) + i, loss.sub.N := ll[2, N]]
#' model.grid2[nrow(model.grid)*(j-1) + i, sharp := sh[, sharpness]]
#' model.grid2[nrow(model.grid)*(j-1) + i, sharp.se := sh[, se]]
#' model.grid2[nrow(model.grid)*(j-1) + i, sharp.N := sh[, N]]
#' }
#' }
#'
#' # how does alpha and lambda behave?
#' model.grid2[order(alpha, -lambda)][stopwords == FALSE & stemming == FALSE & countWords == FALSE,]
#' # Pick one combination of alpha and lambda and explore the influence of stemming, stopwords, and stemming
#' model.grid2[alpha == 0.05 & lambda < 0.03][order(-lambda, stemming)]
#'
#' }
predictLogisticRegressionWithPenalization <- function(model, newdata, lambda = min(model$lambda)) {
# get character vector depending on type of input
if ("occupationData" %in% class(newdata)) {
ans <- newdata[, ans]
print("Depending on model training, some very infrequent codes that were less than 2 times in the training data are not in the result from this function.")
}
if (is.character(newdata)) {
ans <- newdata
}
# preprocessing
preprocessing <- model$preprocessing
ansP <- stringPreprocessing(ans)
# prepare text for efficient computation -> transform to sparse matrix
matrix <- asDocumentTermMatrix(ansP, vect.vocab = model$vect.vocab,
stopwords = preprocessing$stopwords,
stemming = preprocessing$stemming,
type = "dgCMatrix")$dtm
# include feature for number of words
if (preprocessing$countWords) {
ans_freq <- sapply(strsplit(ansP, " "), length)
matrix <- cbind(matrix, ans_freq)
}
if (min(lambda) < min(model$lambda)) warning("Your provided lambda is too small. It is replaced by the smallest value allowed, depending on your model estimation.")
# make predictions
predProb <- predict(model, newx=matrix, s = lambda, type = "response")
# and bring them in a nice format
predProb2 <- data.table(ans = rep(ans, times = dim(predProb)[2]),
pred.code = rep(attr(predProb, "dimnames")[[2]], each = length(ans)),
pred.prob = as.vector(predProb[,,1]))
# add additional columns from new data to the result
if ("occupationData" %in% class(newdata)) {
for (i in seq_along(names(newdata))) {
if (names(newdata)[i] != "ans") {
set(predProb2, i = NULL, j = names(newdata)[i], value = unlist(rep(newdata[, i, with = FALSE], times = dim(predProb)[2])))
}
}
class(predProb2) <- c(class(predProb2), "occupationalPredictions")
}
return(predProb2)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.