predictLogisticRegressionWithPenalization: Predict codes using a logistic regression model
In malsch/occupationCoding: Supervised Learning for Occupation Coding

View source: R/predictLogisticRegressionWithPenalization.R

predictLogisticRegressionWithPenalization

R Documentation

Predict codes using a logistic regression model

Description

Function does the same preprocessing as in trainLogisticRegressionWithPenalization and calls the glmnet predict-function.

Usage

predictLogisticRegressionWithPenalization(
  model,
  newdata,
  lambda = min(model$lambda)
)

Arguments

`model`	the output created from `trainLogisticRegressionWithPenalization`
`newdata`	eiter a data.table created with `removeFaultyAndUncodableAnswers_And_PrepareForAnalysis` or a character vector
`lambda`	see `glmnet`

Details

(Not confirmed:) Since overfitting is not an issue with occupational data, our experience is: the smaller lambda, the better are the predictions. Check if this holds for your data using deviance(model).

Value

a data.table of class occupationalPredictions that contains predicted probabilities pred.prob for every combination of ans and pred.code. pred.code may not cover the full set of possible codes.

Examples

# set up data
data(occupations)
allowed.codes <- c("71402", "71403", "63302", "83112", "83124", "83131", "83132", "83193", "83194", "-0004", "-0030")
allowed.codes.titles <- c("Office clerks and secretaries (without specialisation)-skilled tasks", "Office clerks and secretaries (without specialisation)-complex tasks", "Gastronomy occupations (without specialisation)-skilled tasks",
 "Occupations in child care and child-rearing-skilled tasks", "Occupations in social work and social pedagogics-highly complex tasks", "Pedagogic specialists in social care work and special needs education-unskilled/semiskilled tasks", "Pedagogic specialists in social care work and special needs education-skilled tasks", "Supervisors in education and social work, and of pedagogic specialists in social care work", "Managers in education and social work, and of pedagogic specialists in social care work",
 "Not precise enough for coding", "Student assistants")
proc.occupations <- removeFaultyAndUncodableAnswers_And_PrepareForAnalysis(occupations, colNames = c("orig_answer", "orig_code"), allowed.codes, allowed.codes.titles)

## split sample
set.seed(3451345)
n.test <- 50
group <- sample(c(rep("test", n.test), rep("training", nrow(proc.occupations) - n.test)))
splitted.data <- split(proc.occupations, group)

# train model and make predictions
model <- trainLogisticRegressionWithPenalization(splitted.data$train, preprocessing = list(stopwords = tm::stopwords("de"), stemming = "de", countWords = FALSE), tuning = list(alpha = 0.05, maxit = 50^5, nlambda = 100, thresh = 1e-5))
predictLogisticRegressionWithPenalization(model, c("test", "HIWI", "Hilfswissenschaftler"))
res <- predictLogisticRegressionWithPenalization(model, splitted.data$test)

# look at most probable answer from each id
res[, .SD[which.max(pred.prob), list(ans, true.code = code, pred.code, acc = code == pred.code)], by = id]
res[, .SD[which.max(pred.prob), list(ans, true.code = code, pred.code, acc = code == pred.code)], by = id][, mean(acc)] # calculate accurac of predictions

# for further analysis we usually require further processing:
produceResults(expandPredictionResults(res, allowed.codes, method.name = "LogisticRegression"), k = 1, n = n.test, num.codes = length(allowed.codes))

#######################################################
## RUN A GRID SEARCH (takes some time)

model.grid <- data.table(expand.grid(stopwords = c(TRUE, FALSE), stemming = c(FALSE, "de"), countWords = c(TRUE, FALSE), alpha = c(0, 0.05, 0.2), thresh = 1e-4, stringsAsFactors =FALSE))

# save results here
model.grid2 <- rbind(model.grid, model.grid, model.grid, model.grid)
model.grid2[, lambda_ind := rep(c(50, 70, 80, 100), each = nrow(model.grid))]

# Do grid search
for (i in 1:nrow(model.grid)) {
  res.model <- trainLogisticRegressionWithPenalization(splitted.data$train, preprocessing = list(stopwords = if (model.grid[i, stopwords]) tm::stopwords("de") else character(0),
                                                                                                 stemming = if (model.grid[i, stemming == "de"]) "de" else NULL,
                                                                                                 countWords = model.grid[i, countWords]),
                                                       tuning = list(alpha = model.grid[i, alpha],
                                                                     maxit = 10^6, nlambda = 100, thresh = model.grid[i, thresh]))

  for (j in 1:4) { # loop over all lambda_ind-values (c(50, 70, 80, 100))
    # if glmnet does not converge, we want to use lambda_ind = length(res.model$lambda)
    model.grid2[nrow(model.grid)*(j-1) + i, lambda_ind := min(lambda_ind, length(res.model$lambda))]
    lambdav <- res.model$lambda[model.grid2[nrow(model.grid)*(j-1) + i, lambda_ind]]
    res.proc <- expandPredictionResults(predictLogisticRegressionWithPenalization(res.model, splitted.data$test, lambda = lambdav), allowed.codes = allowed.codes, method.name = paste0("glmnet.elnet.Stopwords=", model.grid[i, stopwords], "Stemming=", model.grid[i, stemming], "Countwords=", model.grid[i, countWords], "Lambda=", lambdav))

    ac <- accuracy(calcAccurateAmongTopK(res.proc, k = 1), n = nrow(splitted.data$test))
    ll <- logLoss(res.proc)
    sh <- sharpness(res.proc)

    model.grid2[nrow(model.grid)*(j-1) + i, lambda := lambdav]
    model.grid2[nrow(model.grid)*(j-1) + i, acc := ac[, acc]]
    model.grid2[nrow(model.grid)*(j-1) + i, acc.se := ac[, se]]
    model.grid2[nrow(model.grid)*(j-1) + i, acc.N := ac[, N]]
    model.grid2[nrow(model.grid)*(j-1) + i, acc.prob0 := ac[, count.pred.prob0]]
    model.grid2[nrow(model.grid)*(j-1) + i, loss.full := ll[1, logscore]]
    model.grid2[nrow(model.grid)*(j-1) + i, loss.full.se := ll[1, se]]
    model.grid2[nrow(model.grid)*(j-1) + i, loss.full.N := ll[1, N]]
    model.grid2[nrow(model.grid)*(j-1) + i, loss.sub := ll[2, logscore]]
    model.grid2[nrow(model.grid)*(j-1) + i, loss.sub.se := ll[2, se]]
    model.grid2[nrow(model.grid)*(j-1) + i, loss.sub.N := ll[2, N]]
    model.grid2[nrow(model.grid)*(j-1) + i, sharp := sh[, sharpness]]
    model.grid2[nrow(model.grid)*(j-1) + i, sharp.se := sh[, se]]
    model.grid2[nrow(model.grid)*(j-1) + i, sharp.N := sh[, N]]
  }
}

# how does alpha and lambda behave?
model.grid2[order(alpha, -lambda)][stopwords == FALSE & stemming == FALSE & countWords == FALSE,]
# Pick one combination of alpha and lambda and explore the influence of stemming, stopwords, and stemming
model.grid2[alpha == 0.05 & lambda < 0.03][order(-lambda, stemming)]

malsch/occupationCoding documentation built on March 14, 2024, 8:09 a.m.