predictSimilarityBasedReasoning: Predict codes using a Similarity Based Probability Model
In malsch/occupationCoding: Supervised Learning for Occupation Coding

View source: R/predictSimilarityBasedReasoning.R

predictSimilarityBasedReasoning

R Documentation

Predict codes using a Similarity Based Probability Model

Description

Function does the same preprocessing as in trainSimilarityBasedReasoning and makes probability predictions. If the verbal answer is not similar to any entry from the coding index, it just predicts all codes to have probability 1/model$num.allowed.codes.

Usage

predictSimilarityBasedReasoning(model, newdata, parallel = FALSE)

Arguments

`model`	the output created from `trainSimilarityBasedReasoning`.
`newdata`	eiter a data.table created with `removeFaultyAndUncodableAnswers_And_PrepareForAnalysis` or a character vector.
`parallel`	If `model$dist.type == "substring"` we may set this TRUE to use parallel computing. It has not been tested yet if this makes predictions faster.

Value

a data.table of class occupationalPredictions that contains predicted probabilities pred.prob for every combination of ans and pred.code. pred.code may not cover the full set of possible codes, but will contain a code "-9999" for every individual that provides a probability for all categories missing.

Examples

# set up data
data(occupations)
allowed.codes <- c("71402", "71403", "63302", "83112", "83124", "83131", "83132", "83193", "83194", "-0004", "-0030")
allowed.codes.titles <- c("Office clerks and secretaries (without specialisation)-skilled tasks", "Office clerks and secretaries (without specialisation)-complex tasks", "Gastronomy occupations (without specialisation)-skilled tasks",
 "Occupations in child care and child-rearing-skilled tasks", "Occupations in social work and social pedagogics-highly complex tasks", "Pedagogic specialists in social care work and special needs education-unskilled/semiskilled tasks", "Pedagogic specialists in social care work and special needs education-skilled tasks", "Supervisors in education and social work, and of pedagogic specialists in social care work", "Managers in education and social work, and of pedagogic specialists in social care work",
 "Not precise enough for coding", "Student assistants")
proc.occupations <- removeFaultyAndUncodableAnswers_And_PrepareForAnalysis(occupations, colNames = c("orig_answer", "orig_code"), allowed.codes, allowed.codes.titles)

## split sample
set.seed(3451345)
n.test <- 50
group <- sample(c(rep("test", n.test), rep("training", nrow(proc.occupations) - n.test)))
splitted.data <- split(proc.occupations, group)
attr(splitted.data$training, "classification")$code <- attr(proc.occupations, "classification")$code

####### train models
# first model uses dist.type = wordwise and some other recommended settings (n.draws could be higher)
simBasedModel <- trainSimilarityBasedReasoning(data = splitted.data$training,
                              coding_index_w_codes = coding_index_excerpt,
                              coding_index_without_codes = frequent_phrases,
                              preprocessing = list(stopwords = tm::stopwords("de"), stemming = NULL, strPreprocessing = TRUE, removePunct = FALSE),
                              dist.type = "wordwise",
                              dist.control = list(method = "osa", weight = c(d = 1, i = 1, s = 1, t = 1)),
                              threshold = c(max = 3, use = 1), simulation.control = list(n.draws = 50, check.normality = FALSE),
                              tmp_folder = "similarityTables")

res <- predictSimilarityBasedReasoning(simBasedModel, splitted.data$test)

# look at most probable answer from each id
res[, .SD[which.max(pred.prob), list(ans, true.code = code, pred.code, acc = code == pred.code)], by = id]
res[, .SD[which.max(pred.prob), list(ans, true.code = code, pred.code, acc = code == pred.code)], by = id][, mean(acc)] # calculate aggrement rate

# for further analysis we usually require further processing:
produceResults(expandPredictionResults(res, allowed.codes, method.name = "WordwiseSimilarityOsa1111"), k = 1, n = n.test, num.codes = length(allowed.codes))

# second model uses dist.type = substring and some other recommended settings (n.draws could be higher)
simBasedModel <- trainSimilarityBasedReasoning(data = splitted.data$training,
                              coding_index_w_codes = coding_index_excerpt,
                              coding_index_without_codes = frequent_phrases,
                              preprocessing = list(stopwords = NULL, stemming = NULL, strPreprocessing = TRUE, removePunct = FALSE),
                              dist.type = "substring",
                              dist.control = list(method = "substring", weight = numeric()),
                              threshold = c(0, 0), simulation.control = list(n.draws = 50, check.normality = FALSE),
                              tmp_folder = "similarityTables")

res <- predictSimilarityBasedReasoning(simBasedModel, splitted.data$test)
res <- predictSimilarityBasedReasoning(simBasedModel, splitted.data$test, parallel = TRUE) # if method = substring was used during training, we can speed up the predictions
produceResults(expandPredictionResults(res, allowed.codes, method.name = "substringSimilarity"), k = 1, n = n.test, num.codes = length(allowed.codes))


# third model uses dist.type = fulltext and some other recommended settings (n.draws could be higher)
simBasedModel <- trainSimilarityBasedReasoning(data = proc.occupations,
                              coding_index_w_codes = coding_index_excerpt,
                              coding_index_without_codes = frequent_phrases,
                              preprocessing = list(stopwords = NULL, stemming = NULL, strPreprocessing = TRUE, removePunct = FALSE),
                              dist.type = "fulltext",
                              dist.control = list(method = "osa", weight = c(d = 1, i = 1, s = 1, t = 1)),
                              threshold = c(max = 3, use = 1), simulation.control = list(n.draws = 50, check.normality = FALSE),
                              tmp_folder = "similarityTables")
res <- predictSimilarityBasedReasoning(simBasedModel, splitted.data$test)
produceResults(expandPredictionResults(res, allowed.codes, method.name = "FulltextSimilarityOsa1111"), k = 1, n = n.test, num.codes = length(allowed.codes))

#######################################################
## RUN A GRID SEARCH (takes some time)

# create a grid of all tuning combinations to try
model.grid <- data.table(expand.grid(stopwords = FALSE, stemming = FALSE, strPreprocessing = TRUE, removePunct = FALSE,
                                     dist.type = c("wordwise", "fulltext"), # dist.control = list(method = "osa", weight = c(d = 1, i = 1, s = 1, t = 1)
                                     threshold = c(0, 2),
                                     n.draws = c(5, 30), stringsAsFactors = FALSE))
model.grid <- rbind(model.grid, data.table(expand.grid(stopwords = FALSE, stemming = FALSE, strPreprocessing = TRUE, removePunct = FALSE,
                                                       dist.type = "substring",
                                                       threshold = FALSE,
                                                       n.draws = 30, stringsAsFactors = FALSE)))


for (i in 1:nrow(model.grid)) { #
  simBasedModel <- trainSimilarityBasedReasoning(data = splitted.data$training,
                                                 coding_index_w_codes = coding_index_excerpt,
                                                 coding_index_without_codes = frequent_phrases,
                                                 preprocessing = list(stopwords = if (model.grid[i, stopwords]) tm::stopwords("de") else character(0),
                                                                      stemming = if (model.grid[i, stemming == "de"]) "de" else NULL,
                                                                      strPreprocessing = model.grid[i, strPreprocessing],
                                                                      removePunct = model.grid[i, removePunct]),
                                                dist.type = model.grid[i, dist.type],
                                                 dist.control = list(method = "osa", weight = c(d = 1, i = 1, s = 1, t = 1)),
                                                 threshold = c(max = 3, use = model.grid[i, threshold]), simulation.control = list(n.draws = model.grid[i, n.draws], check.normality = FALSE),
                                                 tmp_folder = "similarityTables")

  if (model.grid[i, dist.type] == "substring") {
   # parallelization is only implemented (and helpful if dist.type = substring)
    res.proc2 <- expandPredictionResults(predictSimilarityBasedReasoning(simBasedModel, splitted.data$test, parallel = TRUE), allowed.codes = allowed.codes, method.name = "SimilarityBased")
  } else {
    res.proc2 <- expandPredictionResults(predictSimilarityBasedReasoning(simBasedModel, splitted.data$test, parallel = FALSE), allowed.codes = allowed.codes, method.name = "SimilarityBased")
  }

  ac <- accuracy(calcAccurateAmongTopK(res.proc2, k = 1), n = nrow(splitted.data$test))
  ll <- logLoss(res.proc2)
  sh <- sharpness(res.proc2)

  model.grid[i, acc := ac[, acc]]
  model.grid[i, acc.se := ac[, se]]
  model.grid[i, acc.N := ac[, N]]
  model.grid[i, acc.prob0 := ac[, count.pred.prob0]]
  model.grid[i, loss.full := ll[1, logscore]]
  model.grid[i, loss.full.se := ll[1, se]]
  model.grid[i, loss.full.N := ll[1, N]]
  model.grid[i, loss.sub := ll[2, logscore]]
  model.grid[i, loss.sub.se := ll[2, se]]
  model.grid[i, loss.sub.N := ll[2, N]]
  model.grid[i, sharp := sh[, sharpness]]
  model.grid[i, sharp.se := sh[, se]]
  model.grid[i, sharp.N := sh[, N]]
  model.grid[i, sum.pred.prob1 := res.proc2[, .SD[which.max(pred.prob), pred.prob], by = id][, sum(V1)]]
}

model.grid[order(dist.type, threshold, n.draws)]

malsch/occupationCoding documentation built on March 14, 2024, 8:09 a.m.