# predict_readability ------
#' predict readability score from a fitted BT model
#'
#' Predicts the \eqn{lambda} for a given text in `newdata`, from a fitted
#' Bradley-Terry model object `object`.
#'
#' @param object a fitted [BradleyTerry2::BTm()] model object
#' @param newdata a character or [corpus][quanteda::corpus] object containing the
#' texts whose readability values will be predicted. If omitted, the fitted
#' values from `object` are used.
#' @param reference_top,reference_bottom the \eqn{lambda} values of a text
#' against which each predicted text will be compared for difficulty or rescaled. The
#' default value for `reference_bottom` is the \eqn{lambda} applied to all of
#' [data_corpus_fifthgrade()], and is used as the baseline value to calculate
#' the probability that a given text is easier, as well as the anchor value of 100 to which
#' texts are rescaled. The default value for `reference_top` is the \eqn{lambda}
#' of the most difficult text in the State of the Union corpus, and is used as the anchor value
#' of 0 to which texts are rescaled (See `f999866.csv`.)
#' @param bootstrap_n number of bootstrap replicates for computing intervals
#' @param baseline_year a scalar or vector of the baseline years to choose for
#' reference: a year ending in 0 from 1790-2000
#' @param verbose logical; if `TRUE` print status messages
#' @return a data.frame with the rows named to the text names, and the columns
#' consisting of: \describe{ \item{`lambda`}{estimated lambda for each
#' text} \item{`prob`}{the probability that the text is easier than the
#' reference lambda, the default of which is \eqn{lambda} applied to all of
#' [data_corpus_fifthgrade()]} \item{`scaled`}{a rescaled lambda on a scale of
#' "ease" ranging from 0-100, where 100 and 0 are determined by the fifth grade texts
#' and the hardest text from the State of the Union corpus, respectively, unless
#' specified by the user} }
#' @import BradleyTerry2
#' @importFrom data.table as.data.table
#' @export
#' @examples
#' \dontrun{
#' head(predict_readability(data_BTm_bms))
#' ## lambda prob scaled
#' ## 100014 -3.296731 0.24593816 60.51612
#' ## 100028 -3.190470 0.26617180 64.26088
#' ## 100029 -3.719532 0.17607128 45.61617
#' ## 100033 -4.703668 0.07396423 10.93416
#' ## 100034 -3.289739 0.24723716 60.76252
#' ## 100045 -2.780185 0.35346383 78.71976
#'
#' txts <- c(fifthgrade = paste(as.character(data_corpus_fifthgrade), collapse = " "),
#' data_corpus_inaugural[c(1:2, 9:10, 54:58)])
#' predict_readability(data_BTm_bms, newdata = txts)
#' ## lambda prob scaled
#' ## fifthgrade -2.128336 0.51199792 102.84175
#' ## 1789-Washington -5.494969 0.03493749 -96.46991
#' ## 1793-Washington -2.852801 0.33705102 59.95195
#' ## 1821-Monroe -3.629638 0.18949402 13.96156
#' ## 1825-Adams -4.138627 0.12321942 -16.17163
#' ## 2001-Bush -2.273380 0.47575815 94.25482
#' ## 2005-Bush -2.583155 0.39967525 75.91551
#' ## 2009-Obama -2.529601 0.41259115 79.08604
#' ## 2013-Obama -2.747889 0.36087883 66.16295
#' ## 2017-Trump -2.359702 0.45428669 89.14440
#'
#' years <- c(2000, as.integer(substring(names(txts)[-1], 1, 4)))
#' predict_readability(data_BTm_bms, newdata = txts, baseline_year = years)
#' ## lambda prob scaled
#' ## fifthgrade -2.128338 0.51199736 102.84162
#' ## 1789-Washington -5.494972 0.03493741 -96.47004
#' ## 1793-Washington -2.852803 0.33705052 59.95182
#' ## 1821-Monroe -3.629640 0.18949368 13.96143
#' ## 1825-Adams -4.138629 0.12321918 -16.17177
#' ## 2001-Bush -2.273383 0.47575759 94.25469
#' ## 2005-Bush -2.583158 0.39967471 75.91538
#' ## 2009-Obama -2.529603 0.41259061 79.08591
#' ## 2013-Obama -2.747891 0.36087832 66.16282
#' ## 2017-Trump -2.359704 0.45428614 89.14426
#'
#' names(txts) <- gsub("ington", "", names(txts))
#' pr <- predict_readability(data_BTm_bms, newdata = txts[c(1:3, 9:10)], bootstrap_n = 100)
#' format(pr, digits = 4)
#' ## lambda prob scaled lambda_lo lambda_hi prob_lo prob_hi scaled_lo scaled_hi
#' ## fifthgrade -2.172 0.50105 100.15 -2.210 -2.135 0.491591 0.51032 98.81 101.455
#' ## 1789-Wash -5.676 0.02931 -23.35 -6.917 -4.870 0.008664 0.06336 -67.06 5.076
#' ## 1793-Wash -3.560 0.20036 51.22 -4.524 -2.609 0.087218 0.39358 17.25 84.765
#' ## 2013-Obama -2.791 0.35107 78.35 -2.914 -2.645 0.323485 0.38483 74.00 83.469
#' ## 2017-Trump -2.381 0.44904 92.79 -2.511 -2.213 0.417178 0.49080 88.22 98.704
#'
#' predict_readability(data_BTm_bms, newdata = "The cat in the hat ate green eggs and ham.")
#' ## lambda prob scaled
#' ## 1 -1.125721 0.7408932 137.0248
#'
#' }
predict_readability <- function(object, newdata = NULL,
reference_top = -2.1763368548,
reference_bottom = -3.865467, bootstrap_n = 0,
baseline_year = 2000,
verbose = FALSE) {
UseMethod("predict_readability")
}
#' @export
#' @importFrom scales rescale
#' @importFrom data.table data.table
#' @importFrom utils packageVersion
#' @importFrom stats coef
predict_readability.BTm <- function(object, newdata = NULL,
reference_top = -2.1763368548,
reference_bottom = -3.865467, bootstrap_n = 0,
baseline_year = 2000,
verbose = FALSE) {
`:=` <- NA
prob <- lambda <- scaled <- resample <- NULL
time1 <- proc.time()
if (verbose)
message("Starting predict_readability (sophistication v", packageVersion("sophistication"), ")...")
if (missing(newdata) && bootstrap_n > 0)
warning("bootstrap_n ignored; only new texts can be bootstrapped")
if (verbose)
message(" ...using ", deparse(substitute(object)), " as fitted BT model", appendLF = FALSE)
# extract the coefficients
coeffs <- coef(object)
# strip the "[ID]" from the coef names
names(coeffs) <- gsub("[ID]", "", names(coeffs), fixed = TRUE)
if (missing(newdata)) {
if (verbose) message(" and as newdata")
if (bootstrap_n > 0) {
warning("bootstrapping can only be applied to new data")
bootstrap_n <- 0
}
newdata_docnames <- levels(object$data$easier$ID)
# get newdata from object, if not supplied
newdata <- as.data.table(object$data[names(coeffs)])
newdata[, c("_docid", "resample") := list(levels(object$data$easier$ID), 0)]
} else {
if (verbose) message("; ", deparse(substitute(newdata)), " as newdata")
# otherwise compute covars on newdata supplied as character or corpus
if (is.corpus(newdata)) newdata <- as.character(newdata)
if (is.character(newdata)) {
newdata_docnames <- names(newdata)
newdata <- get_covars_from_newdata(newdata, bootstrap_n,
names(coeffs),
baseline_year = baseline_year,
verbose = verbose)
} else {
stop("newdata must be a character or corpus object")
}
}
# make coefficient name from newdata fitting same as fitted data
# this just means that the prediction will work
names(newdata)[grep("^google_min", names(newdata))] <-
grep("^google_min", names(coeffs), value = TRUE)
# error checks
if (!all(whichfound <- names(coeffs) %in% names(newdata))) {
warning("dropping coefficients for which variables are not found in newdata:\n", names(coeffs)[!whichfound])
coeffs <- coeffs[whichfound]
}
# select just the ID vars and coefficients
newdata <- newdata[, c("_docid", "resample", names(coeffs)), with = FALSE]
# compute the predictions, output as a named vector
if (verbose) message(" ...computing predicted values")
newdata$lambda <- apply(newdata[, names(coeffs), with = FALSE], 1, function(z) sum(z * coeffs))
# compute the probability that a text is easier than the reference text
newdata[, prob := exp(lambda) / (exp(lambda) + exp(reference_top))]
# compute the rescaled lambda
# use the references for the 0 and 100 endpoints
newdata[, scaled := scales::rescale(lambda, to = c(0, 100), from = c(reference_bottom, reference_top))]
# combine into data.frame
result <- as.data.frame(newdata[resample == 0, list(lambda, prob, scaled)])
# add CIs if any
if (bootstrap_n > 0) {
result <- cbind(result,
newdata[resample > 0,
list(lambda_lo = quantile(lambda, 0.025, na.rm = TRUE), lambda_hi = quantile(lambda, 0.975, na.rm = TRUE)), by = "_docid"][, 2:3, with = FALSE])
result <- cbind(result,
newdata[resample > 0,
list(prob_lo = quantile(prob, 0.025, na.rm = TRUE), prob_hi = quantile(prob, 0.975, na.rm = TRUE)), by = "_docid"][, 2:3, with = FALSE])
result <- cbind(result,
newdata[resample > 0,
list(scaled_lo = quantile(scaled, 0.025, na.rm = TRUE), scaled_hi = quantile(scaled, 0.975, na.rm = TRUE)), by = "_docid"][, 2:3, with = FALSE])
}
rownames(result) <- newdata_docnames
if (verbose)
message(" ...finished; elapsed time: ", format((proc.time() - time1)[3], digits = 3), " seconds.")
result
}
# get_covars_from_newdata ----
get_covars_from_newdata <- function(x, bootstrap_n = 0, covar_selection = NULL,
baseline_year = 2000, verbose = FALSE) {
UseMethod("get_covars_from_newdata")
}
get_covars_from_newdata.character <- function(x, bootstrap_n = 0,
covar_selection = NULL,
baseline_year = 2000,
verbose = FALSE) {
get_covars_from_newdata(corpus(x), bootstrap_n, covar_selection,
baseline_year, verbose = verbose)
}
get_covars_from_newdata.corpus <- function(x, bootstrap_n = 0,
covar_selection = NULL,
baseline_year = 2000,
verbose = FALSE) {
.SD <- .N <- `:=` <- NULL
covars <- get_covars_new(x, baseline_year = baseline_year, verbose = verbose)
# covars <- covars_make_all(dt$texts, dependency = FALSE, verbose = verbose)
# covars_select <- c(covar_selection, c("C", "St", "n_noun", "ntoken", "W"))
# dt <- cbind(dt, covars[, covars_select])
result <- computed_aggregated_covars(covars, 0)
if (verbose && bootstrap_n > 0)
message(" ...recombining bootstrapped sentence-level quantities")
for (i in seq_len(bootstrap_n)) {
result <- rbind(result,
computed_aggregated_covars(covars[, .SD[sample(.N, replace = TRUE)], by = "doc_id"], i))
}
setnames(result, "doc_id", "_docid")
result
}
computed_aggregated_covars <- function(y, i) {
`:=` <- .N <- NA
C <- W <- n_noun <- google_min <- St <- resample <- n_chars <- n_token <- NULL
y <- y[, list(C = sum(n_chars, na.rm = TRUE), W = sum(n_token, na.rm = TRUE), St = .N,
n_noun = sum(n_noun, na.rm = TRUE), google_min = min(google_min, na.rm = TRUE)),
by = "doc_id"]
y[, c("meanWordChars", "pr_noun", "meanSentenceChars", "google_min") :=
list ( C / W , n_noun/W, C / St , google_min)]
y[, resample := i]
y
}
# get_covars_new --------------
## compute all covariates based on spacy parse
## needs an entire text, not parsed into sentences
get_covars_new <- function(x, baseline_year = 2000, verbose = FALSE) {
UseMethod("get_covars_new")
}
get_covars_new.character <- function (x, baseline_year = 2000, verbose = FALSE) {
get_covars_new(corpus(x))
}
get_covars_new.corpus <- function(x, baseline_year = 2000, verbose = FALSE) {
google_min <- pos <- `:=` <- nchars <- token <- sentence_id <- years <- NULL
doc_id <- .N <- NULL
if (verbose) message(" ...tagging parts of speech")
suppressMessages(
spacyr::spacy_initialize()
)
result <- data.table(spacyr::spacy_parse(as.character(x), tag = FALSE, lemma = FALSE, entity = FALSE, dependency = FALSE))
# remove punctuation
result <- result[pos != "PUNCT" & pos != "SPACE"]
# if years is a vector, repeat for each token
if (length(baseline_year) > 1)
baseline_year <- rep(baseline_year,
result[, list(years = length(sentence_id)), by = doc_id][, years])
if (verbose) message(" ...computing word lengths in characters")
result[, nchars := stringi::stri_length(token)]
if (verbose) message(" ...computing baselines from Google frequencies")
bl_google <-
suppressWarnings(make_baselines_google(result$token, baseline_word = "the",
baseline_year = baseline_year)[, 2])
result[, google_min := bl_google]
if (verbose) message(" ...aggregating to sentence level")
result[,
list(doc_id = doc_id[1],
n_noun = sum(pos == "NOUN", na.rm = TRUE),
n_chars = sum(nchars, na.rm = TRUE),
google_min = min(google_min, na.rm = TRUE),
n_token = .N),
by = c("sentence_id", "doc_id")]
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.