.bs_sent <- function(x) {
quanteda::tokens(x, what = "sentence")[[1]]
}
.bs_sent2 <- function(x) {
## to emulate tokenizers::token_sentences
purrr::map(x, .bs_sent)
}
cal_avg_sentence_length <- function(input_text) {
purrr::map(input_text, .bs_sent) %>% purrr::map(tokenizers::tokenize_words) %>% purrr::map(~purrr::map_int(., length)) %>% purrr::map_dbl(mean)
}
## .parse_text_udpipe <- function(input_text, udpipe_model) {
## as.data.frame(udpipe::udpipe_annotate(object = udpipe_model, x = input_text))
## }
.find_central_verb <- function(parse_tree_data) {
parse_tree_data %>% dplyr::filter(stringr::str_detect(pos, "^VERB")) %>% dplyr::pull(token_id) -> res
if (length(res) == 0) {
parse_tree_data %>% dplyr::filter(stringr::str_detect(tag, "^VB")) %>% dplyr::pull(token_id) -> res
if (length(res) == 0) {
##warning("No Verb. Getting Root instead.")
res <- .find_root(parse_tree_data)
}
}
return(min(res)[1])
}
.find_terminal <- function(parse_tree_data) {
parse_tree_data %>% dplyr::filter(pos != "PUNCT") %>% dplyr::summarise(max_id = max(token_id)) -> res
if (nrow(res) == 0 | res$max_id[1] == -Inf) {
return(NA)
}
res %>% dplyr::pull(max_id) %>% tail(n = 1) -> terminal
return(terminal[1])
}
.find_root <- function(parse_tree_data) {
parse_tree_data$token_id[parse_tree_data$dep_rel == "ROOT"] -> res
return(res[1])
}
.safe_max <- function(x, replacement) {
if (length(x) == 0) {
return(replacement)
}
return(max(x))
}
.cal_max_dist_root_raw <- function(parse_tree_data) {
igraph::graph_from_data_frame(parse_tree_data[parse_tree_data$head_token_id != 0, c('head_token_id', 'token_id')]) -> parse_tree
root_node_id <- which(igraph::V(parse_tree)$name == .find_root(parse_tree_data))
termin_node_id <- which(igraph::V(parse_tree)$name == .find_terminal(parse_tree_data))
if (length(termin_node_id) == 0) {
termin_node_id <- NA
}
central_node_id <- which(igraph::V(parse_tree)$name == .find_central_verb(parse_tree_data))
central_node_paths <- igraph::all_simple_paths(parse_tree, from = central_node_id)
if (length(central_node_paths) == 0) {
### return the theoretical maximum
syntactic_dependency <- igraph::diameter(parse_tree)
} else {
central_node_paths %>% purrr::map_int(length) %>% .safe_max(replacement = igraph::diameter(parse_tree)) -> syntactic_dependency
}
if (is.na(termin_node_id) | root_node_id == termin_node_id) {
syntactic_depth <- 0
} else {
igraph::all_simple_paths(parse_tree, from = root_node_id, to = termin_node_id) %>% purrr::map_int(length) %>% .safe_max(replacement = 0) -> syntactic_depth
}
return(tibble::tibble(syntactic_depth = syntactic_depth, syntactic_dependency = syntactic_dependency))
}
.cal_mean_dist_root_sentences <- function(parsed_content) {
parsed_content %>% dplyr::group_by(sentence_id) %>% dplyr::group_nest() %>% dplyr::pull(data) %>% purrr::map_dfr(.cal_max_dist_root_raw) %>% dplyr::summarise(syntactic_depth = mean(syntactic_depth, na.rm = TRUE), syntactic_dependency = mean(syntactic_dependency, na.rm = TRUE))
}
cal_mean_dist_root <- function(parsed_entire_content, parallel, .progress, plan = "multiprocess") {
if (parallel) {
future::plan(plan)
furrr::future_map_dfr(parsed_entire_content, .cal_mean_dist_root_sentences, .progress = .progress)
} else {
purrr::map_dfr(parsed_entire_content, .cal_mean_dist_root_sentences)
}
}
.cal_mtld_raw <- function(input_text, mtld_threshold = 0.72, mtld_min_ntoken = 10, backward = FALSE) {
current_ntoken <- 0
current_ttr <- 0
types <- c()
sent_brks <- quanteda::tokens(input_text, what = 'word')[[1]]
if (backward) {
sent_brks <- rev(sent_brks)
}
if (length(sent_brks) < mtld_min_ntoken) {
return(NA)
}
current_nfactors <- 0
for (word in sent_brks) {
current_ntoken <- current_ntoken + 1
types <- unique(c(types, word))
current_ttr <- length(types) / current_ntoken
if (current_ttr < mtld_threshold) {
if (current_ntoken >= mtld_min_ntoken) {
current_nfactors <- current_nfactors + 1
}
current_ntoken <- 0
types <- c()
current_ttr <- 0
}
}
remaining_traj <- (1 - current_ttr) / (1 - mtld_threshold)
total_nfactors <- current_nfactors + remaining_traj
length(sent_brks) / total_nfactors
}
.cal_mtld <- function(input_text, mtld_threshold = 0.72, mtld_min_ntoken = 10) {
mean(c(.cal_mtld_raw(input_text, mtld_threshold = mtld_threshold, mtld_min_ntoken = mtld_min_ntoken), .cal_mtld_raw(input_text, mtld_threshold = mtld_threshold, mtld_min_ntoken = mtld_min_ntoken, backward = TRUE)))
}
cal_mtld <- function(input_text, mtld_threshold = 0.72, mtld_min_ntoken = 10) {
purrr::map_dbl(input_text, .cal_mtld, mtld_threshold = mtld_threshold, mtld_min_ntoken = mtld_min_ntoken)
}
#' Calculate the raw scores
#'
#' This function takes a vector of characters and calculate the raw scores required for the algorithm.
#' @importFrom magrittr %>%
#' @param input_text vector of characters.
#' @param parsed_text a list of dataframes. Must be parsed with spacyr::spacy_parse(pos = TRUE, dependency = TRUE, tag = TRUE)
#' @param mtld_threshold Threshold for the calculation of MTLD score.
#' @param mtld_min_ntoken Threhold for minimal length of a viable sentence.
#' @param parallel logical, whether or not to use parallel computation.
#' @param .progrss_bar logical, whether or not to display progress bar. Only possible when parallel = TRUE.
#' @param plan character, please refer to ?future::plan for other options.
#' @return tibble of raw scores
#' @examples
#' \donttest{
#' require(spacyr)
#' data(fairy)
#' spacy_initialize(model = 'en')
#' fairy_rawscore <- calculate_textplex(fairy$text)
#' }
#' @export
calculate_textplex <- function(input_text, parsed_text = NULL, mtld_threshold = 0.72, mtld_min_ntoken = 10, parallel = TRUE, .progress = TRUE, plan = 'multiprocess') {
if (is.null(parsed_text)) {
parsed_text <- spacyr::spacy_parse(input_text, pos = TRUE, dependency = TRUE, tag = TRUE, multi_thread = parallel) %>% dplyr::group_by(doc_id) %>% dplyr::group_nest() %>% dplyr::pull(data)
}
tokenized_text <- quanteda::tokens(input_text)
lexdiv_res <- quanteda::textstat_lexdiv(tokenized_text, measure = c("TTR"))
ari_res <- quanteda::textstat_readability(quanteda::corpus(input_text), measure = "ARI")
input_dfm <- quanteda::dfm(tokenized_text)
I <- quanteda::textstat_lexdiv(tokenized_text, measure = c("I"))$I
## NB: the current implementation of quanteda::textstat_entropy does not adjust for ntokens as in T&B. We need to do it ourselves
entropy <- quanteda::textstat_entropy(input_dfm)$entropy / log(quanteda::ntoken(input_dfm), 2) * 100
lexdiv_res %>% dplyr::left_join(ari_res, by = 'document') %>% dplyr::mutate(I = I, avg_sl = cal_avg_sentence_length(input_text), entropy = entropy, mtld = cal_mtld(input_text, mtld_threshold = mtld_threshold, mtld_min_ntoken = mtld_min_ntoken)) %>% dplyr::bind_cols(cal_mean_dist_root(parsed_text, parallel = parallel, .progress = .progress, plan = plan))
}
#' Fit a two-factor model as per Tolochko and Boomgaarden
#'
#' This function fits a two-factor model of text complexity as per Tolochko and Boomgaarden (2019).
#' @param raw_scores raw_scores extracted by calculate_textplex()
#' @param rotate which rotation to use. The original implementation uses "oblimin". Please consult ?psych::fa for other options.
#' @return a psych::principal fit.
#' @examples
#'
#' @export
fit_two_factor_model <- function(raw_scores, rotate = "oblimin") {
raw_scores %>% dplyr::select(TTR, I, entropy, mtld, avg_sl, ARI, syntactic_depth, syntactic_dependency) %>% psych::principal(nfactors = 2, rotate = rotate) -> fit
return(fit)
}
#' Fifty H.C. Andersen’s fairy tales
#'
#' First fifty fairy tales from hcandersenr::hcandersen_en.
#'
#' @docType data
#'
#' @usage data(fairy)
#'
#' @examples
#' \donttest{
#' data(fairy)
#' fairy_rawscore <- calculate_textplex(fairy$text)
#' }
"fairy"
#' Raw scores extracted from the fairy dataset
#'
#' Raw scores extracted from the fairy dataset using calculate_textplex
#'
#' @docType data
#'
#' @usage data(fairy)
#'
#' @examples
#' data(fairy_rawscore)
#' fit <- fit_two_factor_model(fairy_rawscore)
"fairy_rawscore"
#' Raw scores extracted from the state of the union addresses
#'
#' @docType data
#'
#' @usage data(sotu_rawscore)
#'
#' @examples
#' data(sotu_rawscore)
#' fit <- fit_two_factor_model(sotu_rawscore)
"sotu_rawscore"
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.