Nothing
#' Visualize the output of the LambdaG algorithm
#'
#' This function outputs a colour-coded list of sentences belonging to the input Q text ordered from highest to lowest \eqn{\lambda_G}, as shown in Nini et al. (under review).
#'
#' @param q.data A single questioned or disputed text as a `quanteda` tokens object with the tokens being sentences (e.g. the output of [tokenize_sents()]).
#' @param k.data A known or undisputed corpus containing exclusively a single candidate author's texts as a `quanteda` tokens object with the tokens being sentences (e.g. the output of [tokenize_sents()]).
#' @param ref.data The reference dataset as a `quanteda` tokens object with the tokens being sentences (e.g. the output of [tokenize_sents()]).
#' @param N The order of the model. Default is 10.
#' @param r The number of iterations. Default is 30.
#' @param output A string detailing the file type of the colour-coded text output. Either "html" (default) or "latex".
#' @param print A string indicating the path to the folder where to print a colour-coded text file. If left empty (default), then nothing is printed.
#' @param scale A string indicating what scale to use to colour-code the text file. If "absolute" (default) then the raw \eqn{\lambda_G} is used; if "relative", then the z-score of \eqn{\lambda_G} over the Q data is used instead, thus showing relative importance.
#' @param cores The number of cores to use for parallel processing (the default is one).
#'
#' @references Nini, A., Halvani, O., Graner, L., Gherardi, V., Ishihara, S. Authorship Verification based on the Likelihood Ratio of Grammar Models. https://arxiv.org/abs/2403.08462v1
#' @return The function outputs a list of two objects: a data frame with each row being a token in the Q text and the values of \eqn{\lambda_G} for the token and sentences, in decreasing order of sentence \eqn{\lambda_G} and with the relative contribution of each token and each sentence to the final \eqn{\lambda_G} in percentage; the raw code in html or LaTeX that generates the colour-coded file. If a path is provided for the print argument then the function will also save the colour-coded text as an html or plain text file.
#'
#' @examples
#' q.data <- corpus_trim(enron.sample[1], "sentences", max_ntoken = 10) |> quanteda::tokens("sentence")
#' k.data <- enron.sample[2:5]|> quanteda::tokens("sentence")
#' ref.data <- enron.sample[6:ndoc(enron.sample)] |> quanteda::tokens("sentence")
#' outputs <- lambdaG_visualize(q.data, k.data, ref.data, r = 2)
#' outputs$table
#'
#' @export
lambdaG_visualize <- function(q.data, k.data, ref.data, N = 10, r = 30, output = "html", print = "", scale = "absolute", cores = NULL){
if(length(unique(quanteda::docvars(k.data, "author"))) != 1){
stop("The k.data does not contain a single candidate author.")
}
if(length(q.data) != 1){
stop("The q.data does not contain one text only.")
}
llr.table <- loglikelihood_table_avgllrs(q.data, k.data, ref.data, r, N, cores = cores)
# calculation of relative contribution in percentage
dplyr::filter(llr.table, lambdaG > 0) |> dplyr::pull(lambdaG) |> sum() -> pos.t.total
dplyr::filter(llr.table, lambdaG < 0) |> dplyr::pull(lambdaG) |> sum() -> neg.t.total
dplyr::slice_head(llr.table, n = 1, by = sentence_id) |> dplyr::filter(sentence_lambdaG > 0) |>
dplyr::pull(sentence_lambdaG) |> sum() -> pos.s.total
dplyr::slice_head(llr.table, n = 1, by = sentence_id) |> dplyr::filter(sentence_lambdaG < 0) |>
dplyr::pull(sentence_lambdaG) |> sum() -> neg.s.total
llr.table |>
dplyr::mutate(token_contribution = dplyr::case_when(lambdaG > 0 ~ round(lambdaG/pos.t.total*100, 2),
lambdaG < 0 ~ round(-lambdaG/neg.t.total*100, 2),
lambdaG == 0 ~ 0)) |>
dplyr::mutate(sent_contribution = dplyr::case_when(sentence_lambdaG > 0 ~
round(sentence_lambdaG/pos.s.total*100, 2),
sentence_lambdaG < 0 ~
round(-sentence_lambdaG/neg.s.total*100, 2),
sentence_lambdaG == 0 ~ 0)) ->
llr.table
if(output == "html"){
filename <- paste0(quanteda::docnames(q.data), ".html")
cc.text <- color_coding_html(llr.table, scale)
}else if(output == "latex"){
filename <- paste0(quanteda::docnames(q.data), ".txt")
cc.text <- color_coding_latex(llr.table, scale)
}
if(print != ""){
write(cc.text,
paste0(print, filename)
)
}
output.list <- list(table = llr.table, colourcoded_text = cc.text)
return(output.list)
}
extract <- function(s, N){
kgrams::kgram_freqs(s, N = N) |>
kgrams::language_model(smoother = "kn", D = 0.75) -> model
return(model)
}
loglikelihood_one_rep <- function(q.sents, k.sents, ref.sents, N, k.g, k.sent.probs){
ref.g <- ref.sents |> sample(length(k.sents)) |> extract(N)
ref.sent.probs = kgrams::probability(q.sents, ref.g)
sentence.llrs <- log10(k.sent.probs/ref.sent.probs)
table = mapply(\(id, s, sllr) {
toks = s |> stringr::str_squish() |> stringr::str_split_1(" ")
n_tokens = length(toks) + 1
res = data.frame(sentence_id = id, token_id = 1:n_tokens, t = character(n_tokens), k = numeric(n_tokens),
ref = numeric(n_tokens), llr = numeric(n_tokens), sentence_llr = sllr)
for (i in 1:n_tokens) {
topred = ifelse(i < n_tokens, toks[i], kgrams::EOS())
context = toks[ seq(from = 0, to = i - 1) ]
context = c(rep(kgrams::BOS(), N - 1), context)
context = paste0(context, collapse = " ")
res$t[i] = topred
res$k[i] = kgrams::probability(kgrams::`%|%`(topred, context), k.g)
res$ref[i] = kgrams::probability(kgrams::`%|%`(topred, context), ref.g)
res$llr[i] = log10(res$k[i]/res$ref[i])
}
res
}, 1:length(q.sents), q.sents, sentence.llrs, SIMPLIFY = FALSE)
table = dplyr::bind_rows(table)
return(table)
}
loglikelihood_table_avgllrs <- function(q.data, k.data, ref.data, r, N, cores){
k.sents = as.character(k.data)
ref.sents = as.character(ref.data)
q.sents = as.character(q.data)
k.g = extract(k.sents, N)
k.sent.probs = kgrams::probability(q.sents, k.g)
pbapply::pbreplicate(r, loglikelihood_one_rep(q.sents, k.sents, ref.sents, N, k.g, k.sent.probs),
simplify = FALSE, cl = cores) |>
dplyr::bind_rows() |>
dplyr::group_by(sentence_id, token_id, t) |>
dplyr::summarise(lambdaG = mean(llr), sentence_lambdaG = round(mean(sentence_llr), 3)) |>
dplyr::ungroup() |>
dplyr::mutate(zlambdaG = as.numeric(round(scale(lambdaG), 3))) |>
dplyr::arrange(desc(sentence_lambdaG), sentence_id, token_id) -> final.table
return(final.table)
}
color_coding_html <- function(llr.table, scale){
if(scale == "absolute"){
llr.table |> dplyr::mutate(color = dplyr::case_when(lambdaG > 0 & lambdaG < 1 ~ "#fdedec",
lambdaG >= 1 & lambdaG < 2 ~ "#f5b7b1",
lambdaG >= 2 & lambdaG < 3 ~ "#ec7063",
lambdaG >= 3 & lambdaG <= 4 ~ "#cb4335",
lambdaG > 4 ~ "#943126",
.default = "")) -> table2
}
if(scale == "relative"){
llr.table |> dplyr::mutate(color = dplyr::case_when(zlambdaG > 0.5 & zlambdaG <= 1 ~ "#FADBD8",
zlambdaG > 1 & zlambdaG <= 2 ~ "#F1948A",
zlambdaG > 2 ~ "#E74C3C",
.default = "")) -> table2
}
string = c()
for (i in 1:nrow(table2)) {
if(table2[i, "color"] != ""){
segment = paste0("<span style=\"background-color: ", table2[i, "color"], ";\">",
table2[i, "t"],
"</span>")
}else{
segment = table2[i, "t"]
}
string = paste(string, segment, sep = " ") |> stringr::str_replace_all("___EOS___", "<br>")
}
return(string)
}
color_coding_latex <- function(llr.table, scale){
if(scale == "absolute"){
llr.table |> dplyr::mutate(t = dplyr::if_else(t == "___EOS___", "[EOS]", t),
color = dplyr::case_when(lambdaG > 0 & lambdaG < 1 ~ 20,
lambdaG >= 1 & lambdaG < 2 ~ 40,
lambdaG >= 2 & lambdaG < 3 ~ 60,
lambdaG >= 3 & lambdaG <= 4 ~ 80,
lambdaG > 4 ~ 100,
.default = 0)) -> table2
}
if(scale == "relative"){
llr.table |> dplyr::mutate(t = dplyr::if_else(t == "___EOS___", "[EOS]", t),
color = dplyr::case_when(zlambdaG > 0.5 & zlambdaG <= 1 ~ 20,
zlambdaG > 1 & zlambdaG <= 2 ~ 50,
zlambdaG > 2 ~ 70,
.default = 0)) -> table2
}
string = c()
for (i in 1:nrow(table2)) {
if(table2[i, "t"] == "[EOS]"){
segment = paste0("\\code{colorlow!", table2[i, "color"], "}{\\strut ",
table2[i, "t"],
"}\\allowbreak\\newline")
}else{
segment = paste0("\\code{colorlow!", table2[i, "color"], "}{\\strut ",
table2[i, "t"],
"}\\allowbreak")
}
string <- paste0(string, segment)
}
return(string)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.