Nothing
## Keyness Analysis UI ----
keynessUI <- function() {
tabItem(
tabName = "keyness",
fluidPage(
fluidRow(
column(
8,
h3(strong("Keyness Analysis"), align = "center")
),
div(
title = t_run,
column(
1,
do.call(
"actionButton",
c(
run_bttn,
list(
inputId = "run_keyness"
)
)
)
)
),
div(
title = t_export,
column(
1,
do.call(
"actionButton",
c(
export_bttn,
list(
inputId = "keynessExport"
)
)
)
)
),
div(
title = t_report,
column(
1,
do.call(
"actionButton",
c(
report_bttn,
list(
inputId = "keynessReport"
)
)
)
)
),
div(
column(
1,
dropdown(
h4(strong("Options: ")),
br(),
# Main Configuration
div(
class = "config-section",
div(
class = "config-section-header",
icon("cog", lib = "glyphicon"),
"Main Configuration"
),
selectInput(
inputId = "keyness_approach",
label = "Analysis Approach:",
choices = c(
"Reference Corpus" = "reference_corpus",
"Two Corpus Comparison" = "two_corpus"
),
selected = "reference_corpus"
),
conditionalPanel(
condition = "input.keyness_approach == 'two_corpus'",
uiOutput("keyness_group_warning")
),
numericInput(
inputId = "keyness_n",
label = "Max Number of Terms:",
value = 1000,
min = 100,
max = 10000,
step = 100
),
numericInput(
inputId = "keyness_minchar",
label = "Min Character Length:",
value = 3,
min = 1,
max = 10,
step = 1
),
selectInput(
inputId = "keyness_measure",
label = "Keyness Measure:",
choices = c(
"G2" = "G2",
"LogOddsRatio" = "LogOddsRatio",
"Phi" = "phi",
"Delta P" = "DeltaP"
),
selected = "G2"
)
# ,
# checkboxGroupInput(
# inputId = "keyness_upos",
# label = "POS Tags:",
# choices = c("NOUN", "VERB", "ADJ", "ADV"),
# selected = c("NOUN", "VERB"),
# inline = FALSE
# )
),
# Graphical Parameters Section
tags$details(
class = "advanced-section",
tags$summary(
div(
class = "params-section-header",
style = "display: flex; justify-content: space-between; align-items: center;",
div(
icon("eye-open", lib = "glyphicon"),
" Graphical Parameters"
),
icon(
"chevron-down",
lib = "glyphicon",
style = "font-size: 12px;"
)
)
),
div(
style = "margin-top: 10px;",
h4("Bar Plot:"),
numericInput(
"Keyness_Nbarplot",
label = "N. of Words",
value = 10,
min = 1,
max = 20,
step = 1
)
),
div(
style = "margin-top: 10px;",
h4("Wordcloud:"),
fluidRow(
column(
6,
numericInput(
"Keyness_Nwc",
label = "N. of Words",
value = 50,
min = 10,
step = 1,
max = 200
)
),
column(
6,
numericInput(
"Keyness_size",
label = "Label Size",
value = 100,
min = 10,
step = 1,
max = 200
)
)
)
),
div(
style = "margin-top: 10px;",
h4("Frequency Context:"),
fluidRow(
column(
6,
numericInput(
"Keyness_freq_words",
label = "N. of Words",
value = 20,
min = 1,
step = 1,
max = 100
)
),
column(
6,
numericInput(
"Keyness_label_spacing",
label = "Label spacing",
value = 0.15,
min = 0,
step = 0.01,
max = 1
)
)
)
)
),
style = "gradient",
right = TRUE,
animate = TRUE,
circle = TRUE,
tooltip = tooltipOptions(title = "Options"),
icon = icon("sliders", lib = "font-awesome"),
width = "300px"
)
),
style = style_opt
)
),
fluidRow(
tabsetPanel(
type = "tabs",
tabPanel(
br(),
title = "Plot",
icon = icon("chart-column"),
shinycssloaders::withSpinner(
plotlyOutput(
outputId = "keyness_barplot_plotly",
height = "75vh",
width = "95.0%"
),
color = getOption("spinner.color", default = "#4F7942")
)
),
tabPanel(
br(),
title = "WordCloud",
icon = icon("chart-column"),
# Use renderUI to dynamically create wordcloud containers
fluidRow(
conditionalPanel(
condition = "input.keyness_approach == 'reference_corpus'",
h4(
"Target Corpus: High Keyness Words",
align = "center",
style = "color: #4F7942; margin-bottom: 20px;"
)
),
shinycssloaders::withSpinner(
plotOutput(
outputId = "keyness_wordcloud_plot",
height = "600px",
width = "100%"
),
color = getOption("spinner.color", default = "#4F7942")
)
)
),
tabPanel(
br(),
title = "Frequency Context Plot",
icon = icon("chart-column"),
conditionalPanel(
condition = "input.keyness_approach == 'reference_corpus'",
shinycssloaders::withSpinner(
plotlyOutput(
outputId = "keyness_frequency_plotly",
height = "80vh",
width = "95.0%"
),
color = getOption("spinner.color", default = "#4F7942")
)
),
conditionalPanel(
condition = "input.keyness_approach == 'two_corpus'",
div(
style = "padding: 50px; text-align: center;",
h4(
"Frequency Context Plot is only available for Reference Corpus approach"
),
p(
"This plot compares your corpus against a general reference corpus.",
style = "color: #666; margin-top: 20px;"
)
)
)
),
tabPanel(
title = "Table",
icon = icon("table"),
shinycssloaders::withSpinner(
DT::DTOutput("keyness_table"),
color = getOption("spinner.color", default = "#4F7942")
),
align = "center"
),
tabPanel(
"Info & References",
fluidPage(
fluidRow(
column(1),
column(
10,
div(
style = "padding: 30px; background: white; border-radius: 8px;
box-shadow: 0 2px 4px rgba(0,0,0,0.05); margin-top: 20px;",
HTML(infoTexts$keyness)
)
),
column(1)
)
)
)
)
)
)
)
}
## Keyness Analysis Server ----
keynessServer <- function(input, output, session, values) {
# Reactive counter to force re-rendering
render_counter <- reactiveVal(0)
# Check if keyness_group exists when two_corpus approach is selected
output$keyness_group_warning <- renderUI({
req(input$keyness_approach == "two_corpus")
if (!"keyness_group" %in% names(values$dfTag)) {
div(
style = "padding: 10px; background-color: #fff3cd; border: 1px solid #ffc107;
border-radius: 4px; margin: 10px 0;",
icon("exclamation-triangle", style = "color: #856404;"),
span(
style = "color: #856404; margin-left: 5px;",
strong("Warning:"),
" The 'keyness_group' variable is not defined. Please define it in the Feature Roles menu before running the analysis."
)
)
}
})
# Keyness Analysis
keyness_results <- eventReactive(
input$run_keyness,
{
# Get approach
approach <- input$keyness_approach
if (approach == "refrence_corpus") {
upos <- intersect(
values$dfTag %>%
LemmaSelection() %>%
dplyr::filter(docSelected) %>%
distinct(upos) %>%
pull(upos),
c("NOUN", "VERB", "ADJ", "ADV")
)
} else {
upos <- intersect(
values$dfTag %>%
LemmaSelection() %>%
dplyr::filter(docSelected) %>%
distinct(upos) %>%
pull(upos),
c(
"ADJ",
"ADP",
"ADV",
"AUX",
"CCONJ",
"DET",
"EMAIL",
"EMOJI",
"HASH",
"INTJ",
"IP_ADDRESS",
"MENTION",
"MULTIWORD",
"NOUN",
"NUM",
"PRON",
"PROPN",
"PUNCT",
"SCONJ",
"SYM",
"TO_REMOVE",
"URL",
"VERB"
)
)
}
# Increment counter to force re-rendering
render_counter(render_counter() + 1)
# Check if keyness_group exists for two_corpus approach
if (
approach == "two_corpus" && !"keyness_group" %in% names(values$dfTag)
) {
showNotification(
"Error: 'keyness_group' variable not found. Please define it in Feature Roles menu.",
type = "error",
duration = 10
)
return(NULL)
}
if (length(upos) == 0) {
showNotification(
"Error: No valid PoS Tag selected.",
type = "error",
duration = 10
)
return(NULL)
}
withProgress(message = 'Running Keyness Analysis...', value = 0, {
incProgress(0.3, detail = "Calculating frequencies...")
# Run keyness analysis with selected approach
results <- tall_keyness_analysis(
dfTag = values$dfTag,
approach = approach,
language = values$language,
N = input$keyness_n,
min.char = input$keyness_minchar,
upos_list = upos,
term = ifelse(
approach == "reference_corpus",
"token",
values$generalTerm
)
)
incProgress(0.7, detail = "Generating plots...")
if (is.null(input$keyness_measure)) {
measure = "G2"
} else {
measure = input$keyness_measure
}
if (is.null(input$Keyness_Nbarplot)) {
Nbarplot = 10
} else {
Nbarplot = input$Keyness_Nbarplot
}
# Generate bar plot with approach parameter
results <- c(
results,
plot_tall_keyness(
results$results,
measure = measure,
N = Nbarplot,
approach = approach
)
)
# Generate frequency context plot only for reference_corpus approach
if (approach == "reference_corpus") {
plot_frequency <- frequency_context_analysis(
results$results,
top_n = input$Keyness_freq_words,
g2_threshold = quantile(results$results$G2, 0.95),
label_spacing = input$Keyness_label_spacing
)
results <- c(
results,
plot_frequency = list(plot_frequency)
)
}
if (is.null(input$Keyness_Nwc)) {
Nwc = 100
} else {
Nwc = input$Keyness_Nwc
}
# Generate wordcloud(s) based on approach
if (approach == "reference_corpus") {
# Single wordcloud for reference corpus approach
data_target <- results$results %>%
ungroup() %>%
select(token, all_of(measure)) %>%
arrange(desc(.data[[measure]])) %>%
slice_head(n = Nwc) %>%
rename(word = token, freq = all_of(measure))
results <- c(
results,
wc_data = list(data_target)
)
values$keyness_wordcloud_plot <- wordcloud(
data_target,
shape = "circle",
rot_per = 0.2,
eccentricity = 1.3,
colors = sample(colorlist(), nrow(data_target), replace = T),
seed = values$random_seed,
max_size = input$Keyness_size
)
} else if (approach == "two_corpus") {
# For two corpus: create static wordcloud data
# Try to split by measure sign first
data_positive <- results$results %>%
ungroup() %>%
filter(.data[[measure]] > 0) %>%
select(token, all_of(measure)) %>%
arrange(desc(.data[[measure]])) %>%
slice_head(n = Nwc)
data_negative <- results$results %>%
ungroup() %>%
filter(.data[[measure]] < 0) %>%
select(token, all_of(measure)) %>%
arrange(.data[[measure]]) %>%
slice_head(n = Nwc) %>%
mutate(across(all_of(measure), abs))
# Fallback: if one corpus is empty, split top/bottom
if (nrow(data_positive) == 0 || nrow(data_negative) == 0) {
all_data <- results$results %>%
ungroup() %>%
select(token, all_of(measure)) %>%
arrange(desc(.data[[measure]]))
half_n <- min(Nwc, ceiling(nrow(all_data) / 2))
data_positive <- all_data %>%
slice_head(n = half_n)
data_negative <- all_data %>%
slice_tail(n = half_n) %>%
mutate(across(all_of(measure), abs))
}
# Prepare final data frames
data_corpus1 <- data_positive %>%
rename(word = token, freq = all_of(measure)) %>%
mutate(Corpus = "Corpus 1")
data_corpus2 <- data_negative %>%
rename(word = token, freq = all_of(measure)) %>%
mutate(Corpus = "Corpus 2")
results <- c(
results,
wc1_data = list(data_corpus1),
wc2_data = list(data_corpus2)
)
values$keyness_wordcloud_plot <- wordcloud(
bind_rows(
data_corpus1,
data_corpus2
),
shape = "circle",
rot_per = 0.2,
eccentricity = 1.3,
colors = c("#4575B4", "#D73027"),
seed = values$random_seed,
max_size = input$Keyness_size * 0.7,
facet_by = "Corpus",
facet_ncol = 2
)
}
# Format results table
results$results <- results$results %>%
rename(Word = token, Obs_Freq = O11, Exp_Freq = O12) %>%
select(
Word,
Sig_corrected,
Obs_Freq,
Exp_Freq,
G2,
RDF,
RateRatio,
OddsRatio,
LogOddsRatio,
phi,
MI,
PMI,
DeltaP
) %>%
mutate(
G2 = round(G2, 3),
RDF = round(RDF, 3),
RateRatio = round(RateRatio, 3),
OddsRatio = round(OddsRatio, 3),
LogOddsRatio = round(LogOddsRatio, 3),
phi = round(phi, 3),
MI = round(MI, 3),
PMI = round(PMI, 3),
DeltaP = round(DeltaP, 3)
)
})
values$keyness_results <- results
},
ignoreNULL = TRUE
)
# Keyness Table Output
output$keyness_table <- DT::renderDT({
req(keyness_results())
DTformat(values$keyness_results$results)
})
# Keyness Bar Plot plotly
output$keyness_barplot_plotly <- plotly::renderPlotly({
req(keyness_results())
values$keyness_results$plot_plotly_bar
})
output$keyness_frequency_plotly <- plotly::renderPlotly({
req(keyness_results())
req(input$keyness_approach == "reference_corpus")
values$keyness_results$plot_frequency
})
output$keyness_wordcloud_plot <- renderPlot(
{
req(keyness_results())
req(input$keyness_approach %in% c("reference_corpus", "two_corpus"))
if (input$keyness_approach == "reference_corpus") {
req(values$keyness_results$wc_data)
nrowdata <- nrow(values$keyness_results$wc_data)
} else if (input$keyness_approach == "two_corpus") {
req(values$keyness_results$wc1_data)
nrowdata <- nrow(values$keyness_results$wc1_data) +
nrow(values$keyness_results$wc2_data)
}
if (nrowdata == 0) {
plot.new()
text(
0.5,
0.5,
"No keywords found for Target Corpus",
cex = 1.5,
col = "gray"
)
return()
}
# Use wordcloud package for static plot
tryCatch(
{
values$keyness_wordcloud_plot
},
error = function(e) {
plot.new()
text(
0.5,
0.5,
paste("Error rendering Corpus :\n", e$message),
cex = 1,
col = "red"
)
}
)
},
bg = "white"
)
}
## Keyness Analysis Function ----
tall_keyness_analysis <- function(
dfTag,
approach = c("reference_corpus", "two_corpus"),
language = "english",
N = 2000,
min.char = 3,
upos_list = c("NOUN", "VERB"),
term = "token"
) {
# Match the approach argument
approach <- match.arg(approach)
# ============================================================================
# APPROACH 1: REFERENCE CORPUS (original implementation)
# Compare target corpus against a reference word frequency list
# ============================================================================
if (approach == "reference_corpus") {
# Load word frequency list for the specified language
word_frequency <- tall_load_wordlist(language = language)
# Calculate observed frequencies by filtering and aggregating tokens
x <- dfTag %>%
dplyr::filter(upos %in% upos_list) %>%
mutate(token = tolower(token)) %>%
dplyr::group_by(token) %>%
dplyr::summarise(n = n()) %>%
ungroup() %>%
dplyr::filter(nchar(token) > min.char) %>%
slice_max(order_by = n, n = N) %>%
rename(obsFreq = n) %>%
as_tibble()
# Calculate total number of words (excluding numbers and punctuation)
total_words <- nrow(
dfTag %>%
dplyr::filter(!upos %in% c("NUM", "PUNCT"))
)
# Calculate expected frequencies based on reference corpus
df <- word_frequency %>%
as_tibble() %>%
mutate(token = tolower(token)) %>%
group_by(token) %>%
summarise(rel_freq = sum(rel_freq, na.rm = T)) %>%
ungroup() %>%
mutate(expFreq = round(rel_freq * total_words, 0))
# Create frequency table by joining observed and expected frequencies
freq_table <- x %>%
left_join(df, by = c("token")) %>%
select(token, obsFreq, expFreq) %>%
distinct() %>%
ungroup() %>%
# replace NA in expFreq with 1
dplyr::mutate(
expFreq = ifelse(is.na(expFreq), 1, expFreq)
)
} else if (approach == "two_corpus") {
# ============================================================================
# APPROACH 2: TWO CORPUS COMPARISON
# Compare corpus 1 against corpus 2 to identify distinctive features
# ============================================================================
# Check if keyness_group column exists
if (!"keyness_group" %in% names(dfTag)) {
stop(
"Error: dfTag must contain a 'keyness_group' column with values 1 and 2 for two-corpus comparison."
)
}
# Check if keyness_group has exactly two groups (1 and 2) and remove NAs
groups <- unique(dfTag %>% drop_na(keyness_group) %>% pull(keyness_group))
if (!all(c(1, 2) %in% groups) || length(groups) != 2) {
stop(
"Error: 'keyness_group' must contain exactly two groups with values 1 and 2."
)
}
# Calculate total number of tokens in each corpus (filtered by upos)
n_tokens_corpus1 <- dfTag %>%
dplyr::filter(keyness_group == 1 & upos %in% upos_list) %>%
nrow()
n_tokens_corpus2 <- dfTag %>%
dplyr::filter(keyness_group == 2 & upos %in% upos_list) %>%
nrow()
# Calculate normalization ratio to scale corpus 2 to corpus 1 size
normalization_ratio <- n_tokens_corpus1 / n_tokens_corpus2
# Print corpus sizes for user information
message(sprintf("Corpus 1 size: %d tokens", n_tokens_corpus1))
message(sprintf("Corpus 2 size: %d tokens", n_tokens_corpus2))
message(sprintf("Normalization ratio: %.4f", normalization_ratio))
# Calculate frequencies for corpus 1
freq_corpus1 <- dfTag %>%
dplyr::filter(keyness_group == 1 & upos %in% upos_list) %>%
dplyr::rename(term_col = all_of(term)) %>%
mutate(term_col = tolower(term_col)) %>%
dplyr::group_by(term_col) %>%
dplyr::summarise(obsFreq = n()) %>%
ungroup() %>%
dplyr::filter(nchar(term_col) > min.char) %>%
dplyr::rename(token = term_col)
# freq_corpus1 <- dfTag %>%
# dplyr::filter(keyness_group == 1 & upos %in% upos_list) %>%
# mutate(token = tolower(token)) %>%
# dplyr::group_by(token) %>%
# dplyr::summarise(obsFreq = n()) %>%
# ungroup() %>%
# dplyr::filter(nchar(token) > min.char)
# Calculate frequencies for corpus 2
freq_corpus2 <- dfTag %>%
dplyr::filter(keyness_group == 2 & upos %in% upos_list) %>%
dplyr::rename(term_col = all_of(term)) %>%
mutate(term_col = tolower(term_col)) %>%
dplyr::group_by(term_col) %>%
dplyr::summarise(expFreq_raw = n()) %>%
ungroup() %>%
dplyr::filter(nchar(term_col) > min.char) %>%
dplyr::rename(token = term_col) %>%
dplyr::mutate(expFreq = expFreq_raw * normalization_ratio)
# freq_corpus2 <- dfTag %>%
# dplyr::filter(keyness_group == 2 & upos %in% upos_list) %>%
# mutate(token = tolower(token)) %>%
# dplyr::group_by(token) %>%
# dplyr::summarise(expFreq_raw = n()) %>%
# ungroup() %>%
# dplyr::filter(nchar(token) > min.char) %>%
# # Normalize frequencies of corpus 2 by scaling to corpus 1 size
# dplyr::mutate(expFreq = expFreq_raw * normalization_ratio)
# Join the two frequency tables
# Keep all tokens from both corpora
freq_table <- freq_corpus1 %>%
full_join(freq_corpus2 %>% select(token, expFreq), by = "token") %>%
# Replace NA values with 1 to avoid calculation errors
dplyr::mutate(
obsFreq = ifelse(is.na(obsFreq), 1, obsFreq),
expFreq = ifelse(is.na(expFreq), 1, expFreq)
) %>%
# Select top N tokens by sum of frequencies
dplyr::mutate(total_freq = obsFreq + expFreq) %>%
slice_max(order_by = total_freq, n = N) %>%
select(token, obsFreq, expFreq)
}
# ============================================================================
# COMMON CALCULATIONS FOR BOTH APPROACHES
# Calculate contingency table statistics and keyness measures
# ============================================================================
# Calculate contingency table statistics
stats_tb2 <- freq_table %>%
dplyr::mutate(
C1 = sum(obsFreq),
C2 = sum(expFreq),
N = C1 + C2
) %>%
dplyr::rowwise() %>%
dplyr::mutate(
R1 = obsFreq + expFreq,
R2 = N - R1,
O11 = obsFreq,
O11 = ifelse(O11 == 0, O11 + 0.1, O11),
O12 = R1 - O11,
O21 = C1 - O11,
O22 = C2 - O12
) %>%
dplyr::mutate(
E11 = (R1 * C1) / N,
E12 = (R1 * C2) / N,
E21 = (R2 * C1) / N,
E22 = (R2 * C2) / N
) %>%
dplyr::select(-obsFreq, -expFreq)
# Calculate association measures and keyness statistics
assoc_tb3 <- stats_tb2 %>%
dplyr::mutate(Rws = nrow(.)) %>%
dplyr::rowwise() %>%
# Calculate Fisher's exact test
dplyr::mutate(
p = as.vector(unlist(fisher.test(matrix(
c(O11, O12, O21, O22),
ncol = 2,
byrow = T
))[1]))
) %>%
# Calculate per thousand word frequencies
dplyr::mutate(
ptw_target = O11 / C1 * 1000,
ptw_ref = O12 / C2 * 1000
) %>%
# Calculate chi-square statistic
dplyr::mutate(
X2 = (O11 - E11)^2 /
E11 +
(O12 - E12)^2 / E12 +
(O21 - E21)^2 / E21 +
(O22 - E22)^2 / E22
) %>%
# Calculate various keyness measures
dplyr::mutate(
phi = sqrt((X2 / N)),
MI = log2(O11 / E11),
t.score = (O11 - E11) / sqrt(O11),
PMI = log2((O11 / N) / ((O11 + O12) / N) * ((O11 + O21) / N)),
DeltaP = (O11 / R1) - (O21 / R2),
LogOddsRatio = log(
((O11 + 0.5) * (O22 + 0.5)) / ((O12 + 0.5) * (O21 + 0.5))
),
G2 = 2 *
((O11 + 0.001) *
log((O11 + 0.001) / E11) +
(O12 + 0.001) * log((O12 + 0.001) / E12) +
O21 * log(O21 / E21) +
O22 * log(O22 / E22)),
# Traditional keyness measures
RateRatio = ((O11 + 0.001) / (C1 * 1000)) / ((O12 + 0.001) / (C2 * 1000)),
RateDifference = (O11 / (C1 * 1000)) - (O12 / (C2 * 1000)),
DifferenceCoefficient = RateDifference /
sum((O11 / (C1 * 1000)), (O12 / (C2 * 1000))),
OddsRatio = ((O11 + 0.5) * (O22 + 0.5)) / ((O12 + 0.5) * (O21 + 0.5)),
LLR = 2 * (O11 * (log((O11 / E11)))),
RDF = abs((O11 / C1) - (O12 / C2)),
PDiff = abs(ptw_target - ptw_ref) / ((ptw_target + ptw_ref) / 2) * 100,
SignedDKL = sum(
ifelse(O11 > 0, O11 * log(O11 / ((O11 + O12) / 2)), 0) -
ifelse(O12 > 0, O12 * log(O12 / ((O11 + O12) / 2)), 0)
)
) %>%
# Determine Bonferroni corrected significance
dplyr::mutate(
Sig_corrected = dplyr::case_when(
p / Rws > .05 ~ "n.s.",
p / Rws > .01 ~ "p < .05*",
p / Rws > .001 ~ "p < .01**",
p / Rws <= .001 ~ "p < .001***",
T ~ "N.A."
)
) %>%
# Round p-value and determine type/antitype
dplyr::mutate(
p = round(p, 5),
type = ifelse(E11 > O11, "antitype", "type"),
phi = ifelse(E11 > O11, -phi, phi),
G2 = ifelse(E11 > O11, -G2, G2)
) %>%
# Filter out non-significant results
dplyr::filter(Sig_corrected != "n.s.") %>%
# Arrange by G2 statistic
dplyr::arrange(-G2) %>%
# Remove superfluous columns
dplyr::select(
-any_of(c(
"TermCoocFreq",
"AllFreq",
"NRows",
"R1",
"R2",
"C1",
"C2",
"E12",
"E21",
"E22",
"upp",
"low",
"op",
"t.score",
"z.score",
"Rws"
))
) %>%
# Relocate important columns to the front
dplyr::relocate(any_of(c(
"token",
"type",
"Sig_corrected",
"O11",
"O12",
"ptw_target",
"ptw_ref",
"G2",
"RDF",
"RateRatio",
"RateDifference",
"DifferenceCoefficient",
"LLR",
"SignedDKL",
"PDiff",
"LogOddsRatio",
"MI",
"PMI",
"phi",
"X2",
"OddsRatio",
"DeltaP",
"p",
"E11",
"O21",
"O22"
)))
# Return results with approach information
return(list(
results = assoc_tb3,
approach = approach,
corpus1_size = if (approach == "two_corpus") n_tokens_corpus1 else NULL,
corpus2_size = if (approach == "two_corpus") n_tokens_corpus2 else NULL,
normalization_ratio = if (approach == "two_corpus") {
normalization_ratio
} else {
NULL
}
))
}
### Keyness Wordlist Functions ----
tall_download_wordlist <- function(
language,
file_dir = NULL,
overwrite = TRUE
) {
filename <- paste0(language, "_word_frequency.keyness")
if (is.null(file_dir)) {
file_dir <- paste0(homeFolder(), "/tall/language_models")
}
url <- file.path(
"https://raw.githubusercontent.com/massimoaria/tall.language.models/main/word.frequency.data",
filename
)
to <- file.path(file_dir, filename)
download_failed <- FALSE
download_message <- "OK"
dl <- suppressWarnings(try(
utils::download.file(url = url, destfile = to, mode = "wb"),
silent = TRUE
))
if (inherits(dl, "try-error")) {
download_failed <- TRUE
download_message <- as.character(dl)
} else if (inherits(dl, "integer") && dl != 0) {
download_failed <- TRUE
download_message <- "Download failed. Please check internet connectivity"
}
if (download_failed) {
message("Something went wrong")
message(download_message)
} else {
message(sprintf("Downloading finished, model stored at '%s'", to))
}
return(list(
download_failed = download_failed,
download_message = download_message,
file_wordlist = to
))
}
tall_load_wordlist <- function(
language,
file_dir = NULL
) {
wordlist_path <- paste0(homeFolder(), "/tall/language_models/")
if (!dir.exists(wordlist_path)) {
dir.create(wordlist_path, recursive = TRUE)
}
filename <- paste0(language, "_word_frequency.keyness")
file_path <- file.path(wordlist_path, filename)
if (!file.exists(file_path)) {
message(sprintf(
"Wordlist for language '%s' not found locally. Downloading...",
language
))
info <- tall_download_wordlist(
language = language,
file_dir = file_dir,
overwrite = FALSE
)
if (info$download_failed) {
return(NULL)
}
}
load(file_path)
return(word_frequency)
}
### Keyness Plotting Function ----
plot_tall_keyness <- function(
assoc_tb3,
measure = "G2",
N = 10,
approach = "reference_corpus"
) {
# Get top N and bottom N keywords
top <- assoc_tb3 %>%
dplyr::ungroup() %>%
dplyr::arrange(desc(.data[[measure]])) %>%
dplyr::slice_head(n = N)
bot <- assoc_tb3 %>%
dplyr::ungroup() %>%
dplyr::arrange(desc(.data[[measure]])) %>%
dplyr::slice_tail(n = N)
names(top)[which(names(top) %in% measure)] <- "Measure"
names(bot)[which(names(bot) %in% measure)] <- "Measure"
combined_data <- rbind(top, bot)
# Define plot title based on approach
if (approach == "two_corpus") {
plot_title <- paste0("Top ", N, " keywords for Corpus 1 vs Corpus 2")
} else {
plot_title <- paste0("Top ", N, " keywords for Target vs Reference Corpus")
}
# Create ggplot bar plot for top/bottom keywords
plot_gg_bar <- combined_data %>%
ggplot(aes(
x = reorder(token, Measure, mean),
y = Measure,
label = Measure,
fill = type
)) +
geom_bar(stat = "identity") +
geom_text(
aes(
y = ifelse(Measure > 0, Measure - 50, Measure + 50),
label = round(Measure, 1)
),
color = "white",
size = 3
) +
coord_flip() +
theme_bw() +
theme(legend.position = "none") +
scale_fill_manual(values = c("antitype" = "#D73027", "type" = "#4575B4")) +
labs(
title = plot_title,
x = "Keyword",
y = paste0("Keyness (", measure, ")")
)
# Create plotly bar plot
combined_data <- combined_data %>%
dplyr::mutate(
token = reorder(token, Measure, mean),
color = ifelse(type == "antitype", "#D73027", "#4575B4")
)
plot_plotly_bar <- plotly::plot_ly(
data = combined_data,
y = ~token,
x = ~Measure,
type = "bar",
orientation = "h",
marker = list(color = ~color),
text = ~ round(Measure, 1),
textposition = "inside",
textfont = list(color = "white", size = 12)
) %>%
plotly::layout(
title = plot_title,
xaxis = list(title = paste0("Keyness (", measure, ")")),
yaxis = list(
title = "Keyword",
tickmode = "linear",
categoryorder = "trace",
autorange = "reversed"
),
showlegend = FALSE,
margin = list(l = 100)
)
# Return results
return(list(
plot_ggplot_bar = plot_gg_bar,
plot_plotly_bar = plot_plotly_bar
))
}
### Frequency Plotting Function ----
#' Identify and Visualize Frequency Context in Keyness Results
#'
#' This function analyzes keyness results to identify specialized terminology
#' (high keyness, low frequency) versus fundamental stylistic/thematic differences
#' (high keyness, high frequency).
#'
#' @param keyness_results A data frame containing keyness analysis results with
#' columns: Word, G2, Obs_Freq (observed frequency in target corpus)
#' @param top_n Number of top high-frequency and low-frequency words to select (default: 15)
#' @param g2_threshold Minimum G2 score to consider (default: 10.83, p < 0.001)
#' @param title Plot title (default: "Frequency Context Analysis")
#' @param label_spacing Spacing factor for labels to avoid overlap (default: 0.08)
#' @param freq_threshold Frequency threshold to separate low/high frequency zones (default: NULL, uses median)
#' @return A plotly scatter plot object
#'
frequency_context_analysis <- function(
keyness_results,
top_n = 15,
g2_threshold = 10.83,
title = "Frequency Context Analysis",
label_spacing = 0.08,
freq_threshold = NULL
) {
# Load required libraries
require(plotly)
require(dplyr)
# Filter words with high keyness scores (significant keywords)
high_keyness <- keyness_results %>%
dplyr::filter(G2 >= g2_threshold) %>%
arrange(desc(G2)) %>%
rename(
Word = token,
Obs_Freq = O11
)
# Identify top N high-frequency words (fundamental differences)
high_freq_words <- high_keyness %>%
arrange(desc(Obs_Freq)) %>%
head(top_n) %>%
mutate(Category = "High Frequency\n(Fundamental Differences)")
# Identify top N low-frequency words (specialized terminology)
low_freq_words <- high_keyness %>%
arrange(Obs_Freq) %>%
head(top_n) %>%
anti_join(high_freq_words, by = "Word") %>%
mutate(Category = "Low Frequency\n(Specialized Terminology)")
# Combine selected words
selected_words <- bind_rows(high_freq_words, low_freq_words)
# Calculate frequency threshold if not provided
if (is.null(freq_threshold)) {
freq_threshold <- mean(
min(high_freq_words$Obs_Freq),
max(low_freq_words$Obs_Freq)
) #median(selected_words$Obs_Freq)
}
# Transform coordinates to log scale for calculations
selected_words <- selected_words %>%
mutate(
log_freq = log10(Obs_Freq),
log_g2 = log10(G2)
)
# Calculate axis ranges for background zones (in original scale)
x_range_orig <- range(selected_words$Obs_Freq)
y_range_orig <- range(selected_words$G2)
# Extend ranges for better visualization
x_min <- x_range_orig[1] * 0.5
x_max <- x_range_orig[2] * 2
y_min <- y_range_orig[1] * 0.5
y_max <- y_range_orig[2] * 2
# Algorithm to adjust label positions alternating above/below for nearby points
# with increased spacing for low frequency words
adjust_labels_alternating <- function(df, spacing = label_spacing) {
df <- df %>% arrange(log_freq, log_g2)
# Initialize adjusted positions and anchor positions
df$label_x <- df$log_freq
df$label_y <- df$log_g2
df$yanchor <- "bottom" # Default: label above point
# Set spacing multiplier based on category (more space for low frequency)
df$spacing_mult <- ifelse(
df$Category == "Low Frequency\n(Specialized Terminology)",
2.5,
1.0
)
# Identify clusters of nearby points
clusters <- list()
current_cluster <- c(1)
for (i in 2:nrow(df)) {
# Check if point i is close to any point in current cluster
is_close <- FALSE
for (j in current_cluster) {
dx <- df$log_freq[i] - df$log_freq[j]
dy <- df$log_g2[i] - df$log_g2[j]
dist <- sqrt(dx^2 + dy^2)
if (dist < spacing * 2) {
is_close <- TRUE
break
}
}
if (is_close) {
current_cluster <- c(current_cluster, i)
} else {
if (length(current_cluster) > 1) {
clusters[[length(clusters) + 1]] <- current_cluster
}
current_cluster <- c(i)
}
}
# Add last cluster
if (length(current_cluster) > 1) {
clusters[[length(clusters) + 1]] <- current_cluster
}
# For each cluster, alternate labels above and below with increased distance for low freq
for (cluster in clusters) {
# Sort cluster by G2 value (vertical position)
cluster_sorted <- cluster[order(df$log_g2[cluster])]
# Alternate anchor positions
for (idx in seq_along(cluster_sorted)) {
i <- cluster_sorted[idx]
mult <- df$spacing_mult[i]
if (idx %% 2 == 0) {
df$yanchor[i] <- "top" # Label below point
df$label_y[i] <- df$log_g2[i] - spacing * 0.5 * mult
} else {
df$yanchor[i] <- "bottom" # Label above point
df$label_y[i] <- df$log_g2[i] + spacing * 0.5 * mult
}
}
}
# Additional refinement: push labels apart if still overlapping
for (iter in 1:30) {
moved <- FALSE
for (i in 1:nrow(df)) {
for (j in 1:nrow(df)) {
if (i >= j) {
next
}
dx <- df$label_x[i] - df$label_x[j]
dy <- df$label_y[i] - df$label_y[j]
dist <- sqrt(dx^2 + dy^2)
# Use max spacing multiplier for the pair
max_mult <- max(df$spacing_mult[i], df$spacing_mult[j])
min_dist <- spacing * 0.8 * max_mult
# If labels are still too close, push them apart
if (dist < min_dist && dist > 0) {
push_x <- dx / dist * (min_dist - dist) / 2
push_y <- dy / dist * (min_dist - dist) / 2
df$label_x[i] <- df$label_x[i] + push_x
df$label_x[j] <- df$label_x[j] - push_x
df$label_y[i] <- df$label_y[i] + push_y
df$label_y[j] <- df$label_y[j] - push_y
moved <- TRUE
}
}
}
if (!moved) break
}
return(df)
}
# Adjust label positions with alternating strategy
selected_words <- adjust_labels_alternating(
selected_words,
spacing = label_spacing
)
# Create base scatter plot
p <- plot_ly() %>%
# Add data points for high frequency words
add_trace(
data = selected_words %>%
filter(Category == "High Frequency\n(Fundamental Differences)"),
x = ~Obs_Freq,
y = ~G2,
type = "scatter",
mode = "markers",
name = "High Frequency<br>(Fundamental Differences)",
marker = list(
size = 12,
color = "#FF8C00",
line = list(color = "white", width = 1.5),
opacity = 0.8
),
text = ~Word,
hovertemplate = paste(
"<b>%{text}</b><br>",
"Frequency: %{x}<br>",
"G² Score: %{y:.2f}<br>",
"<extra></extra>"
)
) %>%
# Add data points for low frequency words
add_trace(
data = selected_words %>%
filter(Category == "Low Frequency\n(Specialized Terminology)"),
x = ~Obs_Freq,
y = ~G2,
type = "scatter",
mode = "markers",
name = "Low Frequency<br>(Specialized Terminology)",
marker = list(
size = 12,
color = "#8B4789",
line = list(color = "white", width = 1.5),
opacity = 0.8
),
text = ~Word,
hovertemplate = paste(
"<b>%{text}</b><br>",
"Frequency: %{x}<br>",
"G² Score: %{y:.2f}<br>",
"<extra></extra>"
)
)
# Add annotations for labels with adjusted positions and alternating anchors
annotations_list <- lapply(1:nrow(selected_words), function(i) {
row <- selected_words[i, ]
list(
x = log10(row$Obs_Freq),
y = log10(row$G2),
xref = "x",
yref = "y",
text = row$Word,
xanchor = "center",
yanchor = row$yanchor, # Alternating between "top" and "bottom"
showarrow = TRUE,
arrowhead = 0,
arrowsize = 0.5,
arrowwidth = 1,
arrowcolor = "rgba(128,128,128,0.5)",
ax = (row$label_x - log10(row$Obs_Freq)) * 100,
ay = (row$label_y - log10(row$G2)) * 100,
font = list(size = 10, color = "black"),
bgcolor = "rgba(255,255,255,0.5)",
bordercolor = "rgba(128,128,128,0.6)",
borderwidth = 0.5,
borderpad = 2
)
})
# Finalize layout with background shapes and legend
p <- p %>%
layout(
xaxis = list(
title = "Observed Frequency (Target Corpus)",
type = "log",
gridcolor = "#E0E0E0",
showline = TRUE,
linecolor = "#CCCCCC"
),
yaxis = list(
title = "G² Keyness Score",
type = "log",
gridcolor = "#E0E0E0",
showline = TRUE,
linecolor = "#CCCCCC"
),
plot_bgcolor = "#F8F9FA",
paper_bgcolor = "white",
hovermode = "closest",
showlegend = TRUE,
legend = list(
x = 0.02,
y = 0.98,
xanchor = "left",
yanchor = "top",
bgcolor = "rgba(255,255,255,0.8)",
bordercolor = "#CCCCCC",
borderwidth = 1
),
# Add background rectangles using shapes (coordinates in original scale for log axes)
shapes = list(
# Low frequency zone (specialized terminology) - light purple
list(
type = "rect",
xref = "x",
yref = "y",
x0 = x_min,
y0 = y_min,
x1 = freq_threshold,
y1 = y_max,
fillcolor = "rgba(139, 71, 137, 0.08)", # Purple shade matching the points
line = list(width = 0),
layer = "below"
),
# High frequency zone (fundamental differences) - light orange
list(
type = "rect",
xref = "x",
yref = "y",
x0 = freq_threshold,
y0 = y_min,
x1 = x_max,
y1 = y_max,
fillcolor = "rgba(255, 140, 0, 0.08)", # Orange shade matching the points
line = list(width = 0),
layer = "below"
)
),
margin = list(r = 80, b = 100, l = 80, t = 80),
annotations = c(
annotations_list,
list(
list(
text = paste(
"High keyness threshold: G² ≥",
round(g2_threshold, 1)
),
xref = "paper",
yref = "paper",
x = 0.01,
y = -0.15,
xanchor = "left",
yanchor = "top",
showarrow = FALSE,
font = list(size = 10, color = "gray")
)
)
)
)
return(p)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.