########################################################################################################
# Tests of statistics for detecting multiword expressions
# JK, 18.7.2017
#
# Two functions: One for counting the expressions and one for calculating the statistics
# ************************************************************
library("quanteda")
MWEcounts <- function(candidate, text, stopword = "xxxx") {
# Function for creating the 2^K table of yes/no occurrences
# in text (character vector)
# of words in a K-word candidate expression (character vector)
#
K <- length(candidate)
J <- length(text) - K + 1
##############################################################################
# Fixed objects, here up to candidate length of 4 (extend as needed)
count.vectors <- list(
c("00", "01", "10", "11")
)
count.vectors[[2]] <- paste(rep(c("0","1"), each=4), rep(count.vectors[[1]], 2), sep = "")
count.vectors[[3]] <- paste(rep(c("0","1"), each=8), rep(count.vectors[[2]], 2), sep = "")
#
noyes <- c("no","yes")
array.dims <- list(
list(W2=noyes,W1=noyes),
list(W3=noyes,W2=noyes,W1=noyes),
list(W4=noyes,W3=noyes,W2=noyes,W1=noyes)
)
#
data.frames <- list(
data.frame(count=NA,W1=gl(2,2,4,labels=noyes),W2=gl(2,1,4,labels=noyes)),
data.frame(count=NA,W1=gl(2,4,8,labels=noyes),W2=gl(2,2,8,labels=noyes),W3=gl(2,1,8,labels=noyes)),
data.frame(count=NA,W1=gl(2,8,16,labels=noyes),W2=gl(2,4,16,labels=noyes),W3=gl(2,2,16,labels=noyes),W4=gl(2,1,16,labels=noyes))
)
###############################################################################
# Count the words
counts <- rep(0,2^K)
names(counts) <- count.vectors[[K-1]]
#
for(j in seq(J)){
text.j <- text[j:(j+K-1)]
if(all(text.j!=stopword)){
agreement <- text.j==candidate
tmp <- paste(as.numeric(agreement),collapse="")
counts[tmp] <- counts[tmp]+1
}
}
counts.table <- array(counts,dim=rep(2,K),dimnames=array.dims[[K-1]])
counts.data.frame <- data.frames[[K-1]]
counts.data.frame$count <- counts
#
result <- list(expression=paste(candidate,collapse=" "),counts=counts,counts.table=counts.table,counts.data.frame=counts.data.frame)
return(result)
}
# ************************************************************8
MWEstatistics <- function (counts, smooth=0.5) {
# Function for calculating some association statistics for a
# K-word candidate expression
# The input is output from the function MWEcounts
#
counts.n <- counts$counts
counts.table <- counts$counts.table
counts.df <- counts$counts.data.frame
K <- length(dim(counts.table))
results <- matrix(NA,1,9+(2^K))
colnames(results) <- c("length","lambda","se.lambda","z.lambda","LRtest","smooth","mInfty","Infty","N",names(counts.n))
rownames(results) <- counts$expression
results[,"length"] <- K
results[,"smooth"] <- smooth
results[,-(1:9)] <- counts.n
results[,"N"] <- sum(counts.n)
results[,"mInfty"] <- as.numeric(counts.n[2^K]==0) # 1 if the expression never appears in the text
results[,"Infty"] <- as.numeric(any(counts.n[-(2^K)]==0)) # 1 if the count for any lower-order margin is 0 (i.e. the log-OR is infinity)
##############################################################################
# Fixed objects, here up to candidate length of 4 (extend as needed)
loglin.margins <- list(
list(1,2),
list(1:2,2:3,c(1,3)),
list(1:3,2:4,c(1,2,4),c(1,3,4))
)[[K-1]]
formula <- list(
count~W1*W2,
count~W1*W2*W3,
count~W1*W2*W3*W4
)[[K-1]]
###############################################################################
# Estimated highest-order interaction parameter (lambda), obtained using a Poisson log-linear model
counts.df$count <- counts.df$count+smooth
suppressWarnings(mod1 <- glm(formula,family=poisson,data=counts.df))
tmp <- length(coef(mod1))
results[,"lambda"] <- coef(mod1)[tmp]
results[,"se.lambda"] <- sqrt(diag(vcov(mod1)))[tmp]
results[,"z.lambda"] <- results[,"lambda"]/results[,"se.lambda"]
# Likelihood ratio test of the parameter, obtained from an IPF fit from the loglin function for model without the highest-order interaction
# (note: this could also be obtained by fitting this model and the saturated model with glm, and asking for the LR test
counts.table <- counts.table+smooth
mod2 <- loglin(counts.table,loglin.margins,print=F)
results[,"LRtest"] <- mod2$lrt
#
return(results)
}
test_that("test that collocations do not span texts", {
toks <- quanteda::tokens(c('this is a test', 'this also a test'))
cols <- rbind(textstat_collocations(toks, size = 2, min_count = 1),
textstat_collocations(toks, size = 3, min_count = 1))
expect_false('test this' %in% cols$collocation)
expect_false('test this also' %in% cols$collocation)
expect_true('this also a' %in% cols$collocation)
})
test_that("test that collocations only include selected features", {
toks <- quanteda::tokens(c('This is a Twitter post to @someone on #something.'), what = 'fastest')
toks <- quanteda::tokens_select(toks, "^([a-z]+)$", valuetype = "regex")
cols <- textstat_collocations(toks, min_count = 1, size = 2, tolower = FALSE)
expect_true('This is' %in% cols$collocation)
expect_true('a Twitter' %in% cols$collocation)
expect_false('to @someone' %in% cols$collocation)
expect_false('on #something' %in% cols$collocation)
})
# test_that("test that collocations and sequences are counting the same features", {
# toks <- tokens(data_corpus_inaugural[1:2], remove_punct = TRUE)
# toks <- tokens_remove(toks, stopwords("english"), padding = TRUE)
# seqs <- textstat_collocations(toks, method = 'lambda', size = 2)
# cols <- textstat_collocations(toks, method = 'lambda1', size = 2) # now is equal to `lambda`
# both <- merge(seqs, cols, by = 'collocation')
# expect_true(all(both$count.x == both$count.y))
# })
test_that("test that extractor works with collocation", {
toks <- quanteda::tokens(quanteda::data_corpus_inaugural[2], remove_punct = TRUE)
toks <- quanteda::tokens_remove(toks, quanteda::stopwords(), padding = TRUE)
cols <- textstat_collocations(toks, size = 2)
cols <- cols[1:5, ]
expect_equal(nrow(cols), 5)
expect_true(quanteda::is.collocations(cols))
})
test_that("bigrams and trigrams are all sorted correctly, issue #385", {
toks <- quanteda::tokens(quanteda::data_corpus_inaugural[2], remove_punct = TRUE)
toks <- quanteda::tokens_remove(toks, quanteda::stopwords("english"), padding = TRUE)
cols <- textstat_collocations(toks, method = 'lambda', min_count = 1, size = 2:3)
expect_equal(order(cols$z, decreasing = TRUE), seq_len(nrow(cols)))
})
test_that("test the correctness of significant with smoothing", {
toks <- quanteda::tokens('capital other capital gains other capital word2 other gains capital')
seqs <- textstat_collocations(toks, min_count=1, size = 2)
# smoothing is applied when calculating the dice, so the dice coefficient
#is only tested against manually calculated result.
expect_equal(seqs$collocation[1], 'other capital')
#dice
# expect_equal(seqs$dice[1], 0.625)
#pmi
# expect_equal(seqs$pmi[1], log(2.5*(9+0.5*4)/(2.5+1.5)/(2.5+1.5)))
})
test_that("test the correctness of significant", {
toks <- quanteda::tokens('capital other capital gains other capital word2 other gains capital')
seqs <- textstat_collocations(toks, min_count=1, size = 2, smoothing = 0)
expect_equal(seqs$collocation[1], 'other capital')
# #dice
# # expect_equal(seqs$dice[1], 0.667, tolerance = 1e-3)
#
# #pmi
# expect_equal(seqs$pmi[1], log2(2*9/(2+1)/(2+1)))
#
# #chi2
# expect_equal(seqs$chi2[1], 2.25)
#
# #log likelihood ratio
# expect_equal(seqs$G2[1], 2.231, tolerance = 1e-3)
})
test_that("collocation is counted correctly in racing conditions, issue #381", {
n <- 100 # NOTE: n must be large number to create racing conditionc
txt <- unname(rep(as.character(quanteda::data_corpus_inaugural)[1], n))
toks <- quanteda::tokens(txt)
out1 <- textstat_collocations(toks[1], size = 2, min_count = 1)
out100 <- textstat_collocations(toks, size = 2, min_count = 1)
out1 <- out1[order(out1$collocation),]
out100 <- out100[order(out100$collocation),]
expect_true(all(out1$count * n == out100$count))
})
# Broken
# test_that("textstat_collocations works with corpus, character, tokens objects", {
# txt <- data_char_sampletext
# corp <- corpus(txt)
# expect_equal(
# textstat_collocations(txt, min_count = 2, size = 3),
# textstat_collocations(corp, min_count = 2, size = 3)
# )
# expect_equal(
# textstat_collocations(tokens(txt), min_count = 2, size = 3),
# textstat_collocations(tokens(txt, hash = FALSE), min_count = 2, size = 3)
# )
#
# ## THIS SHOULD BE THE SAME, BUT IS NOT BECAUSE THE REMOVED PUNCTUATION BECOMES
# ## PADS, AND IS COUNTED, WHEN IT SHOULD NOT BE COUNTED AT ALL
#
# toks <- tokens(txt)
# seqs_corp <- textstat_collocations(corp, method = "lambda", min_count = 2, size = 3)
# seqs_toks <- textstat_collocations(tokens_remove(toks, "\\p{P}", valuetype = "regex", padding = TRUE), method = "lambda", min_count = 2, size = 3)
# expect_equal(
# seqs_corp[, 1:3],
# seqs_toks[match(seqs_corp$collocation, seqs_toks$collocation), 1:3],
# check.attributes = FALSE
# )
# })
test_that("lambda & [ function",{
toks <- quanteda::tokens('E E G F a b c E E G G f E E f f G G')
toks_capital <- quanteda::tokens_select(toks, "^[A-Z]$", valuetype="regex",
case_insensitive = FALSE, padding = TRUE)
seqs <- textstat_collocations(toks_capital, min_count = 1)
a_seq <- seqs[1, ]
#call Jouni's implementation
tt <- as.character(toks_capital)
test2 <- MWEcounts(c("G", "F"), tt)
test2_stat <- suppressWarnings(MWEstatistics(test2))
expect_equal(a_seq$collocation, 'g f')
expect_equal(a_seq$lambda, test2_stat[2])
expect_equal(class(a_seq), c("collocations", "textstat", "data.frame"))
})
test_that("textstat_collocations.tokens works ok with zero-length documents (#940)", {
txt <- c('I like good ice cream.', 'Me too! I like good ice cream.', '')
toks <- quanteda::tokens(tolower(txt), remove_punct = TRUE, remove_symbols = TRUE)
expect_equal(
textstat_collocations(txt, size = 2, min_count = 2, tolower = TRUE)$collocation,
c("good ice", "i like", "ice cream", "like good")
)
## collocation count count_nested length lambda z
## 1 good ice 2 0 2 4.317488 2.027787
## 2 i like 2 0 2 4.317488 2.027787
## 3 ice cream 2 0 2 4.317488 2.027787
## 4 like good 2 0 2 4.317488 2.027787
expect_equal(
textstat_collocations(toks, size = 2, min_count = 2)$collocation,
c("good ice", "i like", "ice cream", "like good")
)
})
test_that("textstat_collocations works when texts are shorter than size", {
toks <- quanteda::tokens(c('a', 'bb', ''))
expect_equivalent(
textstat_collocations(toks, size = 2:3, min_count = 1, tolower = TRUE),
data.frame(collocation = character(0),
count = integer(0),
count_nested = integer(0),
length = numeric(0),
lambda = numeric(0),
z = numeric(0),
stringsAsFactors = FALSE))
})
test_that("textstat_collocations error when size = 1 and warn when size > 5", {
toks <- quanteda::tokens('a b c d e f g h a b c d e f')
expect_silent(textstat_collocations(toks, size = 2:5))
expect_error(textstat_collocations(toks, size = 1:5),
"Collocation sizes must be larger than 1")
expect_warning(textstat_collocations(toks, size = 2:6),
"Computation for large collocations may take long time")
})
test_that("textstat_collocations counts sequences correctly when recursive = FALSE", {
txt <- c("a b c . . a b c . . a b c . . . a b c",
"a b . . a b . . a b . . a b . a b")
toks <- quanteda::tokens_keep(quanteda::tokens(txt), c("a", "b", "c"), padding = TRUE)
col1 <- textstat_collocations(toks, size = 2:3)
expect_equal(col1$collocation, c('a b', 'b c', 'a b c'))
expect_equal(col1$count, c(9, 4, 4))
expect_equal(col1$count_nested, c(4, 4, 0))
txt2 <- c(". . . . a b c . . a b c . . .",
"a b . . a b . . a b . . a b . a b",
"b c . . b c . b c . . . b c")
toks2 <- quanteda::tokens_keep(quanteda::tokens(txt2), c("a", "b", "c"), padding = TRUE)
col2 <- textstat_collocations(toks2, size = 2:3, min_count = 1)
expect_equal(col2$collocation, c('a b', 'b c', 'a b c'))
expect_equal(col2$count, c(7, 6, 2))
expect_equal(col2$count_nested, c(2, 2, 0))
txt3 <- c(". . . . a b c d . . a b c d . . .",
"a b . . a b . . a b . . a b . a b",
"b c . . b c . b c . . . b c")
toks3 <- quanteda::tokens_keep(quanteda::tokens(txt3), c("a", "b", "c", "d"), padding = TRUE)
col3 <- textstat_collocations(toks3, size = c(2, 4), min_count = 1)
expect_equal(col3$collocation, c('a b', 'b c', 'c d', 'a b c d'))
expect_equal(col3$count, c(7, 6, 2, 2))
expect_equal(col3$count_nested, c(2, 2, 2, 0))
txt4 <- c(". . . . a b c d . . a b c . . .")
toks4 <- quanteda::tokens_keep(quanteda::tokens(txt4), c("a", "b", "c", "d"), padding = TRUE)
col4 <- textstat_collocations(toks4, size = c(2:4), min_count = 1)
expect_equal(col4$collocation, c('a b', 'b c', 'c d', 'a b c d', 'a b c', 'b c d'))
expect_equal(col4$count, c(2, 2, 1, 1, 2, 1))
expect_equal(col4$count_nested, c(2, 2, 1, 0, 1, 1))
})
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.