test-textstat_collocations.R
In quanteda.textstats: Textual Statistics for the Quantitative Analysis of Textual Data

########################################################################################################
# Tests of statistics for detecting multiword expressions
# JK, 18.7.2017
#
# Two functions: One for counting the expressions and one for calculating the statistics

# ************************************************************

library("quanteda")

MWEcounts <- function(candidate, text, stopword = "xxxx") {
    # Function for creating the 2^K table of yes/no occurrences
    # in text (character vector)
    # of words in a K-word candidate expression (character vector)
    #
    K <- length(candidate)
    J <- length(text) - K + 1

    ##############################################################################
    # Fixed objects, here up to candidate length of 4 (extend as needed)

    count.vectors <- list(
        c("00", "01", "10", "11")
    )
    count.vectors[[2]] <- paste(rep(c("0","1"), each=4), rep(count.vectors[[1]], 2), sep = "")
    count.vectors[[3]] <- paste(rep(c("0","1"), each=8), rep(count.vectors[[2]], 2), sep = "")
    #
    noyes <- c("no","yes")
    array.dims <- list(
        list(W2=noyes,W1=noyes),
        list(W3=noyes,W2=noyes,W1=noyes),
        list(W4=noyes,W3=noyes,W2=noyes,W1=noyes)
    )
    #
    data.frames <- list(
        data.frame(count=NA,W1=gl(2,2,4,labels=noyes),W2=gl(2,1,4,labels=noyes)),
        data.frame(count=NA,W1=gl(2,4,8,labels=noyes),W2=gl(2,2,8,labels=noyes),W3=gl(2,1,8,labels=noyes)),
        data.frame(count=NA,W1=gl(2,8,16,labels=noyes),W2=gl(2,4,16,labels=noyes),W3=gl(2,2,16,labels=noyes),W4=gl(2,1,16,labels=noyes))
    )

    ###############################################################################
    # Count the words

    counts <- rep(0,2^K)
    names(counts) <- count.vectors[[K-1]]
    #
    for(j in seq(J)){
        text.j <- text[j:(j+K-1)]
        if(all(text.j!=stopword)){
            agreement <- text.j==candidate
            tmp <- paste(as.numeric(agreement),collapse="")
            counts[tmp] <- counts[tmp]+1
        }
    }
    counts.table <- array(counts,dim=rep(2,K),dimnames=array.dims[[K-1]])
    counts.data.frame <- data.frames[[K-1]]
    counts.data.frame$count <- counts
    #
    result <- list(expression=paste(candidate,collapse=" "),counts=counts,counts.table=counts.table,counts.data.frame=counts.data.frame)
    return(result)
}

# ************************************************************8

MWEstatistics <- function (counts, smooth=0.5) {
    # Function for calculating some association statistics for a
    # K-word candidate expression
    # The input is output from the function MWEcounts
    #
    counts.n <- counts$counts
    counts.table <- counts$counts.table
    counts.df <- counts$counts.data.frame
    K <- length(dim(counts.table))

    results <- matrix(NA,1,9+(2^K))
    colnames(results) <- c("length","lambda","se.lambda","z.lambda","LRtest","smooth","mInfty","Infty","N",names(counts.n))
    rownames(results) <- counts$expression
    results[,"length"] <- K
    results[,"smooth"] <- smooth
    results[,-(1:9)] <- counts.n
    results[,"N"] <- sum(counts.n)
    results[,"mInfty"] <- as.numeric(counts.n[2^K]==0) # 1 if the expression never appears in the text
    results[,"Infty"] <- as.numeric(any(counts.n[-(2^K)]==0)) # 1 if the count for any lower-order margin is 0 (i.e. the log-OR is infinity)

    ##############################################################################
    # Fixed objects, here up to candidate length of 4 (extend as needed)

    loglin.margins <- list(
        list(1,2),
        list(1:2,2:3,c(1,3)),
        list(1:3,2:4,c(1,2,4),c(1,3,4))
    )[[K-1]]
    formula <- list(
        count~W1*W2,
        count~W1*W2*W3,
        count~W1*W2*W3*W4
    )[[K-1]]

    ###############################################################################
    # Estimated highest-order interaction parameter (lambda), obtained using a Poisson log-linear model

    counts.df$count <- counts.df$count+smooth
    suppressWarnings(mod1 <- glm(formula,family=poisson,data=counts.df))
    tmp <- length(coef(mod1))
    results[,"lambda"] <- coef(mod1)[tmp]
    results[,"se.lambda"] <- sqrt(diag(vcov(mod1)))[tmp]
    results[,"z.lambda"] <- results[,"lambda"]/results[,"se.lambda"]

    # Likelihood ratio test of the parameter, obtained from an IPF fit from the loglin function for model without the highest-order interaction
    # (note: this could also be obtained by fitting this model and the saturated model with glm, and asking for the LR test

    counts.table <- counts.table+smooth
    mod2 <- loglin(counts.table,loglin.margins,print=F)
    results[,"LRtest"] <- mod2$lrt
    #
    return(results)
}

test_that("test that collocations do not span texts", {
    toks <- quanteda::tokens(c('this is a test', 'this also a test'))
    cols <- rbind(textstat_collocations(toks, size = 2, min_count = 1),
                  textstat_collocations(toks, size = 3, min_count = 1))

    expect_false('test this' %in% cols$collocation)
    expect_false('test this also' %in% cols$collocation)
    expect_true('this also a' %in% cols$collocation)
})

test_that("test that collocations only include selected features", {
    toks <- quanteda::tokens(c('This is a Twitter post to @someone on #something.'), what = 'fastest')
    toks <- quanteda::tokens_select(toks, "^([a-z]+)$", valuetype = "regex")
    cols <- textstat_collocations(toks, min_count = 1, size = 2, tolower = FALSE)

    expect_true('This is' %in% cols$collocation)
    expect_true('a Twitter' %in% cols$collocation)

    expect_false('to @someone' %in% cols$collocation)
    expect_false('on #something' %in% cols$collocation)
})

# test_that("test that collocations and sequences are counting the same features", {
#     toks <- tokens(data_corpus_inaugural[1:2], remove_punct = TRUE)
#     toks <- tokens_remove(toks, stopwords("english"), padding = TRUE)
#     seqs <- textstat_collocations(toks, method = 'lambda', size = 2)
#     cols <- textstat_collocations(toks, method = 'lambda1', size = 2)  # now is equal to `lambda`
#     both <- merge(seqs, cols, by = 'collocation')
#     expect_true(all(both$count.x == both$count.y))
# })

test_that("test that extractor works with collocation", {
    toks <- quanteda::tokens(quanteda::data_corpus_inaugural[2], remove_punct = TRUE)
    toks <- quanteda::tokens_remove(toks, quanteda::stopwords(), padding = TRUE)
    cols <- textstat_collocations(toks, size = 2)
    cols <- cols[1:5, ]
    expect_equal(nrow(cols), 5)
    expect_true(quanteda::is.collocations(cols))
})

test_that("bigrams and trigrams are all sorted correctly, issue #385", {
    toks <- quanteda::tokens(quanteda::data_corpus_inaugural[2], remove_punct = TRUE)
    toks <- quanteda::tokens_remove(toks, quanteda::stopwords("english"), padding = TRUE)
    cols <- textstat_collocations(toks, method = 'lambda', min_count = 1, size = 2:3)
    expect_equal(order(cols$z, decreasing = TRUE), seq_len(nrow(cols)))
})

test_that("test the correctness of significant with smoothing", {
    toks <- quanteda::tokens('capital other capital gains other capital word2 other gains capital')
    seqs <- textstat_collocations(toks, min_count=1, size = 2)
    # smoothing is applied when calculating the dice, so the dice coefficient
    #is only tested against manually calculated result.

    expect_equal(seqs$collocation[1], 'other capital')

    #dice
    # expect_equal(seqs$dice[1], 0.625)

    #pmi
    # expect_equal(seqs$pmi[1], log(2.5*(9+0.5*4)/(2.5+1.5)/(2.5+1.5)))
})

test_that("test the correctness of significant", {
    toks <- quanteda::tokens('capital other capital gains other capital word2 other gains capital')
    seqs <- textstat_collocations(toks, min_count=1, size = 2, smoothing = 0)

    expect_equal(seqs$collocation[1], 'other capital')

    # #dice
    # # expect_equal(seqs$dice[1], 0.667, tolerance = 1e-3)
    #
    # #pmi
    # expect_equal(seqs$pmi[1], log2(2*9/(2+1)/(2+1)))
    #
    # #chi2
    # expect_equal(seqs$chi2[1], 2.25)
    #
    # #log likelihood ratio
    # expect_equal(seqs$G2[1], 2.231, tolerance = 1e-3)
})

test_that("collocation is counted correctly in racing conditions, issue #381", {
    n <- 100 # NOTE: n must be large number to create racing conditionc
    txt <- unname(rep(as.character(quanteda::data_corpus_inaugural)[1], n))
    toks <- quanteda::tokens(txt)
    out1 <- textstat_collocations(toks[1], size = 2, min_count = 1)
    out100 <- textstat_collocations(toks, size = 2, min_count = 1)
    out1 <- out1[order(out1$collocation),]
    out100 <- out100[order(out100$collocation),]
    expect_true(all(out1$count * n == out100$count))
})

# Broken
# test_that("textstat_collocations works with corpus, character, tokens objects", {
#     txt <- data_char_sampletext
#     corp <- corpus(txt)
#     expect_equal(
#         textstat_collocations(txt, min_count = 2, size = 3),
#         textstat_collocations(corp, min_count = 2, size = 3)
#     )
#     expect_equal(
#         textstat_collocations(tokens(txt), min_count = 2, size = 3),
#         textstat_collocations(tokens(txt, hash = FALSE), min_count = 2, size = 3)
#     )
#
#     ## THIS SHOULD BE THE SAME, BUT IS NOT BECAUSE THE REMOVED PUNCTUATION BECOMES
#     ## PADS, AND IS COUNTED, WHEN IT SHOULD NOT BE COUNTED AT ALL
#
#     toks <- tokens(txt)
#     seqs_corp <- textstat_collocations(corp, method = "lambda", min_count = 2, size = 3)
#     seqs_toks <- textstat_collocations(tokens_remove(toks, "\\p{P}", valuetype = "regex", padding = TRUE), method = "lambda", min_count = 2, size = 3)
#     expect_equal(
#         seqs_corp[, 1:3],
#         seqs_toks[match(seqs_corp$collocation, seqs_toks$collocation), 1:3],
#         check.attributes = FALSE
#     )
# })

test_that("lambda & [ function",{
    toks <- quanteda::tokens('E E G F a b c E E G G f E E f f G G')
    toks_capital <- quanteda::tokens_select(toks, "^[A-Z]$", valuetype="regex",
                                  case_insensitive = FALSE, padding = TRUE)
    seqs <- textstat_collocations(toks_capital, min_count = 1)
    a_seq <- seqs[1, ]

    #call Jouni's implementation
    tt <- as.character(toks_capital)
    test2 <- MWEcounts(c("G", "F"), tt)
    test2_stat <- suppressWarnings(MWEstatistics(test2))

    expect_equal(a_seq$collocation, 'g f')
    expect_equal(a_seq$lambda, test2_stat[2])
    expect_equal(class(a_seq), c("collocations", "textstat", "data.frame"))
})

test_that("textstat_collocations.tokens works ok with zero-length documents (#940)", {
    txt <- c('I like good ice cream.', 'Me too!  I like good ice cream.', '')
    toks <- quanteda::tokens(tolower(txt), remove_punct = TRUE, remove_symbols = TRUE)

    expect_equal(
        textstat_collocations(txt, size = 2, min_count = 2, tolower = TRUE)$collocation,
        c("good ice", "i like", "ice cream", "like good")
    )
    ##   collocation count count_nested length   lambda        z
    ## 1    good ice     2            0      2 4.317488 2.027787
    ## 2      i like     2            0      2 4.317488 2.027787
    ## 3   ice cream     2            0      2 4.317488 2.027787
    ## 4   like good     2            0      2 4.317488 2.027787

    expect_equal(
        textstat_collocations(toks, size = 2, min_count = 2)$collocation,
        c("good ice", "i like", "ice cream", "like good")
    )
})

test_that("textstat_collocations works when texts are shorter than size", {
    toks <- quanteda::tokens(c('a', 'bb', ''))
    expect_equivalent(
        textstat_collocations(toks, size = 2:3, min_count = 1, tolower = TRUE),
        data.frame(collocation = character(0),
                   count = integer(0),
                   count_nested = integer(0),
                   length = numeric(0),
                   lambda = numeric(0),
                   z = numeric(0),
                   stringsAsFactors = FALSE))
})

test_that("textstat_collocations error when size = 1 and warn when size > 5", {
    toks <- quanteda::tokens('a b c d e f g h a b c d e f')
    expect_silent(textstat_collocations(toks, size = 2:5))
    expect_error(textstat_collocations(toks, size = 1:5),
                 "Collocation sizes must be larger than 1")
    expect_warning(textstat_collocations(toks, size = 2:6),
                 "Computation for large collocations may take long time")
})

test_that("textstat_collocations counts sequences correctly when recursive = FALSE", {
    txt <- c("a b c . . a b c . . a b c . . . a b c",
             "a b . . a b . . a b . . a b . a b")
    toks <- quanteda::tokens_keep(quanteda::tokens(txt), c("a", "b", "c"), padding = TRUE)

    col1 <- textstat_collocations(toks, size = 2:3)
    expect_equal(col1$collocation, c('a b', 'b c', 'a b c'))
    expect_equal(col1$count, c(9, 4, 4))
    expect_equal(col1$count_nested, c(4, 4, 0))

    txt2 <- c(". . . . a b c . . a b c . . .",
              "a b . . a b . . a b . . a b . a b",
              "b c . . b c . b c . . . b c")
    toks2 <- quanteda::tokens_keep(quanteda::tokens(txt2), c("a", "b", "c"), padding = TRUE)

    col2 <- textstat_collocations(toks2, size = 2:3, min_count = 1)
    expect_equal(col2$collocation, c('a b', 'b c', 'a b c'))
    expect_equal(col2$count, c(7, 6, 2))
    expect_equal(col2$count_nested, c(2, 2, 0))

    txt3 <- c(". . . . a b c d . . a b c d . . .",
              "a b . . a b . . a b . . a b . a b",
              "b c . . b c . b c . . . b c")
    toks3 <- quanteda::tokens_keep(quanteda::tokens(txt3), c("a", "b", "c", "d"), padding = TRUE)

    col3 <- textstat_collocations(toks3, size = c(2, 4), min_count = 1)
    expect_equal(col3$collocation, c('a b', 'b c', 'c d', 'a b c d'))
    expect_equal(col3$count, c(7, 6, 2, 2))
    expect_equal(col3$count_nested, c(2, 2, 2, 0))

    txt4 <- c(". . . . a b c d . . a b c . . .")
    toks4 <- quanteda::tokens_keep(quanteda::tokens(txt4), c("a", "b", "c", "d"), padding = TRUE)

    col4 <- textstat_collocations(toks4, size = c(2:4), min_count = 1)
    expect_equal(col4$collocation, c('a b', 'b c', 'c d', 'a b c d', 'a b c', 'b c d'))
    expect_equal(col4$count, c(2, 2, 1, 1, 2, 1))
    expect_equal(col4$count_nested, c(2, 2, 1, 0, 1, 1))
})
Any scripts or data that you put into this service are public.
quanteda.textstats documentation built on Sept. 11, 2024, 6:39 p.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
quanteda.textstats
Textual Statistics for the Quantitative Analysis of Textual Data

tests/testthat/test-textstat_collocations.R
In quanteda.textstats: Textual Statistics for the Quantitative Analysis of Textual Data

Try the quanteda.textstats package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

quanteda.textstats Textual Statistics for the Quantitative Analysis of Textual Data

tests/testthat/test-textstat_collocations.R In quanteda.textstats: Textual Statistics for the Quantitative Analysis of Textual Data

Try the quanteda.textstats package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

quanteda.textstats
Textual Statistics for the Quantitative Analysis of Textual Data

tests/testthat/test-textstat_collocations.R
In quanteda.textstats: Textual Statistics for the Quantitative Analysis of Textual Data