tests/testthat/test-lingmatch.R

words <- c(
  sub("*", "", unlist(lma_dict(1:9, as.regex = FALSE), use.names = FALSE), fixed = TRUE),
  vapply(seq_len(200), function(w) paste0(sample(letters, sample(9, 1)), collapse = ""), "")
)
texts <- vapply(seq_len(50), function(d) {
  paste0(sample(words, sample(100, 1), TRUE), collapse = " ")
}, "")
textsfile <- tempfile(fileext = ".csv")
write(do.call(paste, list(c("id", seq_along(texts)), c("text", texts), sep = ",")), textsfile, sep = ",")
txtfile <- tempfile(fileext = ".txt")
writeLines(texts, txtfile)
dtm <- lma_dtm(texts)

test_that("different input formats have the same results", {
  manual <- as.numeric(lma_simets(dtm, metric = "cosine"))
  expect_equal(as.numeric(lingmatch(textsfile, metric = "x")$sim), manual)
  expect_equal(as.numeric(lingmatch(txtfile)$sim), manual)
  expect_equal(as.numeric(lingmatch(texts)$sim), manual)
  expect_equal(as.numeric(lingmatch(dtm)$sim), manual)
  expect_equal(as.numeric(lingmatch(as.data.frame(as.matrix(dtm)))$sim), manual)
  expect_equal(as.numeric(lingmatch(matrix(as.character(dtm), nrow(dtm)))$sim), manual)
  expect_warning(
    dropped <- lingmatch(cbind(as.data.frame(as.matrix(dtm)), id = "a"))$sim,
    "some input variables were not"
  )
  expect_equal(as.numeric(dropped), manual)
  expect_equal(as.numeric(lingmatch(data = dtm)$sim), manual)
  data <- data.frame(text = as.factor(texts))
  base <- as.numeric(lingmatch(data$text, symmetrical = TRUE)$sim)
  expect_equal(
    as.numeric(lingmatch("text", "text", pairwise = TRUE, data = data)$sim),
    base
  )
  expect_equal(
    as.numeric(lingmatch("text", "text", pairwise = TRUE, data = data)$sim),
    base
  )
  expect_equal(
    as.numeric(lingmatch(data$text, data$text, pairwise = TRUE)$sim),
    base
  )
})

test_that("different input formats have the same results (comp)", {
  manual <- as.numeric(lma_simets(dtm[-(1:10), ], dtm[1:10, ], metric = "cosine"))
  expect_equal(as.numeric(lingmatch(textsfile, 1:10)$sim), manual)
  expect_equal(as.numeric(lingmatch(texts[-(1:10)], texts[1:10])$sim), manual)
  expect_equal(as.numeric(lingmatch(texts[-(1:10)], as.factor(texts[1:10]))$sim), manual)
  expect_equal(as.numeric(lingmatch(dtm[-(1:10), ], dtm[1:10, ])$sim), manual)
  expect_equal(as.numeric(lingmatch(
    as.data.frame(as.matrix(dtm[-(1:10), ])),
    as.data.frame(as.matrix(dtm[1:10, ]))
  )$sim), manual)
  expect_equal(
    as.numeric(lingmatch(dtm[-(1:10), -1], dtm[1:10, ])$sim),
    as.numeric(lma_simets(cbind(0, dtm[-(1:10), -1]), dtm[1:10, ], metric = "cosine"))
  )
})

test_that("different input formats have the same results (LSM)", {
  wdtm <- lma_weight(dtm, percent = TRUE)
  cdtm <- lma_termcat(wdtm)
  manual <- as.numeric(lma_simets(cdtm, "canberra"))
  expect_equal(as.numeric(lingmatch(texts, type = "lsm")$sim), manual)
  expect_equal(as.numeric(lingmatch(dtm, type = "lsm")$sim), manual)
  expect_equal(as.numeric(lingmatch(wdtm, type = "lsm")$sim), manual)
  expect_equal(as.numeric(lingmatch(cdtm, type = "lsm")$sim), manual)
  colnames(cdtm)[c(3, 6)] <- c("articles", "preps")
  expect_equal(as.numeric(lingmatch(cdtm, type = "lsm")$sim), manual)
  colnames(cdtm) <- paste0("liwc.", c(
    "personal_pronouns", "impersonal_pronouns", "articles", "auxiliary_verbs",
    "adverbs", "prepositions", "conjunctions", "negations", "quantifiers"
  ))
  expect_equal(as.numeric(lingmatch(cdtm, type = "lsm")$sim), manual)
})

test_that("different input formats have the same results (LSA)", {
  space <- lma_lspace(dtm)
  wdtm <- lma_weight(dtm, "tfidf")
  cdtm <- lma_lspace(wdtm, space)
  manual <- as.numeric(lma_simets(cdtm, metric = "cosine"))
  expect_equal(as.numeric(lingmatch(texts, space = space, type = "lsa")$sim), manual)
  expect_equal(as.numeric(lingmatch(dtm, space = space, type = "lsa")$sim), manual)
  expect_equal(as.numeric(lingmatch(wdtm, space = space, type = "lsa")$sim), manual)
  expect_equal(as.numeric(lingmatch(cdtm, type = "lsa")$sim), manual)
})

test_that("comparison columns align (named)", {
  wdtm <- lma_weight(dtm, percent = TRUE)
  cdtm <- lma_termcat(wdtm)
  manual <- as.numeric(lma_simets(cdtm, cdtm[1:5, ], "canberra"))
  expect_equal(as.numeric(lingmatch(texts, cdtm[1:5, ], type = "lsm")$sim), manual)
  expect_equal(as.numeric(lingmatch(dtm, cdtm[1:5, ], type = "lsm")$sim), manual)
  expect_equal(as.numeric(lingmatch(wdtm, cdtm[1:5, ], type = "lsm")$sim), manual)
  expect_equal(as.numeric(lingmatch(cdtm, cdtm[1:5, ], type = "lsm")$sim), manual)
})

test_that("comparison columns align (unnamed)", {
  space <- lma_lspace(dtm)
  wdtm <- lma_weight(dtm, "tfidf")
  cdtm <- lma_lspace(wdtm, space)
  manual <- as.numeric(lma_simets(cdtm, cdtm[1:5, ], metric = "cosine"))
  expect_equal(as.numeric(lingmatch(texts, cdtm[1:5, ], space = space, type = "lsa")$sim), manual)
  expect_equal(as.numeric(lingmatch(dtm, cdtm[1:5, ], space = space, type = "lsa")$sim), manual)
  expect_equal(as.numeric(lingmatch(wdtm, cdtm[1:5, ], space = space, type = "lsa")$sim), manual)
  expect_equal(as.numeric(lingmatch(cdtm, cdtm[1:5, ], space = space, type = "lsa")$sim), manual)
})

test_that("drop works", {
  ddtm <- dtm
  ddtm[, 1] <- 0
  expect_identical(
    as.numeric(lingmatch(ddtm, drop = TRUE)$sim),
    as.numeric(lingmatch(dtm[, -1])$sim)
  )
})

test_that("function comparisons work", {
  expect_equal(
    as.numeric(lingmatch(dtm, sum)$sim),
    as.numeric(lingmatch(dtm, colSums(dtm))$sim)
  )
})

test_that("profile comparisons work", {
  novels <- c(
    ppron = 10.35, ipron = 4.79, article = 8.35, auxverb = 7.77, adverb = 4.17,
    prep = 14.27, conj = 6.28, negate = 1.68, quant = 1.8
  )
  expect_equal(
    as.numeric(lingmatch(dtm, "novels", type = "lsm")$sim),
    as.numeric(lingmatch(dtm, novels, type = "lsm")$sim)
  )
})

test_that("index/logical comparisons work", {
  ind <- sample(nrow(dtm), 5)
  su <- seq_len(nrow(dtm)) %in% ind
  expect_equal(
    as.numeric(lingmatch(dtm, ind)$sim),
    as.numeric(lingmatch(dtm, dtm[ind, ])$sim[-ind, ])
  )
  expect_equal(
    as.numeric(lingmatch(dtm, su)$sim),
    as.numeric(lingmatch(dtm, dtm[su, ])$sim[!su, ])
  )
})

test_that("group comparisons work", {
  wdtm <- lma_weight(dtm, percent = TRUE)
  cdtm <- lma_termcat(wdtm)
  groups <- rep(c("a", "b"), each = nrow(dtm) / 2)
  group_means <- rbind(
    a = colMeans(cdtm[groups == "a", ]),
    b = colMeans(cdtm[groups == "b", ])
  )
  g1 <- lingmatch(dtm, group = groups, type = "lsm")$sim[, 2]
  expect_equal(g1, rowSums(lingmatch(dtm, group_means, type = "lsm")$sim * rep(
    c(1, 0, 1), nrow(dtm) / c(2, 1, 2)
  )))
  expect_true(class(lingmatch(dtm, type = "lsm", symmetrical = FALSE)$sim)[1] == "dtCMatrix")
  pw <- lingmatch(dtm, type = "lsm", symmetrical = TRUE)$sim
  expect_equal(
    as.numeric(lingmatch(dtm, "pairwise", symmetrical = TRUE, group = groups, type = "lsm")$sim$a),
    as.numeric(pw[groups == "a", groups == "a"])
  )
  expect_equal(
    lingmatch(dtm, "pairwise", mean = TRUE, group = groups, type = "lsm")$sim[, 2],
    (c(
      rowSums(pw[groups == "a", groups == "a"]),
      rowSums(pw[groups == "b", groups == "b"])
    ) - 1) / (nrow(dtm) / 2 - 1)
  )
  subgroups <- sample(c("x", "y"), nrow(dtm), TRUE)
  all_groups <- paste(groups, subgroups)
  sgmeans <- t(vapply(split(as.data.frame(cdtm), all_groups), colMeans, numeric(9)))
  g1_2 <- lingmatch(dtm, group = cbind(groups, subgroups), type = "lsm", all.levels = TRUE)$sim
  expect_equal(g1, g1_2$g1_canberra)
  expect_equal(g1_2$g1_g2_canberra, vapply(seq_along(all_groups), function(i) {
    lma_simets(cdtm[i, ], sgmeans[all_groups[i], ], "can")
  }, 0))
  group_data <- cbind(groups, subgroups)
  expect_equal(as.numeric(lingmatch(
    dtm,
    group = group_data, type = "lsm", all.levels = TRUE
  )$sim$g1_canberra), g1)
  # debugonce(lingmatch)
  g1_2p <- lingmatch(dtm, "pair", group = cbind(groups, subgroups), type = "lsm", mean = FALSE)$sim
  for (s in names(g1_2p)) {
    expect_identical(
      as.numeric(g1_2p[[s]]),
      as.numeric(lingmatch(dtm[all_groups == s, ], "pair", type = "lsm", mean = FALSE)$sim)
    )
  }
})

test_that("groups work through data", {
  data <- data.frame(
    id = as.factor(rep(seq_len(nrow(dtm) / 2), 2)),
    group = as.factor(rep(c("a", "b"), each = nrow(dtm) / 2)),
    lma_termcat(lma_weight(dtm, percent = TRUE))
  )
  categories <- colnames(data)[-(1:2)]
  group_means <- t(vapply(split(data[, categories], data$group), colMeans, numeric(9)))
  manual <- as.numeric(c(
    lma_simets(data[data$group == "a", categories], group_means["a", ], "canberra"),
    lma_simets(data[data$group == "b", categories], group_means["b", ], "canberra")
  ))
  expect_error(lingmatch(data, group = noexist, type = "lsm"), "could not find noexist")
  expect_equal(
    lingmatch(data, group = "group", type = "lsm")$sim[, 2],
    as.numeric(c(
      lma_simets(data[data$group == "a", categories], group_means["a", ], "canberra"),
      lma_simets(data[data$group == "b", categories], group_means["b", ], "canberra")
    ))
  )
  id_means <- t(vapply(split(data[, categories], data$id), colMeans, numeric(9)))
  expect_equal(
    lingmatch(data, group = id, type = "lsm")$sim[, 2],
    vapply(seq_len(nrow(data)), function(i) {
      r <- data[i, ]
      lma_simets(r[categories], id_means[r[[1]], ], "canberra")
    }, 0)
  )
})

test_that("comp.group and comp.data work", {
  data <- data.frame(
    id = as.factor(rep(seq_len(nrow(dtm) / 2), 2)),
    condition = rbinom(nrow(dtm), 1, .5),
    group = as.factor(rep(c("a", "b"), each = nrow(dtm) / 2)),
    subgroup = as.factor(rep(c("a", "b"), nrow(dtm) / 2)),
    lma_termcat(lma_weight(dtm, percent = TRUE))
  )
  cats <- names(lma_dict(1:9))
  data$prep <- 0
  data[1, cats] <- 0
  sdat <- split(data[, cats], paste0(data$group, data$condition))
  pairs <- list(a = lma_simets(sdat$a0, sdat$a1, metric = "can"), b = lma_simets(sdat$b0, sdat$b1, metric = "can"))
  expect_identical(
    as.numeric((rowSums(pairs$a) - 1) / (ncol(pairs$a) - 1)),
    as.numeric(lma_simets(sdat$a0, sdat$a1, metric = "can", mean = TRUE))
  )
  c1mean <- list(
    a = lma_simets(sdat$a0, colMeans(sdat$a1), metric = "can"),
    b = lma_simets(sdat$b0, colMeans(sdat$b1), metric = "can")
  )
  expect_equal(
    lingmatch(
      data[data$condition == 0, ],
      comp.data = data[data$condition == 1, ],
      comp.group = group, type = "lsm"
    )$sim$canberra,
    as.numeric(unlist(c1mean))
  )
  pairwise_a <- as.numeric(lma_simets(
    data[data$condition == 0 & data$group == "a", cats],
    data[data$condition == 1 & data$group == "a", cats],
    metric = "canberra"
  ))
  expect_identical(
    as.numeric(lingmatch(
      data[data$condition == 0, ], data[data$condition == 1, ],
      comp.group = group, type = "lsm"
    )$sim$a$canberra),
    pairwise_a
  )
  comp <- data[data$condition == 1, ]
  colnames(comp)[3:4] <- c("c1", "c2")
  expect_identical(
    as.numeric(lingmatch(
      data[data$condition == 0, ], comp,
      all.levels = TRUE, pairwise = TRUE,
      group = c("group", "subgroup"), comp.group = c("c1", "c2"), type = "lsm"
    )$sim$a$a$g1_canberra),
    pairwise_a
  )
  colnames(sdat$a0)[c(3, 6)] <- colnames(sdat$a1)[c(3, 6)] <- c("articles", "preps")
  expect_true(all(lingmatch(sdat$a0, colMeans(sdat$a1), metric = "can")$sim == c1mean$a))
  expect_equal(as.numeric(lingmatch(sdat$a0, comp.data = sdat$a1, metric = "can")$sim), as.numeric(c1mean$a))
  auto <- lingmatch(sdat$a0, "auto")
  expect_equal(
    as.numeric(auto$sim),
    as.numeric(lingmatch(sdat$a0, sub("auto: ", "", auto$comp.type, fixed = TRUE))$sim)
  )
})

test_that("sequential comparisons work", {
  data <- data.frame(
    pair = sort(sample(1:4, nrow(dtm), TRUE)),
    phase = 0,
    speaker = sample(c("a", "b"), nrow(dtm), TRUE),
    lma_termcat(lma_weight(dtm, percent = TRUE))
  )
  for (p in 1:4) {
    su <- data$pair == p
    data$phase[su] <- sort(sample(1:3, sum(su), TRUE))
  }
  cats <- names(lma_dict(1:9))
  h <- list(
    lingmatch(data, "seq", dict = cats, group = speaker)$sim,
    lingmatch(data, "seq", dict = cats, group = "speaker", agg = FALSE)$sim,
    lingmatch(data, "seq", dict = cats, group = c("pair", "speaker"))$sim,
    lingmatch(data, "seq", dict = cats, group = data[, c("pair", "speaker")], agg = FALSE)$sim,
    lingmatch(data, "seq", dict = cats, group = data[, c("pair", "phase", "speaker")])$sim,
    lingmatch(data, "seq", dict = cats, group = c("pair", "phase", "speaker"), agg = FALSE)$sim
  )
  expect_true(any(h[[1]][, "cosine"] - h[[2]][, "cosine"] > .01))
  expect_true(any(h[[3]][, "cosine"] - h[[4]][, "cosine"] > .01))
  expect_true(any(h[[5]][, "cosine"] - h[[6]][, "cosine"] > .01))
  for (comp in h) {
    manual <- vapply(strsplit(rownames(comp), " <-> ", fixed = TRUE), function(s) {
      if (length(s) == 1) {
        1
      } else {
        s <- strsplit(s, ", ", fixed = TRUE)
        lma_simets(
          colMeans(data[as.numeric(s[[1]]), -(1:3), drop = FALSE]),
          colMeans(data[as.numeric(s[[2]]), -(1:3), drop = FALSE]),
          "cosine"
        )
      }
    }, 0)
    expect_equal(comp[, "cosine"], manual)
  }
  reorder <- sample(seq_len(nrow(data)))
  expect_equal(h[[1]][, 1], lingmatch(
    data[reorder, ], "seq",
    dict = cats, group = speaker, order = order(reorder)
  )$sim[, 1])
})

Try the lingmatch package in your browser

Any scripts or data that you put into this service are public.

lingmatch documentation built on May 29, 2024, 11:48 a.m.