tests/testthat/test-utils-embeddings.R

test_that("get_anchor returns data object", {

    gend <- get_anchors(relation="gender")
    expect_equal(ncol(gend), 2L)

})

test_that("get_centroid works on different data types", {

    ## single c() list of terms ##
    out.c <- get_centroid(anchors=anchor.solo.c,
                          wv=fake_word_vectors)

    expect_type(out.c, "double")
    expect_identical(dim(out.c),  ce.dims)
    expect_identical(rownames(out.c), ce.name)

    ## order of single c() list of terms doesn't matter ##
    out.d <- get_centroid(anchors=anchor.solo.d,
                          wv=fake_word_vectors)

    expect_identical(out.c[1], out.d[1])

    # actual list list
    out <- get_centroid(anchors=anchor.solo.list,
                        wv=fake_word_vectors)

    expect_type(out, "double")
    expect_identical(dim(out),  ce.dims)
    expect_identical(rownames(out), ce.name)
    expect_identical(out.c[1], out[1])

    # data.frame
    out <- get_centroid(anchors=anchor.solo.df,
                        wv=fake_word_vectors)
    expect_type(out, "double")
    expect_identical(dim(out),  ce.dims)
    expect_identical(rownames(out), ce.name)
    expect_identical(out.c[1], out[1])

    # tibble
    out <- get_centroid(anchors=anchor.solo.tbl,
                        wv=fake_word_vectors)

    expect_type(out, "double")
    expect_identical(dim(out),  ce.dims)
    expect_identical(rownames(out), ce.name)
    expect_identical(out.c[1], out[1])

    ## paired list of terms ##
    ## get_centroid should only use first lsit

    # actual list list
    out <- get_centroid(anchors=anchor.pair.list,
                             wv=fake_word_vectors)
    expect_type(out, "double")
    expect_identical(dim(out),  ce.dims)
    expect_identical(rownames(out), ce.name)
    expect_identical(out.c[1], out[1])

    # data.frame
    out <- get_centroid(anchors=anchor.pair.df,
                             wv=fake_word_vectors)
    expect_type(out, "double")
    expect_identical(dim(out),  ce.dims)
    expect_identical(rownames(out), ce.name)
    expect_identical(out.c[1], out[1])

    # tibble
    out <- get_centroid(anchors=anchor.pair.tbl,
                             wv=fake_word_vectors)
    expect_type(out, "double")
    expect_identical(dim(out),  ce.dims)
    expect_identical(rownames(out), ce.name)
    expect_identical(out.c[1], out[1])

})


test_that("get_direction works on different data types", {

    ## paired list of terms ##
    # actual list list
    out <- get_direction(anchors=anchor.pair.list,
                        wv=fake_word_vectors)
    expect_type(out, "double")
    expect_identical(dim(out),  di.dims)
    expect_identical(rownames(out), di.name)

    # data.frame
    out <- get_direction(anchors=anchor.pair.df,
                        wv=fake_word_vectors)
    expect_type(out, "double")
    expect_identical(dim(out),  di.dims)
    expect_identical(rownames(out), di.name)

    # tibble
    out <- get_direction(anchors=anchor.pair.tbl,
                         wv=fake_word_vectors)
    expect_type(out, "double")
    expect_identical(dim(out),  di.dims)
    expect_identical(rownames(out), di.name)

})

test_that("get_direction errors if only one set of terms", {

    out <- get_direction(anchors=anchor.pair.list,
                            wv=fake_word_vectors,
                            method="paired")
    expect_type(out, "double")
    expect_identical(dim(out),  di.dims)
    expect_identical(rownames(out), di.name)

    out <- get_direction(anchors=anchor.pair.list,
                            wv=fake_word_vectors,
                            method="pooled")
    expect_type(out, "double")
    expect_identical(dim(out),  di.dims)
    expect_identical(rownames(out), di.name)

    out <- get_direction(anchors=anchor.pair.list,
                            wv=fake_word_vectors,
                            method="L2")
    expect_type(out, "double")
    expect_identical(dim(out),  di.dims)
    expect_identical(rownames(out), di.name)

    out <- get_direction(anchors=anchor.pair.list,
                         wv=fake_word_vectors,
                         method="PCA")
    expect_type(out, "double")
    expect_identical(dim(out),  di.dims)
    expect_identical(rownames(out), di.name)

    expect_error(get_direction(anchors=anchor.pair.list,
                            wv=fake_word_vectors,
                            method="not_a_method"),
                 "method must be 'pooled', 'paired', 'L2' or 'PCA'")

})

test_that("check if different get_direction methods work", {

    expect_error( get_direction(anchors=anchor.solo.list,
                                wv=fake_word_vectors),
                  "get_direction requires two sets of juxtaposing terms")

    expect_error( get_direction(anchors=anchor.solo.df,
                                wv=fake_word_vectors),
                  "get_direction requires two sets of juxtaposing terms")

    expect_error( get_direction(anchors=anchor.solo.tbl,
                                wv=fake_word_vectors),
                  "get_direction requires two sets of juxtaposing terms")

})

test_that(".check_term_in_embeddings works on different data types", {

    ## single list of terms ##
    # character list
    terms <- c("choose", "moon")
    out <- .check_term_in_embeddings(terms=terms,
                                     wv=fake_word_vectors,
                                     action="remove")
    expect_identical(out, terms)
    # actual list list
    terms <- list(c("choose", "moon"))
    out <- .check_term_in_embeddings(terms=terms,
                                     wv=fake_word_vectors,
                                     action="remove")
    expect_identical(out, terms)
    # data.frame
    terms <- data.frame(add = c("choose", "moon"))
    out <- .check_term_in_embeddings(terms=terms,
                                     wv=fake_word_vectors,
                                     action="remove")
    expect_identical(out, terms)
    # tibble
    terms <- tibble::tibble(add = c("choose", "moon"))
    out <- .check_term_in_embeddings(terms=terms,
                                     wv=fake_word_vectors,
                                     action="remove")
    expect_identical(out, terms)

    ## paired list of terms ##
    # actual list list
    terms <- list(add=c("choose", "moon"),
                  sub=c("decade", "this"))
    out <- .check_term_in_embeddings(terms=terms,
                                     wv=fake_word_vectors,
                                     action="remove")
    expect_identical(out, terms)
    # data.frame
    terms <- data.frame(add=c("choose", "moon"),
                        sub=c("decade", "this"))
    out <- .check_term_in_embeddings(terms=terms,
                                     wv=fake_word_vectors,
                                     action="remove")
    expect_identical(out, terms)
    # tibble
    terms <- tibble::tibble(add=c("choose", "moon"),
                            sub=c("decade", "this"))
    out <- .check_term_in_embeddings(terms=terms,
                                     wv=fake_word_vectors,
                                     action="remove")
    expect_identical(out, terms)

})

test_that(".check_term_in_embeddings removes words on different data types", {

    ## single list of terms ##
    # character list
    terms <- c("choose", "moon", "picklespit")
    expect_message(out <- .check_term_in_embeddings(terms=terms,
                                     wv=fake_word_vectors,
                                     action="remove"))

    "The following (and any associated terms) removed
    because there are no matching word vectors: picklespit"

    expect_identical(out,
                     c("choose", "moon"))

    # actual list list
    terms <- list(c("choose", "moon", "picklespit"))
    expect_message(out <- .check_term_in_embeddings(terms=terms,
                                     wv=fake_word_vectors,
                                     action="remove"))
    expect_identical(out,
                     list(c("choose", "moon")))

    # data.frame
    terms <- data.frame(add = c("choose", "moon", "picklespit"))
    expect_message(out <- .check_term_in_embeddings(terms=terms,
                                     wv=fake_word_vectors,
                                     action="remove"))
    expect_identical(out,
                     data.frame(add = c("choose", "moon")))

    # tibble
    terms <- tibble::tibble(add = c("choose", "moon", "picklespit"))
    expect_message(out <- .check_term_in_embeddings(terms=terms,
                                     wv=fake_word_vectors,
                                     action="remove"))
    expect_identical(out,
                     tibble::tibble(add = c("choose", "moon")))

    ## paired list of terms ##
    # actual list list
    terms <- list(add=c("choose", "moon","picklespit", "decade"),
                  sub=c("decade", "this", "choose", "picklespit"))
    expect_message(out <- .check_term_in_embeddings(terms=terms,
                                     wv=fake_word_vectors,
                                     action="remove"))
    expect_identical(out,
                     list(add=c("choose", "moon", "decade"),
                          sub=c("decade", "this", "choose")))
    # data.frame
    terms <- data.frame(add=c("choose", "moon","picklespit", "decade"),
                        sub=c("decade", "this", "choose", "picklespit"))
    expect_message(out <- .check_term_in_embeddings(terms=terms,
                                     wv=fake_word_vectors,
                                     action="remove"))
    expect_identical(out,
                     data.frame(add=c("choose", "moon"),
                                sub=c("decade", "this")) )

    # tibble
    terms <- tibble::tibble(add=c("choose", "moon","picklespit", "decade"),
                            sub=c("decade", "this", "choose", "picklespit"))
    expect_message(out <- .check_term_in_embeddings(terms=terms,
                                     wv=fake_word_vectors,
                                     action="remove"))
    expect_identical(out,
                     tibble::tibble(add=c("choose", "moon"),
                                    sub=c("decade", "this")) )


})


test_that(".check_term_in_embeddings removing all words will stop", {

    ## single list of terms ##
    # character list
    terms <- c("picklespit")
    expect_error(
        .check_term_in_embeddings(terms=terms,
                                  wv=fake_word_vectors,
                                  action="remove"),
        "The following have no matching word vectors: picklespit")

    # actual list list
    terms <- list(c("picklespit"))

    expect_error(
        .check_term_in_embeddings(terms=terms,
                                  wv=fake_word_vectors,
                                  action="remove"),
        "The following have no matching word vectors: picklespit")

    # data.frame
    terms <- data.frame(add = c("picklespit"))
    expect_error(
        .check_term_in_embeddings(terms=terms,
                                  wv=fake_word_vectors,
                                  action="remove"),
        "The following have no matching word vectors: picklespit")

    # tibble
    terms <- tibble::tibble(add = c("picklespit"))
    expect_error(
        .check_term_in_embeddings(terms=terms,
                                  wv=fake_word_vectors,
                                  action="remove"),
        "The following have no matching word vectors: picklespit")

    ## paired list of terms ##

    # actual list list
    terms <- list(add=c("picklespit", "mulepants"),
                  sub=c("choose", "picklespit"))
    expect_error(
        .check_term_in_embeddings(terms=terms,
                                  wv=fake_word_vectors,
                                  action="remove"),
        "The following have no matching word vectors: picklespit")

    # data.frame
    terms <- data.frame(add=c("picklespit", "decade"),
                        sub=c("choose", "picklespit"))
    expect_error(
        out <- .check_term_in_embeddings(terms=terms,
                                         wv=fake_word_vectors,
                                         action="remove"),
        "The following have no matching word vectors: picklespit")

    # tibble
    terms <- tibble::tibble(add=c("picklespit", "decade"),
                            sub=c("choose", "picklespit"))
    expect_error(
        out <- .check_term_in_embeddings(terms=terms,
                                         wv=fake_word_vectors,
                                         action="remove"),
        "The following have no matching word vectors: picklespit")


    # actual list list -- works on the opposite side too...
    terms <- list(add=c("choose", "picklespit"),
                  sub=c("mulepants", "picklespit"))
    expect_error(
        .check_term_in_embeddings(terms=terms,
                                  wv=fake_word_vectors,
                                  action="remove"),
        "The following have no matching word vectors: picklespit")

})


test_that(".check_term_in_embeddings only prints 10 bad words", {

    ## single list of terms ##
    # character list
    terms <- c("choose", "picklespit", "mulepants", "pandaboots",
               "rhinojumps", "penguinland", "tigersoda", "wildrumpus",
               "weirdal", "boul", "jawn", "handcrank", "quink")
    expect_message(
    out <- .check_term_in_embeddings(terms=terms,
                                  wv=fake_word_vectors,
                                  action="remove"))
    expect_identical(out, "choose")

})


test_that(".check_term_in_embeddings stops when words
            missing on different data types", {

    er.msg <- "The following have no matching word vectors: picklespit"

    ## single list of terms ##
    # character list
    terms <- c("choose", "moon", "picklespit")
    expect_error(.check_term_in_embeddings(terms=terms,
                                     wv=fake_word_vectors,
                                     action="stop"),  er.msg)

    # actual list list
    terms <- list(c("choose", "moon", "picklespit"))
    expect_error(.check_term_in_embeddings(terms=terms,
                                     wv=fake_word_vectors,
                                     action="stop"),  er.msg)

    # data.frame
    terms <- data.frame(add = c("choose", "moon", "picklespit"))
    expect_error(.check_term_in_embeddings(terms=terms,
                                     wv=fake_word_vectors,
                                     action="stop"), er.msg)

    # tibble
    terms <- tibble::tibble(add = c("choose", "moon", "picklespit"))
    expect_error(.check_term_in_embeddings(terms=terms,
                                     wv=fake_word_vectors,
                                     action="stop"), er.msg)

    ## paired list of terms ##
    # actual list list
    terms <- list(add=c("choose", "moon","picklespit", "decade"),
                  sub=c("decade", "this", "choose", "picklespit"))
    expect_error(.check_term_in_embeddings(terms=terms,
                                     wv=fake_word_vectors,
                                     action="stop"),  er.msg)

    # data.frame
    terms <- data.frame(add=c("choose", "moon","picklespit", "decade"),
                        sub=c("decade", "this", "choose", "picklespit"))
    expect_error(.check_term_in_embeddings(terms=terms,
                                     wv=fake_word_vectors,
                                     action="stop"),  er.msg)

    # tibble
    terms <- tibble::tibble(add=c("choose", "moon","picklespit", "decade"),
                            sub=c("decade", "this", "choose", "picklespit"))

    expect_error(.check_term_in_embeddings(terms=terms,
                                     wv=fake_word_vectors,
                                     action="stop"), er.msg)

})

test_that("get_regions seed works", {

    my.regions <- get_regions(
        wv = fake_word_vectors,
        k_regions = 20L,
        max_iter = 10L,
        seed = 442
    )

    my.regions2 <- get_regions(
        wv = fake_word_vectors,
        k_regions = 20L,
        max_iter = 10L,
        seed = 442
    )

    expect_identical(my.regions,  my.regions2)

})

test_that("find_projection will stop if lengths differ", {

    vector <- fake_word_vectors[2,1:5] #choose

    expect_error(find_projection(fake_word_vectors,
                                 vector))

})

test_that("find_rejection will stop if lengths differ", {

    vector <- fake_word_vectors[2,1:5] #choose

    expect_error(find_rejection(fake_word_vectors,
                                 vector))

})

test_that("find_projection produces matrix with correct dimensions", {

    vector <- fake_word_vectors[2,]

    project <- find_projection(fake_word_vectors, vector)

    expect_identical(length(vector), ncol(fake_word_vectors), ncol(project))

    expect_identical(nrow(fake_word_vectors), nrow(project))

})

test_that("find_rejection produces matrix with correct dimensions", {

    vector <- fake_word_vectors[2,]

    reject <- find_rejection(fake_word_vectors, vector)

    expect_identical(length(vector), ncol(fake_word_vectors), ncol(reject))

    expect_identical(nrow(fake_word_vectors), nrow(reject))

})

test_that("find_tranformation, dimensions and names", {

    norm <- find_transformation(wv=fake_word_vectors,
                                 method = "norm")
    center <- find_transformation(wv=fake_word_vectors,
                                 method = "center" )
    align <- find_transformation(wv=fake_word_vectors,
                                 ref=fake_word_vectors,
                                 method = "align" )

    fake_vectors_dgc <- methods::as(fake_word_vectors,
                                    "dgCMatrix")

    norm.dgc <- find_transformation(wv=fake_vectors_dgc,
                                method = "norm")
    center.dgc  <- find_transformation(wv=fake_vectors_dgc,
                                  method = "center" )
    align.dgc  <- find_transformation(wv=fake_vectors_dgc,
                                 ref=fake_vectors_dgc,
                                 method = "align" )

    expect_identical(dim(norm), dim(fake_word_vectors))
    expect_identical(dim(center), dim(fake_word_vectors))
    expect_identical(dim(align), dim(fake_word_vectors))

    expect_identical(rownames(norm), rownames(fake_word_vectors))
    expect_identical(rownames(center), rownames(fake_word_vectors))
    expect_identical(rownames(align), rownames(fake_word_vectors))

    expect_identical(dim(norm), dim(norm.dgc))
    expect_identical(dim(center), dim(center.dgc))
    expect_identical(dim(align), dim(align.dgc))

    expect_identical(rownames(norm), rownames(norm.dgc))
    expect_identical(rownames(center), rownames(center.dgc))
    expect_identical(rownames(align), rownames(align.dgc))

})

test_that("find_tranformation, dimensions and names", {

    tcm.approx <- text2vec::sim2(fake_word_vectors, method = "cosine")
    vocab <- intersect(rownames(tcm.dgc), rownames(tcm.approx))
    tcm.dgc.b <- tcm.dgc[vocab, vocab]
    tcm.approx <- tcm.approx[vocab, vocab]
    base <- mean((tcm.approx - tcm.dgc.b)^2)

    retro <- find_transformation(wv=fake_word_vectors,
                                 ref=tcm.dgc,
                                 method = "retrofit")

    tcm.approx <- text2vec::sim2(retro, method = "cosine")
    tcm.dgc.b <- tcm.dgc[vocab, vocab]
    tcm.approx <- tcm.approx[vocab, vocab]
    post <- mean((tcm.approx - tcm.dgc.b)^2)

    retro <- find_transformation(wv=retro,
                                ref=tcm.dgc,
                                method = "retrofit")
    
    tcm.approx <- text2vec::sim2(retro, method = "cosine")
    tcm.dgc.b <- tcm.dgc[vocab, vocab]
    tcm.approx <- tcm.approx[vocab, vocab]
    postb <- mean((tcm.approx - tcm.dgc.b)^2)

    expect_true(base > post)
    expect_true(post > postb)

    expect_identical(dim(retro), dim(fake_word_vectors))

})

Try the text2map package in your browser

Any scripts or data that you put into this service are public.

text2map documentation built on July 9, 2023, 6:35 p.m.