#' Contrast nearest neighbors
#'
#' Computes the ratio of cosine similarities between group embeddings and features
#' --that is, for any given feature it first computes the similarity between that feature
#' and each group embedding, and then takes the ratio of these two similarities.
#' This ratio captures how "discriminant" a feature is of a given group.
#'
#' @param x (quanteda) `tokens-class` object
#' @param groups (numeric, factor, character) a binary variable of the same length as `x`
#' @inheritParams dem
#' @param bootstrap (logical) if TRUE, use bootstrapping -- sample from texts with replacement and
#' re-estimate cosine ratios for each sample. Required to get std. errors.
#' @param num_bootstraps (numeric) - number of bootstraps to use
#' @param confidence_level (numeric in (0,1)) confidence level e.g. 0.95
#' @param permute (logical) - if TRUE, compute empirical p-values using a permutation test
#' @param num_permutations (numeric) - number of permutations to use
#' @param candidates (character) vector of candidate features for nearest neighbors
#' @param N (numeric) - nearest neighbors are subset to the union of the N neighbors of each group (if NULL, ratio is computed for all features)
#'
#' @return a data.frame with following columns:
#' \describe{
#' \item{`feature`}{(character) vector of feature terms corresponding to the nearest neighbors.}
#' \item{`value`}{(numeric) ratio of cosine similarities. Average over bootstrapped samples if bootstrap = TRUE.}
#' \item{`std.error`}{(numeric) std. error of the ratio of cosine similarties. Column is dropped if bootsrap = FALSE.}
#' \item{`lower.ci`}{(numeric) (if bootstrap = TRUE) lower bound of the confidence interval.}
#' \item{`upper.ci`}{(numeric) (if bootstrap = TRUE) upper bound of the confidence interval.}
#' \item{`p.value`}{(numeric) empirical p-value. Column is dropped if permute = FALSE.}
#' }
#'
#' @export
#' @rdname contrast_nns
#' @keywords contrast_nns
#' @examples
#'
#' library(quanteda)
#'
#' cr_toks <- tokens(cr_sample_corpus)
#'
#' immig_toks <- tokens_context(x = cr_toks,
#' pattern = "immigration", window = 6L, hard_cut = FALSE, verbose = TRUE)
#'
#' # sample 100 instances of the target term, stratifying by party (only for example purposes)
#' set.seed(2022L)
#' immig_toks <- tokens_sample(immig_toks, size = 100, by = docvars(immig_toks, 'party'))
#'
#' set.seed(42L)
#' party_nns <- contrast_nns(x = immig_toks,
#' groups = docvars(immig_toks, 'party'),
#' pre_trained = cr_glove_subset,
#' transform = TRUE, transform_matrix = cr_transform,
#' bootstrap = TRUE,
#' num_bootstraps = 100,
#' confidence_level = 0.95,
#' permute = TRUE, num_permutations = 10,
#' candidates = NULL, N = 20,
#' verbose = FALSE)
#'
#' head(party_nns)
contrast_nns <- function(x, groups = NULL, pre_trained = NULL, transform = TRUE, transform_matrix = NULL, bootstrap = TRUE, num_bootstraps = 100, confidence_level = 0.95, permute = TRUE, num_permutations = 100, candidates = NULL, N = 20, verbose = TRUE){
# checks
if(bootstrap && (confidence_level >= 1 || confidence_level<=0)) stop('"confidence_level" must be a numeric value between 0 and 1.', call. = FALSE) # check confidence level is between 0 and 1
if(bootstrap && num_bootstraps < 100) stop('num_bootstraps must be at least 100') # check num_bootstraps >= 100
if(class(x)[1] != "tokens") stop("data must be of class tokens")
groupvals <- unique(groups)
if(length(groupvals)!=2) stop("groups must be binary")
# add grouping variable to docvars
quanteda::docvars(x, "group") <- groups
# construct document-feature-matrix
toks_dfm <- quanteda::dfm(x, tolower = FALSE)
# construct document-embedding-matrix
toks_dem <- dem(x = toks_dfm, pre_trained = pre_trained, transform = transform, transform_matrix = transform_matrix, verbose = verbose)
# aggregate dems by group var
embeds_out1 <- toks_dem[which(toks_dem@docvars$group == groupvals[1]),]
embeds_out2 <- toks_dem[which(toks_dem@docvars$group == groupvals[2]),]
if(bootstrap){
cat('starting bootstrapping \n')
# bootstrap similarity
bootstrap_out <- replicate(num_bootstraps, bootstrap_contrast(target_embeddings1 = embeds_out1, target_embeddings2 = embeds_out2, pre_trained, candidates = candidates, norm = 'l2'), simplify = FALSE)
# sim_out1
bs_sim_out1 <- lapply(bootstrap_out, '[[', 'cos_sim1') %>% do.call(rbind,.)
sim_out1 <- apply(bs_sim_out1, 2, mean)
stderror_sim_out1 <- apply(bs_sim_out1, 2, sd)
ci_sim_out1 <- apply(bs_sim_out1, 2, function(x) x[order(x)])[c(round((1-confidence_level)*num_bootstraps),round(confidence_level*num_bootstraps)),]
nns1 <- dplyr::tibble(feature = names(sim_out1), value = unname(sim_out1), std.error = unname(stderror_sim_out1), lower.ci = unname(ci_sim_out1[1,]), upper.ci = unname(ci_sim_out1[2,]))
# sim_out2
bs_sim_out2 <- lapply(bootstrap_out, '[[', 'cos_sim2') %>% do.call(rbind,.)
sim_out2 <- apply(bs_sim_out2, 2, mean)
stderror_sim_out2 <- apply(bs_sim_out2, 2, sd)
ci_sim_out2 <- apply(bs_sim_out2, 2, function(x) x[order(x)])[c(round((1-confidence_level)*num_bootstraps),round(confidence_level*num_bootstraps)),]
nns2 <- dplyr::tibble(feature = names(sim_out2), value = unname(sim_out2), std.error = unname(stderror_sim_out2), lower.ci = unname(ci_sim_out2[1,]), upper.ci = unname(ci_sim_out2[2,]))
# sim_ratio
bs_sim_ratio <- lapply(bootstrap_out, '[[', 'sim_ratio') %>% do.call(rbind,.)
sim_ratio <- apply(bs_sim_ratio, 2, mean)
dev1 <- abs(sim_ratio - 1)
stderror_sim_ratio <- apply(bs_sim_ratio, 2, sd)
ci_sim_ratio <- apply(bs_sim_ratio, 2, function(x) x[order(x)])[c(round((1-confidence_level)*num_bootstraps),round(confidence_level*num_bootstraps)),]
nns_ratio <- dplyr::tibble(feature = names(sim_ratio), value = unname(sim_ratio), std.error = unname(stderror_sim_ratio), lower.ci = unname(ci_sim_ratio[1,]), upper.ci = unname(ci_sim_ratio[2,]))
cat('done bootstrapping \n')
}else{
# ALC embeddings
alc_embedding1 <- matrix(Matrix::colMeans(embeds_out1), nrow = 1)
alc_embedding2 <- matrix(Matrix::colMeans(embeds_out2), nrow = 1)
if(length(candidates) >0){
# sim_out1
sim_out1 <- text2vec::sim2(alc_embedding1, pre_trained[candidates,], method = 'cosine', norm = 'l2')
# sim_out2
sim_out2 <- text2vec::sim2(alc_embedding2, pre_trained[candidates,], method = 'cosine', norm = 'l2')
}else{
# sim_out1
sim_out1 <- text2vec::sim2(alc_embedding1, pre_trained, method = 'cosine', norm = 'l2')
# sim_out2
sim_out2 <- text2vec::sim2(alc_embedding2, pre_trained, method = 'cosine', norm = 'l2')
}
# nearest neighbors
nns1 <- dplyr::tibble(feature = colnames(sim_out1), value = sim_out1[1,])
nns2 <- dplyr::tibble(feature = colnames(sim_out2), value = sim_out2[1,])
# ratio of cosine similarities
sim_ratio <- sim_out1/sim_out2
dev1 <- abs(sim_ratio - 1)
nns_ratio <- dplyr::tibble(feature = colnames(sim_ratio), value = sim_ratio[1,])
}
if(permute){
# permute similarity
cat('starting permutations \n')
permute_out <- replicate(num_permutations, permute_contrast(target_embeddings1 = embeds_out1, target_embeddings2 = embeds_out2, pre_trained, candidates = candidates, norm = 'l2'), simplify = FALSE)
# pm_out1
pm_sim_out1 <- lapply(permute_out, '[[', 'cos_sim1') %>% do.call(rbind,.)
pm_comp1 <- apply(pm_sim_out1, 1, function(i) i >= sim_out1)
pvalue1 <- apply(pm_comp1, 1, function(i) sum(i)/length(i))
nns1 <- cbind(nns1, p.value = unname(pvalue1))
# pm_out2
pm_sim_out2 <- lapply(permute_out, '[[', 'cos_sim2') %>% do.call(rbind,.)
pm_comp2 <- apply(pm_sim_out2, 1, function(i) i >= sim_out2)
pvalue2 <- apply(pm_comp2, 1, function(i) sum(i)/length(i))
nns2 <- cbind(nns2, p.value = unname(pvalue2))
# pm_ratio
pm_ratio_out <- lapply(permute_out, '[[', 'sim_ratio') %>% do.call(rbind,.)
pm_dev1 <- abs(pm_ratio_out - 1)
pm_dev1 <- apply(pm_dev1, 1, function(i) i >= dev1)
pvalue_ratio <- apply(pm_dev1, 1, function(i) sum(i)/length(i))
nns_ratio <- cbind(nns_ratio, p.value = unname(pvalue_ratio))
cat('done with permutations \n')
}
# arrange
nns1 <- nns1 %>% dplyr::arrange(-value)
nns2 <- nns2 %>% dplyr::arrange(-value)
nns_ratio <- nns_ratio %>% dplyr::arrange(-value)
# subset if to union of top N nearest neighbors if specified
if(is.numeric(N)){
# subset nnss
nns1 <- nns1 %>% dplyr::slice(1:N)
nns2 <- nns2 %>% dplyr::slice(1:N)
# subset nns_ratio
nns_ratio <- nns_ratio %>% dplyr::filter(feature %in% union(nns1$feature,nns2$feature))
}
# output
return(nns_ratio)
}
# -----------------------------
#
# SUB-FUNCTIONS
#
# -----------------------------
#' Permute similarity and ratio computations
#'
#' @param target_embeddings1 ALC embeddings for group 1
#' @param target_embeddings2 ALC embeddings for group 2
#' @param pre_trained a V x D matrix of numeric values - pretrained embeddings with V = size of vocabulary and D = embedding dimensions
#' @param candidates character vector defining the candidates for nearest neighbors - e.g. output from `get_local_vocab`
#' @param norm character = c("l2", "none") - set to 'l2' for cosine similarity and to 'none' for inner product (see ?sim2 in text2vec)
#'
#' @return a list with three elements, nns for group 1, nns for group 2 and nns_ratio comparing with ratios of similarities between the two groups
#' @export
#'
# runs permutations
permute_contrast <- function(target_embeddings1 = NULL, target_embeddings2 = NULL, pre_trained = NULL, candidates = NULL, norm = NULL){
# number of observations
num_obs1 <- nrow(target_embeddings1)
num_obs2 <- nrow(target_embeddings2)
num_obs <- num_obs1 + num_obs2
# randomly sample observations for each group
obs1_perm <- sample(1:num_obs, num_obs1)
obs2_perm <- setdiff(1:num_obs, obs1_perm)
# sample embeddings
target_embeddings <- rbind(target_embeddings1, target_embeddings2)
target_embeddings <- target_embeddings[sample(1:nrow(target_embeddings)),] # shuffle
target_embeddings1_perm <- matrix(Matrix::colMeans(target_embeddings[obs1_perm,]), nrow = 1)
target_embeddings2_perm <- matrix(Matrix::colMeans(target_embeddings[obs2_perm,]), nrow = 1)
# compute similarities
pm_out <- compute_contrast(target_embeddings1_perm, target_embeddings2_perm, pre_trained = pre_trained, candidates = candidates, norm = 'l2')
# output
return(pm_out)
}
#' Bootstrap similarity and ratio computations
#'
#' @param target_embeddings1 ALC embeddings for group 1
#' @param target_embeddings2 ALC embeddings for group 2
#' @param pre_trained a V x D matrix of numeric values - pretrained embeddings with V = size of vocabulary and D = embedding dimensions
#' @param candidates character vector defining the candidates for nearest neighbors - e.g. output from `get_local_vocab`
#' @param norm character = c("l2", "none") - set to 'l2' for cosine similarity and to 'none' for inner product (see ?sim2 in text2vec)
#'
#' @return a list with three elements, nns for group 1, nns for group 2 and nns_ratio comparing with ratios of similarities between the two groups
#' @export
#'
# runs bootstraps
bootstrap_contrast <- function(target_embeddings1 = NULL, target_embeddings2 = NULL, pre_trained = NULL, candidates = NULL, norm = NULL){
# sample
bs_target_embeddings1 <- target_embeddings1[sample(1:nrow(target_embeddings1), replace = TRUE),]
bs_target_embeddings2 <- target_embeddings2[sample(1:nrow(target_embeddings2), replace = TRUE),]
# compute similarities
bs_out <- compute_contrast(bs_target_embeddings1, bs_target_embeddings2, pre_trained = pre_trained, candidates = candidates, norm = 'l2')
# output
return(bs_out)
}
#' Compute similarity and similarity ratios
#'
#' @param target_embeddings1 ALC embeddings for group 1
#' @param target_embeddings2 ALC embeddings for group 2
#' @param pre_trained a V x D matrix of numeric values - pretrained embeddings with V = size of vocabulary and D = embedding dimensions
#' @param candidates character vector defining the candidates for nearest neighbors - e.g. output from `get_local_vocab`
#' @param norm character = c("l2", "none") - set to 'l2' for cosine similarity and to 'none' for inner product (see ?sim2 in text2vec)
#'
#' @return a list with three elements, nns for group 1, nns for group 2 and nns_ratio comparing with ratios of similarities between the two groups
#' @export
#'
# computes ratio of similarities
compute_contrast <- function(target_embeddings1 = NULL, target_embeddings2 = NULL, pre_trained = NULL, candidates = NULL, norm = NULL){
# ALC embeddings
alc_embedding1 <- matrix(Matrix::colMeans(target_embeddings1), nrow = 1)
alc_embedding2 <- matrix(Matrix::colMeans(target_embeddings2), nrow = 1)
# cosine similarities
if(length(candidates) > 0){
cos_sim1 <- text2vec::sim2(alc_embedding1, pre_trained[candidates,], method = 'cosine', norm = 'l2')
cos_sim2 <- text2vec::sim2(alc_embedding2, pre_trained[candidates,], method = 'cosine', norm = 'l2')}else{
cos_sim1 <- text2vec::sim2(alc_embedding1, pre_trained, method = 'cosine', norm = 'l2')
cos_sim2 <- text2vec::sim2(alc_embedding2, pre_trained, method = 'cosine', norm = 'l2')
}
# ratio of cosine similarities
sim_ratio <- cos_sim1/cos_sim2
# output
return(list(cos_sim1 = cos_sim1, cos_sim2 = cos_sim2, sim_ratio = sim_ratio))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.