test_data <- tibble(
text = c(
"I would not eat them here or there.",
"I would not eat them anywhere.",
"I would not eat green eggs and ham.",
"I do not like them, Sam-I-am."
)
)
rec_base <- recipe(~., data = test_data)
# Create some manual data for expected results.
tokens <- rec_base |>
step_tokenize(text) |>
recipes::prep() |>
recipes::bake(new_data = NULL) |>
vctrs::vec_cbind(rename(test_data, text1 = text)) |>
dplyr::select(text = text1, tokens = text)
# Give each token an arbitrary value for comparison. Real embeddings will be
# doubles, so make these double.
embeddings <- tokens |>
dplyr::mutate(tokens = vctrs::field(tokens, "tokens")) |>
tidyr::unnest(tokens) |>
dplyr::distinct(tokens) |>
dplyr::arrange(tokens) |>
# There are 17 unique tokens. We'll represent them with a 5-d set of vectors
# so each one can be unique.
dplyr::mutate(
token_num_binary = purrr::map(
seq_along(tokens),
\(this_token) {
tibble(
dimension = paste0("d", 1:5),
score = as.double(intToBits(this_token)[1:5])
)
}
)
) |>
tidyr::unnest(token_num_binary) |>
tidyr::pivot_wider(
names_from = dimension,
values_from = score
)
saveRDS(embeddings, test_path("emb-data", "embeddings.rds"), version = 2)
sentence_embeddings_long <- tokens |>
dplyr::mutate(tokens = vctrs::field(tokens, "tokens")) |>
tidyr::unnest(tokens) |>
dplyr::left_join(embeddings, by = "tokens")
saveRDS(
sentence_embeddings_long,
test_path("emb-data", "long.rds"),
version = 2
)
# Summarize by each statistic, and reorder to original order.
sentence_embeddings_sum <- sentence_embeddings_long |>
dplyr::select(-tokens) |>
dplyr::group_by(text) |>
dplyr::summarize_all(sum) |>
dplyr::rename_if(
is.numeric,
~ paste("wordembed_text", ., sep = "_")
)
sentence_embeddings_sum <- test_data |>
dplyr::left_join(sentence_embeddings_sum, by = "text")
saveRDS(sentence_embeddings_sum, test_path("emb-data", "sum.rds"), version = 2)
sentence_embeddings_mean <- sentence_embeddings_long |>
dplyr::select(-tokens) |>
dplyr::group_by(text) |>
dplyr::summarize_all(mean) |>
dplyr::rename_if(
is.numeric,
~ paste("wordembed_text", ., sep = "_")
)
sentence_embeddings_mean <- test_data |>
dplyr::left_join(sentence_embeddings_mean, by = "text")
saveRDS(
sentence_embeddings_mean,
test_path("emb-data", "mean.rds"),
version = 2
)
sentence_embeddings_min <- sentence_embeddings_long |>
dplyr::select(-tokens) |>
dplyr::group_by(text) |>
dplyr::summarize_all(min) |>
dplyr::rename_if(
is.numeric,
~ paste("wordembed_text", ., sep = "_")
)
sentence_embeddings_min <- test_data |>
dplyr::left_join(sentence_embeddings_min, by = "text")
saveRDS(sentence_embeddings_min, test_path("emb-data", "min.rds"), version = 2)
sentence_embeddings_max <- sentence_embeddings_long |>
dplyr::select(-tokens) |>
dplyr::group_by(text) |>
dplyr::summarize_all(max) |>
dplyr::rename_if(
is.numeric,
~ paste("wordembed_text", ., sep = "_")
)
sentence_embeddings_max <- test_data |>
dplyr::left_join(sentence_embeddings_max, by = "text")
saveRDS(sentence_embeddings_max, test_path("emb-data", "max.rds"), version = 2)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.