inst/doc/guided_tour.R

## -----------------------------------------------------------------------------
library(tidyverse)
library(microbenchmark)
library(fuzzyjoin)
library(zoomerjoin)

corpus_1 <- dime_data %>% # dime data is packaged with zoomerjoin
  head(500)
names(corpus_1) <- c("a", "field")
corpus_1

## -----------------------------------------------------------------------------
corpus_2 <- dime_data %>% # dime data is packaged with zoomerjoin
  tail(500)
names(corpus_2) <- c("b", "field")
corpus_2

## -----------------------------------------------------------------------------
set.seed(1)
start_time <- Sys.time()
join_out <- jaccard_inner_join(corpus_1, corpus_2,
  by = "field", n_gram_width = 6,
  n_bands = 20, band_width = 6, threshold = .8
)
print(Sys.time() - start_time)
print(join_out)

## -----------------------------------------------------------------------------
jaccard_curve(20, 6)

## -----------------------------------------------------------------------------
organization_names <- c(
  "American Civil Liberties Union",
  "American Civil Liberties Union (ACLU)",
  "NRA National Rifle Association",
  "National Rifle Association NRA",
  "National Rifle Association",
  "Planned Parenthood",
  "Blue Cross"
)
standardized_organization_names <- jaccard_string_group(organization_names, threshold = .5, band_width = 3)
print(standardized_organization_names)

Try the zoomerjoin package in your browser

Any scripts or data that you put into this service are public.

zoomerjoin documentation built on April 13, 2025, 9:08 a.m.