Nothing
## ----include = FALSE----------------------------------------------------------
knitr::opts_chunk$set(
collapse = TRUE,
comment = "#>"
)
## ----setup--------------------------------------------------------------------
library(RapidFuzz)
## -----------------------------------------------------------------------------
levenshtein_distance("kitten", "sitting")
levenshtein_normalized_similarity("kitten", "sitting")
## -----------------------------------------------------------------------------
s1 <- "California"
s2 <- "Kalifornia"
data.frame(
metric = c("Levenshtein", "Damerau-Levenshtein", "Hamming", "Jaro",
"Jaro-Winkler", "LCSseq", "OSA", "Indel"),
distance = c(
levenshtein_distance(s1, s2),
damerau_levenshtein_distance(s1, s2),
hamming_distance(s1, s2),
round(jaro_distance(s1, s2), 4),
round(jaro_winkler_distance(s1, s2), 4),
lcs_seq_distance(s1, s2),
osa_distance(s1, s2),
indel_distance(s1, s2)
),
normalized_similarity = c(
round(levenshtein_normalized_similarity(s1, s2), 4),
round(damerau_levenshtein_normalized_similarity(s1, s2), 4),
round(hamming_normalized_similarity(s1, s2), 4),
round(jaro_normalized_similarity(s1, s2), 4),
round(jaro_winkler_normalized_similarity(s1, s2), 4),
round(lcs_seq_normalized_similarity(s1, s2), 4),
round(osa_normalized_similarity(s1, s2), 4),
round(indel_normalized_similarity(s1, s2), 4)
)
)
## -----------------------------------------------------------------------------
# Exact content, different case/spacing
fuzz_ratio("New York City", "new york city")
# Partial match: one string is contained in the other
fuzz_partial_ratio("York", "New York City")
# Word order doesn't matter
fuzz_token_sort_ratio("City of New York", "New York City")
# Common tokens
fuzz_token_set_ratio("New York City NY", "New York City")
# Weighted ratio (best overall heuristic)
fuzz_WRatio("New York City", "new york city!!")
## -----------------------------------------------------------------------------
fuzz_partial_token_sort_ratio("Museum of Modern Art", "Modern Art Museum NYC")
fuzz_partial_token_set_ratio("Museum of Modern Art", "Modern Art Museum NYC")
fuzz_partial_token_ratio("Museum of Modern Art", "Modern Art Museum NYC")
## -----------------------------------------------------------------------------
# Misspelled state names
queries <- c("Kalifornia", "Nwe York", "Texs", "Florda", "Pensylvania")
states <- state.name
# Find the best match for each misspelled name
results <- lapply(queries, function(q) {
best <- extract_best_match(q, states, score_cutoff = 0)
data.frame(
query = q,
best_match = best$choice,
score = round(best$score, 2)
)
})
do.call(rbind, results)
## -----------------------------------------------------------------------------
# Find top 5 states similar to "New"
extract_matches("New", states, score_cutoff = 50, limit = 5, scorer = "PartialRatio")
## -----------------------------------------------------------------------------
# All states with > 70% similarity to "North"
extract_similar_strings("North", states, score_cutoff = 70)
## -----------------------------------------------------------------------------
query <- "san francisco"
cities <- c("San Francisco", "San Fernando", "Santa Fe", "San Diego",
"Francisco", "South San Francisco", "San Fran")
scorers <- c("Ratio", "PartialRatio", "TokenSortRatio", "TokenSetRatio",
"WRatio", "QRatio", "PartialTokenSortRatio",
"PartialTokenSetRatio", "PartialTokenRatio", "TokenRatio")
results <- lapply(scorers, function(sc) {
m <- extract_matches(query, cities, score_cutoff = 0, limit = 3, scorer = sc)
data.frame(scorer = sc, rank1 = m$choice[1], score1 = round(m$score[1], 1))
})
do.call(rbind, results)
## -----------------------------------------------------------------------------
# Trim + lowercase
processString(" São Paulo ", processor = TRUE, asciify = FALSE)
# Trim + lowercase + ASCII transliteration
processString(" São Paulo ", processor = TRUE, asciify = TRUE)
# ASCII only
processString("Ñoño", processor = FALSE, asciify = TRUE)
## -----------------------------------------------------------------------------
# Without preprocessing
fuzz_ratio("São Paulo", "sao paulo")
# With preprocessing
fuzz_ratio(
processString("São Paulo", processor = TRUE, asciify = TRUE),
processString("sao paulo", processor = TRUE, asciify = TRUE)
)
## -----------------------------------------------------------------------------
# Levenshtein edit operations
ops <- get_editops("saturday", "sunday")
ops
## -----------------------------------------------------------------------------
# Apply the operations
editops_apply_str(ops, "saturday", "sunday")
## -----------------------------------------------------------------------------
# LCSseq edit operations
lcs_seq_editops("kitten", "sitting")
## -----------------------------------------------------------------------------
# Same prefix "inter"
prefix_similarity("international", "internet")
prefix_normalized_similarity("international", "internet")
# Same postfix "tion"
postfix_similarity("education", "formation")
postfix_normalized_similarity("education", "formation")
## -----------------------------------------------------------------------------
# Simulated "dirty" records
dirty <- c("J. Smith", "Jane M. Doe", "Bob Johnson Jr",
"Alice Wonderland", "Charlie Browne")
# Clean reference list
clean <- c("John Smith", "Jane Mary Doe", "Robert Johnson Junior",
"Alice Wonder", "Charles Brown", "David Lee")
# Match each dirty record to the best clean record
matches <- lapply(dirty, function(d) {
best <- extract_best_match(d, clean, score_cutoff = 0)
data.frame(
dirty_record = d,
matched_to = best$choice,
confidence = round(best$score, 1)
)
})
do.call(rbind, matches)
## -----------------------------------------------------------------------------
# Compare performance: RapidFuzz vs base R adist
s1 <- paste(sample(letters, 100, replace = TRUE), collapse = "")
s2 <- paste(sample(letters, 100, replace = TRUE), collapse = "")
bench <- system.time(
for (i in 1:1000) levenshtein_distance(s1, s2)
)
bench_base <- system.time(
for (i in 1:1000) adist(s1, s2)
)
data.frame(
method = c("RapidFuzz", "base::adist"),
time_1000_calls = c(bench["elapsed"], bench_base["elapsed"])
)
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.