knitr::opts_chunk$set( collapse = TRUE, comment = "#>" )
library(RapidFuzz)
RapidFuzz provides high-performance string similarity and distance functions powered by the C++ library rapidfuzz-cpp. It is useful for tasks such as record linkage, fuzzy matching, typo correction, and deduplication.
This vignette demonstrates the main features of the package using data readily available in R.
The Levenshtein distance counts the minimum number of single-character edits (insertions, deletions, substitutions) needed to transform one string into another.
levenshtein_distance("kitten", "sitting") levenshtein_normalized_similarity("kitten", "sitting")
Let's compare how different metrics score the same pair of strings:
s1 <- "California" s2 <- "Kalifornia" data.frame( metric = c("Levenshtein", "Damerau-Levenshtein", "Hamming", "Jaro", "Jaro-Winkler", "LCSseq", "OSA", "Indel"), distance = c( levenshtein_distance(s1, s2), damerau_levenshtein_distance(s1, s2), hamming_distance(s1, s2), round(jaro_distance(s1, s2), 4), round(jaro_winkler_distance(s1, s2), 4), lcs_seq_distance(s1, s2), osa_distance(s1, s2), indel_distance(s1, s2) ), normalized_similarity = c( round(levenshtein_normalized_similarity(s1, s2), 4), round(damerau_levenshtein_normalized_similarity(s1, s2), 4), round(hamming_normalized_similarity(s1, s2), 4), round(jaro_normalized_similarity(s1, s2), 4), round(jaro_winkler_normalized_similarity(s1, s2), 4), round(lcs_seq_normalized_similarity(s1, s2), 4), round(osa_normalized_similarity(s1, s2), 4), round(indel_normalized_similarity(s1, s2), 4) ) )
The fuzz_* family of functions provides different strategies for comparing strings, especially useful when word order or partial matches matter.
# Exact content, different case/spacing fuzz_ratio("New York City", "new york city") # Partial match: one string is contained in the other fuzz_partial_ratio("York", "New York City") # Word order doesn't matter fuzz_token_sort_ratio("City of New York", "New York City") # Common tokens fuzz_token_set_ratio("New York City NY", "New York City") # Weighted ratio (best overall heuristic) fuzz_WRatio("New York City", "new york city!!")
These combine the benefits of token-based comparison with partial matching:
fuzz_partial_token_sort_ratio("Museum of Modern Art", "Modern Art Museum NYC") fuzz_partial_token_set_ratio("Museum of Modern Art", "Modern Art Museum NYC") fuzz_partial_token_ratio("Museum of Modern Art", "Modern Art Museum NYC")
A common task is finding the best match for a query within a list of options. RapidFuzz provides three extract functions for this.
# Misspelled state names queries <- c("Kalifornia", "Nwe York", "Texs", "Florda", "Pensylvania") states <- state.name # Find the best match for each misspelled name results <- lapply(queries, function(q) { best <- extract_best_match(q, states, score_cutoff = 0) data.frame( query = q, best_match = best$choice, score = round(best$score, 2) ) }) do.call(rbind, results)
# Find top 5 states similar to "New" extract_matches("New", states, score_cutoff = 50, limit = 5, scorer = "PartialRatio")
# All states with > 70% similarity to "North" extract_similar_strings("North", states, score_cutoff = 70)
The extract_matches() function supports 10 different scorers. The best choice depends on your data:
query <- "san francisco" cities <- c("San Francisco", "San Fernando", "Santa Fe", "San Diego", "Francisco", "South San Francisco", "San Fran") scorers <- c("Ratio", "PartialRatio", "TokenSortRatio", "TokenSetRatio", "WRatio", "QRatio", "PartialTokenSortRatio", "PartialTokenSetRatio", "PartialTokenRatio", "TokenRatio") results <- lapply(scorers, function(sc) { m <- extract_matches(query, cities, score_cutoff = 0, limit = 3, scorer = sc) data.frame(scorer = sc, rank1 = m$choice[1], score1 = round(m$score[1], 1)) }) do.call(rbind, results)
The processString() function helps normalize strings before comparison:
# Trim + lowercase processString(" São Paulo ", processor = TRUE, asciify = FALSE) # Trim + lowercase + ASCII transliteration processString(" São Paulo ", processor = TRUE, asciify = TRUE) # ASCII only processString("Ñoño", processor = FALSE, asciify = TRUE)
This is especially useful for matching names with accented characters:
# Without preprocessing fuzz_ratio("São Paulo", "sao paulo") # With preprocessing fuzz_ratio( processString("São Paulo", processor = TRUE, asciify = TRUE), processString("sao paulo", processor = TRUE, asciify = TRUE) )
Edit operations show exactly what transformations are needed to convert one string into another.
# Levenshtein edit operations ops <- get_editops("saturday", "sunday") ops
# Apply the operations editops_apply_str(ops, "saturday", "sunday")
# LCSseq edit operations lcs_seq_editops("kitten", "sitting")
Useful for comparing strings that share beginnings or endings:
# Same prefix "inter" prefix_similarity("international", "internet") prefix_normalized_similarity("international", "internet") # Same postfix "tion" postfix_similarity("education", "formation") postfix_normalized_similarity("education", "formation")
A real-world scenario: matching messy data against a clean reference list.
# Simulated "dirty" records dirty <- c("J. Smith", "Jane M. Doe", "Bob Johnson Jr", "Alice Wonderland", "Charlie Browne") # Clean reference list clean <- c("John Smith", "Jane Mary Doe", "Robert Johnson Junior", "Alice Wonder", "Charles Brown", "David Lee") # Match each dirty record to the best clean record matches <- lapply(dirty, function(d) { best <- extract_best_match(d, clean, score_cutoff = 0) data.frame( dirty_record = d, matched_to = best$choice, confidence = round(best$score, 1) ) }) do.call(rbind, matches)
RapidFuzz is implemented in C++ and is significantly faster than pure R alternatives for string matching tasks.
# Compare performance: RapidFuzz vs base R adist s1 <- paste(sample(letters, 100, replace = TRUE), collapse = "") s2 <- paste(sample(letters, 100, replace = TRUE), collapse = "") bench <- system.time( for (i in 1:1000) levenshtein_distance(s1, s2) ) bench_base <- system.time( for (i in 1:1000) adist(s1, s2) ) data.frame( method = c("RapidFuzz", "base::adist"), time_1000_calls = c(bench["elapsed"], bench_base["elapsed"]) )
| Task | Recommended Functions |
|------|-----------------------|
| Simple distance/similarity | levenshtein_*, hamming_* |
| Transpositions matter | damerau_levenshtein_*, osa_* |
| Fuzzy matching (general) | fuzz_WRatio, fuzz_QRatio |
| Partial string matching | fuzz_partial_ratio, fuzz_partial_token_* |
| Word-order independent | fuzz_token_sort_ratio, fuzz_token_set_ratio |
| Find best match in list | extract_best_match, extract_matches |
| Names with accents | processString() + any metric |
| Common prefix/suffix | prefix_*, postfix_* |
| Edit operations detail | get_editops, lcs_seq_editops, osa_editops |
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.