knitr::opts_chunk$set( collapse = TRUE, comment = "#>", eval = FALSE )
library(polyglotr) library(dplyr) library(tibble) library(purrr)
This vignette demonstrates how to detect input languages and translate only non-English text, integrating seamlessly with tidyverse workflows. This approach is particularly useful for cleaning multilingual datasets and processing mixed-language content efficiently.
Language detection and conditional translation allows you to:
Let's start with simple language detection:
# Sample texts in different languages sample_texts <- c( "Hello, how are you today?", # English "Bonjour, comment allez-vous?", # French "Hola, ¿cómo estás hoy?", # Spanish "Guten Tag, wie geht es Ihnen?", # German "Ciao, come stai oggi?" # Italian ) # Detect languages detected_languages <- sapply(sample_texts, language_detect) print("Detected languages:") print(detected_languages)
Create a function that only translates non-English text:
translate_if_not_english <- function(text, target_language = "en") { # Detect language of the input text detected_lang <- language_detect(text) # Check if the detected language is English is_english <- grepl("en", detected_lang, ignore.case = TRUE) if (is_english) { # Return original text if already English return(list( original = text, translated = text, was_translated = FALSE, detected_language = detected_lang )) } else { # Translate to English if not English translated_text <- google_translate(text, target_language = target_language, source_language = "auto") return(list( original = text, translated = translated_text, was_translated = TRUE, detected_language = detected_lang )) } } # Test the function test_text_fr <- "Bonjour, j'aimerais acheter un billet." result <- translate_if_not_english(test_text_fr) print("Conditional translation result:") print(paste("Original:", result$original)) print(paste("Translated:", result$translated)) print(paste("Was translated:", result$was_translated)) print(paste("Detected language:", result$detected_language))
Here's a practical example with a tibble containing mixed-language rows:
# Create a dataset with mixed languages (typical of user-generated content) mixed_data <- tibble( id = 1:8, user_feedback = c( "Great product, very satisfied!", # English "Excelente producto, muy satisfecho!", # Spanish "Produit fantastique, je le recommande!", # French "This service exceeded my expectations.", # English "Der Service war wirklich hervorragend.", # German "Servizio eccellente, davvero impressionante!", # Italian "The delivery was fast and reliable.", # English "La livraison était rapide et fiable." # French ), rating = c(5, 5, 4, 5, 4, 5, 4, 4), category = rep(c("product", "service"), 4) ) print("Original mixed-language dataset:") print(mixed_data)
Now let's detect languages and conditionally translate:
# Function to process each text entry process_feedback <- function(text) { result <- translate_if_not_english(text) return(tibble( original_text = result$original, english_text = result$translated, was_translated = result$was_translated, detected_language = result$detected_language )) } # Apply to all feedback entries processed_results <- purrr::map_dfr(mixed_data$user_feedback, process_feedback) # Combine with original data enhanced_data <- bind_cols(mixed_data, processed_results) print("Enhanced dataset with language detection and translation:") print(enhanced_data)
For more sophisticated data processing workflows:
library(stringr) # Enhanced processing function with more details enhanced_language_processing <- function(df, text_column) { df %>% mutate( # Detect language for each text entry detected_lang = map_chr(!!rlang::sym(text_column), ~ tryCatch(language_detect(.x), error = function(e) "unknown")), # Determine if translation is needed needs_translation = !str_detect(detected_lang, "en"), # Translate only non-English text english_text = map2_chr(!!rlang::sym(text_column), needs_translation, ~ if (.y) { tryCatch(google_translate(.x, target_language = "en"), error = function(e) .x) } else { .x }), # Add translation confidence/status translation_status = case_when( detected_lang == "unknown" ~ "detection_failed", !needs_translation ~ "already_english", english_text != !!rlang::sym(text_column) ~ "translated", TRUE ~ "translation_failed" ) ) } # Apply enhanced processing result_data <- enhanced_language_processing(mixed_data, "user_feedback") print("Advanced processing results:") print(result_data %>% select(id, detected_lang, needs_translation, translation_status))
Process large datasets efficiently by filtering and batching:
# Create larger sample dataset large_dataset <- tibble( id = 1:20, content = c( # Mix of English and non-English content "Amazing service quality", # EN "Fantástico servicio al cliente", # ES "Service client exceptionnel", # FR "Great user experience", # EN "Esperienza utente eccellente", # IT "Ausgezeichnete Benutzerführung", # DE "Fast shipping and delivery", # EN "Livraison rapide et efficace", # FR "Excellent product quality", # EN "Qualità del prodotto superiore", # IT "Easy to use interface", # EN "Interfaz muy fácil de usar", # ES "Highly recommend this product", # EN "Je recommande vivement ce produit", # FR "Outstanding customer support", # EN "Soporte al cliente sobresaliente", # ES "Very satisfied with purchase", # EN "Sehr zufrieden mit dem Kauf", # DE "Will definitely buy again", # EN "Sicuramente acquisterò di nuovo" # IT ), timestamp = Sys.time() + sample(-1000:1000, 20), priority = sample(c("high", "medium", "low"), 20, replace = TRUE) ) # Efficient batch processing workflow batch_process_languages <- function(df, text_col, batch_size = 5) { # First, detect languages for all entries df_with_detection <- df %>% mutate( row_id = row_number(), detected_lang = map_chr(!!rlang::sym(text_col), ~ tryCatch(language_detect(.x), error = function(e) "en")), is_english = str_detect(detected_lang, "en") ) # Separate English and non-English content english_content <- df_with_detection %>% filter(is_english) non_english_content <- df_with_detection %>% filter(!is_english) # Process non-English content in batches if (nrow(non_english_content) > 0) { non_english_content <- non_english_content %>% mutate( batch_id = ceiling(row_number() / batch_size), english_text = map_chr(!!rlang::sym(text_col), ~ tryCatch(google_translate(.x, target_language = "en"), error = function(e) .x)) ) } else { non_english_content <- non_english_content %>% mutate(batch_id = integer(0), english_text = character(0)) } # For English content, keep original text english_content <- english_content %>% mutate( batch_id = NA_integer_, english_text = !!rlang::sym(text_col) ) # Combine results result <- bind_rows(english_content, non_english_content) %>% arrange(row_id) %>% select(-row_id) return(result) } # Apply batch processing processed_large <- batch_process_languages(large_dataset, "content", batch_size = 3) # Summary statistics summary_stats <- processed_large %>% summarise( total_entries = n(), english_entries = sum(is_english), translated_entries = sum(!is_english), translation_rate = mean(!is_english), unique_languages = n_distinct(detected_lang) ) print("Processing summary:") print(summary_stats) print("Sample of processed data:") print(processed_large %>% select(id, detected_lang, is_english, content, english_text) %>% head(10))
Language detection and conditional translation provide powerful tools for cleaning and standardizing multilingual datasets. By integrating with tidyverse workflows, you can efficiently process mixed-language content, enabling consistent analysis and insights across diverse linguistic data sources.
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.