R/prep_dyads.R
In ConversationAlign: Process Text and Compute Linguistic Alignment in Conversation Transcripts

Documented in prep_dyads

#' prep_dyads
#'
#' Cleans, vectorizes and appends lexical norms to all content words in a language corpus.
#' User guides options for stopword removal and lemmatization. User selects up to three psycholinguistic dimensions to yoke norms
#' on each content word in the original conversation transcript.
#' @name prep_dyads
#' @param dat_read dataframe produced from read_dyads() function
#' @param omit_stops option to remove stopwords, default TRUE
#' @param lemmatize logical, should words be lemmatized (switched to base morphological form), default is TRUE
#' @param which_stoplist user-specified stopword removal method with options including "none", "SMART", "MIT_stops", "CA_OriginalStops", or "Temple_Stopwords25".
#' "Temple_Stopwords25 is the default list
#' @param verbose display detailed output such as error messages and progress (default is TRUE)
#' @returns
#' dataframe with text cleaned and vectorized to a one word per-row format.
#' Lexical norms and metadata are appended to each content word. Cleaned text appears under a new column
#' called 'Text_Clean'. Any selected dimensions (e.g., word length) and metadata are also appended to each word along
#' with speaker identity, turn, and Event_ID (conversation identifier).
#' @importFrom dplyr select
#' @importFrom dplyr group_by
#' @importFrom dplyr mutate
#' @importFrom dplyr na_if
#' @importFrom dplyr ungroup
#' @importFrom magrittr %>%
#' @importFrom stats lag
#' @importFrom stringi stri_replace_all_fixed
#' @importFrom stringi stri_replace_all_regex
#' @importFrom stringr str_trim
#' @importFrom stringr str_squish
#' @importFrom stringr str_replace_all
#' @importFrom textstem lemmatize_strings
#' @importFrom tidyr separate_rows
#' @importFrom tidyselect all_of
#' @importFrom tidyselect any_of
#' @importFrom tidyselect contains
#' @importFrom utils select.list
#' @export

prep_dyads <- function(dat_read, lemmatize = TRUE, omit_stops = TRUE,
                       which_stoplist = "Temple_stops25", verbose = TRUE) {
  # Verification steps
  if (nrow(dat_read) == 0) {
    stop("Input dataframe is empty.")
  }
  if (!"Text_Raw" %in% names(dat_read)) {
    stop("Column 'Text_Raw' not found.")
  }
  if (!"Participant_ID" %in% names(dat_read)) {
    stop("Column 'Participant_ID' not found.")
  }

  # Define stopword lists
  stopwords_lists <- list(MIT_stops = MIT_stops,
                          Temple_stops25 = Temple_stops25,
                          SMART_stops = SMART_stops,
                          CA_orig_stops = CA_orig_stops)

  # Validate stoplist selection
  if (omit_stops && !which_stoplist %in% names(stopwords_lists)) {
    stop("Invalid stoplist specified. Choose from: ", paste(names(stopwords_lists), collapse = ", "))
  }

  # Only prompt for stoplist if omit_stops is TRUE and which_stoplist is NULL
  if (omit_stops && is.null(which_stoplist)) {
    if (verbose) {
      message("Available stopword lists:")
      message("1. Temple_stops25\n2. MIT_stops\n3. SMART_stops\n4. CA_orig_stops")
    }
    choice <- readline(prompt = "Enter number of stoplist to use (1-4): ")
    which_stoplist <- c("Temple_stops25", "MIT_stops", "SMART_stops", "CA_orig_stops")[as.integer(choice)]
  }

  # create Turn_Count variable
  dat_prep <- dat_read %>% group_by(Event_ID) %>%
    mutate(switch_mark = Participant_ID != dplyr::lag(Participant_ID, default = first(Participant_ID)),
           Turn_Count = cumsum(switch_mark) + 1) %>% select(-switch_mark) %>% ungroup()

  # Text Processing Pipeline
  dat_prep <- dat_prep %>% mutate(Participant_ID = as.factor(Participant_ID),
                                  Event_ID = as.factor(Event_ID), Text_Prep = tolower(Text_Raw)) %>% select(-Text_Raw)

  # Standardize apostrophes
  dat_prep <- dat_prep %>%
    mutate(Text_Prep = stringi::stri_replace_all_regex(Text_Prep, "[\u2018\u2019\u02BC\u201B\uFF07\u0092\u0091\u0060\u00B4\u2032\u2035]", "'"))

  # Remove non-alphabetic characters except apostrophes
  dat_prep <- dat_prep %>% mutate(Text_Prep = stringi::stri_replace_all_regex(Text_Prep, "[^a-zA-Z']", " "))

  # Clean whitespace
  dat_prep <- dat_prep %>% mutate(Text_Prep = str_squish(gsub("\\s+", " ", Text_Prep)))

  # Split into words
  dat_prep <- dat_prep %>% tidyr::separate_rows(Text_Prep, sep = "[[:space:]]+")

  # Clean text
  dat_prep <- dat_prep %>% mutate(Text_Prep = stringi::stri_replace_all_regex(Text_Prep, "[^a-z']", ""))

  # ASCII conversion
  dat_prep <- dat_prep %>% mutate(Text_Prep = iconv(Text_Prep, to = "ASCII//TRANSLIT", sub = ""),
                                  Text_Prep = stringi::stri_replace_all_regex(Text_Prep, "[^[:alnum:]']", ""))

  # Apply contractions replacement do NOT quote the column name
  dat_prep <- replacements_25(dat = dat_prep, wordcol = Text_Prep)

  # Split again after contractions
  dat_prep <- dat_prep %>% tidyr::separate_rows(Text_Prep, sep = "[[:space:]]+")

  # Final processing and lemmatization
  df_prep <- dat_prep %>% mutate(Text_Clean = ifelse(stringi::stri_isempty(Text_Prep), NA, Text_Prep),
                                 Text_Clean = if(lemmatize) textstem::lemmatize_strings(Text_Clean) else Text_Clean)

  # Stopword removal
  if (omit_stops) {
    stopwords <- stopwords_lists[[which_stoplist]]$word
    df_prep <- df_prep %>% mutate(Text_Clean = ifelse(Text_Clean %in% stopwords, NA, Text_Clean))
  }

  # Variable selection
  possible_vars <- setdiff(colnames(lookup_Jul25), "word")

  repeat {
    if (verbose) {
      message("Available variables:")
      for (i in seq_along(possible_vars)) {
        message(sprintf("%d. %s", i, possible_vars[i]))
      }
      message("\nSelect up to 3 variables you want to analyze alignment on.")
      message("Enter the numbers separated by spaces (e.g., 1 3 5) then hit enter: ")
    }

    input <- readline()
    selected_indices <- suppressWarnings(as.numeric(unlist(strsplit(trimws(input), "\\s+"))))

    if (length(selected_indices) == 0 || any(is.na(selected_indices))) {
      warning("Invalid input. Please enter numbers only, separated by spaces.")
      next
    }
    if (any(selected_indices < 1 | selected_indices > length(possible_vars))) {
      warning("Invalid selection. Please enter numbers between 1 and ", length(possible_vars))
      next
    }
    if (length(selected_indices) > 3) {
      warning("You selected more than 3 variables. Please select 3 or fewer.")
      next
    }

    myvars <- possible_vars[selected_indices]
    break
  }

  # [Rest of the function remains unchanged]
  var_selected <- lookup_Jul25 %>% dplyr::select(word, tidyselect::all_of(myvars))

  # Join with psycholinguistic measures
  df_prep <- df_prep %>% left_join(var_selected, by = c("Text_Clean" = "word")) %>%
    mutate(Exchange_Count = ceiling(Turn_Count / 2))

  # Reorder columns
  df_prep <- df_prep %>% select(Event_ID, Participant_ID, Exchange_Count, Turn_Count,
                                Text_Prep, Text_Clean, all_of(myvars), everything())

  return(df_prep)
}

Any scripts or data that you put into this service are public.

ConversationAlign documentation built on Aug. 8, 2025, 7:22 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

ConversationAlign
Process Text and Compute Linguistic Alignment in Conversation Transcripts

R/prep_dyads.R
In ConversationAlign: Process Text and Compute Linguistic Alignment in Conversation Transcripts

Defines functions prep_dyads

Documented in prep_dyads

Try the ConversationAlign package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

ConversationAlign Process Text and Compute Linguistic Alignment in Conversation Transcripts

R/prep_dyads.R In ConversationAlign: Process Text and Compute Linguistic Alignment in Conversation Transcripts

Defines functions prep_dyads

Documented in prep_dyads

Try the ConversationAlign package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

ConversationAlign
Process Text and Compute Linguistic Alignment in Conversation Transcripts

R/prep_dyads.R
In ConversationAlign: Process Text and Compute Linguistic Alignment in Conversation Transcripts