R/curate_swda_data.R

Defines functions process_speaker_info extract_utterances extract_speaker_info curate_swda_file curate_swda_data

Documented in curate_swda_data curate_swda_file extract_speaker_info extract_utterances process_speaker_info

#' Curate SWDA data
#'
#' Process and curate Switchboard Dialog Act (SWDA) data by reading all .utt
#' files from a specified directory and converting them into a structured
#' format.
#'
#' @details
#' The function expects a directory containing .utt files or subdirectories
#' with .utt files, as found in the raw SWDA data
#' (Linguistic Data Consortium. LDC97S62: Switchboard Dialog Act Corpus.)
#'
#' @param dir_path Character string. Path to the directory containing .utt
#'        files. Must be an existing directory.
#'
#' @return A data frame containing the curated SWDA data with columns:
#'   \itemize{
#'     \item doc_id: Document identifier
#'     \item damsl_tag: Dialog act annotation
#'     \item speaker_id: Unique speaker identifier
#'     \item speaker: Speaker designation (A or B)
#'     \item turn_num: Turn number in conversation
#'     \item utterance_num: Utterance number
#'     \item utterance_text: Actual spoken text
#'   }
#'
#' @examples
#' # Example using simulated data bundled with the package
#' example_data <- system.file("extdata", "simul_swda", package = "qtkit")
#' swda_data <- curate_swda_data(example_data)
#'
#' str(swda_data)
#'
#' @export
curate_swda_data <- function(dir_path) {
  # Validate directory path
  if (!dir.exists(dir_path)) {
    stop("Directory does not exist: ", dir_path)
  }
  # Find all .utt files
  utt_files <- list.files(
    path = dir_path,
    pattern = "\\.utt$",
    full.names = TRUE,
    recursive = TRUE
  )
  # Check if any files were found
  if (length(utt_files) == 0) {
    warning("No .utt files found in directory: ", dir_path)
    return(data.frame())
  }
  # Process all files and combine results using base R
  data_df <- do.call(
    rbind,
    lapply(utt_files, function(f) {
      result <- curate_swda_file(f)
      # Ensure row names don't clash when combining
      rownames(result) <- NULL
      result
    })
  )
  return(data_df)
}

#' Process a single SWDA utterance file
#'
#' @param file_path Character string. Path to the .utt file
#' @return A data frame containing processed data from the file
#' @keywords internal
curate_swda_file <- function(file_path) {
  # Read file contents
  doc_lines <- readLines(file_path)
  # Extract speaker information
  speaker_info <- extract_speaker_info(doc_lines)
  # Extract and process utterances
  utterances_df <- extract_utterances(doc_lines, speaker_info)
  return(utterances_df)
}

#' Extract speaker information from document lines
#'
#' @param doc_lines Character vector of file lines
#' @return Named list of speaker information
#' @keywords internal
extract_speaker_info <- function(doc_lines) {
  speaker_pattern <- "\\d+_\\d+_\\d+"
  speaker_line <- grep(speaker_pattern, doc_lines, value = TRUE)
  if (length(speaker_line) == 0) {
    stop("Could not find speaker information in file")
  }
  speaker_info_parts <- strsplit(
    sub(".*\\t", "", speaker_line),
    "_"
  )[[1]]
  return(list(
    doc_id = speaker_info_parts[1],
    speaker_a_id = speaker_info_parts[2],
    speaker_b_id = speaker_info_parts[3]
  ))
}

#' Extract and process utterances from document lines
#'
#' @param doc_lines Character vector of file lines
#' @param speaker_info List of speaker information
#' @return Data frame of processed utterances
#' @keywords internal
extract_utterances <- function(doc_lines, speaker_info) {
  # Find text section
  text_start <- grep("={3,}", doc_lines)[1] + 1
  text_end <- length(doc_lines)
  # Extract non-empty lines
  text_lines <- grep(
    ".+",
    trimws(doc_lines[text_start:text_end]),
    value = TRUE
  )
  # Create initial data frame
  df <- data.frame(
    doc_id = speaker_info$doc_id,
    text = text_lines,
    stringsAsFactors = FALSE
  )
  # Extract components using regular expressions
  df <- transform(df,
    damsl_tag = trimws(sub("^(.+?)\\s.*$", "\\1", text)),
    speaker_turn = sub("^.*([AB]\\.\\d+).*$", "\\1", text),
    utterance_num = sub("^.*utt(\\d+).*$", "\\1", text),
    utterance_text = trimws(sub("^.*:(.+)$", "\\1", text))
  )
  # Process speaker information
  speaker_info <- process_speaker_info(
    df$speaker_turn,
    speaker_info$speaker_a_id,
    speaker_info$speaker_b_id
  )
  # Combine all information
  result_df <- data.frame(
    doc_id = df$doc_id,
    damsl_tag = df$damsl_tag,
    speaker_id = speaker_info$speaker_id,
    speaker = speaker_info$speaker,
    turn_num = speaker_info$turn_num,
    utterance_num = df$utterance_num,
    utterance_text = df$utterance_text,
    stringsAsFactors = FALSE
  )
  return(result_df)
}

#' Process speaker turn information
#'
#' @param speaker_turn Vector of speaker turns
#' @param speaker_a_id ID for speaker A
#' @param speaker_b_id ID for speaker B
#' @return List with processed speaker information
#' @keywords internal
process_speaker_info <- function(speaker_turn, speaker_a_id, speaker_b_id) {
  # Split speaker turn information
  turn_parts <- strsplit(speaker_turn, "\\.")
  # Extract components
  speaker <- sapply(turn_parts, "[[", 1)
  turn_num <- sapply(turn_parts, "[[", 2)
  # Assign speaker IDs
  speaker_id <- ifelse(speaker == "A", speaker_a_id, speaker_b_id)
  return(list(
    speaker = speaker,
    turn_num = turn_num,
    speaker_id = speaker_id
  ))
}

Try the qtkit package in your browser

Any scripts or data that you put into this service are public.

qtkit documentation built on April 4, 2025, 4:47 a.m.