R/summarize_dyads.R
In ConversationAlign: Process Text and Compute Linguistic Alignment in Conversation Transcripts

Documented in summarize_dyads

#' summarize_dyads
#'
#' Calculates and appends 3 measures for quantifying alignment. Appends the averaged value for each selected dimension by turn and speaker. Calculates and Spearman's rank correlation between interlocutor time series and appends by transcript. Calculates the area under the curve of the absolute difference time series between interlocutor time series. The length of the difference time series can be standardized the shortest number of exchanges present in the group using an internally defined resampling function, called with resample = TRUE. Spearman's rank correlation and area under the curve become less reliable for dyads under 30 exchanges.
#'
#' @name summarize_dyads
#' @param df_prep produced in the align_dyads function
#' @param custom_lags integer vector, should any lags be added in addition to -2, 0, 2
#' @param corr_type option for computing lagged correlations turn-by-turn covariance (default='Pearson')
#' @param sumdat_only default=TRUE, group and summarize data, two rows per conversation, one row for each participant, false will fill down summary statistics across all exchanges
#' @returns either:
#' - a grouped dataframe with summary data aggregated by converation (Event_ID) and participant if sumdat_only=T.
#' - the origoinal dataframe 'filled down' with summary data (e.g., AUC, turn-by-turn correlations) for each conversation is sumdat_only=F.
#' @importFrom DescTools AUC
#' @importFrom dplyr across
#' @importFrom dplyr bind_rows
#' @importFrom dplyr bind_cols
#' @importFrom dplyr distinct
#' @importFrom dplyr group_by
#' @importFrom dplyr filter
#' @importFrom dplyr first
#' @importFrom dplyr summarise
#' @importFrom dplyr left_join
#' @importFrom dplyr mutate
#' @importFrom dplyr rename_with
#' @importFrom dplyr rename_at
#' @importFrom dplyr select
#' @importFrom dplyr ungroup
#' @importFrom magrittr %>%
#' @importFrom stringr str_replace
#' @importFrom stringr str_c
#' @importFrom tidyr fill
#' @importFrom tidyr pivot_wider
#' @importFrom tidyr pivot_longer
#' @importFrom tidyr drop_na
#' @importFrom tidyselect starts_with
#' @importFrom tidyselect ends_with
#' @importFrom tidyselect contains
#' @importFrom tidyselect everything
#' @importFrom utils tail
#' @importFrom zoo na.approx
#' @export summarize_dyads

summarize_dyads <- function(df_prep, custom_lags = NULL, sumdat_only = TRUE, corr_type = 'Pearson') {
  # Validate correlation type at the start
  if (!corr_type %in% c("Pearson", "Spearman")) {
    stop("corr_type must be either 'Pearson' or 'Spearman'")
  }

  # Extract and Summarize Meta-Data - select Event_ID, Participant_ID, and non-numeric columns
  # excluding dimension columns and specific technical columns
  df_meta <- df_prep %>% dplyr::select(Event_ID, Participant_ID,
                                       -c(Exchange_Count, Turn_Count, Text_Prep, Text_Clean),
                                       -matches("^(emo_|lex_|phon_|sem_|df_)")) %>%
    # Select only factor and character columns (excluding numeric)
    dplyr::select(where(~ is.factor(.x) | is.character(.x)))

  # Check if there are any columns left besides Event_ID and Participant_ID
  meta_cols <- setdiff(names(df_meta), c("Event_ID", "Participant_ID"))

  # Only create summary if there are metadata columns to summarize
  if (length(meta_cols) > 0) {
    df_meta_sum <- df_meta %>%
      dplyr::group_by(Event_ID, Participant_ID) %>%
      dplyr::summarise(dplyr::across(.cols = everything(),
                                     .fns = ~ dplyr::first(.x),
                                     .names = "{.col}_summary"),
                       .groups = "drop")
  } else {
    df_meta_sum <- df_meta %>%
      dplyr::select(Event_ID, Participant_ID) %>%
      dplyr::distinct()
  }

  # Generate summary data by conversation and participant
  av_df <- df_prep %>%
    dplyr::select(Event_ID, Participant_ID, Exchange_Count, Turn_Count, Text_Clean, matches("^(sem_|lex_|emo_|phon_)")) %>%
    dplyr::group_by(Event_ID, Participant_ID) %>%
    dplyr::summarize(
      across(
        matches("^(sem_|lex_|emo_|phon_)"),
        ~ mean(.x, na.rm = TRUE),
        .names = "{.col}_mean"
      ),
      across(c(Exchange_Count, Turn_Count, Text_Clean), dplyr::first),
      .groups = "drop"
    ) %>%
    dplyr::select(Event_ID, Participant_ID, contains("mean")) %>%
    tidyr::pivot_longer(
      cols = matches("^(sem_|lex_|emo_|phon_).*_mean"),
      names_to = "Dimension",
      names_pattern = "(.*)_mean",
      values_to = "Dimension_Mean"
    )

  # Clean up factors
  df_prep <- droplevels(df_prep)

  # Add participant pairs
  df_prep <- df_prep %>%
    dplyr::group_by(Event_ID) %>%
    dplyr::mutate(Participant_Pair = paste(sort(unique(Participant_ID)), collapse = "---")) %>%
    dplyr::ungroup()

  # Compute AUC and lag correlations
  auc_df <- compute_auc(df_prep = df_prep) %>% dplyr::select(c("Event_ID", contains("AUC")))

  user_lags <- c(-2, 0, 2)
  if (!is.null(custom_lags)) {
    user_lags <- sort(unique(append(user_lags, custom_lags)))
  }

  # Pass corr_type to compute_lagcorr - this is the key modification
  covar_df <- compute_lagcorr(df_prep = df_prep, lags = user_lags, corr_type = corr_type)

  # Reshape AUC data
  auc_df_long <- auc_df %>%
    tidyr::pivot_longer(
      contains("AUC"),
      names_to = c("Dimension", "reshaped"),
      names_pattern = "AUC_(.*)_(raw|scaled100)",
      values_to = "AUC"
    ) %>%
    tidyr::pivot_wider(
      names_from = reshaped,
      names_prefix = "AUC_",
      values_from = AUC
    )

  # Combine all data frames - with conditional join
  df_summarize <- av_df %>%
    dplyr::left_join(auc_df_long, by = c("Event_ID", "Dimension")) %>%
    dplyr::left_join(covar_df, by = c("Event_ID", "Dimension"))

  # Only join metadata if there were columns to summarize
  if (length(meta_cols) > 0) {
    df_summarize <- df_summarize %>%
      dplyr::left_join(df_meta_sum, by = c("Event_ID", "Participant_ID"))
  }

  # Remove row names
  row.names(df_summarize) <- NULL

  if (sumdat_only) {
    return(df_summarize)
  } else {
    # Join summary data back to original dataframe
    df_full <- df_prep %>%
      dplyr::left_join(df_summarize, by = c("Event_ID", "Participant_ID")) %>%
      # Fill down summary data within each Event_ID
      dplyr::group_by(Event_ID) %>%
      tidyr::fill(names(df_summarize)[!names(df_summarize) %in% c("Event_ID", "Participant_ID")], .direction = "downup") %>%
      dplyr::ungroup()

    return(df_full)
  }
}

Any scripts or data that you put into this service are public.

ConversationAlign documentation built on Aug. 8, 2025, 7:22 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

ConversationAlign
Process Text and Compute Linguistic Alignment in Conversation Transcripts

R/summarize_dyads.R
In ConversationAlign: Process Text and Compute Linguistic Alignment in Conversation Transcripts

Defines functions summarize_dyads

Documented in summarize_dyads

Try the ConversationAlign package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

ConversationAlign Process Text and Compute Linguistic Alignment in Conversation Transcripts

R/summarize_dyads.R In ConversationAlign: Process Text and Compute Linguistic Alignment in Conversation Transcripts

Defines functions summarize_dyads

Documented in summarize_dyads

Try the ConversationAlign package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

ConversationAlign
Process Text and Compute Linguistic Alignment in Conversation Transcripts

R/summarize_dyads.R
In ConversationAlign: Process Text and Compute Linguistic Alignment in Conversation Transcripts