R/compute_auc.R
In ConversationAlign: Process Text and Compute Linguistic Alignment in Conversation Transcripts

#' compute_auc
#'
#' internal function that computes two indices of global alignment (auc) between conversation partners for each dyad
#' @name compute_spearman
#' @importFrom dplyr bind_rows
#' @importFrom dplyr mutate
#' @importFrom dplyr select
#' @importFrom magrittr %>%
#' @returns
#' nothing - internal function used for intermediary computation piped into summarize_dyads function
#' @keywords internal
#' @noRd

compute_auc <- function(df_prep, verbose = TRUE) {
  # selects align_var by grepping on possible prefixes of dimensions
  align_var <- grep("^(emo_|lex_|sem_|phon_)", colnames(df_prep), value = TRUE, ignore.case = TRUE)

  # split the data frame into a list by event id
  df_list <- split(df_prep, f = df_prep$Event_ID)

  # iterate over each event, replacing participant names with a transient filler variable
  df_list_speakvar <- lapply(df_list, function(df){
    svec <- unique(as.character(df$Participant_ID))
    df <- df %>% dplyr::mutate(participant_var = ifelse(as.character(Participant_ID) == svec[1], "S1", "S2"),
                               Participant_Pair = paste(sort(svec), collapse = "---"))
  })

  df_speakvar <- dplyr::bind_rows(df_list_speakvar)

  # group by turn then take the average score for each turn count, then pivot on pids
  df_wide <- df_speakvar %>% dplyr::group_by(Event_ID, Exchange_Count, Participant_ID) %>%
    dplyr::summarise(dplyr::across(tidyselect::contains(align_var), ~ mean(.x, na.rm = TRUE)),
                     participant_var = dplyr::first(participant_var), Participant_Pair = dplyr::first(Participant_Pair),
                     .groups = "drop") %>%
    tidyr::pivot_wider(names_from = c("participant_var"), values_from = any_of(align_var))

  # if there is only one aligned variable manually add that variable name to participant columns
  if (length(align_var) == 1){
    colnames(df_wide)[which(colnames(df_wide) %in% c("S1", "S2"))] = paste(align_var[1], colnames(df_wide)[which(colnames(df_wide) %in% c("S1", "S2"))], sep = "_")
  }

  # should remove uneven final exchanges here - need to grab the last exchange count of each row and if it is uneven, remove it.
  for (eid in unique(as.character(df_wide$Event_ID))){
    final_exc_rows <- df_wide[which(df_wide$Event_ID == eid & df_wide$Exchange_Count == max(df_wide$Exchange_Count[which(df_wide$Event_ID == eid)])),]
    if (nrow(final_exc_rows) == 1){
      df_wide <- df_wide[-which(df_wide$Event_ID == eid & df_wide$Exchange_Count == max(df_wide$Exchange_Count[which(df_wide$Event_ID == eid)])),]
    }
  }

  # here is the big iteration:
  split_pid_df_list <- sapply(c("S1", "S2"), function(temp_pid){
    # grab the columns that are needed for grouping or contain just on participant's scores
    p_df <- df_wide %>%
      dplyr::select(c("Event_ID", "Exchange_Count", "Participant_Pair") | c(ends_with(paste0("_", temp_pid))))

    # take the first column that matches an aligned variable
    first_var_col <- p_df[,which(colnames(p_df) %in% paste(align_var, temp_pid, sep = "_"))[1]]
    # index the rows that are either not NA or are NaN values (numbers and NaN)
    p_df <- p_df[which(is.na(first_var_col) == FALSE | is.nan(unlist(first_var_col)) == TRUE),]
    p_df_list <- split(p_df, f = p_df$Event_ID)

    p_df_altered <- dplyr::bind_rows(p_df_list)
    # interpolate here - fill forward for end, fill back for front, linear interpolation for the middle
    # grouped by event id, then each function is run across all the Time series individually

    p_df_interp <- p_df_altered %>%
      dplyr::group_by(Event_ID) %>% # interpolates all missing values between two points
      dplyr::mutate(dplyr::across(tidyselect::contains(align_var),
                                  ~ zoo::na.approx(.x, na.rm = FALSE))) %>%
      tidyr::fill(tidyselect::contains(align_var), .direction = "downup") %>%
      dplyr::ungroup()

    # wrap the data frame in a list to preserve structure (lapply will pull it into a list of lists)
    df_interp_names_list <- list(p_df_interp)
  })
  # join the two participant data frame together by dyad and Exchange_Count
  widedf <- dplyr::left_join(split_pid_df_list[[1]], split_pid_df_list[[2]], by = c("Event_ID", "Participant_Pair", "Exchange_Count"))

  #iterate over each aligned dimension, selecting only the scores for that dimension and pulling a difference value and subbing it in for the actual values
  for (dimension in align_var){
    both_participant_cols <- widedf %>%
      dplyr::select(starts_with(dimension))
    # add a count to the data frame (x-axis)

    absdiffcol <- data.frame(dimension = abs(both_participant_cols[,1] - both_participant_cols[,2]))
    widedf[which(colnames(widedf) %in% paste(dimension, c("S1", "S2"), sep = "_"))] <- absdiffcol
  }

  long_diff_df <- widedf %>%
    tidyr::pivot_longer(cols = c(ends_with("_S1") | ends_with("_S2")),
                        names_to = c("dimension", "Participant_ID"),
                        names_pattern = "(.*)_([^_]+)$",
                        values_to = "score") %>% # pivot longer by dimension and pid
    # pivot each dimension to a column
    tidyr::pivot_wider(names_from = dimension, values_from = score) %>%
    dplyr::filter(Participant_ID == "S1") %>% # remove every second row per turn
    dplyr::select(-Participant_ID) # remove pid column
  # split the difference data frame into a list based on event id
  long_diff_df_list <- split(long_diff_df, f = long_diff_df$Event_ID)

  # grab the aligned dimensions as a vector to iterate over
  xdimensions <- colnames(long_diff_df)[which(colnames(long_diff_df) %in% align_var)]

  domain_auc_list <- lapply(xdimensions, function(dimension){ #iterate over emotion
    # now iterate over each dyad in the corpus

    # function to calculate AUC for single time series, return NA if there is an error
    calculate_auc <- function(domain_ts, doc_name, dimension) {
      tryCatch({
        # if time series has fewer points than the threshold, fill with NA
        if (max(domain_ts$Exchange_Count) < 3) { # hard coded to three exchanges
          #create a single row, single column dataframe with one empty value to fill in the AUC
          doc_domain_auc_df <- data.frame(domain_auc = as.double(NA),
                                          Exchanges = max(domain_ts$Exchange_Count))
        }
        else {
          domain_ts <- data.frame(domain_ts) #make single emotion Time series a data frame
          domain_auc <- DescTools::AUC(x = domain_ts[,1], y = domain_ts[,2], method = "trapezoid")
          doc_domain_auc_df <- data.frame(domain_auc,
                                          Exchanges = max(domain_ts$Exchange_Count)) #make data frame of AUC, replicated once
        }
        doc_domain_auc_df
      },
      error = function(e) {
        if (verbose) {
          message(paste("Results for dAUC will be filled with NA.\n\tTranscript:",
                        doc_name, "\n\tDimension:", dimension))
        }
        # fill the result cell with NA
        doc_domain_auc_df <- data.frame(domain_auc = as.double(NA),
                                        Exchanges = max(domain_ts$Exchange_Count))
        doc_domain_auc_df
      })
    }

    # iterate over each document
    single_doc_auc <- lapply(long_diff_df_list, function(df_prep){

      domain_ts <- df_prep %>%
        dplyr::select(Exchange_Count,
                      tidyselect::contains(dimension)) # take dimension and time

      # put the function in here
      single_doc_domain_auc <- calculate_auc(domain_ts,
                                             doc_name = as.character(df_prep$Event_ID)[1],
                                             dimension = dimension)
    })
    #bind all docs AUCs for emotion into one column and add column prefix
    all_doc_domain_auc_df <- dplyr::bind_rows(single_doc_auc)
    # drop exchange column for all but first dimension
    if (dimension != xdimensions[1]) {
      all_doc_domain_auc_df <- data.frame(all_doc_domain_auc_df[,1])
    }
    colnames(all_doc_domain_auc_df)[1] <- paste("AUC", dimension, sep = "_")
    all_doc_domain_auc_df
  })

  all_domain_df <- dplyr::bind_cols(domain_auc_list, data.frame(Event_ID = names(long_diff_df_list))) #bind all columns of AUCs into one data frame
  # throw warning if any dyads are fewer than 50 exchanges
  small_dyads <- all_domain_df[which(all_domain_df$Exchanges < 50), "Event_ID"]
  if (length(small_dyads) > 0) {
    warning(paste0("Some conversations are shorter than 50 exchanges (100 turns). ",
                   "It is recommended that conversations are longer than 50 exchanges. ",
                   "Affected conversations:\n",
                   paste(small_dyads, collapse = ", ")))
  }
  # standardize each AUC to 50
  all_domain_df_s <- all_domain_df %>%
    dplyr::mutate(dplyr::across(tidyselect::contains("AUC"), ~ (50 * .x) / Exchanges)) %>%
    dplyr::select(-Exchanges)

  output_auc <- dplyr::left_join(all_domain_df, all_domain_df_s, by = "Event_ID", suffix = c("_raw", "_scaled100")) %>%
    dplyr::select(c(Event_ID, Exchanges, dplyr::everything()))

  return(output_auc)
}

Any scripts or data that you put into this service are public.

ConversationAlign documentation built on Aug. 8, 2025, 7:22 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

ConversationAlign
Process Text and Compute Linguistic Alignment in Conversation Transcripts

R/compute_auc.R
In ConversationAlign: Process Text and Compute Linguistic Alignment in Conversation Transcripts

Defines functions compute_auc

Try the ConversationAlign package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

ConversationAlign Process Text and Compute Linguistic Alignment in Conversation Transcripts

R/compute_auc.R In ConversationAlign: Process Text and Compute Linguistic Alignment in Conversation Transcripts

Defines functions compute_auc

Try the ConversationAlign package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

ConversationAlign
Process Text and Compute Linguistic Alignment in Conversation Transcripts

R/compute_auc.R
In ConversationAlign: Process Text and Compute Linguistic Alignment in Conversation Transcripts