R/prepare_data.R
In Nestimate: Network Estimation, Bootstrap, and Higher-Order Analysis

Documented in prepare print.nestimate_data

#' Prepare Event Log Data for Network Estimation
#'
#' @description
#' Converts event log data (actor, action, time) into wide sequence format
#' suitable for \code{\link{build_network}}. Automatically parses timestamps,
#' detects sessions from time gaps, and handles tie-breaking.
#'
#' @param data Data frame with event log columns.
#' @param actor Character or character vector. Column name(s) identifying who
#'   performed the action (e.g. \code{"student"} or
#'   \code{c("student", "group")}). If missing, all data is treated as one
#'   actor.
#' @param action Character. Column name containing the action/state/code.
#' @param time Character or NULL. Column name containing timestamps.
#'   Supports ISO8601, Unix timestamps (numeric), and 40+ date/time formats.
#'   If NULL, row order defines the sequence. Default: NULL.
#' @param order Character or NULL. Column name for tie-breaking when
#'   timestamps are identical. If NULL, original row order is used.
#'   Default: NULL.
#' @param session Character, character vector, or NULL. Column name(s) for
#'   explicit session grouping (e.g. \code{"course"} or
#'   \code{c("course", "semester")}). When combined with \code{time},
#'   sessions are further split by time gaps. Default: NULL.
#' @param time_threshold Numeric. Maximum gap in seconds between consecutive
#'   events before a new session starts. Only used when \code{time} is
#'   provided. Default: 900 (15 minutes).
#' @param custom_format Character or NULL. Custom \code{strptime} format
#'   string for parsing timestamps. Default: NULL (auto-detect).
#' @param is_unix_time Logical. If TRUE, treat numeric time values as Unix
#'   timestamps. Default: FALSE (auto-detected for numeric columns).
#' @param unix_time_unit Character. Unit for Unix timestamps:
#'   \code{"seconds"}, \code{"milliseconds"}, or \code{"microseconds"}.
#'   Default: \code{"seconds"}.
#'
#' @return A list with class \code{"nestimate_data"} containing:
#' \describe{
#'   \item{sequence_data}{Data frame in wide format (one row per session,
#'     columns T1, T2, ...).}
#'   \item{long_data}{The processed long-format data with session IDs.}
#'   \item{meta_data}{Session-level metadata (session ID, actor).}
#'   \item{time_data}{Parsed time values in wide format (if time provided).}
#'   \item{statistics}{List with total_sessions, total_actions,
#'     max_sequence_length, unique_actors, etc.}
#' }
#'
#' @examples
#' df <- data.frame(
#'   student = rep(1:3, each = 5),
#'   code = sample(c("read", "write", "test"), 15, replace = TRUE),
#'   timestamp = seq.POSIXt(as.POSIXct("2024-01-01"), by = "min", length.out = 15)
#' )
#' prepared <- prepare(df, actor = "student", action = "code",
#'                     time = "timestamp")
#' net <- build_network(prepared$sequence_data, method = "relative")
#'
#' @seealso \code{\link{build_network}}, \code{\link{prepare_onehot}}
#'
#' @export
prepare <- function(data,
                         actor,
                         action,
                         time = NULL,
                         order = NULL,
                         session = NULL,
                         time_threshold = 900,
                         custom_format = NULL,
                         is_unix_time = FALSE,
                         unix_time_unit = c("seconds", "milliseconds",
                                            "microseconds")) {
  stopifnot(is.data.frame(data))
  stopifnot(is.character(action), length(action) == 1, action %in% names(data))
  stopifnot(is.numeric(time_threshold), length(time_threshold) == 1,
            time_threshold > 0)
  unix_time_unit <- match.arg(unix_time_unit)

  df <- as.data.frame(data)
  n <- nrow(df)

  # ---- Actor ----
  default_actor <- FALSE
  if (missing(actor) || is.null(actor)) {
    df$.actor <- "all"
    actor_col <- ".actor"
    default_actor <- TRUE
  } else {
    stopifnot(is.character(actor), all(actor %in% names(df)))
    if (length(actor) > 1L) {
      df$.actor <- interaction(df[, actor, drop = FALSE], sep = "-",
                               drop = TRUE)
      actor_col <- ".actor"
    } else {
      actor_col <- actor
    }
  }

  # ---- Session (explicit grouping) ----
  if (!is.null(session)) {
    stopifnot(is.character(session), all(session %in% names(df)))
    if (length(session) > 1L) {
      df$.session_explicit <- interaction(
        df[, session, drop = FALSE], sep = "-", drop = TRUE
      )
    } else {
      df$.session_explicit <- df[[session]]
    }
  }

  # ---- Order (tiebreaker) ----
  if (is.null(order)) {
    df$.order <- seq_len(n)
    order_col <- ".order"
  } else {
    stopifnot(is.character(order), length(order) == 1, order %in% names(df))
    order_col <- order
  }

  # ---- Build base grouping key (actor + session) ----
  if (!is.null(session)) {
    df$.base_group <- interaction(
      df[[actor_col]], df$.session_explicit, sep = " | ", drop = TRUE
    )
  } else {
    df$.base_group <- df[[actor_col]]
  }

  # ---- Time parsing + inferred session detection ----
  if (!is.null(time)) {
    stopifnot(is.character(time), length(time) == 1, time %in% names(df))

    # Auto-detect numeric as unix
    if (is.numeric(df[[time]])) {
      is_unix_time <- TRUE
    }

    parsed <- .parse_time(df[[time]], custom_format = custom_format,
                          is_unix_time = is_unix_time,
                          unix_time_unit = unix_time_unit)
    df$.parsed_time <- parsed

    # Sort by base_group + time + order
    df <- df[base::order(df$.base_group, df$.parsed_time, df[[order_col]]), ]

    # Infer sub-sessions from time gaps within each base group
    df$.inferred_nr <- ave(
      as.numeric(df$.parsed_time),
      df$.base_group,
      FUN = function(t) {
        gaps <- c(NA_real_, diff(t))
        new_session <- is.na(gaps) | gaps > time_threshold
        cumsum(new_session)
      }
    )

    df$.session_id <- paste0(df$.base_group, " s", df$.inferred_nr)

  } else {
    # No time: sort by base_group + order
    df <- df[base::order(df$.base_group, df[[order_col]]), ]

    # Each base group = one session
    df$.session_id <- as.character(df$.base_group)
  }

  # ---- Sequence numbering within sessions ----
  df$.sequence <- ave(seq_len(nrow(df)), df$.session_id, FUN = seq_along)

  # ---- Pivot to wide ----
  sessions <- unique(df$.session_id)
  max_len <- max(df$.sequence)

  # Build wide sequence data via matrix indexing (fast)
  seq_mat <- matrix(NA_character_, nrow = length(sessions), ncol = max_len)
  session_idx <- match(df$.session_id, sessions)
  seq_mat[cbind(session_idx, df$.sequence)] <- as.character(df[[action]])

  sequence_data <- as.data.frame(seq_mat, stringsAsFactors = FALSE)
  names(sequence_data) <- paste0("T", seq_len(max_len))

  # Build wide time data (if time was provided)
  if (!is.null(time)) {
    time_mat <- matrix(NA_real_, nrow = length(sessions), ncol = max_len)
    time_mat[cbind(session_idx, df$.sequence)] <- as.numeric(df$.parsed_time)
    time_data <- as.data.frame(time_mat)
    names(time_data) <- paste0("time_T", seq_len(max_len))
    for (j in seq_len(ncol(time_data))) {
      time_data[[j]] <- as.POSIXct(time_data[[j]], origin = "1970-01-01")
    }
  } else {
    time_data <- NULL
  }

  # Meta data
  meta_data <- data.frame(.session_id = sessions, stringsAsFactors = FALSE)
  if (!default_actor) {
    actor_map <- df[!duplicated(df$.session_id),
                    c(".session_id", actor_col), drop = FALSE]
    meta_data <- merge(meta_data, actor_map, by = ".session_id", sort = FALSE)
  }

  # Aggregate extra columns per session
  special_cols <- c(action, actor_col, time, order_col, session,
                    grep("^\\.", names(df), value = TRUE))
  extra_cols <- setdiff(names(df), special_cols)
  if (length(extra_cols) > 0L) {
    agg <- .aggregate_metadata(df, session_col = ".session_id",
                               extra_cols = extra_cols)
    meta_data <- merge(meta_data, agg, by = ".session_id", sort = FALSE)
  }

  # Statistics
  stats <- list(
    total_sessions = length(sessions),
    total_actions = nrow(df),
    max_sequence_length = max_len
  )
  if (!default_actor) {
    stats$unique_actors <- length(unique(df[[actor_col]]))
  }

  rownames(sequence_data) <- NULL

  structure(
    list(
      sequence_data = sequence_data,
      long_data = df,
      meta_data = meta_data,
      time_data = time_data,
      statistics = stats
    ),
    class = "nestimate_data"
  )
}


#' Print Method for nestimate_data
#' @param x A \code{nestimate_data} object.
#' @param ... Additional arguments (ignored).
#' @return The input object, invisibly.
#' @examples
#' events <- data.frame(
#'   actor  = c("u1","u1","u1","u2","u2","u2"),
#'   action = c("A","B","C","B","A","C"),
#'   time   = c(1,2,3,1,2,3)
#' )
#' nd <- prepare(events, action = "action",
#'               actor = "actor", time = "time")
#' print(nd)
#' @export
print.nestimate_data <- function(x, ...) {
  cat("Prepared Data for Network Estimation\n")
  s <- x$statistics
  cat(sprintf("  Sessions: %d  |  Actions: %d  |  Max length: %d\n",
              s$total_sessions, s$total_actions, s$max_sequence_length))
  if (!is.null(s$unique_actors)) {
    cat(sprintf("  Actors: %d\n", s$unique_actors))
  }
  if (!is.null(x$time_data)) {
    cat("  Time data: available\n")
  }
  invisible(x)
}


# ---- Time parsing ----

#' Parse time values from various formats
#'
#' Tries 40+ date/time formats, Unix timestamps, and custom formats.
#' Matches tna::parse_time logic.
#'
#' @param time Character or numeric vector of time values.
#' @param custom_format Character or NULL. Custom strptime format.
#' @param is_unix_time Logical. Treat numeric as Unix timestamp.
#' @param unix_time_unit Character. "seconds", "milliseconds", "microseconds".
#' @return POSIXct vector.
#' @noRd
.parse_time <- function(time, custom_format = NULL, is_unix_time = FALSE,
                        unix_time_unit = "seconds") {
  # Already POSIXct
  if (inherits(time, c("POSIXct", "POSIXlt"))) return(time)

  # Numeric Unix timestamps
  if (is.numeric(time) && is_unix_time) {
    divisor <- switch(unix_time_unit,
      seconds = 1,
      milliseconds = 1000,
      microseconds = 1e6
    )
    return(as.POSIXct(time / divisor, origin = "1970-01-01"))
  }

  # Character parsing
  time <- trimws(as.character(time))
  # Strip fractional seconds and timezone suffixes for format matching
  time_clean <- gsub("(\\.\\d{1,3})?[A-Za-z ]*$", "", time)

  # Custom format first
  if (!is.null(custom_format)) {
    parsed <- as.POSIXct(strptime(time_clean, format = custom_format))
    if (!all(is.na(parsed))) return(parsed)
  }

  # Try standard formats
  formats <- c(
    "%Y-%m-%d %H:%M:%S", "%Y-%m-%d %H:%M",
    "%Y/%m/%d %H:%M:%S", "%Y/%m/%d %H:%M",
    "%Y.%m.%d %H:%M:%S", "%Y.%m.%d %H:%M",
    "%Y-%m-%dT%H:%M:%S", "%Y-%m-%dT%H:%M", "%Y-%m-%dT%H:%M:%OS",
    "%Y-%m-%d %H:%M:%S%z", "%Y-%m-%d %H:%M%z",
    "%Y-%m-%d %H:%M:%S %z", "%Y-%m-%d %H:%M %z",
    "%Y%m%d%H%M%S", "%Y%m%d%H%M",
    "%d-%m-%Y %H:%M:%S", "%d-%m-%Y %H:%M",
    "%d/%m/%Y %H:%M:%S", "%d/%m/%Y %H:%M",
    "%d.%m.%Y %H:%M:%S", "%d.%m.%Y %H:%M",
    "%d-%m-%YT%H:%M:%S", "%d-%m-%YT%H:%M",
    "%m-%d-%Y %H:%M:%S", "%m-%d-%Y %H:%M",
    "%m/%d/%Y %H:%M:%S", "%m/%d/%Y %H:%M",
    "%m.%d.%Y %H:%M:%S", "%m.%d.%Y %H:%M",
    "%m-%d-%YT%H:%M:%S", "%m-%d-%YT%H:%M",
    "%d %b %Y %H:%M:%S", "%d %b %Y %H:%M",
    "%d %B %Y %H:%M:%S", "%d %B %Y %H:%M",
    "%b %d %Y %H:%M:%S", "%b %d %Y %H:%M",
    "%B %d %Y %H:%M:%S", "%B %d %Y %H:%M",
    "%Y-%m-%d", "%Y/%m/%d", "%Y.%m.%d",
    "%d-%m-%Y", "%d/%m/%Y", "%d.%m.%Y",
    "%m-%d-%Y", "%m/%d/%Y", "%m.%d.%Y",
    "%d %b %Y", "%d %B %Y", "%b %d %Y", "%B %d %Y"
  )

  for (fmt in formats) {
    parsed <- as.POSIXct(strptime(time_clean, format = fmt))
    if (!all(is.na(parsed))) return(parsed)
  }

  # Last resort: try as numeric Unix timestamp
  numeric_time <- suppressWarnings(as.numeric(time))
  if (!any(is.na(numeric_time))) {
    divisor <- switch(unix_time_unit, # nocov start
      seconds = 1, milliseconds = 1000, microseconds = 1e6
    )
    return(as.POSIXct(numeric_time / divisor, origin = "1970-01-01")) # nocov end
  }

  stop("Could not parse time values. Sample: ",
       paste(utils::head(time, 3), collapse = ", "),
       ". Use 'custom_format' argument.", call. = FALSE)
}


#' Aggregate extra columns per session
#'
#' Numeric columns: mean. Character/factor columns: mode (most frequent value);
#' if tied, uses the first occurring value and emits a message.
#' @noRd
.aggregate_metadata <- function(df, session_col, extra_cols) {
  sessions <- unique(df[[session_col]])
  tie_counts <- integer(0)

  agg_list <- lapply(extra_cols, function(col) {
    vals <- df[[col]]
    if (is.numeric(vals)) {
      agg <- tapply(vals, df[[session_col]], mean, na.rm = TRUE)[sessions]
      agg[is.nan(agg)] <- NA_real_
      agg
    } else {
      n_ties <- 0L
      result <- tapply(vals, df[[session_col]], function(v) {
        v <- v[!is.na(v)]
        if (length(v) == 0L) return(NA_character_)
        tab <- table(v)
        max_count <- max(tab)
        modes <- names(tab)[tab == max_count]
        if (length(modes) > 1L) {
          n_ties <<- n_ties + 1L
          v[v %in% modes][1L]
        } else {
          modes
        }
      })[sessions]
      if (n_ties > 0L) tie_counts[[col]] <<- n_ties
      result
    }
  })
  names(agg_list) <- extra_cols

  if (length(tie_counts) > 0L) {
    parts <- vapply(names(tie_counts), function(col) {
      sprintf("'%s' (%d sessions)", col, tie_counts[[col]])
    }, character(1))
    message("Metadata aggregated per session: ties resolved by first ",
            "occurrence in ", paste(parts, collapse = ", "))
  }

  result <- data.frame(
    .session_id = sessions,
    stringsAsFactors = FALSE
  )
  for (col in extra_cols) {
    result[[col]] <- unname(agg_list[[col]])
  }
  result
}