R/pedanalysis.R

Defines functions pedgenint

Documented in pedgenint

#' Calculate Generation Intervals
#'
#' Computes the generation intervals for the four gametic pathways:
#' Sire to Son (SS), Sire to Daughter (SD), Dam to Son (DS), and Dam to Daughter (DD).
#' The generation interval is defined as the age of the parents at the birth of their offspring.
#'
#' @param ped A \code{tidyped} object.
#' @param timevar Character. The name of the column containing the \strong{birth date}
#'   (or hatch date) of each individual. The column must be one of:
#'   \itemize{
#'     \item \code{Date} or \code{POSIXct} (recommended).
#'     \item A date string parseable by \code{as.POSIXct} (e.g., \code{"2020-03-15"}).
#'       Use \code{format} for non-ISO strings.
#'     \item A numeric year (e.g., \code{2020}). Automatically converted to
#'       \code{Date} (\code{"YYYY-07-01"}) with a message. For finer precision,
#'       convert to \code{Date} beforehand.
#'   }
#'   If \code{NULL}, auto-detects columns named \code{"BirthYear"}, \code{"Year"},
#'   \code{"BirthDate"}, or \code{"Date"}.
#' @param unit Character. Output time unit for the interval:
#'   \code{"year"} (default), \code{"month"}, \code{"day"}, or \code{"hour"}.
#' @param format Character. Optional format string for parsing \code{timevar}
#'   when it contains non-standard date strings (e.g., \code{"\%d/\%m/\%Y"}
#'   for \code{"15/03/2020"}).
#' @param cycle Numeric. Optional target (designed) length of one generation
#'   cycle expressed in \code{unit}s. When provided, an additional column
#'   \code{GenEquiv} is appended to the result, defined as:
#'   \deqn{GenEquiv_i = \frac{\bar{L}_i}{L_{cycle}}}
#'   where \eqn{\bar{L}_i} is the observed mean interval for pathway \eqn{i} and
#'   \eqn{L_{cycle}} is \code{cycle}. A value > 1 means the observed
#'   interval exceeds the target cycle (lower breeding efficiency).
#'   Example: for Pacific white shrimp with a 180-day target cycle, set
#'   \code{unit = "day", cycle = 180}.
#' @param by Character. Optional grouping column (e.g., \code{"Breed"}, \code{"Farm"}).
#'   If provided, intervals are calculated within each group.
#'
#' @return A \code{data.table} with columns:
#' \itemize{
#'   \item \code{Group}: Grouping level (if \code{by} is used).
#'   \item \code{Pathway}: One of \code{"SS"}, \code{"SD"}, \code{"DS"}, \code{"DD"},
#'     \code{"SO"}, \code{"DO"}, or \code{"Average"}.
#'     SS/SD/DS/DD require offspring sex; SO (Sire-Offspring) and DO (Dam-Offspring)
#'     are computed from all parent-offspring pairs regardless of offspring sex.
#'   \item \code{N}: Number of parent-offspring pairs used.
#'   \item \code{Mean}: Average generation interval in \code{unit}.
#'   \item \code{SD}: Standard deviation of the interval.
#'   \item \code{GenEquiv}: (Optional) Generation equivalents based on \code{cycle}.
#' }
#'
#' @details
#' Parent-offspring pairs with zero or negative intervals are excluded from
#' the calculation because they typically indicate data entry errors or
#' insufficient time resolution. If many zero intervals are expected (e.g.,
#' when using \code{unit = "year"} with annual spawners), consider using a
#' finer time unit such as \code{"month"} or \code{"day"}.
#'
#' Numeric year columns (e.g., \code{2020}) are automatically converted to
#' \code{Date} by appending \code{"-07-01"} (mid-year) as a reasonable default.
#' For more precise results, convert to \code{Date} before calling this function.
#'
#' @examples
#' \donttest{
#' # ---- Basic usage with package dataset (numeric Year auto-converted) ----
#' tped <- tidyped(big_family_size_ped)
#' gi <- pedgenint(tped, timevar = "Year")
#' gi
#'
#' # ---- Generation equivalents with cycle ----
#' gi2 <- pedgenint(tped, timevar = "Year", cycle = 2)
#' gi2
#' }
#'
#' @export
pedgenint <- function(ped, timevar = NULL, unit = c("year", "month", "day", "hour"),
                      format = NULL, cycle = NULL, by = NULL) {
  # Input validation
  ped <- ensure_complete_tidyped(ped, "pedgenint()")
  unit <- match.arg(unit)
  
  # Auto-detect timevar
  if (is.null(timevar)) {
    candidates <- c("BirthYear", "Year", "birth_year", "year", "BirthDate", "Date")
    match_col <- intersect(names(ped), candidates)
    if (length(match_col) > 0) {
      timevar <- match_col[1]
      message(sprintf("Using '%s' as time variable.", timevar))
    } else {
      stop("Please specify 'timevar' (e.g., 'BirthYear' or 'BirthDate').")
    }
  }
  
  if (!timevar %in% names(ped)) {
    stop(sprintf("Column '%s' not found in pedigree.", timevar))
  }
  
  # Check grouping column
  if (!is.null(by) && !by %in% names(ped)) {
    stop(sprintf("Grouping column '%s' not found.", by))
  }

  # Parse time to numeric axis
  numeric_time <- .parse_to_numeric_time(ped[[timevar]], unit = unit, format = format)
  
  # Prepare working data — include numeric parent IDs for fast integer lookup
  cols <- c("Ind", "Sire", "Dam", "Sex", "IndNum", "SireNum", "DamNum")
  if (!is.null(by)) cols <- c(cols, by)
  
  dt <- as.data.table(ped)[, ..cols]
  dt[, Time := numeric_time]
  
  # Build integer-indexed time vector instead of a named character vector.
  # time_vec[IndNum + 1L] = time for individual IndNum.
  # time_vec[1L] = NA  (sentinel for SireNum/DamNum = 0, i.e. unknown parent).
  # This replaces time_map <- setNames(dt$Time, dt$Ind) which requires a
  # 1M-entry character hash table and four O(N) hash lookups per call.
  n_ped <- nrow(ped)
  time_vec <- vector("double", n_ped + 1L)
  time_vec[1L] <- NA_real_
  time_vec[ped$IndNum + 1L] <- numeric_time   # IndNum 1-based → offset by 1
  
  calc_pathway <- function(parent_col, offspring_sex_label) {
    parent_num_col <- paste0(parent_col, "Num")  # "SireNum" or "DamNum"
    valid_offspring <- dt[!is.na(Sex) & Sex == offspring_sex_label]
    valid_pairs <- valid_offspring[get(parent_num_col) > 0L]
    
    if (nrow(valid_pairs) == 0) return(NULL)
    
    p_nums <- valid_pairs[[parent_num_col]]
    p_times <- time_vec[p_nums + 1L]  # integer array lookup — O(1) per element
    o_times <- valid_pairs$Time
    
    intervals <- o_times - p_times
    # Filter valid intervals (ignore zero/negative which might be data errors)
    valid_idx <- !is.na(intervals) & intervals > 0
    
    if (sum(valid_idx) == 0) return(NULL)
    
    path_code <- paste0(
      ifelse(parent_col == "Sire", "S", "D"),
      ifelse(offspring_sex_label == "male", "S", "D")
    )
    
    res <- data.table(
      Pathway = path_code,
      Interval = intervals[valid_idx]
    )
    
    if (!is.null(by)) {
      res[, Group := valid_pairs[[by]][valid_idx]]
    }
    
    return(res)
  }
  
  res_list <- list(
    calc_pathway("Sire", "male"),   # SS
    calc_pathway("Sire", "female"), # SD
    calc_pathway("Dam", "male"),    # DS
    calc_pathway("Dam", "female")   # DD
  )
  
  all_intervals <- rbindlist(res_list)
  
  # Compute Average from ALL parent-offspring pairs regardless of offspring sex.
  # This avoids severely underestimating N when most offspring lack Sex info.
  calc_all_pathway <- function(parent_col) {
    parent_num_col <- paste0(parent_col, "Num")
    valid_pairs <- dt[get(parent_num_col) > 0L]
    if (nrow(valid_pairs) == 0) return(NULL)
    p_nums <- valid_pairs[[parent_num_col]]
    p_times <- time_vec[p_nums + 1L]
    o_times <- valid_pairs$Time
    intervals <- o_times - p_times
    valid_idx <- !is.na(intervals) & intervals > 0
    if (sum(valid_idx) == 0) return(NULL)
    res <- data.table(
      Pathway = ifelse(parent_col == "Sire", "SO", "DO"),
      Interval = intervals[valid_idx]
    )
    if (!is.null(by)) {
      res[, Group := valid_pairs[[by]][valid_idx]]
    }
    return(res)
  }
  
  all_parent_intervals <- rbindlist(list(
    calc_all_pathway("Sire"),
    calc_all_pathway("Dam")
  ))
  
  if (nrow(all_intervals) == 0 && nrow(all_parent_intervals) == 0) {
    warning("No valid parent-offspring pairs found with birth dates.")
    res <- data.table(Pathway = character(0), N = integer(0), Mean = numeric(0), SD = numeric(0))
    attr(res, "unit") <- unit
    return(res)
  }
  
  agg_cols <- "Pathway"
  if (!is.null(by)) agg_cols <- c("Group", "Pathway")
  
  summary_dt <- if (nrow(all_intervals) > 0) {
    all_intervals[, .(
      N = .N,
      Mean = mean(Interval),
      SD = sd(Interval)
    ), by = agg_cols]
  } else {
    data.table(Pathway = character(0), N = integer(0), Mean = numeric(0), SD = numeric(0))
  }
  
  # SO/DO: Sire-Offspring and Dam-Offspring (sex-independent)
  so_do_agg_cols <- if (!is.null(by)) c("Group", "Pathway") else "Pathway"
  
  so_do_dt <- if (nrow(all_parent_intervals) > 0) {
    all_parent_intervals[, .(
      N = .N,
      Mean = mean(Interval),
      SD = sd(Interval)
    ), by = so_do_agg_cols]
  } else {
    data.table(Pathway = character(0), N = integer(0), Mean = numeric(0), SD = numeric(0))
  }
  
  # Average: all parent-offspring pairs combined
  overall_agg_cols <- if (!is.null(by)) "Group" else NULL
  
  overall_dt <- all_parent_intervals[, .(
    Pathway = "Average",
    N = .N,
    Mean = mean(Interval),
    SD = sd(Interval)
  ), by = overall_agg_cols]
  
  final_res <- rbind(summary_dt, so_do_dt, overall_dt, fill = TRUE)
  
  # Handle Cycle Length for Generation Equivalents
  if (!is.null(cycle) && is.numeric(cycle)) {
    final_res[, GenEquiv := Mean / cycle]
  }
  
  if (!is.null(by)) {
    setorderv(final_res, c("Group", "Pathway"))
  } else {
    setorder(final_res, Pathway)
  }
  
  attr(final_res, "unit") <- unit
  return(final_res[])
}

.parse_to_numeric_time <- function(x, unit = "year", format = NULL) {
  # Seconds per unit
  factors <- c(year = 365.25 * 86400, month = 30.4375 * 86400,
               day = 86400, hour = 3600)
  if (!unit %in% names(factors)) {
    stop("unit must be one of: ", paste(names(factors), collapse = ", "))
  }
  target_factor <- factors[unit]

  # --- Numeric input: assume integer years, auto-convert to Date ---
  if (is.numeric(x)) {
    message("Numeric time column detected. Converting to Date (YYYY-07-01). ",
            "For finer precision, convert to Date beforehand.")
    date_strings <- ifelse(is.na(x), NA_character_, paste0(as.integer(x), "-07-01"))
    x <- as.Date(date_strings)
  }

  # --- Date input ---
  if (inherits(x, "Date")) {
    seconds <- as.numeric(as.POSIXct(x, tz = "UTC"))
    return(seconds / target_factor)
  }

  # --- POSIXct input ---
  if (inherits(x, "POSIXct")) {
    return(as.numeric(x) / target_factor)
  }

  # --- Character input: parse date strings ---
  if (!is.character(x)) {
    stop("timevar must be Date, POSIXct, character, or numeric year. Got ",
         class(x)[1], ".")
  }

  u_x <- unique(x[!is.na(x)])
  if (length(u_x) == 0) return(rep(NA_real_, length(x)))

  parsed <- if (!is.null(format)) {
    as.POSIXct(u_x, format = format, tz = "UTC")
  } else {
    # Try ISO / common formats
    p <- suppressWarnings(as.POSIXct(u_x, tz = "UTC", optional = TRUE))
    if (all(is.na(p))) {
      p <- as.POSIXct(suppressWarnings(as.Date(u_x, optional = TRUE)), tz = "UTC")
    }
    if (all(is.na(p))) {
      p <- as.POSIXct(suppressWarnings(
        as.Date(u_x, format = "%Y/%m/%d", optional = TRUE)
      ), tz = "UTC")
    }
    # Fallback: bare year strings like "2020"
    if (all(is.na(p))) {
      num_try <- suppressWarnings(as.numeric(u_x))
      if (!all(is.na(num_try))) {
        p <- as.POSIXct(as.Date(paste0(as.integer(num_try), "-07-01")), tz = "UTC")
        message("Bare year strings detected. Converting to Date (YYYY-07-01). ",
                "For finer precision, convert to Date beforehand.")
      }
    }
    p
  }

  if (all(is.na(parsed))) {
    stop("Failed to parse time variable. Provide Date, POSIXct, or specify ",
         "'format' (e.g. '%Y-%m-%d').")
  }

  val_map <- setNames(as.numeric(parsed), u_x)
  numeric_seconds <- val_map[as.character(x)]

  return(numeric_seconds / target_factor)
}

#' Pedigree Subpopulations
#'
#' Summarizes pedigree subpopulations and group structure.
#'
#' When \code{by = NULL}, this function is a lightweight summary wrapper around
#' \code{\link{splitped}}, returning one row per disconnected pedigree component
#' plus an optional \code{"Isolated"} row for individuals with no known parents
#' and no offspring. When \code{by} is provided, it instead summarizes the pedigree
#' directly by the specified column (e.g. \code{"Gen"}, \code{"Year"}, \code{"Breed"}).
#'
#' @param ped A \code{tidyped} object.
#' @param by Character. The name of the column to group by. 
#'   If NULL, summarizes disconnected components via \code{\link{splitped}}.
#'
#' @return A \code{data.table} with columns:
#' \itemize{
#'   \item \code{Group}: Subpopulation label.
#'   \item \code{N}: Total individuals.
#'   \item \code{N_Sire}: Number of distinct sires.
#'   \item \code{N_Dam}: Number of distinct dams.
#'   \item \code{N_Founder}: Number of founders (parents unknown).
#' }
#'
#' @details
#' Use \code{pedsubpop()} when you want a compact analytical summary table.
#' Use \code{\link{splitped}} when you need the actual re-tidied sub-pedigree
#' objects for downstream plotting or analysis.
#'
#' @examples
#' tp <- tidyped(simple_ped)
#'
#' # Summarize disconnected pedigree components
#' pedsubpop(tp)
#'
#' # Summarize by an existing grouping variable
#' pedsubpop(tp, by = "Gen")
#' 
#' @export
pedsubpop <- function(ped, by = NULL) {
  ped <- ensure_tidyped(ped)
  
  if (is.null(by)) {
    splits <- splitped(ped)
    groups <- names(splits)
    n_grps <- length(splits)
    iso <- attr(splits, "isolated")
    n_iso <- length(iso)
    
    res_list <- vector("list", n_grps + (n_iso > 0))
    
    for (i in seq_along(splits)) {
      gped <- splits[[i]]
      res_list[[i]] <- list(
        Group = groups[i],
        N = nrow(gped),
        N_Sire = data.table::uniqueN(gped$Sire, na.rm = TRUE),
        N_Dam = data.table::uniqueN(gped$Dam, na.rm = TRUE),
        N_Founder = sum(is.na(gped$Sire) & is.na(gped$Dam))
      )
    }
    
    if (n_iso > 0) {
      res_list[[n_grps + 1]] <- list(
        Group = "Isolated",
        N = n_iso,
        N_Sire = 0,
        N_Dam = 0,
        N_Founder = n_iso
      )
    }
    
  } else {
    if (!by %in% names(ped)) stop(sprintf("Column '%s' not found.", by))
    
    res_list <- list(ped[, .(
      N = .N,
      N_Sire = data.table::uniqueN(Sire, na.rm = TRUE),
      N_Dam = data.table::uniqueN(Dam, na.rm = TRUE),
      N_Founder = sum(is.na(Sire) & is.na(Dam))
    ), by = by])
    
    setnames(res_list[[1]], by, "Group")
  }
  
  data.table::rbindlist(res_list)[]
}

#' Calculate Mean Relationship or Coancestry Within Groups
#'
#' Computes either the average pairwise additive genetic relationship
#' coefficients (\eqn{a_{ij}}) within cohorts, or the corrected population mean
#' coancestry used for pedigree-based diversity summaries.
#'
#' When \code{scale = "relationship"}, the returned value is the mean of the
#' off-diagonal additive relationship coefficients among the selected
#' individuals. When \code{scale = "coancestry"}, the returned value is the
#' diagonal-corrected population mean coancestry:
#' \deqn{\bar{C} = \frac{N - 1}{N} \cdot \frac{\bar{a}_{off}}{2} + \frac{1 + \bar{F}}{2N}}
#' where \eqn{\bar{a}_{off}} is the mean off-diagonal relationship, \eqn{\bar{F}}
#' is the mean inbreeding coefficient of the selected individuals, and \eqn{N}
#' is the number of selected individuals. This \eqn{\bar{C}} matches the
#' internal coancestry quantity used to derive \eqn{f_g} in \code{\link{pediv}}.
#'
#' @param ped A \code{tidyped} object.
#' @param by Character. The column name to group by (e.g., "Year", "Breed", "Generation").
#' @param reference Character vector. An optional vector of reference individual IDs to calculate
#'   relationships for. If provided, only individuals matching these IDs in each group
#'   will be used. Default is NULL (use all individuals in the group).
#' @param compact Logical. Whether to use compact representation for large families to
#'   save memory. Recommended when pedigree size exceeds 25,000. Default is FALSE.
#' @param scale Character. One of \code{"relationship"} or \code{"coancestry"}.
#'   \code{"relationship"} returns the pairwise off-diagonal mean additive
#'   relationship (current \code{pedrel()} behavior). \code{"coancestry"}
#'   returns the corrected population mean coancestry used for pedigree-based
#'   diversity calculations.
#'
#' @return A \code{data.table} with columns:
#' \itemize{
#'   \item A grouping identifier column, named after the \code{by} parameter (e.g., \code{Gen}, \code{Year}).
#'   \item \code{NTotal}: Total number of individuals in the group.
#'   \item \code{NUsed}: Number of individuals used in calculation (could be subset by reference).
#'   \item \code{MeanRel}: Present when \code{scale = "relationship"}; average of off-diagonal
#'     elements in the Additive Relationship (A) matrix for this group
#'     (\eqn{a_{ij} = 2f_{ij}}).
#'   \item \code{MeanCoan}: Present when \code{scale = "coancestry"}; diagonal-corrected
#'     population mean coancestry for this group.
#' }
#' 
#' @examples
#' \donttest{
#' library(data.table)
#' # Use the sample dataset and simulate a birth year
#' tp <- tidyped(small_ped)
#' tp$Year <- 2010 + tp$Gen
#' 
#' # Example 1: Calculate average relationship grouped by Generation (default)
#' rel_by_gen <- pedrel(tp, by = "Gen")
#' print(rel_by_gen)
#' 
#' # Example 2: Calculate average relationship grouped by Year
#' rel_by_year <- pedrel(tp, by = "Year")
#' print(rel_by_year)
#' 
#' # Example 3: Calculate corrected mean coancestry
#' coan_by_gen <- pedrel(tp, by = "Gen", scale = "coancestry")
#' print(coan_by_gen)
#' 
#' # Example 4: Filter calculations with a reference list in a chosen group
#' candidates <- c("N", "O", "P", "Q", "T", "U", "V", "X", "Y")
#' rel_subset <- pedrel(tp, by = "Gen", reference = candidates)
#' print(rel_subset)
#' }
#' 
#' @export
pedrel <- function(ped, by = "Gen", reference = NULL, compact = FALSE,
                   scale = c("relationship", "coancestry")) {
  ped <- ensure_complete_tidyped(ped, "pedrel()")
  scale <- match.arg(scale)
  if (!by %in% names(ped)) stop(sprintf("Column '%s' not found.", by))

  corrected_mean_coancestry <- function(mean_rel, mean_f, n_ref) {
    if (is.na(mean_rel) || is.na(mean_f) || n_ref < 1L) return(NA_real_)
    (n_ref - 1) / n_ref * mean_rel / 2 + (1 + mean_f) / (2 * n_ref)
  }
  
  groups <- unique(ped[[by]])
  groups <- groups[!is.na(groups)]
  
  res_list <- lapply(groups, function(g) {
    row_idx <- which(ped[[by]] == g)
    sub_ped_full <- as.data.table(ped)[row_idx]
    n_total <- nrow(sub_ped_full)
    
    if (n_total < 2) {
      warning(sprintf("Group '%s' has less than 2 individuals, returning NA_real_.", g))
      return(data.table(Group = g, NTotal = n_total, NUsed = n_total, Mean = NA_real_))
    }
    
    if (!is.null(reference)) {
      sub_ped <- sub_ped_full[Ind %in% reference]
    } else {
      sub_ped <- sub_ped_full
    }
    n_used <- nrow(sub_ped)
    
    if (n_used < 2) {
      warning(sprintf("Group '%s' has less than 2 individuals after applying 'reference', returning NA_real_.", g))
      return(data.table(Group = g, NTotal = n_total, NUsed = n_used, Mean = NA_real_))
    }
    
    local_ped <- tryCatch({
      suppressMessages(tidyped(ped, cand = sub_ped$Ind, addnum = TRUE))
    }, error = function(e) {
      if (all(is.na(sub_ped$Sire) & is.na(sub_ped$Dam))) {
        return(NULL)
      }
      stop(e)
    })
    
    if (is.null(local_ped)) {
      mean_rel <- 0.0
      mean_f_s <- 0.0
    } else {
      target_inds <- sub_ped$Ind
      
      max_dense <- 25000L
      if (!compact && nrow(local_ped) > max_dense) {
        warning(sprintf(
          paste0("Group '%s': pedigree too large (%d individuals including ancestors) ",
                 "for dense A matrix (limit: %d). Returning NA.\n",
                 "Hint: use 'reference' parameter to reduce group size, or set compact = TRUE."),
          g, nrow(local_ped), max_dense))
        return(data.table(Group = g, NTotal = n_total, NUsed = n_used, Mean = NA_real_))
      }
      
      if (isTRUE(compact)) {
        if (nrow(local_ped) > 200000L) {
          warning(sprintf(
            paste0("Group '%s': traced pedigree too large (%d individuals) for matrix ",
                   "computation (limit: 200,000). Returning NA.\n",
                   "Hint: use 'reference' to limit the traced ancestry depth."),
            g, nrow(local_ped)))
          return(data.table(Group = g, NTotal = n_total, NUsed = n_used, Mean = NA_real_))
        }
        A <- tryCatch({
          pedmat(local_ped, method = "A", sparse = FALSE, compact = TRUE)
        }, error = function(e) {
          warning(sprintf(
            paste0("Group '%s': %s\n",
                   "Hint: subset with 'reference' or use 'compact = TRUE'."),
            g, e$message))
          return(NULL)
        })
        
        if (is.null(A)) {
          return(data.table(Group = g, NTotal = n_total, NUsed = n_used, Mean = NA_real_))
        }
        
        c_map <- attr(A, "compact_map")
        c_map_target <- c_map[Ind %in% target_inds]
        
        freq_dt <- c_map_target[, .(W = .N), by = RepIndNum]
        W_idx <- freq_dt$RepIndNum
        W <- freq_dt$W
        
        A_rep <- A[W_idx, W_idx, drop = FALSE]
        sum_total_rep <- as.numeric(t(W) %*% (A_rep %*% W))
        sum_between <- sum_total_rep - sum(W^2 * diag(A_rep))
        
        sib_reps <- freq_dt[W > 1]
        sum_within <- 0.0
        
        if (nrow(sib_reps) > 0) {
          rep_parents <- unique(c_map_target[RepIndNum %in% sib_reps$RepIndNum,
                                             .(RepIndNum, rep_sire = SireNum, rep_dam = DamNum)])
          
          for (i in seq_len(nrow(sib_reps))) {
            w_i <- sib_reps$W[i]
            r_idx <- sib_reps$RepIndNum[i]
            sire_idx <- rep_parents[RepIndNum == r_idx, rep_sire]
            dam_idx <- rep_parents[RepIndNum == r_idx, rep_dam]
            A_sib <- 0.25 * (A[sire_idx, sire_idx] + A[dam_idx, dam_idx] + 2 * A[sire_idx, dam_idx])
            sum_within <- sum_within + w_i * (w_i - 1) * A_sib
          }
        }
        
        mean_rel <- (sum_between + sum_within) / (as.numeric(n_used) * (n_used - 1))
        mean_f_s <- sum(W * (diag(A_rep) - 1)) / as.numeric(n_used)
      } else {
        target_idx <- match(target_inds, local_ped$Ind)
        if (anyNA(target_idx)) {
          return(data.table(Group = g, NTotal = n_total, NUsed = n_used, Mean = NA_real_))
        }
        mean_rel <- tryCatch({
          cpp_mean_relationship(local_ped$SireNum, local_ped$DamNum, as.integer(target_idx))
        }, error = function(e) {
          warning(sprintf(
            paste0("Group '%s': %s\n",
                   "Hint: subset with 'reference' or use 'compact = TRUE'."),
            g, e$message))
          return(NA_real_)
        })

        mean_f_s <- if (is.na(mean_rel)) {
          NA_real_
        } else {
          tryCatch({
            f_traced <- cpp_calculate_inbreeding(local_ped$SireNum, local_ped$DamNum)$f
            mean(f_traced[target_idx], na.rm = TRUE)
          }, error = function(e) {
            warning(sprintf(
              paste0("Group '%s': %s\n",
                     "Hint: subset with 'reference' or use 'compact = TRUE'."),
              g, e$message))
            NA_real_
          })
        }
      }
    }

    mean_value <- if (scale == "relationship") {
      mean_rel
    } else {
      corrected_mean_coancestry(mean_rel, mean_f_s, n_used)
    }

    data.table(Group = g, NTotal = n_total, NUsed = n_used, Mean = mean_value)
  })
  
  result <- rbindlist(res_list)
  setnames(result, "Group", by)
  setnames(result, "Mean", if (scale == "relationship") "MeanRel" else "MeanCoan")
  setorderv(result, cols = by)
  return(result[])
}

#' Calculate Equi-Generate Coefficient
#'
#' Estimates the number of distinct ancestral generations using the Equi-Generate Coefficient (ECG).
#' The ECG is calculated as 1/2 of the sum of the parents' ECG values plus 1.
#'
#' @param ped A \code{tidyped} object.
#'
#' @return A \code{data.table} with columns:
#' \itemize{
#'   \item \code{Ind}: Individual ID.
#'   \item \code{ECG}: Equi-Generate Coefficient.
#'   \item \code{FullGen}: Fully traced generations (depth of complete ancestry).
#'   \item \code{MaxGen}: Maximum ancestral generations (depth of deepest ancestor).
#' }
#'
#' @examples
#' tp <- tidyped(simple_ped)
#' ecg <- pedecg(tp)
#'
#' # ECG combines pedigree depth and completeness
#' head(ecg)
#'
#' # Individuals with deeper and more complete ancestry have larger ECG values
#' ecg[order(-ECG)][1:5]
#' 
#' @references
#' Boichard, D., Maignel, L., & Verrier, E. (1997). The value of using probabilities of gene origin to measure genetic variability in a population. Genetics Selection Evolution, 29(1), 5.
#'
#' @export
pedecg <- function(ped) {
  ped <- ensure_complete_tidyped(ped, "pedecg()")
  
  n <- nrow(ped)
  ecg <- numeric(n)
  full_gen <- numeric(n)
  max_gen_val <- numeric(n)
  
  sire_nums <- ped$SireNum
  dam_nums <- ped$DamNum
  
  gen_indices <- split(ped$IndNum, ped$Gen)
  gen_levels <- sort(as.integer(names(gen_indices)))
  
  for (g in gen_levels) {
    idx <- gen_indices[[as.character(g)]]
    k <- length(idx)
    
    sires <- sire_nums[idx]
    dams <- dam_nums[idx]
    
    sire_ecg <- numeric(k) 
    dam_ecg <- numeric(k)
    sire_full <- rep(-1, k)
    dam_full <- rep(-1, k)
    sire_max <- rep(-1, k)
    dam_max <- rep(-1, k)
    
    valid_s <- !is.na(sires) & sires > 0
    if (any(valid_s)) {
      parents <- sires[valid_s]
      sire_ecg[valid_s] <- ecg[parents]
      sire_full[valid_s] <- full_gen[parents]
      sire_max[valid_s] <- max_gen_val[parents]
    }
    
    valid_d <- !is.na(dams) & dams > 0
    if (any(valid_d)) {
      parents <- dams[valid_d]
      dam_ecg[valid_d] <- ecg[parents]
      dam_full[valid_d] <- full_gen[parents]
      dam_max[valid_d] <- max_gen_val[parents]
    }
    
    term_s <- ifelse(valid_s, sire_ecg + 1, 0)
    term_d <- ifelse(valid_d, dam_ecg + 1, 0)
    ecg[idx] <- 0.5 * (term_s + term_d)
    
    full_gen[idx] <- pmin(sire_full, dam_full) + 1
    max_gen_val[idx] <- pmax(sire_max, dam_max) + 1
  }
  
  res <- data.table(
    Ind = ped$Ind,
    ECG = ecg,
    FullGen = full_gen,
    MaxGen = max_gen_val
  )

  return(res[])
}

#' Pedigree Statistics
#'
#' Calculates comprehensive statistics for a pedigree, including population structure,
#' generation intervals, and ancestral depth.
#'
#' @param ped A \code{tidyped} object.
#' @param timevar Optional character. Name of the column containing the
#'   \strong{birth date} (or hatch date) of each individual.
#'   Accepted column formats:
#'   \itemize{
#'     \item \code{Date} or \code{POSIXct} (recommended).
#'     \item A date string parseable by \code{as.POSIXct}
#'       (e.g., \code{"2020-06-15"}).  Use \code{format} via \code{...}
#'       for non-ISO strings.
#'     \item A numeric year (e.g., \code{2020}).  Automatically converted
#'       to \code{Date} (\code{"YYYY-07-01"}) with a message.
#'   }
#'   If \code{NULL}, attempts auto-detection from common column names
#'   (\code{"BirthYear"}, \code{"Year"}, \code{"BirthDate"}, etc.).
#' @param unit Character. Time unit for reporting generation intervals:
#'   \code{"year"} (default), \code{"month"}, \code{"day"}, or \code{"hour"}.
#' @param cycle Numeric. Optional target generation cycle length in
#'   \code{unit}s. When provided, \code{gen_intervals} will include a
#'   \code{GenEquiv} column (observed Mean / cycle). See
#'   \code{\link{pedgenint}} for details.
#' @param ecg Logical. Whether to compute equivalent complete generations
#'   for each individual via \code{\link{pedecg}}. Default \code{TRUE}.
#' @param genint Logical. Whether to compute generation intervals via
#'   \code{\link{pedgenint}}. Requires a detectable \code{timevar} column.
#'   Default \code{TRUE}.
#' @param ... Additional arguments passed to \code{\link{pedgenint}},
#'   e.g., \code{format} for custom date parsing or \code{by} for grouping.
#'
#' @return An object of class \code{pedstats}, which is a list containing:
#' \itemize{
#'   \item \code{summary}: A \code{data.table} with one row summarising the
#'     whole pedigree.  Columns:
#'     \itemize{
#'       \item \code{N} — total number of individuals.
#'       \item \code{NSire} — number of unique sires.
#'       \item \code{NDam} — number of unique dams.
#'       \item \code{NFounder} — number of founder individuals
#'         (both parents unknown).
#'       \item \code{MaxGen} — maximum generation number.
#'     }
#'   \item \code{ecg}: A \code{data.table} with one row per individual
#'     (\code{NULL} if \code{ecg = FALSE}).  Columns:
#'     \itemize{
#'       \item \code{Ind} — individual identifier.
#'       \item \code{ECG} — equivalent complete generations.
#'       \item \code{FullGen} — number of fully known generations.
#'       \item \code{MaxGen} — maximum traceable generation depth.
#'     }
#'   \item \code{gen_intervals}: A \code{data.table} of generation intervals
#'     (\code{NULL} if no \code{timevar} is detected or
#'     \code{genint = FALSE}).  Columns:
#'     \itemize{
#'       \item \code{Pathway} — gametic pathway label.  Seven values:
#'         \code{"SS"} (sire to son), \code{"SD"} (sire to daughter),
#'         \code{"DS"} (dam to son), \code{"DD"} (dam to daughter) —
#'         require offspring sex; \code{"SO"} (sire to offspring) and
#'         \code{"DO"} (dam to offspring) — sex-independent; and
#'         \code{"Average"} — all parent-offspring pairs combined.
#'       \item \code{N} — number of parent-offspring pairs.
#'       \item \code{Mean} — mean generation interval.
#'       \item \code{SD} — standard deviation of the interval.
#'       \item \code{GenEquiv} — \code{Mean / cycle} (only present when
#'         \code{cycle} is supplied).
#'     }
#' }
#'
#' @examples
#' \donttest{
#' # ---- Without time variable ----
#' tp <- tidyped(simple_ped)
#' ps <- pedstats(tp)
#' ps$summary
#' ps$ecg
#'
#' # ---- With annual Year column (big_family_size_ped) ----
#' tp2 <- tidyped(big_family_size_ped)
#' ps2 <- pedstats(tp2, timevar = "Year")
#' ps2$summary
#' ps2$gen_intervals
#' }
#'
#' @export
pedstats <- function(ped, timevar = NULL, unit = "year", cycle = NULL, ecg = TRUE, genint = TRUE, ...) {
  ped <- if (isTRUE(ecg) || isTRUE(genint)) {
    ensure_complete_tidyped(ped, "pedstats()")
  } else {
    ensure_tidyped(ped)
  }
  
  summ <- data.table(
    N = nrow(ped),
    NSire = data.table::uniqueN(ped$Sire, na.rm = TRUE),
    NDam = data.table::uniqueN(ped$Dam, na.rm = TRUE),
    NFounder = sum(is.na(ped$Sire) & is.na(ped$Dam)),
    MaxGen = max(ped$Gen)
  )
  
  ecg_dt <- NULL
  if (ecg) {
    ecg_dt <- pedecg(ped)
  }
  
  gen_int <- NULL
  
  target_timevar <- timevar
  if (is.null(target_timevar)) {
    candidates <- c("BirthYear", "Year", "birth_year", "year", "BirthDate", "Date")
    match_col <- intersect(names(ped), candidates)
    if (length(match_col) > 0) target_timevar <- match_col[1]
  }
  
  if (genint && !is.null(target_timevar) && target_timevar %in% names(ped)) {
    gen_int <- tryCatch({
      pedgenint(ped, timevar = target_timevar, unit = unit, cycle = cycle, ...)
    }, error = function(e) {
      warning("Failed to calculate generation intervals: ", e$message)
      NULL
    })
  }
  
  res <- list(
    summary = summ,
    ecg = ecg_dt,
    gen_intervals = gen_int
  )
  class(res) <- "pedstats"
  res
}

#' Print Pedigree Statistics
#'
#' @param x A \code{pedstats} object.
#' @param ... Additional arguments.
#'
#' @export
print.pedstats <- function(x, ...) {
  cat("Pedigree Statistics\n")
  cat("===================\n")
  
  s <- x$summary
  cat(sprintf("Total Individuals: %d\n", s$N))
  cat(sprintf("Sires: %d | Dams: %d | Founders: %d\n", s$NSire, s$NDam, s$NFounder))
  cat(sprintf("Max Generation: %d\n", s$MaxGen))
  
  if (!is.null(x$gen_intervals)) {
    unit <- attr(x$gen_intervals, "unit")
    unit_label <- switch(unit, 
                        "year" = "Years", "month" = "Months", "day" = "Days", 
                        "hour" = "Hours", "gen" = "Units", "Years")
    cat(sprintf("\nGeneration Intervals (%s):\n", unit_label))
    print(x$gen_intervals)
  }
  
  if (!is.null(x$ecg)) {
    cat("\nAncestral Depth (ECG) Summary:\n")
    print(summary(x$ecg$ECG))
  }
  
  invisible(x)
}

#' Calculate Effective Population Size
#'
#' Calculates the effective population size (Ne) based on the rate of
#' coancestry, the rate of inbreeding, or demographic parent numbers.
#'
#' @param ped A \code{tidyped} object.
#' @param method Character. The method to compute Ne. One of \code{"coancestry"} (default), \code{"inbreeding"}, or \code{"demographic"}.
#' @param by Character. The name of the column used to group cohorts (e.g., "Year", "BirthYear").
#'   If NULL, calculates overall Ne for all individuals.
#' @param reference Character vector. Optional subset of individual IDs defining the reference cohort.
#'   If NULL, uses all individuals in the pedigree.
#' @param nsamples Integer. Number of individuals to randomly sample per cohort when using the \code{"coancestry"} method. Very large cohorts will be sampled down to this size to save memory and time (default: 1000).
#' @param ncores Integer. Number of cores for parallel processing. Currently only effective for \code{method = "coancestry"} (default: 1).
#' @param seed Integer or NULL. Random seed passed to \code{set.seed()} before sampling
#'   in the coancestry method, ensuring reproducible \eqn{N_e} estimates.
#'   Default is \code{NULL} (no fixed seed).
#'
#' @return A \code{data.table} with columns:
#' \itemize{
#'   \item \code{Cohort}: Cohort or grouping variable value.
#'   \item \code{N}: Number of individuals in the cohort.
#'   \item \code{Ne}: Calculated effective population size.
#'   \item \code{...}: Additional columns depending on the selected method (e.g., \code{NSampled}, \code{DeltaC}, \code{MeanF}, \code{DeltaF}, \code{Nm}, \code{Nf}).
#' }
#' 
#' @details
#' The effective population size can be calculated using one of three methods:
#' 
#' \itemize{
#'   \item \strong{"coancestry"} (Default): Based on the rate of coancestry (\eqn{f_{ij}}) between 
#'   pairs of individuals. In this context, \eqn{f_{ij}} is the probability that two alleles 
#'   randomly sampled from individuals $i$ and $j$ are identical by descent, which is equivalent 
#'   to half the additive genetic relationship (\eqn{f_{ij} = a_{ij} / 2}). 
#'   This method is generally more robust as it accounts for full genetic drift and 
#'   bottlenecks (Cervantes et al., 2011).
#'   \deqn{\Delta c_{ij} = 1 - (1 - c_{ij})^{1/(\frac{ECG_i + ECG_j}{2})}}
#'   \deqn{N_e = \frac{1}{2 \overline{\Delta c}}}
#'   To handle large populations, this method samples \code{nsamples} individuals per cohort 
#'   and computes the mean rate of coancestry among them.
#'   
#'   \item \strong{"inbreeding"}: Based on the individual rate of inbreeding ($F_i$) (Gutiérrez et al., 2008, 2009).
#'   \deqn{\Delta F_i = 1 - (1 - F_i)^{1/(ECG_i - 1)}}
#'   \deqn{N_e = \frac{1}{2 \overline{\Delta F}}}
#'   
#'   \item \strong{"demographic"}: Based on the demographic census of breeding males and females (Wright, 1931).
#'   \deqn{N_e = \frac{4 N_m N_f}{N_m + N_f}}
#'   Where \eqn{N_m} and \eqn{N_f} are the number of unique male and female parents contributing to the cohort.
#' }
#' 
#' @references
#' Cervantes, I., Goyache, F., Molina, A., Valera, M., & Gutiérrez, J. P. (2011). Estimation of effective population size from the rate of coancestry in pedigreed populations. \emph{Journal of Animal Breeding and Genetics}, 128(1), 56-63.
#' 
#' Gutiérrez, J. P., Cervantes, I., Molina, A., Valera, M., & Goyache, F. (2008). Individual increase in inbreeding allows estimating effective sizes from pedigrees. \emph{Genetics Selection Evolution}, 40(4), 359-370.
#' 
#' Gutiérrez, J. P., Cervantes, I., & Goyache, F. (2009). Improving the estimation of realized effective population sizes in farm animals. \emph{Journal of Animal Breeding and Genetics}, 126(4), 327-332.
#' 
#' Wright, S. (1931). Evolution in Mendelian populations. \emph{Genetics}, 16(2), 97-159.
#'
#' @examples
#' \donttest{
#' # Coancestry-based Ne (default) using a simple pedigree grouped by year
#' tp_simple <- tidyped(simple_ped)
#' tp_simple$BirthYear <- 2000 + tp_simple$Gen
#' ne_coan <- suppressMessages(pedne(tp_simple, by = "BirthYear", seed = 42L))
#' ne_coan
#'
#' # Inbreeding-based Ne using an inbred pedigree
#' tp_inbred <- tidyped(inbred_ped)
#' ne_inb <- suppressMessages(pedne(tp_inbred, method = "inbreeding", by = "Gen"))
#' ne_inb
#'
#' # Demographic Ne from the number of contributing sires and dams
#' ne_demo <- suppressMessages(pedne(tp_simple, method = "demographic", by = "BirthYear"))
#' ne_demo
#' }
#' 
#' @export
pedne <- function(ped, method = c("coancestry", "inbreeding", "demographic"),
                  by = NULL, reference = NULL, nsamples = 1000, ncores = 1,
                  seed = NULL) {
  
  ped <- if (match.arg(method) %in% c("coancestry", "inbreeding")) {
    ensure_complete_tidyped(ped, sprintf("pedne(method = \"%s\")", match.arg(method)))
  } else {
    ensure_tidyped(ped)
  }
  method <- match.arg(method)
  
  # Protect the original data from being modified by reference
  ped_dt <- data.table::copy(ped)
  
  # Handle 'by' parameter
  if (is.null(by)) {
    message("No 'by' parameter specified. Calculating Ne for the entire dataset. Grouping by a cohort or year variable is often more biologically meaningful.")
    by <- ".CohortAll"
    ped_dt[[by]] <- "All"
  } else {
    if (!by %in% names(ped_dt)) {
      stop(sprintf("Column '%s' not found in pedigree.", by))
    }
  }
  
  # Configure threads
  if (ncores > 1) {
    if (cpp_openmp_available()) {
      cpp_set_num_threads(ncores)
    } else {
      warning("OpenMP not available. Using single core.")
    }
  }

  # Pre-calculate mandatory genetic parameters on FULL pedigree before subsetting
  if (method %in% c("inbreeding", "coancestry")) {
    if (method == "inbreeding" && !"f" %in% names(ped_dt)) {
      ped_dt <- inbreed(ped_dt)
    }
    if (!"ECG" %in% names(ped_dt)) {
      ecg_dt <- pedecg(ped_dt)
      ped_dt <- merge(ped_dt, ecg_dt[, .(Ind, ECG)], by = "Ind", all.x = TRUE)
      # merge() sorts by Ind (alphabetically), breaking the IndNum == row-index
      # invariant that the fast-path BFS in tidyped() depends on.
      setorder(ped_dt, IndNum)
    }
  }
  
  # Filter reference population if specified
  if (!is.null(reference)) {
    if (!all(reference %in% ped_dt$Ind)) {
      missing <- reference[!reference %in% ped_dt$Ind]
      warning(sprintf("Reference IDs not found in pedigree: %s", paste(missing, collapse = ", ")))
    }
    ped_subset <- as.data.table(ped_dt)[Ind %in% reference]
  } else {
    ped_subset <- ped_dt
  }
  
  # Dispatch to specific calculator
  res <- switch(method,
    "inbreeding"  = calc_ne_inbreeding(ped_subset, by),
    "coancestry"  = {
      raw <- calc_ne_coancestry(ped_subset, ped_dt, by, nsamples, seed = seed)
      # Strip fg/MeanCoan/NSampledCoan: exposed only via pediv(), not pedne()
      cols_to_drop <- intersect(names(raw), c("fg", "MeanCoan", "NSampledCoan"))
      if (length(cols_to_drop) > 0) raw[, (cols_to_drop) := NULL]
      raw
    },
    "demographic" = calc_ne_demographic(ped_subset, by)
  )

  return(res[])
}

calc_ne_inbreeding <- function(ped, by) {
  # Inbreeding coefficients and ECG should already be calculated on the full pedigree
  if (!"f" %in% names(ped) || !"ECG" %in% names(ped)) {
    stop("f or ECG is missing in subset tracking.")
  }
  
  ped_work <- ped[ECG >= 1]
  
  if (nrow(ped_work) == 0) {
    warning("No individuals with sufficient pedigree depth (ECG >= 1).")
    return(data.table(Cohort = numeric(0), N = integer(0), Ne = numeric(0)))
  }

  ped_work[, CohortLabel := get(by)]
  ped_work[, DeltaF_Ind := ifelse(ECG > 1, 1 - (1 - f)^(1 / (ECG - 1)), NA_real_)]
  
  # Filter valid DeltaF
  ped_valid <- ped_work[!is.na(DeltaF_Ind) & is.finite(DeltaF_Ind) & DeltaF_Ind >= 0]
  
  if (nrow(ped_valid) == 0) {
    warning("No valid DeltaF values calculated.")
    return(data.table(Cohort = numeric(0), N = integer(0), Ne = numeric(0)))
  }

  result <- ped_valid[, .(
    N = .N,
    MeanF = mean(f, na.rm = TRUE),
    DeltaF = mean(DeltaF_Ind, na.rm = TRUE),
    MeanECG = mean(ECG, na.rm = TRUE)
  ), by = .(Cohort = CohortLabel)]
  
  result[, Ne := ifelse(DeltaF > 0, 1 / (2 * DeltaF), NA_real_)]
  setorder(result, Cohort)
  
  return(result[])
}

calc_ne_demographic <- function(ped, by) {
  # Ensure numbers are present
  if (!all(c("Sire", "Dam", "Sex") %in% names(ped))) {
     stop("Pedigree must contain 'Sire', 'Dam', and 'Sex' columns for demographic Ne.")
  }

  ped <- copy(ped)
  ped[, CohortLabel := get(by)]
  
  # Need to find parents contributing to EACH cohort
  # Parent IDs are in Sire and Dam columns
  # We should count how many UNIQUE Sires and Dams produced offspring in this cohort
  
  result <- ped[, {
    sires <- unique(Sire[!is.na(Sire) & Sire != "" & Sire != "0"])
    dams <- unique(Dam[!is.na(Dam) & Dam != "" & Dam != "0"])
    
    nm <- length(sires)
    nf <- length(dams)
    
    ne_val <- if (nm > 0 && nf > 0) (4 * nm * nf) / (nm + nf) else NA_real_
    
    list(
      N = .N,
      Nm = nm,
      Nf = nf,
      Ne = ne_val
    )
  }, by = .(Cohort = CohortLabel)]
  
  setorder(result, Cohort)
  return(result[])
}

calc_ne_coancestry <- function(ped_subset, ped_full, by, nsamples,
                               seed = NULL, max_trace = 25000L) {
  # ECG is already merged into ped_subset from ped_full by the caller function.
  # Returns extended table with fg/MeanCoan/NSampledCoan columns.
  # NOTE: pedne() strips fg columns before exposing its public output;
  #       pediv() calls this function directly to access fg without duplication.

  ped_subset <- as.data.table(ped_subset)

  ped_subset[, CohortLabel := get(by)]
  cohorts <- sort(unique(ped_subset$CohortLabel))

  results_list <- list()

  for (coh in cohorts) {
    sub_ped <- ped_subset[CohortLabel == coh]
    n_total <- nrow(sub_ped)

    if (n_total == 0) next

    # Reproducible sampling
    if (!is.null(seed)) set.seed(seed)

    # Initial sampling
    if (n_total > nsamples) {
      sampled_inds <- sample(sub_ped$Ind, nsamples)
    } else {
      sampled_inds <- sub_ped$Ind
    }

    # Trace back ancestors using the FULL pedigree
    trace_ped <- tidyped(ped_full, cand = sampled_inds, trace = "up",
                         addgen = TRUE, addnum = TRUE)

    # Adaptive resampling: if traced pedigree exceeds memory limit, reduce sample
    if (nrow(trace_ped) > max_trace) {
      n_reduced <- max(2L, floor(length(sampled_inds) * max_trace / nrow(trace_ped)))
      if (!is.null(seed)) set.seed(seed)
      sampled_inds <- sample(sub_ped$Ind, min(n_reduced, n_total))
      trace_ped <- tidyped(ped_full, cand = sampled_inds, trace = "up",
                           addgen = TRUE, addnum = TRUE)
    }

    # Must be sorted by IndNum for correct C++ indexing (1..N)
    setorder(trace_ped, IndNum)

    # Re-identify target numeric IDs (1-based)
    target_nums <- trace_ped[Ind %in% sampled_inds, IndNum]

    # Ensure ECG is present (propagate from full pedigree if missing after trace)
    if (!"ECG" %in% names(trace_ped)) {
      ecg_map <- ped_full[Ind %in% trace_ped$Ind, .(Ind, ECG)]
      trace_ped <- merge(trace_ped, ecg_map, by = "Ind", all.x = TRUE)
      setorder(trace_ped, IndNum)
    }
    trace_ped[is.na(ECG), ECG := 0]

    # ---- 1. DeltaC / Ne ----
    avg_delta_c <- tryCatch({
      cpp_calculate_sampled_coancestry_delta(
        trace_ped$SireNum,
        trace_ped$DamNum,
        target_nums,
        trace_ped$ECG
      )
    }, error = function(e) {
      warning(paste("Error in coancestry calculation for cohort", coh, ":", e$message))
      NA_real_
    })

    ne_val <- if (!is.na(avg_delta_c) && avg_delta_c > 0) 1 / (2 * avg_delta_c) else NA_real_

    # ---- 2. fg: founder genome equivalents (Caballero & Toro 2000) ----
    # Reuses the same traced pedigree — zero extra tidyped() cost.
    # Corrected mean coancestry:
    #   C_bar = (N-1)/N * rbar_off/2 + (1 + mean_F_s) / (2N)
    # where N = full reference cohort size (NOT sample size).
    fg_val     <- NA_real_
    mean_coan  <- NA_real_

    tryCatch({
      # Off-diagonal mean relationship (cpp returns off-diagonal only)
      rbar_off <- cpp_mean_relationship(
        trace_ped$SireNum,
        trace_ped$DamNum,
        target_nums
      )

      # Inbreeding for diagonal correction (on traced pedigree — fast)
      f_traced  <- cpp_calculate_inbreeding(trace_ped$SireNum, trace_ped$DamNum)$f
      mean_f_s  <- mean(f_traced[target_nums], na.rm = TRUE)

      # Diagonal-corrected mean coancestry weighted by full N
      N_ref     <- n_total
      mean_coan <- (N_ref - 1) / N_ref * rbar_off / 2 +
                   (1 + mean_f_s) / (2 * N_ref)

      fg_val    <- if (!is.na(mean_coan) && mean_coan > 0) 1 / (2 * mean_coan) else NA_real_
    }, error = function(e) {
      warning(paste("Error in fg calculation for cohort", coh, ":", e$message))
    })

    results_list[[as.character(coh)]] <- data.table(
      Cohort       = coh,
      N            = n_total,
      NSampled     = length(sampled_inds),
      DeltaC       = avg_delta_c,
      Ne           = ne_val,
      MeanCoan     = mean_coan,
      fg           = fg_val,
      NSampledCoan = length(sampled_inds)
    )
  }

  result <- rbindlist(results_list)
  return(result[])
}

# Legacy wrapper/stub for documentation or compatibility if needed
# (none required as we replaced pedne in place) -> THIS LINE INTENTIONALLY LEFT BLANK
fill_phantoms <- function(ped) {
  missing_sire <- is.na(ped$Sire) & !is.na(ped$Dam)
  missing_dam <- !is.na(ped$Sire) & is.na(ped$Dam)
  
  if (!any(missing_sire) && !any(missing_dam)) {
    return(ped)
  }
  
  ped_new <- data.table::copy(ped)
  new_rows <- list()
  
  if (any(missing_sire)) {
    phantoms_sire <- paste0("*Phantom_Sire_", ped_new$Ind[missing_sire])
    ped_new$Sire[missing_sire] <- phantoms_sire
    new_rows[[1]] <- data.table::data.table(
      Ind = phantoms_sire,
      Sire = NA_character_,
      Dam = NA_character_
    )
  }
  
  if (any(missing_dam)) {
    phantoms_dam <- paste0("*Phantom_Dam_", ped_new$Ind[missing_dam])
    ped_new$Dam[missing_dam] <- phantoms_dam
    new_rows[[2]] <- data.table::data.table(
      Ind = phantoms_dam,
      Sire = NA_character_,
      Dam = NA_character_
    )
  }
  
  new_dt <- data.table::rbindlist(new_rows, fill = TRUE)
  extra_cols <- setdiff(names(ped_new), names(new_dt))
  if (length(extra_cols) > 0) {
    for (col in extra_cols) {
      data.table::set(new_dt, j = col, value = NA)
    }
  }
  
  ped_combined <- rbind(ped_new, new_dt, fill = TRUE)
  
  old_cols <- intersect(names(ped_combined), c("IndNum", "SireNum", "DamNum"))
  if (length(old_cols) > 0) {
    ped_combined[, (old_cols) := NULL]
  }
  
  return(tidyped(ped_combined, addnum = TRUE))
}

# Internal: Shannon entropy -> effective number (q=1 Hill number)
# p must be a probability vector (sums to 1); zeros are filtered out.
calc_shannon_effective <- function(p) {
  p <- p[p > 0]
  if (length(p) == 0L) return(NA_real_)
  exp(-sum(p * log(p)))
}

#' Calculate Founder and Ancestor Contributions
#'
#' Calculates genetic contributions from founders and influential ancestors.
#' Implements the gene dropping algorithm for founder contributions and Boichard's 
#' algorithm for ancestor contributions to estimate the effective number of founders ($f_e$) 
#' and ancestors ($f_a$).
#'
#' @param ped A \code{tidyped} object.
#' @param reference Character vector. Optional subset of individual IDs defining the reference population.
#'   If NULL, uses all individuals in the most recent generation.
#' @param mode Character. Type of contribution to calculate:
#' \itemize{
#'   \item \code{"founder"}: Founder contributions ($f_e$).
#'   \item \code{"ancestor"}: Ancestor contributions ($f_a$).
#'   \item \code{"both"}: Both founder and ancestor contributions.
#' }
#' @param top Integer. Number of top contributors to return. Default is 20.
#'
#' @return A list with class \code{pedcontrib} containing:
#' \itemize{
#'   \item \code{founders}: A \code{data.table} of founder contributions (if mode includes "founder", or "both").
#'   \item \code{ancestors}: A \code{data.table} of ancestor contributions (if mode includes "ancestor", or "both").
#'   \item \code{summary}: A \code{list} of summary statistics including:
#'     \itemize{
#'       \item \code{f_e}: Classical effective number of founders (\eqn{q=2}, Lacy 1989).
#'       \item \code{f_e_H}: Information-theoretic effective number of founders
#'         (\eqn{q=1}, Shannon entropy): \eqn{f_e^{(H)} = \exp(-\sum p_i \ln p_i)}.
#'       \item \code{f_a}: Classical effective number of ancestors (\eqn{q=2}, Boichard 1997).
#'       \item \code{f_a_H}: Information-theoretic effective number of ancestors
#'         (\eqn{q=1}): \eqn{f_a^{(H)} = \exp(-\sum q_k \ln q_k)}.
#'     }
#' }
#' 
#' Each contribution table contains:
#' \itemize{
#'   \item \code{Ind}: Individual ID.
#'   \item \code{Contrib}: Contribution to the reference population (0-1).
#'   \item \code{CumContrib}: Cumulative contribution.
#'   \item \code{Rank}: Rank by contribution.
#' }
#' 
#' @details
#' **Founder Contributions ($f_e$):**
#' Calculated by probabilistic gene flow from founders to the reference cohort. 
#' When individual ancestors with one unknown parent exist, "phantom" parents are temporarily injected correctly conserving the probability mass.
#' 
#' **Ancestor Contributions ($f_a$):**
#' Calculated using Boichard's iterative algorithm (1997), accounting for:
#' \itemize{
#'   \item Marginal genetic contribution of each ancestor
#'   \item Long-term contributions through multiple pathways
#' }
#' The parameter $f_a$ acts as a stringent metric since it identifies the bottlenecks of genetic variation in pedigrees. 
#' 
#' @examples
#' \donttest{
#' library(data.table)
#' # Load a sample pedigree
#' tp <- tidyped(small_ped)
#'
#' # Calculate both founder and ancestor contributions for reference population
#' ref_ids <- c("Z1", "Z2", "X", "Y")
#' contrib <- pedcontrib(tp, reference = ref_ids, mode = "both")
#'
#' # Print results including f_e, f_e(H), f_a, and f_a(H)
#' print(contrib)
#'
#' # Access Shannon-entropy effective numbers directly
#' contrib$summary$f_e_H   # Information-theoretic effective founders (q=1)
#' contrib$summary$f_e     # Classical effective founders (q=2)
#' contrib$summary$f_a_H   # Information-theoretic effective ancestors (q=1)
#' contrib$summary$f_a     # Classical effective ancestors (q=2)
#'
#' # Diversity ratio rho > 1 indicates long-tail founder value
#' contrib$summary$f_e_H / contrib$summary$f_e
#' }
#'
#' @references
#' Boichard, D., Maignel, L., & Verrier, É. (1997). The value of using probabilities 
#' of gene origin to measure genetic variability in a population. Genetics Selection 
#' Evolution, 29(1), 5-23.
#'
#' @export
pedcontrib <- function(ped, reference = NULL, mode = c("both", "founder", "ancestor"), top = 20) {
  ped <- ensure_complete_tidyped(ped, "pedcontrib()")
  
  # Inject phantom parents to conserve genetic contributions accurately
  ped <- fill_phantoms(ped)
  
  mode <- match.arg(mode)
  
  # Ensure integer indices exist
  if (!all(c("SireNum", "DamNum", "IndNum") %in% names(ped))) {
    ped <- tidyped(ped, addnum = TRUE)
  }
  
  # Define reference cohort
  if (is.null(reference)) {
    max_gen <- max(ped$Gen)
    reference <- ped[Gen == max_gen, Ind]
    message(sprintf("Using %d individuals from generation %d as reference population.",
                    length(reference), max_gen))
  } else {
    if (!all(reference %in% ped$Ind)) {
      missing <- reference[!reference %in% ped$Ind]
      warning(sprintf("Reference IDs not found in pedigree: %s", paste(head(missing, 5), collapse = ", ")))
    }
    reference <- reference[reference %in% ped$Ind]
  }
  
  if (length(reference) == 0) {
    stop("No valid reference individuals specified.")
  }
  
  n <- nrow(ped)
  n_ref <- length(reference)
  
  # Pre-compute integer arrays
  sire_num <- ped$SireNum   # 0 = unknown
  dam_num <- ped$DamNum     # 0 = unknown
  ind_ids <- ped$Ind
  
  # Map reference IDs to integer positions
  ind_to_pos <- setNames(seq_len(n), ind_ids)
  cohort_pos <- as.integer(ind_to_pos[reference])
  
  # Determine numeric mode for C++: 1=founder, 2=ancestor, 3=both
  cpp_mode <- switch(mode, founder = 1L, ancestor = 2L, both = 3L)
  
  # ---- Call Rcpp backend ----
  cpp_res <- cpp_pedcontrib(
    sire     = as.integer(sire_num),
    dam      = as.integer(dam_num),
    cohort_pos = cohort_pos,
    mode     = cpp_mode
  )
  
  result <- list()
  
  # ---- Founder Contributions ----
  fe2_sum <- NA_real_
  fe_H <- NA_real_
  n_founders_total <- 0L
  
  if (mode %in% c("founder", "both")) {
    message("Calculating founder contributions...")
    
    f_idx <- cpp_res$founder_idx
    f_val <- cpp_res$founder_contrib
    
    if (length(f_idx) == 0) {
      warning("No founders found (all individuals have at least one parent).")
      result$founders <- data.table(Ind = character(0), Contrib = numeric(0),
                                     CumContrib = numeric(0), Rank = integer(0))
    } else {
      founder_dt <- data.table(
        Ind = ind_ids[f_idx],
        Contrib = f_val
      )
      
      setorder(founder_dt, -Contrib)
      founder_dt[, CumContrib := cumsum(Contrib)]
      founder_dt[, Rank := seq_len(.N)]
      
      fe2_sum <- sum(founder_dt$Contrib^2, na.rm = TRUE)
      fe_H <- calc_shannon_effective(founder_dt$Contrib)
      n_founders_total <- nrow(founder_dt)
      
      result$founders <- if (nrow(founder_dt) > top) founder_dt[1:top] else founder_dt
    }
  }
  
  # ---- Ancestor Contributions ----
  fa2_sum <- NA_real_
  fa_H <- NA_real_
  n_ancestors_total <- 0L
  
  if (mode %in% c("ancestor", "both")) {
    message("Calculating ancestor contributions (Boichard's iterative algorithm)...")
    
    a_idx <- cpp_res$ancestor_idx
    a_val <- cpp_res$ancestor_contrib
    
    if (length(a_idx) == 0) {
      result$ancestors <- data.table(Ind = character(0), Contrib = numeric(0),
                                      CumContrib = numeric(0), Rank = integer(0))
    } else {
      ancestor_dt_full <- data.table(
        Ind = ind_ids[a_idx],
        Contrib = a_val
      )
      ancestor_dt_full[, CumContrib := cumsum(Contrib)]
      ancestor_dt_full[, Rank := seq_len(.N)]
      
      fa2_sum <- sum(ancestor_dt_full$Contrib^2, na.rm = TRUE)
      fa_H <- calc_shannon_effective(ancestor_dt_full$Contrib)
      n_ancestors_total <- nrow(ancestor_dt_full)
      
      result$ancestors <- if (nrow(ancestor_dt_full) > top) ancestor_dt_full[1:top] else ancestor_dt_full
    }
  }
  
  # ---- Summary Statistics ----
  summary_list <- list(n_ref = n_ref)
  
  if (!is.null(result$founders)) {
    summary_list$n_founder <- n_founders_total
    summary_list$n_founder_show <- nrow(result$founders)
    summary_list$f_e <- if (!is.na(fe2_sum) && fe2_sum > 0) 1 / fe2_sum else NA_real_
    summary_list$f_e_H <- fe_H
  }
  
  if (!is.null(result$ancestors)) {
    summary_list$n_ancestor <- n_ancestors_total
    summary_list$n_ancestor_show <- nrow(result$ancestors)
    summary_list$f_a <- if (!is.na(fa2_sum) && fa2_sum > 0) 1 / fa2_sum else NA_real_
    summary_list$f_a_H <- fa_H
  }
  
  result$summary <- summary_list
  class(result) <- "pedcontrib"
  
  return(result)
}

#' Calculate Ancestry Proportions
#'
#' Estimates the proportion of genes for each individual that originates from 
#' specific founder groups (e.g., breeds, source populations).
#'
#' @param ped A \code{tidyped} object.
#' @param foundervar Character. The name of the column containing founder-group
#'   labels (e.g., "Breed", "Origin").
#' @param target_labels Character vector. Specific founder-group labels to track.
#'   If NULL, all unique labels in \code{foundervar} among founders are used.
#'
#' @return A \code{data.table} with columns:
#' \itemize{
#'   \item \code{Ind}: Individual ID.
#'   \item One column per tracked label (named after each unique value in
#'     \code{foundervar} among founders, or as specified by
#'     \code{target_labels}).
#'     Each value gives the proportion of genes (0--1) originating from that
#'     founder group. Row sums across all label columns equal 1.
#' }
#'
#' @examples
#' \donttest{
#' library(data.table)
#' # Create dummy labels for founders
#' tp <- tidyped(small_ped)
#' tp_dated <- copy(tp)
#' founders <- tp_dated[is.na(Sire) & is.na(Dam), Ind]
#' # Assign 'LineA' and 'LineB'
#' tp_dated[Ind %in% founders[1:(length(founders)/2)], Origin := "LineA"]
#' tp_dated[is.na(Origin), Origin := "LineB"]
#' 
#' # Calculate ancestry proportions for all individuals
#' anc <- pedancestry(tp_dated, foundervar = "Origin")
#' print(tail(anc))
#' }
#' 
#' @export
pedancestry <- function(ped, foundervar, target_labels = NULL) {
  ped <- ensure_complete_tidyped(ped, "pedancestry()")
  if (!foundervar %in% names(ped)) stop(sprintf("Column '%s' not found.", foundervar))
  
  # Forward pass requires numbers
  if (!all(c("SireNum", "DamNum", "IndNum") %in% names(ped))) {
    ped_work <- tidyped(ped, addnum = TRUE)
  } else {
    ped_work <- copy(ped)
  }
  
  # Ensure sorted by Gen for correct forward propagation
  setorder(ped_work, Gen, IndNum)
  
  # Identify founder labels
  founders_idx <- which(is.na(ped_work$Sire) & is.na(ped_work$Dam))
  if (is.null(target_labels)) {
    target_labels <- unique(ped_work[[foundervar]][founders_idx])
    target_labels <- target_labels[!is.na(target_labels) & target_labels != ""]
  }
  
  if (length(target_labels) == 0) {
    stop("No valid founder-group labels found. Please check 'foundervar'.")
  }

  n <- nrow(ped_work)
  res_mat <- matrix(0, nrow = n, ncol = length(target_labels))
  colnames(res_mat) <- target_labels
  
  # Initialize founders with their respective labels using vectorized matrix indexing
  founder_labels <- ped_work[[foundervar]][founders_idx]
  valid_matches <- match(founder_labels, target_labels)
  valid_mask <- !is.na(valid_matches)
  
  if (any(valid_mask)) {
    res_mat[cbind(founders_idx[valid_mask], valid_matches[valid_mask])] <- 1.0
  }
  
  sire_nums <- ped_work$SireNum
  dam_nums <- ped_work$DamNum
  
  # Forward pass via Rcpp
  res_mat <- cpp_calculate_ancestry(
    sire_nums, 
    dam_nums, 
    res_mat
  )
  
  ans <- as.data.table(res_mat)
  colnames(ans) <- target_labels
  ans[, Ind := ped_work$Ind]
  setcolorder(ans, "Ind")
  return(ans[])
}

#' Summarize Inbreeding Levels
#'
#' Classifies individuals into inbreeding levels based on their inbreeding 
#' coefficients (F) according to standard or user-defined thresholds.
#'
#' @param ped A \code{tidyped} object.
#' @param breaks Numeric vector of strictly increasing positive upper bounds for
#'   inbreeding classes. Default is \code{c(0.0625, 0.125, 0.25)}, corresponding
#'   approximately to half-sib, avuncular/grandparent, and full-sib/parent-offspring
#'   mating thresholds. The class \code{"F = 0"} is always kept as a fixed first
#'   level. A final open-ended class \code{"F > max(breaks)"} is always appended
#'   automatically.
#' @param labels Optional character vector of interval labels. If \code{NULL},
#'   labels are generated automatically from \code{breaks}. When supplied, its
#'   length must equal \code{length(breaks)}, with each element naming the
#'   bounded interval \code{(breaks[i-1], breaks[i]]}. The open-ended tail
#'   class is always auto-generated and cannot be overridden.
#'
#' @return A \code{data.table} with 3 columns:
#' \describe{
#'   \item{\code{FClass}}{An ordered factor. By default it contains 5 levels:
#'   \code{"F = 0"}, \code{"0 < F <= 0.0625"}, \code{"0.0625 < F <= 0.125"},
#'   \code{"0.125 < F <= 0.25"}, and \code{"F > 0.25"}. The number of levels
#'   equals \code{length(breaks) + 2} (the fixed zero class plus one class per
#'   bounded interval plus the open-ended tail).}
#'   \item{\code{Count}}{Integer. Number of individuals in each class.}
#'   \item{\code{Percentage}}{Numeric. Percentage of individuals in each class.}
#' }
#'
#' @details
#' The default thresholds follow common pedigree interpretation rules:
#' \itemize{
#'   \item \code{F = 0.0625}: approximately the offspring of half-sib mating.
#'   \item \code{F = 0.125}: approximately the offspring of avuncular or grandparent-grandchild mating.
#'   \item \code{F = 0.25}: approximately the offspring of full-sib or parent-offspring mating.
#' }
#' Therefore, assigning \code{F = 0.25} to the class \code{"0.125 < F <= 0.25"}
#' is appropriate. If finer reporting is needed, supply custom \code{breaks},
#' for example to separate \code{0.25}, \code{0.375}, or \code{0.5}.
#'
#' @examples
#' tp <- tidyped(simple_ped, addnum = TRUE)
#' pedfclass(tp)
#'
#' # Finer custom classes (4 breaks, labels auto-generated)
#' pedfclass(tp, breaks = c(0.03125, 0.0625, 0.125, 0.25))
#'
#' # Custom labels aligned to breaks (3 labels for 3 breaks; tail is auto)
#' pedfclass(tp, labels = c("Low", "Moderate", "High"))
#'
#' \donttest{
#' tp_inbred <- tidyped(inbred_ped, addnum = TRUE)
#' pedfclass(tp_inbred)
#' }
#'
#' @export
pedfclass <- function(ped,
                      breaks = c(0.0625, 0.125, 0.25),
                      labels = NULL) {
  ped <- ensure_tidyped(ped)

  if (!is.numeric(breaks) || length(breaks) < 1L || any(!is.finite(breaks)) ||
      any(breaks <= 0) || is.unsorted(breaks, strictly = TRUE)) {
    stop("breaks must be a strictly increasing numeric vector of positive values")
  }

  # Build bounded-interval labels (one per break, no tail)
  if (is.null(labels)) {
    lower_bounds <- c(0, head(breaks, -1))
    labels <- ifelse(
      lower_bounds == 0,
      paste0("0 < F <= ", breaks),
      paste0(lower_bounds, " < F <= ", breaks)
    )
  } else {
    if (!is.character(labels) || length(labels) != length(breaks)) {
      stop("labels must be a character vector of length equal to length(breaks)")
    }
  }
  # The open-ended tail class is always auto-generated
  tail_label <- paste0("F > ", max(breaks))
  all_labels  <- c(labels, tail_label)
  
  if (!"f" %in% names(ped)) {
    ped <- ensure_complete_tidyped(ped, "pedfclass()")
    message("Calculating inbreeding coefficients...")
    ped <- inbreed(ped)
  }
  
  cut_breaks   <- c(-Inf, .Machine$double.eps, breaks, Inf)
  class_levels <- c("F = 0", all_labels)

  # Classify
  f_classes <- cut(ped$f, breaks = cut_breaks, labels = class_levels, include.lowest = TRUE)
  
  dt <- data.table(FClass = factor(f_classes, levels = class_levels, ordered = TRUE))
  summary_dt <- dt[, .(Count = .N), by = FClass]
  
  # Ensure all categories appear
  all_cats <- data.table(FClass = factor(class_levels, levels = class_levels, ordered = TRUE))
  summary_dt <- merge(all_cats, summary_dt, by = "FClass", all.x = TRUE)
  summary_dt[is.na(Count), Count := 0]
  summary_dt[, Count := as.integer(Count)]
  
  summary_dt[, Percentage := (Count / sum(Count)) * 100]
  setorder(summary_dt, FClass)
  
  return(summary_dt[])
}

#' Calculate Partial Inbreeding
#'
#' Decomposes individuals' inbreeding coefficients into marginal contributions from 
#' specific ancestors. This allows identifying which ancestors or lineages are 
#' responsible for the observed inbreeding.
#'
#' @param ped A \code{tidyped} object.
#' @param ancestors Character vector. IDs of ancestors to calculate partial 
#'   inbreeding for. If NULL, the top ancestors by marginal contribution are used.
#' @param top Integer. Number of top ancestors to include if \code{ancestors} is NULL.
#'
#' @return A \code{data.table} with the first column as \code{Ind} and subsequent 
#'   columns representing the partial inbreeding ($pF$) from each ancestor.
#'
#' @details
#' The sum of all partial inbreeding coefficients for an individual (including 
#' contributions from founders) equals $1 + f_i$, where $f_i$ is the total 
#' inbreeding coefficient. This function specifically isolates the terms in 
#' the Meuwissen & Luo (1992) decomposition that correspond to the selected ancestors.
#'
#' @references
#' Lacey, R. C. (1996). A formula for determining the partial inbreeding coefficient, 
#' \eqn{F_{ij}}. Journal of Heredity, 87(4), 337-339.
#' 
#' Meuwissen, T. H., & Luo, Z. (1992). Computing inbreeding coefficients in 
#' large populations. Genetics Selection Evolution, 24(4), 305-313.
#'
#' @examples
#' \donttest{
#' library(data.table)
#' tp <- tidyped(inbred_ped)
#' # Calculate partial inbreeding originating from specific ancestors
#' target_ancestors <- inbred_ped[is.na(Sire) & is.na(Dam), Ind]
#' pF <- pedpartial(tp, ancestors = target_ancestors)
#' print(tail(pF))
#' }
#' 
#' @export
pedpartial <- function(ped, ancestors = NULL, top = 20) {
  ped <- ensure_complete_tidyped(ped, "pedpartial()")
  
  # Need dii from inbreeding algorithm which requires integer indices
  if (!all(c("SireNum", "DamNum", "IndNum") %in% names(ped))) {
    ped_work <- tidyped(ped, addnum = TRUE)
  } else {
    ped_work <- copy(ped)
  }
  
  # Ensure sorted by IndNum for alignment with C++ output
  setorder(ped_work, IndNum)
  
  # Identify target ancestors
  if (is.null(ancestors)) {
    message("No ancestors specified. Identifying top contributors...")
    cont <- pedcontrib(ped, mode = "ancestor", top = top)
    ancestors <- cont$ancestors$Ind
    if (length(ancestors) == 0) stop("Could not identify top ancestors.")
  }
  
  # Map ancestor IDs to IndNum
  anc_nums <- ped_work[Ind %in% ancestors, IndNum]
  anc_ids <- ped_work[Ind %in% ancestors, Ind]
  
  if (length(anc_nums) == 0) stop("None of the specified ancestors found in pedigree.")
  
  message(sprintf("Calculating partial inbreeding for %d ancestors...", length(anc_nums)))
  
  # Calculate dii (Meuwissen & Luo engines)
  # We reuse the logic from cpp_calculate_inbreeding but only need dii
  res_f <- cpp_calculate_inbreeding(ped_work$SireNum, ped_work$DamNum)
  
  # Adjust dii for half-founders to match Gulisija & Crow (2007) tabular method.
  # For half-founders (one parent known, one unknown), the standard dii includes

  # 0.25 Mendelian sampling variance from the unknown parent that cannot be
  # attributed to any known ancestor. We subtract this to ensure that
  # sum(pF_j) over all founder ancestors j equals the total F_i.
  # Standard dii for half-founder: 0.75 - 0.25 * F_known
  # Adjusted dii for partial F:    0.50 - 0.25 * F_known
  dii_partial <- res_f$dii
  sire_vec <- ped_work$SireNum
  dam_vec <- ped_work$DamNum
  f_vec <- res_f$f
  half_founders <- which(xor(sire_vec > 0, dam_vec > 0))
  if (length(half_founders) > 0) {
    for (idx in half_founders) {
      known_parent <- if (sire_vec[idx] > 0) sire_vec[idx] else dam_vec[idx]
      dii_partial[idx] <- 0.5 - 0.25 * f_vec[known_parent]
    }
  }
  
  # Call C++ partial inbreeding engine
  pf_mat <- cpp_calculate_partial_inbreeding(
    ped_work$SireNum, 
    ped_work$DamNum, 
    dii_partial, 
    as.integer(anc_nums)
  )
  
  result <- as.data.table(pf_mat)
  colnames(result) <- anc_ids
  result[, Ind := ped_work$Ind]
  setcolorder(result, "Ind")
  
  return(result[])
}

#' Print Founder and Ancestor Contributions
#'
#' @param x A \code{pedcontrib} object.
#' @param ... Additional arguments.
#'
#' @export
print.pedcontrib <- function(x, ...) {
  cat("Founder and Ancestor Contributions\n")
  cat("===================================\n")
  
  s <- x$summary
  cat(sprintf("Reference population size: %d\n", s$n_ref))
  
  if (!is.null(x$founders)) {
    cat(sprintf("\nFounders: %d (reported top %d)\n", s$n_founder, s$n_founder_show))
    cat(sprintf("  f_e(H) = %.3f  |  f_e = %.3f\n",
                if (is.null(s$f_e_H) || is.na(s$f_e_H)) NA_real_ else s$f_e_H,
                s$f_e))
    cat("\nTop 10 Founder Contributions:\n")
    print(head(x$founders, 10))
  }
  
  if (!is.null(x$ancestors)) {
    cat(sprintf("\nAncestors: %d (reported top %d)\n", s$n_ancestor, s$n_ancestor_show))
    cat(sprintf("  f_a(H) = %.3f  |  f_a = %.3f\n",
                if (is.null(s$f_a_H) || is.na(s$f_a_H)) NA_real_ else s$f_a_H,
                s$f_a))
    cat("\nTop 10 Ancestor Contributions:\n")
    print(head(x$ancestors, 10))
  }
  
  invisible(x)
}

#' Calculate Genetic Diversity Indicators
#'
#' Combines founder/ancestor contributions ($f_e$, $f_a$) and effective population
#' size estimates (Ne) from three methods into a single summary object.
#'
#' @param ped A \code{tidyped} object.
#' @param reference Character vector. Optional subset of individual IDs defining the reference population.
#'   If NULL, uses all individuals in the most recent generation.
#' @param top Integer. Number of top contributors to return in founder/ancestor tables. Default is 20.
#' @param nsamples Integer. Number of individuals sampled per cohort for the coancestry
#'   Ne method and for \eqn{f_g} estimation. Very large cohorts are sampled down to this
#'   size to control memory usage (default: 1000).
#' @param ncores Integer. Number of cores for parallel processing in the coancestry method. Default is 1.
#' @param seed Integer or NULL. Random seed passed to \code{set.seed()} before sampling
#'   in the coancestry method, ensuring reproducible \eqn{f_g} and \eqn{N_e} estimates.
#'   Default is \code{NULL} (no fixed seed).
#'
#' @return A list with class \code{pediv} containing:
#' \itemize{
#'   \item \code{summary}: A single-row \code{data.table} with columns
#'     \code{NRef}, \code{NFounder}, \code{feH}, \code{fe}, \code{NAncestor},
#'     \code{faH}, \code{fa}, \code{fafe}, \code{fg}, \code{MeanCoan},
#'     \code{GeneDiv}, \code{NSampledCoan}, \code{NeCoancestry}, \code{NeInbreeding},
#'     \code{NeDemographic}.
#'     Here \code{feH} and \code{faH} are the Shannon-entropy (\eqn{q=1})
#'     effective numbers of founders and ancestors, respectively.
#'     \code{GeneDiv} is the pedigree-based retained genetic diversity,
#'     computed as \eqn{1 - \bar{C}}, where \eqn{\bar{C}} is the
#'     diagonal-corrected population mean coancestry (\code{MeanCoan}).
#'   \item \code{founders}: A \code{data.table} of top founder contributions.
#'   \item \code{ancestors}: A \code{data.table} of top ancestor contributions.
#' }
#'
#' @details
#' Internally calls \code{\link{pedcontrib}} for \eqn{f_e} and \eqn{f_a}.
#' The coancestry method is called via the internal \code{calc_ne_coancestry()}
#' function directly so that \eqn{f_g} and the Ne estimate can be obtained from
#' the same traced pedigree without duplication.
#' The inbreeding and demographic Ne methods are obtained via \code{\link{pedne}}.
#' All calculations use the same \code{reference} population.
#' If any method fails (e.g., insufficient pedigree depth), its value is \code{NA}
#' rather than stopping execution.
#'
#' \eqn{f_g} (founder genome equivalents, Caballero & Toro 2000) is estimated from
#' the diagonal-corrected mean coancestry of the reference population:
#' \deqn{\hat{\bar{C}} = \frac{N-1}{N} \cdot \frac{\bar{a}_{off}}{2} + \frac{1 + \bar{F}_s}{2N}}
#' \deqn{f_g = \frac{1}{2 \hat{\bar{C}}}}
#' where \eqn{N} is the full reference cohort size, \eqn{\bar{a}_{off}} is the
#' off-diagonal mean relationship among sampled individuals, and \eqn{\bar{F}_s}
#' is their mean inbreeding coefficient.
#'
#' @examples
#' \donttest{
#' tp <- tidyped(small_ped)
#' div <- pediv(tp, reference = c("Z1", "Z2", "X", "Y"), seed = 42L)
#' print(div)
#'
#' # Access Shannon effective numbers from summary
#' div$summary$feH   # Shannon effective founders (q=1)
#' div$summary$faH   # Shannon effective ancestors (q=1)
#'
#' # Founder diversity profile: NFounder >= feH >= fe
#' with(div$summary, c(NFounder = NFounder, feH = feH, fe = fe))
#' }
#'
#' @seealso \code{\link{pedcontrib}}, \code{\link{pedne}}, \code{\link{pedstats}}
#' @export
pediv <- function(ped, reference = NULL, top = 20, nsamples = 1000, ncores = 1,
                  seed = NULL) {
  ped <- ensure_complete_tidyped(ped, "pediv()")

  # ---- Founder and ancestor contributions ----
  message("Calculating founder and ancestor contributions...")
  contrib <- pedcontrib(ped, reference = reference, mode = "both", top = top)
  s <- contrib$summary

  # ---- Prepare pedigree with ECG (mirrors pedne() preprocessing) ----
  ped_dt <- data.table::copy(ped)
  if (!"ECG" %in% names(ped_dt)) {
    ecg_dt <- pedecg(ped_dt)
    ped_dt <- merge(ped_dt, ecg_dt[, .(Ind, ECG)], by = "Ind", all.x = TRUE)
    # merge() sorts by Ind (alphabetically), breaking the IndNum == row-index
    # invariant that the fast-path BFS in tidyped() depends on.
    setorder(ped_dt, IndNum)
  }
  by_all <- ".CohortAll"
  ped_dt[[by_all]] <- "All"

  if (!is.null(reference)) {
    ped_subset <- as.data.table(ped_dt)[Ind %in% reference]
  } else {
    ped_subset <- ped_dt
  }

  # ---- Ne (coancestry) + fg: call internal function directly ----
  # pedne(method="coancestry") strips fg from its public output; bypass it here
  # so both Ne and fg are computed from the same single traced pedigree.
  message("Calculating Ne (coancestry) and fg...")
  raw_coan <- tryCatch(
    calc_ne_coancestry(ped_subset, ped_dt, by_all, nsamples, seed = seed),
    error = function(e) NULL
  )

  # ---- Ne: inbreeding and demographic (via public pedne API) ----
  message("Calculating Ne (inbreeding)...")
  ne_i <- tryCatch(
    suppressMessages(pedne(ped, reference = reference, by = NULL,
                           method = "inbreeding")),
    error = function(e) NULL
  )

  message("Calculating Ne (demographic)...")
  ne_d <- tryCatch(
    suppressMessages(pedne(ped, reference = reference, by = NULL,
                           method = "demographic")),
    error = function(e) NULL
  )

  extract_ne <- function(dt) {
    if (is.null(dt) || nrow(dt) == 0) NA_real_ else dt$Ne[1]
  }

  # Extract fg and MeanCoan from raw coancestry result
  fg_val      <- if (!is.null(raw_coan) && nrow(raw_coan) > 0 && "fg" %in% names(raw_coan))
                   raw_coan$fg[1] else NA_real_
  mean_coan   <- if (!is.null(raw_coan) && nrow(raw_coan) > 0 && "MeanCoan" %in% names(raw_coan))
                   raw_coan$MeanCoan[1] else NA_real_
  n_samp_coan <- if (!is.null(raw_coan) && nrow(raw_coan) > 0 && "NSampledCoan" %in% names(raw_coan))
                   as.integer(raw_coan$NSampledCoan[1]) else NA_integer_

  # ---- Assemble summary ----
  summary_dt <- data.table::data.table(
    NRef          = s$n_ref,
    NFounder      = s$n_founder,
    feH           = if (!is.null(s$f_e_H)) s$f_e_H else NA_real_,
    fe            = s$f_e,
    NAncestor     = s$n_ancestor,
    faH           = if (!is.null(s$f_a_H)) s$f_a_H else NA_real_,
    fa            = s$f_a,
    fafe          = if (!is.na(s$f_a) && !is.na(s$f_e) && s$f_e > 0)
                      round(s$f_a / s$f_e, 6) else NA_real_,
    fg            = fg_val,
    MeanCoan      = mean_coan,
    GeneDiv       = if (!is.na(mean_coan)) pmax(0, pmin(1, 1 - mean_coan)) else NA_real_,
    NSampledCoan  = n_samp_coan,
    NeCoancestry  = extract_ne(raw_coan),
    NeInbreeding  = extract_ne(ne_i),
    NeDemographic = extract_ne(ne_d)
  )

  result <- list(
    summary   = summary_dt,
    founders  = contrib$founders,
    ancestors = contrib$ancestors
  )
  class(result) <- "pediv"
  return(result)
}

#' Print Genetic Diversity Summary
#'
#' @param x A \code{pediv} object.
#' @param ... Additional arguments.
#'
#' @export
print.pediv <- function(x, ...) {
  cat("Genetic Diversity Summary\n")
  cat("=========================\n")

  s <- x$summary

  cat(sprintf("Reference population size : %d\n", s$NRef))

  cat("\n-- Founder / Ancestor Contributions --\n")
  cat(sprintf("Founders  (total) : %d    fe(H) = %.3f    fe = %.3f\n",
              s$NFounder, s$feH, s$fe))
  cat(sprintf("Ancestors (total) : %d    fa(H) = %.3f    fa = %.3f\n",
              s$NAncestor, s$faH, s$fa))
  cat(sprintf("fa/fe ratio       : %.4f\n", s$fafe))
  if (!is.null(s$fg) && !is.na(s$fg)) {
    cat(sprintf("Founder genomes   : fg = %.3f  (MeanCoan = %.6f, NSampled = %d)\n",
                s$fg, s$MeanCoan, s$NSampledCoan))
    if (!is.null(s$GeneDiv) && !is.na(s$GeneDiv)) {
      cat(sprintf("Gene diversity    : GeneDiv = %.4f  (= 1 - MeanCoan)\n", s$GeneDiv))
    }
    # Hierarchy fg <= fa <= fe <= NFounder holds only when all metrics share
    # the same reference population. Display it only when the order is intact.
    if (!is.na(s$fa) && !is.na(s$fe) && s$fg <= s$fa + 1e-6 && s$fa <= s$fe + 1e-6) {
      cat(sprintf("Hierarchy: fg <= fa <= fe <= NFounder  =  %.2f <= %.3f <= %.3f <= %d\n",
                  s$fg, s$fa, s$fe, s$NFounder))
    }
  }

  cat("\n-- Effective Population Size (Ne) --\n")
  fmt_ne <- function(v) if (is.na(v)) "     NA" else sprintf("%7.1f", v)
  cat(sprintf("  Coancestry  : %s\n", fmt_ne(s$NeCoancestry)))
  cat(sprintf("  Inbreeding  : %s\n", fmt_ne(s$NeInbreeding)))
  cat(sprintf("  Demographic : %s\n", fmt_ne(s$NeDemographic)))

  if (!is.null(x$founders) && nrow(x$founders) > 0) {
    cat("\nTop Founder Contributions (top 5):\n")
    print(head(x$founders, 5))
  }

  if (!is.null(x$ancestors) && nrow(x$ancestors) > 0) {
    cat("\nTop Ancestor Contributions (top 5):\n")
    print(head(x$ancestors, 5))
  }

  invisible(x)
}

Try the visPedigree package in your browser

Any scripts or data that you put into this service are public.

visPedigree documentation built on March 30, 2026, 9:07 a.m.