R/summarizePedigree.R

Defines functions summarizeFamilies summarizePatrilines summarizeMatrilines findFounder calculateSummaryDT summarizePedigrees

Documented in calculateSummaryDT findFounder summarizeFamilies summarizeMatrilines summarizePatrilines summarizePedigrees

#' Summarize Pedigree Data
#'
#' This function summarizes pedigree data, by
#' computing key summary statistics for all numeric variables and identifying the
#' originating member (founder) for each family, maternal, and paternal lineage.
#'
#' The function calculates standard descriptive statistics, including the count of individuals in
#' each lineage, means, medians, minimum and maximum values, and standard deviations.
#' Additionally, if `five_num_summary = TRUE`, the function includes the first and third quartiles (Q1, Q3)
#' to provide a more detailed distributional summary. Users can also specify variables to exclude from
#' the analysis via `skip_var`.
#'
#' Beyond summary statistics, the function identifies the founding member of each lineage
#' based on the specified sorting variable (`founder_sort_var`), defaulting to birth year (`byr`)
#' when available or `personID` otherwise. Users can retrieve the largest and oldest
#' lineages by setting `nbiggest` and `noldest`, respectively.
#'
#' @inheritParams ped2fam
#' @inheritParams ped2maternal
#' @inheritParams ped2paternal
#' @param nbiggest Integer. Number of largest lineages to return (sorted by count).
#' @param noldest Integer. Number of oldest lineages to return (sorted by birth year).
#' @param byr Character. Optional column name for birth year. Used to determine the oldest lineages.
#' @param type Character vector. Specifies which summaries to compute.
#'   Options: `"fathers"`, `"mothers"`, `"families"`. Default includes all three.
#' @param skip_var Character vector. Variables to exclude from summary calculations.
#' @param five_num_summary Logical. If `TRUE`, includes the first quartile (Q1) and third quartile (Q3) in addition to
#'   the minimum, median, and maximum values.
#' @param include_founder Logical. If `TRUE`, includes the founder (originating member) of each lineage in the output.
#' @param founder_sort_var Character. Column used to determine the founder of each lineage.
#'   Defaults to `byr` (if available) or `personID` otherwise.
#' @param network_checks Logical. If `TRUE`, performs network checks on the pedigree data.
#' @param verbose Logical, if TRUE, print progress messages.
#' @returns A data.frame (or list) containing summary statistics for family, maternal, and paternal lines, as well as the 5 oldest and biggest lines.
#' @import data.table
#' @export
summarizePedigrees <- function(ped, famID = "famID", personID = "ID",
                               momID = "momID", dadID = "dadID",
                               matID = "matID", patID = "patID",
                               type = c("fathers", "mothers", "families"),
                               byr = NULL, include_founder = FALSE, founder_sort_var = NULL,
                               nbiggest = 5, noldest = 5, skip_var = NULL,
                               five_num_summary = FALSE, network_checks = FALSE,
                               verbose = FALSE) {
  # Fast Fails

  ## Check that the ID variables are not the same
  if (personID %in% c(famID, momID, dadID, matID, patID)) {
    stop("personID cannot be the same as any of the other ID variables.")
  }
  ## Check that neccesarry variables are present
  if (!all(c(personID, momID, dadID) %in% names(ped))) {
    stop("personID, momID, and dadID must be columns in the pedigree data.")
  }
  if (is.null(type)) {
    stop("Type must be specified. Options are 'fathers', 'mothers', and 'families'.")
  }
  if (!is.null(byr) && !byr %in% names(ped)) {
    stop("If byr is specified, byr must be a column in the pedigree data. If you do not want to sort by birth year, set byr = NULL.")
  }
  if (!is.null(founder_sort_var) && !founder_sort_var %in% names(ped)) {
    stop("If you set founder_sort_var, that variable must be a column in the pedigree data. If you want to sort by using the default, set founder_sort_var = NULL. The default is to sort by birth year if that's present and by personID otherwise.")
  }

  # user inputs
  ## what variables to skip
  if (is.null(skip_var)) {
    skip_var <- c(personID, famID, momID, dadID, matID, patID)
  } else {
    skip_var <- c(skip_var, personID, famID, momID, dadID, matID, patID)
  }
  ## How to sort/identify the founders
  ## If founder_sort_var is NULL, sort by byr if it's present, otherwise sort by personID
  if (is.null(founder_sort_var) && is.null(byr)) {
    founder_sort_var <- personID
  } else if (is.null(founder_sort_var) & !is.null(byr)) {
    founder_sort_var <- byr
  }

  # Build the pedigree using the provided functions
  if ("families" %in% type && !famID %in% names(ped)) {
    if (verbose) message("Counting families...")
    ped <- ped2fam(ped, personID = personID, momID = momID, dadID = dadID, famID = famID)
  }
  if ("mothers" %in% type && !matID %in% names(ped)) {
    if (verbose) message("Counting mothers...")
    ped <- ped2maternal(ped, personID = personID, momID = momID, dadID = dadID, matID = matID)
  }
  if ("fathers" %in% type && !patID %in% names(ped)) {
    if (verbose) message("Counting fathers...")
    ped <- ped2paternal(ped, personID = personID, momID = momID, dadID = dadID, patID = patID)
  }


  # Convert to data.table
  ped_dt <- data.table::as.data.table(ped)

  # Initialize...
  ## Output list
  output <- list()
  ## Size of families
  n_fathers <- n_mothers <- n_families <- NULL


  if (network_checks) {
    if (verbose) message("Performing network validation checks...")
    network_validation_results <- checkPedigreeNetwork(
      ped,
      personID = personID,
      momID = momID,
      dadID = dadID,
      verbose = verbose
    )
    output$network_validation <- network_validation_results
  }

  # Calculate summary statistics for families, maternal lines, and paternal lines

  if ("families" %in% type) {
    if (verbose) message("Summarizing families...")
    family_summary_dt <- calculateSummaryDT(ped_dt, famID,
      skip_var = skip_var,
      five_num_summary = five_num_summary
    )
    # Find the originating member for each line
    if (include_founder) {
      if (verbose) message("Finding originating members for families...")
      originating_member_family <- findFounder(ped_dt,
        group_var = famID,
        sort_var = founder_sort_var
      )
      # Merge summary statistics with originating members for additional information
      family_summary_dt <- merge(family_summary_dt,
        originating_member_family,
        by = famID, suffixes = c("", "_founder")
      )
    }
    output$family_summary <- family_summary_dt
    n_families <- nrow(family_summary_dt)
    if (verbose) message("Summarized ", n_families, " families.")
  }

  if ("mothers" %in% type) {
    if (verbose) message("Summarizing maternal lines...")
    maternal_summary_dt <- calculateSummaryDT(ped_dt, matID,
      skip_var = skip_var,
      five_num_summary = five_num_summary
    )
    if (include_founder) {
      if (verbose) message("Finding originating members for matrilineal lines...")
      originating_member_maternal <- findFounder(ped_dt,
        group_var = matID,
        sort_var = founder_sort_var
      )
      maternal_summary_dt <- merge(maternal_summary_dt,
        originating_member_maternal,
        by = matID, suffixes = c("", "_founder")
      )
    }
    output$maternal_summary <- maternal_summary_dt
    n_mothers <- nrow(maternal_summary_dt)
    if (verbose) message("Summarized ", n_mothers, " maternal lines.")
  }
  if ("fathers" %in% type) {
    if (verbose) message("Summarizing paternal lines...")
    paternal_summary_dt <- calculateSummaryDT(ped_dt, patID,
      skip_var = skip_var,
      five_num_summary = five_num_summary
    )
    if (include_founder) {
      if (verbose) message("Finding originating members for patrilineal lines...")
      originating_member_paternal <- findFounder(ped_dt,
        group_var = patID,
        sort_var = founder_sort_var
      )
      paternal_summary_dt <- merge(paternal_summary_dt,
        originating_member_paternal,
        by = patID, suffixes = c("", "_founder")
      )
    }

    output$paternal_summary <- paternal_summary_dt
    n_fathers <- nrow(paternal_summary_dt)
    if (verbose) message("Summarized ", n_fathers, " paternal lines.")
  }

  ## Check errors
  #  if (check_errors) {
  #   if (verbose) message("Checking for errors...")
  #    output$checkIDs <- checkIDs(ped,
  #                                repair = FALSE, verbose = verbose)
  #  }


  # Optionally find the superlative lines
  # & noldest <= unique(ped_dt[[famID]])
  # determin number of lines


  ## oldest
  if (!is.null(byr) && noldest > 0) {
    if (!is.null(n_families) && "families" %in% type) {
      if (verbose) message("Finding oldest families...")
      output$oldest_families <- try_na(family_summary_dt[order(get(byr))][1:min(c(noldest, n_families),
        na.rm = TRUE
      )])
    }
    if (!is.null(n_mothers) && "mothers" %in% type) {
      if (verbose) message("Finding oldest maternal lines...")
      output$oldest_maternal <- try_na(maternal_summary_dt[order(get(byr))][1:min(c(noldest, n_mothers),
        na.rm = TRUE
      )])
    }
    if (!is.null(n_fathers) && "fathers" %in% type) {
      if (verbose) message("Finding oldest paternal lines...")
      output$oldest_paternal <- try_na(paternal_summary_dt[order(get(byr))][1:min(c(noldest, n_fathers),
        na.rm = TRUE
      )])
    }
  }

  # biggest lines
  if (!is.null(nbiggest) && nbiggest > 0) {
    if (!is.null(n_families) && "families" %in% type) {
      output$biggest_families <- try_na(family_summary_dt[order(-get("count"))][1:min(c(nbiggest, n_families),
        na.rm = TRUE
      )])
    }
    if (!is.null(n_mothers) && "mothers" %in% type) {
      output$biggest_maternal <- try_na(maternal_summary_dt[order(-get("count"))][1:min(c(nbiggest, n_mothers),
        na.rm = TRUE
      )])
    }
    if (!is.null(n_fathers) && "fathers" %in% type) {
      output$biggest_paternal <- try_na(paternal_summary_dt[order(-get("count"))][1:min(c(nbiggest, n_fathers),
        na.rm = TRUE
      )])
    }
  }

  return(output)
}

# Function to calculate summary statistics for all numeric variables

#' This function calculates summary statistics for all numeric variables in a data.table. It is supposed to be used internally by the \code{summarize_pedigree} function.
#' @inheritParams summarizePedigrees
#' @param data A data.table containing the pedigree data.
#' @param group_var A character string specifying the column name of the grouping variable.
#' @return A data.table containing the summary statistics for all numeric variables.
#' @keywords internal
#'
calculateSummaryDT <- function(data, group_var, skip_var,
                               five_num_summary = FALSE) {
  # Identify numeric columns excluding the group_var and skip_var
  numeric_cols <- setdiff(names(data)[vapply(data, is.numeric, logical(1))], c(group_var, skip_var))
  summary_stats <- data[,
    {
      count <- .N # Calculate count once per group
      stats_list <- lapply(numeric_cols, function(colname) {
        x <- .SD[[colname]]
        stats <- list(
          #   count = .N,
          mean = as.double(base::mean(x, na.rm = TRUE)),
          median = as.double(stats::median(x, na.rm = TRUE)),
          #  mode = as.double(stats::mode(x, na.rm = TRUE)),
          min = ifelse(all(is.na(x)), as.double(NA), as.double(base::min(x, na.rm = TRUE))),
          max = ifelse(all(is.na(x)), as.double(NA), as.double(base::max(x, na.rm = TRUE))),
          sd = as.double(stats::sd(x, na.rm = TRUE))
        )
        if (five_num_summary) {
          stats <- c(stats, list(
            Q1 = as.double(stats::quantile(x, 0.25, na.rm = TRUE)),
            Q3 = as.double(stats::quantile(x, 0.75, na.rm = TRUE))
          ))
        }
        names(stats) <- paste0(colname, "_", names(stats))
        stats
      })
      stats <- unlist(stats_list, recursive = FALSE)
      c(list(count = count), stats)
    },
    by = group_var,
    .SDcols = numeric_cols
  ]
  # Flatten the nested lists
  summary_stats <- data.table::as.data.table(summary_stats[, lapply(.SD, unlist),
    by = group_var
  ])
  return(summary_stats)
}

# Function to find the originating member for each line

#' This function finds the originating member for each line in a pedigree. It is supposed to be used internally by the \code{summarize_pedigree} function.
#' @inheritParams summarizePedigrees
#' @param sort_var A character string specifying the column name to sort by.
#' @param data A data.table containing the pedigree data.
#'
#' @return A data.table containing the originating member for each line.
#' @keywords internal
#'
findFounder <- function(data, group_var, sort_var) {
  data[order(get(sort_var)), .SD[1], by = group_var]
}


#' Summarize the maternal lines in a pedigree
#' @inheritParams summarizePedigrees
#' @seealso [summarizePedigrees ()]
#' @export
#'
summarizeMatrilines <- function(ped, famID = "famID", personID = "ID",
                                momID = "momID", dadID = "dadID",
                                matID = "matID", patID = "patID",
                                byr = NULL, include_founder = FALSE,
                                founder_sort_var = NULL,
                                nbiggest = 5, noldest = 5, skip_var = NULL,
                                five_num_summary = FALSE, verbose = FALSE) {
  # Call to wrapper function
  summarizePedigrees(
    ped = ped,
    personID = personID,
    nbiggest = nbiggest,
    noldest = noldest,
    byr = byr,
    include_founder = include_founder,
    momID = momID, dadID = dadID,
    famID = famID, matID = matID, patID = patID, skip_var = skip_var,
    type = "mothers", verbose = verbose, five_num_summary = five_num_summary,
    founder_sort_var = founder_sort_var
  )
}

#' Summarize the paternal lines in a pedigree
#' @inheritParams summarizePedigrees
#' @seealso [summarizePedigrees ()]
#' @export
#'
summarizePatrilines <- function(ped, famID = "famID", personID = "ID",
                                momID = "momID", dadID = "dadID",
                                matID = "matID", patID = "patID",
                                byr = NULL, founder_sort_var = NULL,
                                include_founder = FALSE,
                                nbiggest = 5, noldest = 5, skip_var = NULL,
                                five_num_summary = FALSE, verbose = FALSE) {
  # Call to wrapper function
  summarizePedigrees(
    ped = ped,
    personID = personID,
    nbiggest = nbiggest,
    noldest = noldest,
    byr = byr,
    include_founder = include_founder,
    momID = momID, dadID = dadID,
    famID = famID, matID = matID, patID = patID, skip_var = skip_var,
    type = "fathers", verbose = verbose, five_num_summary = five_num_summary,
    founder_sort_var = founder_sort_var
  )
}

#' Summarize the families in a pedigree
#' @inheritParams summarizePedigrees
#' @seealso [summarizePedigrees ()]
#' @export

summarizeFamilies <- function(ped, famID = "famID", personID = "ID",
                              momID = "momID", dadID = "dadID",
                              matID = "matID", patID = "patID",
                              byr = NULL, founder_sort_var = NULL,
                              include_founder = FALSE,
                              nbiggest = 5, noldest = 5, skip_var = NULL,
                              five_num_summary = FALSE, verbose = FALSE) {
  # Call to wrapper function
  summarizePedigrees(
    ped = ped,
    personID = personID,
    nbiggest = nbiggest,
    noldest = noldest,
    byr = byr,
    include_founder = include_founder,
    momID = momID, dadID = dadID,
    famID = famID, matID = matID, patID = patID, skip_var = skip_var,
    type = "families", verbose = verbose, five_num_summary = five_num_summary,
    founder_sort_var = founder_sort_var
  )
}
R-Computing-Lab/BGMisc documentation built on April 3, 2025, 3:12 p.m.