R/readWikifamilytree.R
In BGmisc: An R Package for Extended Behavior Genetics Analysis

Documented in buildTreeGrid convertGrid2LongTree deduplicatePairs extractMemberTable getGridNeighbors getWikiTreeSummary parseTreeRelationships populateParents readWikifamilytree traceTreePaths

#' Read Wiki Family Tree
#'
#' @param text A character string containing the text of a family tree in wiki format.
#' @param verbose A logical value indicating whether to print messages.
#' @param file_path The path to the file containing the family tree.
#' @param ... Additional arguments (not used).
#'
#' @return A list containing the summary, members, structure, and relationships of the family tree.
#' @export
readWikifamilytree <- function(text = NULL, verbose = FALSE, file_path = NULL, ...) {
  # Checks
  if (is.null(text) && is.null(file_path)) {
    stop("Either 'text' or 'file_path' must be provided.")
  }
  # read from file if provided
  if (!is.null(file_path)) {
    if (!file.exists(file_path)) stop("File does not exist: ", file_path)

    if (verbose) {
      message(paste("Reading file:", file_path))
    }
    file <- data.frame(X1 = readLines(file_path))
    file_length <- nrow(file)
    if (verbose) {
      message(paste0("File is ", file_length, " lines long"))
    }
    text <- paste0(file$X1, collapse = "\n")
  }
  # Extract summary text

  summary_text <- getWikiTreeSummary(text)
  # Extract all lines defining the family tree
  tree_lines <- unlist(stringr::str_extract_all(text, "\\{\\{familytree.*?\\}\\}"))
  tree_lines <- tree_lines[!stringr::str_detect(tree_lines, "start|end")] # Remove start/end markers
  tree_lines <- gsub("\\{\\{familytree(.*?)\\}\\}", "\\1", tree_lines) # Remove wrapping markup

  # Convert tree structure into a coordinate grid (preserves symbols!)
  tree_df <- buildTreeGrid(tree_lines)

  # Identify columns that start with "Y"
  cols_to_pivot <- grep("^Y", names(tree_df), value = TRUE)

  # Reshape from wide to long format
  tree_long <- convertGrid2LongTree(tree_df, cols_to_pivot)

  # Extract member definitions
  members_df <- extractMemberTable(text)
  members_df$id <- paste0("P", seq_len(nrow(members_df))) # Assign unique person IDs

  # Merge names into the tree structure (keeping all symbols!)
  tree_long <- merge(tree_long, members_df, by.x = "Value", by.y = "identifier", all.x = TRUE)

  tree_long$DisplayName <- ifelse(!is.na(tree_long$name), tree_long$name, tree_long$Value) # Use name if available

  # parse relationships and infer them
  tree_paths <- traceTreePaths(tree_long, deduplicate = FALSE)

  parsedRelationships <- parseTreeRelationships(tree_long, tree_paths)
  # Return structured table of the family tree (symbols included)
  list(
    summary = summary_text,
    members = members_df,
    structure = tree_long,
    tree_paths = parsedRelationships$tree_paths,
    relationships = parsedRelationships$relationships
  )
}

#' Make Long Tree
#' @param tree_df A data frame containing the tree structure.
#' @param cols_to_pivot A character vector of column names to pivot.
#' @return A long data frame containing the tree structure.
#' @keywords internal
convertGrid2LongTree <- function(tree_df, cols_to_pivot) {
  tree_long <- stats::reshape(tree_df,
    varying = cols_to_pivot,
    v.names = "Value",
    timevar = "Column",
    times = cols_to_pivot,
    idvar = setdiff(names(tree_df), cols_to_pivot),
    direction = "long"
  )

  tree_long <- tree_long[!is.na(tree_long$Value), ]
  tree_long$Value <- stringr::str_trim(tree_long$Value)
  tree_long$Column <- as.numeric(gsub("^Y", "", tree_long$Column))
  return(tree_long)
}

#' Match Members
#' @inheritParams readWikifamilytree
#' @return A data frame containing information about the members of the family tree.
#' @keywords internal

extractMemberTable <- function(text) {
  member_matches <- stringr::str_extract_all(text, "\\|\\s*([A-Za-z0-9]+)\\s*=\\s*([^|}]*)")[[1]]
  member_matches <- gsub("\\[|\\]|'''", "", member_matches) # Remove formatting

  members_df <- data.frame(
    identifier = stringr::str_trim(stringr::str_extract(member_matches, "^[^=]+")),
    name = stringr::str_trim(stringr::str_extract(member_matches, "(?<=\\=).*")),
    stringsAsFactors = FALSE
  )

  # Remove leading pipes (`|`) from identifiers for consistency
  members_df$identifier <- gsub("^\\|\\s*", "", members_df$identifier)

  # remove summary row
  members_df <- members_df[members_df$identifier != "summary", ]

  return(members_df)
}

#' Extract Summary Text
#' @inheritParams readWikifamilytree
#' @return A character string containing the summary text.
#' @keywords internal
#' @export

getWikiTreeSummary <- function(text) {
  summary_match <- stringr::str_match(text, "\\{\\{familytree/start \\|summary=(.*?)\\}\\}")
  summary_text <- ifelse(!is.na(summary_match[, 2]), summary_match[, 2], NA)
  return(summary_text)
}

#' Parse Tree
#' @param tree_lines A character vector containing the lines of the tree structure.
#' @return A data frame containing the tree structure.
#' @keywords internal
#' @export

buildTreeGrid <- function(tree_lines) {
  tree_matrix <- base::strsplit(tree_lines, "\\|") # Split each row into columns
  max_cols <- max(sapply(tree_matrix, length)) # Find the max column count

  # Convert to a data frame (ensures correct structure)
  tree_df <- do.call(rbind, lapply(tree_matrix, function(row) {
    length(row) <- max_cols # Ensure uniform column length
    return(row)
  }))

  tree_df <- as.data.frame(tree_df, stringsAsFactors = FALSE)
  colnames(tree_df) <- paste0("Y", seq_len(ncol(tree_df))) # Assign column names
  tree_df$Row <- seq_len(nrow(tree_df)) # Assign row numbers
  return(tree_df)
}

#' infer relationship from tree template
#'
#' @param tree_long A data frame containing the tree structure in long format.
#' @param tree_paths Optional. traceTreePaths output. If NULL, it will be calculated.
#' @return A data frame containing the relationships between family members.
#' @keywords internal
#'
parseTreeRelationships <- function(tree_long, tree_paths = NULL) {
  # Check if tree_paths is NULL and call traceTreePaths if necessary
  if (is.null(tree_paths)) {
    tree_paths <- traceTreePaths(tree_long, deduplicate = FALSE)
  }
  # Initialize relationships dataframe: one row per unique person
  person_ids <- unique(tree_long$id[!is.na(tree_long$id)])

  # Initialize relationships data frame
  relationships <- data.frame(
    id = person_ids,
    momID = NA_character_,
    dadID = NA_character_,
    parent_1 = NA_character_,
    parent_2 = NA_character_,
    spouseID = NA_character_,
    stringsAsFactors = FALSE
  )

  tree_paths <- tree_paths[!is.na(tree_paths$from_id) & !is.na(tree_paths$to_id), ]

  # Fill in relationships based on the tree structure
  tree_paths$relationship <- NA_character_

  # map relationships based on the intermediate values

  tree_paths$relationship[
    grepl("\\+", tree_paths$intermediate_values) &
      !grepl("y", tree_paths$intermediate_values)
  ] <- "spouse"

  # Parent-child: + and y both present
  tree_paths$relationship[
    grepl("\\+", tree_paths$intermediate_values) &
      grepl("y", tree_paths$intermediate_values)
  ] <- "offspring"

  tree_paths$relationship[
    is.na(tree_paths$relationship) & grepl("y", tree_paths$intermediate_values)
  ] <- "offspring"

  # determine direction
  tree_paths$relationship[grepl("^\\+", tree_paths$intermediate_values) & tree_paths$relationship == "offspring"] <- "parent-child"
  tree_paths$relationship[grepl("[y\\|]$", tree_paths$intermediate_values) & tree_paths$relationship == "offspring"] <- "parent-child"
  tree_paths$relationship[grepl("\\+$", tree_paths$intermediate_values) & tree_paths$relationship == "offspring"] <- "child-parent"
  tree_paths$relationship[grepl("^[y\\|]", tree_paths$intermediate_values) & tree_paths$relationship == "offspring"] <- "child-parent"

  # Fill spouse links
  spouse_links <- tree_paths[tree_paths$relationship == "spouse", ]

  for (i in seq_len(nrow(spouse_links))) {
    a <- spouse_links$from_id[i]
    b <- spouse_links$to_id[i]
    relationships$spouseID[relationships$id == a] <- b
    relationships$spouseID[relationships$id == b] <- a
  }

  # Fill parent-child links from directional tags

  pc_links <- tree_paths[tree_paths$relationship == "parent-child", ]
  for (i in seq_len(nrow(pc_links))) {
    relationships <- populateParents(
      df = relationships,
      child = pc_links$to_id[i],
      parent = pc_links$from_id[i]
    )
  }

  # --- Child-parent (to_id = parent) ---
  cp_links <- tree_paths[tree_paths$relationship == "child-parent", ]
  for (i in seq_len(nrow(cp_links))) {
    relationships <- populateParents(
      df = relationships,
      child = cp_links$from_id[i],
      parent = cp_links$to_id[i]
    )
  }

  out <- list(
    tree_paths = tree_paths,
    relationships = relationships
  )
  return(out)
}

#' Assign Parent
#' @param df A data frame containing the relationships.
#' @param child The ID of the child.
#' @param parent The ID of the parent.
#' @return A data frame with updated parent information.
#' @keywords internal
populateParents <- function(df, child, parent) {
  idx <- which(df$id == child)
  if (length(idx) != 1) {
    return(df)
  }

  if (is.na(df$parent_1[idx])) {
    df$parent_1[idx] <- parent
  } else if (is.na(df$parent_2[idx]) && df$parent_1[idx] != parent) {
    df$parent_2[idx] <- parent
  }
  return(df)
}


#' Trace paths between individuals in a family tree grid
#'
#' @param tree_long A data.frame with columns: Row, Column, Value, id
#' @param deduplicate Logical, if TRUE, will remove duplicate paths
#' @return A data.frame with columns: from_id, to_id, direction, path_length, intermediates
#' @export
#'
traceTreePaths <- function(tree_long, deduplicate = TRUE) {
  # Keep only relevant cells (people and path symbols)
  path_symbols <- c("|", "-", "+", "v", "^", "y", ",", ".", "`", "!", "~", "x", ")", "(")
  tree_long$Value <- gsub("\\s+", "", tree_long$Value) # Remove whitespace
  active_cells <- tree_long[!is.na(tree_long$Value) &
    (tree_long$Value %in% path_symbols | !is.na(tree_long$id)), ]

  active_cells$key <- paste(active_cells$Row, active_cells$Column, sep = "_")

  edges <- do.call(rbind, lapply(seq_len(nrow(active_cells)), function(i) {
    from_key <- active_cells$key[i]
    to_keys <- getGridNeighbors(active_cells[i, ],
      active_keys = active_cells$key
    )
    if (length(to_keys) > 0) {
      data.frame(from = from_key, to = to_keys, stringsAsFactors = FALSE)
    }
  }))

  # Create graph
  g <- igraph::graph_from_data_frame(edges, directed = FALSE)

  # Map keys to IDs
  person_nodes <- active_cells[!is.na(active_cells$id), c("key", "id")]
  id_map <- stats::setNames(person_nodes$id, person_nodes$key)

  # Find all pairs of people and trace paths
  person_keys <- names(id_map)
  result <- data.frame()

  for (i in seq_along(person_keys)) {
    for (j in seq_along(person_keys)) {
      if (i == j) next
      from_key <- person_keys[i]
      to_key <- person_keys[j]

      # skip if either endpoint is not in graph
      if (!(from_key %in% igraph::V(g)$name) || !(to_key %in% igraph::V(g)$name)) {
        next
      }
      # Find the shortest path between the two keys
      sp <- suppressWarnings(igraph::shortest_paths(g, from_key, to_key, output = "vpath")$vpath[[1]])
      if (length(sp) > 1) {
        intermediate <- setdiff(names(sp), c(from_key, to_key))
        # Extract values at those intermediate keys
        intermediate_values <- sapply(intermediate, function(k) {
          cell <- active_cells[active_cells$key == k, ]
          if (nrow(cell) > 0) cell$Value else NA
        })
        result <- rbind(result, data.frame(
          from_id = id_map[[from_key]],
          to_id = id_map[[to_key]],
          path_length = length(sp) - 1,
          intermediates = paste(intermediate, collapse = ";"),
          intermediate_values = paste(intermediate_values, collapse = ""),
          stringsAsFactors = FALSE
        ))
      } else {
        # If no path found, add a row with NA values
        result <- rbind(result, data.frame(
          from_id = id_map[[from_key]],
          to_id = id_map[[to_key]],
          path_length = NA,
          intermediates = NA,
          intermediate_values = NA,
          stringsAsFactors = FALSE
        ))
      }
    }
  }

  if (deduplicate == TRUE) {
    # Deduplicate pairs
    result <- deduplicatePairs(result)
  }

  return(result)
}

#' Build adjacency list (4-way neighbors)
#'
#' @param cell A data frame with columns Row and Column
#' @return A character vector of neighboring cell keys
#' @keywords internal

getGridNeighbors <- function(cell, active_keys) {
  offsets <- list(c(1, 0), c(-1, 0), c(0, 1), c(0, -1)) # down, up, right, left
  out <- character()
  for (offset in offsets) {
    r2 <- cell$Row + offset[1]
    c2 <- cell$Column + offset[2]
    key2 <- paste(r2, c2, sep = "_")
    if (key2 %in% active_keys) {
      out <- c(out, key2)
    }
  }
  return(out)
}

#' Deduplicate pairs of IDs in a data frame
#'
#' @param df A data frame with columns from_id and to_id
#' @return A data frame with unique pairs of IDs
#' @keywords internal
deduplicatePairs <- function(df) {
  # Create a new column with sorted pairs
  df$pair <- apply(df[, c("from_id", "to_id")], 1, function(x) paste(sort(x), collapse = "_"))

  # Remove duplicates based on the pair column
  df_dedup <- df[!duplicated(df$pair), ]

  # Drop the pair column
  df_dedup$pair <- NULL

  return(df_dedup)
}

Any scripts or data that you put into this service are public.

BGmisc documentation built on June 11, 2025, 1:07 a.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

BGmisc
An R Package for Extended Behavior Genetics Analysis

R/readWikifamilytree.R
In BGmisc: An R Package for Extended Behavior Genetics Analysis

Defines functions deduplicatePairs getGridNeighbors traceTreePaths populateParents parseTreeRelationships buildTreeGrid getWikiTreeSummary extractMemberTable convertGrid2LongTree readWikifamilytree

Documented in buildTreeGrid convertGrid2LongTree deduplicatePairs extractMemberTable getGridNeighbors getWikiTreeSummary parseTreeRelationships populateParents readWikifamilytree traceTreePaths

Try the BGmisc package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

BGmisc An R Package for Extended Behavior Genetics Analysis

R/readWikifamilytree.R In BGmisc: An R Package for Extended Behavior Genetics Analysis

Defines functions deduplicatePairs getGridNeighbors traceTreePaths populateParents parseTreeRelationships buildTreeGrid getWikiTreeSummary extractMemberTable convertGrid2LongTree readWikifamilytree

Documented in buildTreeGrid convertGrid2LongTree deduplicatePairs extractMemberTable getGridNeighbors getWikiTreeSummary parseTreeRelationships populateParents readWikifamilytree traceTreePaths

Try the BGmisc package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

BGmisc
An R Package for Extended Behavior Genetics Analysis

R/readWikifamilytree.R
In BGmisc: An R Package for Extended Behavior Genetics Analysis