R/tree-sequences.R
In slendr: A Simulation Framework for Spatiotemporal Population Genetics

Documented in ts_eigenstrat ts_genotypes ts_load ts_metadata ts_mutate ts_recapitate ts_save ts_simplify

# tree sequence processing ------------------------------------------------

#' Load a tree sequence file produced by a given model
#'
#' This function loads a tree sequence file simulated from a given slendr model.
#' Optionally, the tree sequence can be recapitated and simplified.
#'
#' The loading, recapitation and simplification is performed using the Python
#' module pyslim which serves as a link between tree sequences generated by SLiM
#' and the tskit module for manipulation of tree sequence data. All of these
#' steps have been modelled after the official pyslim tutorial and documentation
#' available at: <https://tskit.dev/pyslim/docs/latest/tutorial.html>.
#'
#' The recapitation and simplification steps can also be performed individually
#' using the functions \code{\link{ts_recapitate}} and
#' \code{\link{ts_simplify}}.
#'
#' @param file A path to the tree-sequence file (either originating from a
#'   slendr model or a standard non-slendr tree sequence).
#' @param model Optional \code{slendr_model} object which produced the
#'   tree-sequence \code{file}. Used for adding various annotation data and
#'   metadata to the standard tskit tree-sequence object.
#'
#' @return Tree-sequence object of the class \code{slendr_ts}, which serves as
#'   an interface point for the Python module tskit using slendr functions with
#'   the \code{ts_} prefix.
#'
#' @seealso \code{\link{ts_nodes}} for extracting useful information about
#'   individuals, nodes, coalescent times and geospatial locations of nodes on a
#'   map
#'
#' @examples
#' \dontshow{check_dependencies(python = TRUE, quit = TRUE) # dependencies must be present
#' }
#' init_env()
#'
#' # load an example model with an already simulated tree sequence
#' slendr_ts <- system.file("extdata/models/introgression_slim.trees", package = "slendr")
#' model <- read_model(path = system.file("extdata/models/introgression", package = "slendr"))
#'
#' # load tree sequence generated by a given model
#' ts <- ts_load(slendr_ts, model)
#'
#' # even tree sequences generated by non-slendr models can be
#' msprime_ts <- system.file("extdata/models/msprime.trees", package = "slendr")
#' ts <- ts_load(msprime_ts)
#'
#' # load tree sequence and immediately simplify it only to sampled individuals
#' # (note that the example tree sequence is already simplified so this operation
#' # does not do anything in this case)
#' ts <- ts_load(slendr_ts, model = model) %>% ts_simplify(keep_input_roots = TRUE)
#'
#' # load tree sequence and simplify it to a subset of sampled individuals
#' ts_small <- ts_simplify(ts, simplify_to = c("CH_1", "NEA_1", "NEA_2",
#'                                             "AFR_1", "AFR_2", "EUR_1", "EUR_2"))
#'
#' # load tree sequence, recapitate it and simplify it
#' ts <- ts_load(slendr_ts, model) %>%
#'   ts_recapitate(recombination_rate = 1e-8, Ne = 10000, random_seed = 42) %>%
#'   ts_simplify()
#'
#' # load tree sequence, recapitate it, simplify it and overlay neutral mutations
#' ts <- ts_load(slendr_ts, model) %>%
#'   ts_recapitate(recombination_rate = 1e-8, Ne = 10000, random_seed = 42) %>%
#'   ts_simplify() %>%
#'   ts_mutate(mutation_rate = 1e-8)
#'
#' ts
#' @export
ts_load <- function(file, model = NULL) {
  # load the tree sequence, converting it to a SLiM tree sequence if necessary
  ts <- if (is.character(file)) tskit$load(path.expand(file)) else file

  if (length(ts$metadata) == 0 || is.null(ts$metadata$SLiM))
    type <- "generic"
  else
    type <- "SLiM"

  attr(ts, "type") <- type
  attr(ts, "model") <- model
  attr(ts, "spatial") <-
    (type != "SLiM" && nrow(ts$individual_locations) > 0 && ncol(ts$individual_locations) > 0) ||
    (type == "SLiM" && ts$metadata$SLiM$spatial_dimensionality != "")
  if (attr(ts, "spatial")) check_spatial_pkgs()

  attr(ts, "metadata") <- get_slendr_metadata(ts)

  attr(ts, "recapitated") <- FALSE
  attr(ts, "simplified") <- FALSE
  attr(ts, "mutated") <- FALSE

  class(ts) <- c("slendr_ts", class(ts))

  # Extract "raw" tree sequence tables -- these can be later accessed via
  # ts_table(ts, "<nodes|edges|individuals|mutations>") but note that these are
  # not necessary for standard slendr data analysis. For that purpose, the
  # annotated tables provided by ts_nodes() and ts_edges() are more useful.
  attr(ts, "raw_nodes") <- get_ts_raw_nodes(ts)
  attr(ts, "raw_edges") <- get_ts_raw_edges(ts)
  attr(ts, "raw_individuals") <- get_ts_raw_individuals(ts)
  attr(ts, "raw_mutations") <- get_ts_raw_mutations(ts)

  attr(ts, "nodes") <- if (type == "SLiM") get_pyslim_table_data(ts) else get_tskit_table_data(ts)

  # if the tree sequence was loaded from a file, save the path
  if (is.character(file)) attr(ts, "path") <- normalizePath(file)

  ts
}

#' Save a tree sequence to a file
#'
#' @param ts Tree sequence object loaded by \code{ts_load}
#' @param file File to which the tree sequence should be saved
#'
#' @return No return value, called for side effects
#'
#' @examples
#' \dontshow{check_dependencies(python = TRUE, quit = TRUE) # dependencies must be present
#' }
#' init_env()
#'
#' # load an example model with an already simulated tree sequence
#' slendr_ts <- system.file("extdata/models/introgression_slim.trees", package = "slendr")
#' model <- read_model(path = system.file("extdata/models/introgression", package = "slendr"))
#'
#' # load the tree sequence
#' ts <- ts_load(slendr_ts, model)
#'
#' # save the tree-sequence object to a different location
#' another_file <- paste(tempfile(), ".trees")
#' ts_save(ts, another_file)
#' @export
ts_save <- function(ts, file) {
  check_ts_class(ts)
  type <- attr(ts, "type")
  from_slendr <- !is.null(attr(ts, "model"))

  # overwrite the original list of sample names (if the tree sequence was simplified
  # down to a smaller number of individuals than originally sampled)
  if (from_slendr && nrow(ts_samples(ts)) != nrow(attr(ts, "metadata")$sampling)) {
    tables <- ts$dump_tables()
    tables$metadata_schema = tskit$MetadataSchema(list("codec" = "json"))

    sample_names <- attr(ts, "metadata")$sample_names
    pedigree_ids <- attr(ts, "metadata")$sample_ids
    if (type == "SLiM") {
      tables$metadata$SLiM$user_metadata$slendr[[1]]$sample_names <- sample_names
      tables$metadata$SLiM$user_metadata$slendr[[1]]$sample_ids <- pedigree_ids
    } else
      tables$metadata$slendr$sample_names <- sample_names

    # put the tree sequence object back together
    ts <- tables$tree_sequence()
  }

  ts$dump(path.expand(file))
}


#' Recapitate the tree sequence
#'
#' @param ts Tree sequence object loaded by \code{ts_load}
#' @param recombination_rate A constant value of the recombination rate
#' @param Ne Effective population size during the recapitation process
#' @param demography Ancestral demography to be passed internally to
#'   \code{msprime.sim_ancestry()} (see msprime's documentation for mode detail)
#' @param random_seed Random seed passed to pyslim's \code{recapitate} method
#'   (if \code{NULL}, a seed will be generated between 0 and the maximum integer
#'   number available)
#'
#' @return Tree-sequence object of the class \code{slendr_ts}, which serves as
#'   an interface point for the Python module tskit using slendr functions with
#'   the \code{ts_} prefix.
#'
#' @seealso \code{\link{ts_nodes}} for extracting useful information about
#'   individuals, nodes, coalescent times and geospatial locations of nodes on a
#'   map
#'
#' @examples
#' \dontshow{check_dependencies(python = TRUE, quit = TRUE) # dependencies must be present
#' }
#' init_env()
#'
#' # load an example model with an already simulated tree sequence
#' slendr_ts <- system.file("extdata/models/introgression_slim.trees", package = "slendr")
#' model <- read_model(path = system.file("extdata/models/introgression", package = "slendr"))
#'
#' ts <- ts_load(slendr_ts, model) %>%
#'   ts_recapitate(recombination_rate = 1e-8, Ne = 10000, random_seed = 42)
#'
#' ts
#' @export
ts_recapitate <- function(ts, recombination_rate, Ne = NULL, demography = NULL, random_seed = NULL) {
  check_ts_class(ts)

  if ((is.null(Ne) & is.null(demography)) | !is.null(Ne) & !is.null(demography))
    stop("Either ancestral Ne or demography (but not both) must be specified for\n",
         "recapitation. See documentation of pyslim.recapitate for more detail.", call. = FALSE)

  model <- attr(ts, "model")
  type <- attr(ts, "type")
  spatial <- attr(ts, "spatial")

  random_seed <- set_random_seed(random_seed)

  if (type == "SLiM") {
    if (!is.null(Ne))
      ts_new <- pyslim$recapitate(
        ts,
        recombination_rate = recombination_rate,
        ancestral_Ne = Ne,
        random_seed = random_seed
      )
    else
      ts_new <- pyslim$recapitate(
        ts,
        recombination_rate = recombination_rate,
        demography = demography,
        random_seed = random_seed
      )
  } else {
    warning("There is no need to recapitate an already coalesced msprime tree sequence",
            call. = FALSE)
    ts_new <- ts
  }

  # copy attributes over to the new tree-sequence object or generate updates
  # ones where necessary
  attr(ts_new, "model") <- model
  attr(ts_new, "type") <- type
  attr(ts_new, "spatial") <- spatial
  attr(ts_new, "metadata") <- attr(ts, "metadata")

  attr(ts_new, "recapitated") <- TRUE
  attr(ts_new, "simplified") <- attr(ts, "simplified")
  attr(ts_new, "mutated") <- attr(ts, "mutated")

  attr(ts_new, "raw_nodes") <- get_ts_raw_nodes(ts_new)
  attr(ts_new, "raw_edges") <- get_ts_raw_edges(ts_new)

  attr(ts_new, "raw_individuals") <- get_ts_raw_individuals(ts_new)
  attr(ts_new, "raw_mutations") <- get_ts_raw_mutations(ts_new)

  attr(ts_new, "path") <- attr(ts, "path")

  if (type == "SLiM") {
    # inherit the information about which individuals should be marked as
    # explicitly "sampled" from the previous tree sequence object (if that
    # was specified) -- this is only necessary for a SLiM sequence
    # TODO: no longer necessary
    old_individuals <- attr(ts, "raw_individuals")
    sampled_ids <- old_individuals[old_individuals$sampled, ]$pedigree_id
    attr(ts_new, "raw_individuals") <- attr(ts_new, "raw_individuals") %>%
      dplyr::mutate(sampled = pedigree_id %in% sampled_ids)
  }

  attr(ts_new, "nodes") <- if (type == "SLiM") get_pyslim_table_data(ts_new) else get_tskit_table_data(ts_new)

  class(ts_new) <- c("slendr_ts", class(ts_new))

  ts_new
}

#' Simplify the tree sequence down to a given set of individuals
#'
#' This function is a convenience wrapper around the \code{simplify} method
#' implemented in tskit, designed to work on tree sequence data simulated by
#' SLiM using the \pkg{slendr} R package.
#'
#' The simplification process is used to remove redundant information from the
#' tree sequence and retains only information necessary to describe the
#' genealogical history of a set of samples.
#'
#' For more information on how simplification works in pyslim and tskit, see the
#' official documentation at
#' <https://tskit.dev/tskit/docs/stable/python-api.html#tskit.TreeSequence.simplify>
#' and <https://tskit.dev/pyslim/docs/latest/tutorial.html#simplification>.
#'
#' A very clear description of the difference between remembering and retaining
#' and how to use these techniques to implement historical individuals (i.e.
#' ancient DNA samples) is in the pyslim documentation at
#' <https://tskit.dev/pyslim/docs/latest/tutorial.html#historical-individuals>.
#'
#' @param ts Tree sequence object of the class \code{slendr_ts}
#' @param simplify_to A character vector of individual names. If NULL, all
#'   explicitly remembered individuals (i.e. those specified via the
#'   \code{\link{schedule_sampling}} function will be left in the tree sequence
#'   after the simplification.
#' @param keep_input_roots Should the history ancestral to the MRCA of all
#'   samples be retained in the tree sequence? Default is \code{FALSE}.
#' @param keep_unary Should unary nodes be preserved through simplification?
#'   Default is \code{FALSE}.
#' @param keep_unary_in_individuals Should unary nodes be preserved through
#'   simplification if they are associated with an individual recorded in
#'   the table of individuals? Default is \code{FALSE}. Cannot be set to
#'   \code{TRUE} if \code{keep_unary} is also TRUE
#' @param filter_nodes Should nodes be reindexed after simplification? Default is
#'   \code{TRUE}. See tskit's documentation for the Python method \code{simplify()}
#    for more detail.
#'
#' @return Tree-sequence object of the class \code{slendr_ts}, which serves as
#'   an interface point for the Python module tskit using slendr functions with
#'   the \code{ts_} prefix.
#'
#' @seealso \code{\link{ts_nodes}} for extracting useful information about
#'   individuals, nodes, coalescent times and geospatial locations of nodes on a
#'   map
#'
#' @examples
#' \dontshow{check_dependencies(python = TRUE, quit = TRUE) # dependencies must be present
#' }
#' init_env()
#'
#' # load an example model with an already simulated tree sequence
#' slendr_ts <- system.file("extdata/models/introgression_slim.trees", package = "slendr")
#' model <- read_model(path = system.file("extdata/models/introgression", package = "slendr"))
#'
#' ts <- ts_load(slendr_ts, model)
#' ts
#'
#' # simplify tree sequence to sampled individuals
#' ts_simplified <- ts_simplify(ts)
#'
#' # simplify to a subset of sampled individuals
#' ts_small <- ts_simplify(ts, simplify_to = c("CH_1", "NEA_1", "NEA_2", "AFR_1",
#'                                             "AFR_2", "EUR_1", "EUR_2"))
#'
#' ts_small
#' @export
ts_simplify <- function(ts, simplify_to = NULL, keep_input_roots = FALSE,
                        keep_unary = FALSE, keep_unary_in_individuals = FALSE,
                        filter_nodes = TRUE) {
  check_ts_class(ts)

  model <- attr(ts, "model")
  type <- attr(ts, "type")
  spatial <- attr(ts, "spatial")

  from_slendr <- !is.null(model)

  if (!attr(ts, "recapitated") && !keep_input_roots && !ts_coalesced(ts))
    warning("Simplifying a non-recapitated tree sequence. Make sure this is what you really want",
            call. = FALSE)

  data <- attr(ts, "nodes")

  if (is.null(simplify_to)) { # no individuals/nodes were given to guide the simplification
    samples <- dplyr::filter(data, sampled)$node_id # simplify to all sampled nodes
  } else if (is.character(simplify_to)) { # a vector of slendr individual names was given
    if (!from_slendr)
      stop("Symbolic character names can only be provided for slendr-generated\n",
           "tree sequences", call. = FALSE)
    if (!all(simplify_to %in% data$name))
      stop("The following individuals are not present in the tree sequence: ",
          paste0(simplify_to[!simplify_to %in% data$name], collapse = ", "),
          call. = FALSE)
    samples <- dplyr::filter(data, name %in% simplify_to)$node_id
  } else if (is.numeric(simplify_to)) {
    if (!all(simplify_to %in% data$node_id))
      stop("The following nodes are not among sampled nodes: ",
          paste0(simplify_to[!simplify_to %in% data$node_id], collapse = ", "),
          call. = FALSE)
    samples <- simplify_to
  } else
    stop("Unknown type of simplification nodes", call. = FALSE)

  ts_new <- ts$simplify(as.integer(samples),
                        filter_populations = FALSE,
                        filter_nodes = filter_nodes,
                        keep_input_roots = keep_input_roots,
                        keep_unary = keep_unary,
                        keep_unary_in_individuals = keep_unary_in_individuals)

  # copy attributes over to the new tree-sequence object or generate updates
  # ones where necessary
  attr(ts_new, "model") <- model
  attr(ts_new, "type") <- type
  attr(ts_new, "spatial") <- spatial

  attr(ts_new, "metadata") <- attr(ts, "metadata")

  attr(ts_new, "recapitated") <- attr(ts, "recapitated")
  attr(ts_new, "simplified") <- TRUE
  attr(ts_new, "mutated") <- attr(ts, "mutated")

  attr(ts_new, "raw_nodes") <- get_ts_raw_nodes(ts_new)
  attr(ts_new, "raw_edges") <- get_ts_raw_edges(ts_new)
  attr(ts_new, "raw_individuals") <- get_ts_raw_individuals(ts_new)
  attr(ts_new, "raw_mutations") <- get_ts_raw_mutations(ts_new)

  # use pedigree IDs to cross-check the original data with simplified table
  if (type == "SLiM") {
    # mark only explicitly simplified individuals as "focal"
    sample_ids <- data[data$node_id %in% samples, ]$pedigree_id
    attr(ts_new, "raw_individuals")$sampled <-
      attr(ts_new, "raw_individuals")$pedigree_id %in% sample_ids

    # get the name and location from the original table with the pedigree_id key
    cols <- c("pedigree_id", "pop")
    if (from_slendr) cols <- c(cols, "name")
    if (spatial) cols <- c(cols, "location")
    # we need to deduplicate the rows because the table is stored in a long format
    # (but we removed the node_id column which each diploid individual has two
    # values of)
    keep_data <- data[, cols] %>% dplyr::filter(!duplicated(pedigree_id))

    # get node IDs of individuals present in the simplified tree sequence
    # (sort by individual ID and time)
    nodes_new <- get_ts_raw_nodes(ts_new) %>%
      dplyr::arrange(ind_id, time) %>%
      dplyr::select(node_id, ind_id) %>%
      .$node_id

    location_col <- if (spatial) "location" else NULL

    # get other data about individuals in the simplified tree sequence, sort them
    # also by their IDs and times, and add their node IDs extracted above
    # (this works because we sorted both in the same way)
    data_new <- get_pyslim_table_data(ts_new, simplify_to) %>%
      as.data.frame() %>%
      dplyr::arrange(ind_id, time) %>%
      dplyr::select(pop_id, ind_id, pedigree_id, time, time_tskit, sampled, remembered, retained, alive) %>%
      dplyr::inner_join(keep_data, by = "pedigree_id") %>%
      dplyr::mutate(node_id = nodes_new) %>%
      dplyr::as_tibble()

    if (spatial)
      data_new <- sf::st_as_sf(data_new, crs = sf::st_crs(data))

    name_col <- if (from_slendr) "name" else NULL
    attr(ts_new, "nodes") <- data_new[, c(name_col, "pop", "node_id",
                                         "time", "time_tskit", location_col, "sampled", "remembered",
                                         "retained", "alive", "pedigree_id", "ind_id", "pop_id")]
  } else
    attr(ts_new, "nodes") <- get_tskit_table_data(ts_new, simplify_to)

  attr(ts_new, "path") <- attr(ts, "path")

  # replace the names of sampled individuals (if simplification led to subsetting)
  if (from_slendr) {
    sampled_nodes <- attr(ts_new, "nodes") %>% dplyr::filter(sampled)
    attr(ts_new, "metadata")$sample_names <-  unique(sampled_nodes$name)
    if (type == "SLiM")
      attr(ts_new, "metadata")$sample_ids <- unique(sampled_nodes$pedigree_id)
  }

  class(ts_new) <- c("slendr_ts", class(ts_new))

  ts_new
}

#' Add mutations to the given tree sequence
#'
#' @param ts Tree sequence object of the class \code{slendr_ts}
#' @param mutation_rate Mutation rate used by msprime to simulate mutations
#' @param random_seed Random seed passed to msprime's \code{mutate} method
#'   (if \code{NULL}, a seed will be generated between 0 and the maximum integer
#'   number available)
#' @param keep_existing Keep existing mutations?
#' @param mut_type Assign SLiM mutation type to neutral mutations? If
#'   \code{NULL} (default), no special mutation type will be used. If an
#'   integer number is given, mutations of the SLiM mutation type with that
#'   integer identifier will be created.
#'
#' @return Tree-sequence object of the class \code{slendr_ts}, which serves as
#'   an interface point for the Python module tskit using slendr functions with
#'   the \code{ts_} prefix.
#'
#' @seealso \code{\link{ts_nodes}} for extracting useful information about
#'   individuals, nodes, coalescent times and geospatial locations of nodes on a
#'   map
#'
#' @examples
#' \dontshow{check_dependencies(python = TRUE, quit = TRUE) # dependencies must be present
#' }
#' init_env()
#'
#' # load an example model with an already simulated tree sequence
#' slendr_ts <- system.file("extdata/models/introgression_slim.trees", package = "slendr")
#' model <- read_model(path = system.file("extdata/models/introgression", package = "slendr"))
#'
#' ts <- ts_load(slendr_ts, model)
#' ts_mutate <- ts_mutate(ts, mutation_rate = 1e-8, random_seed = 42)
#'
#' ts_mutate
#' @export
ts_mutate <- function(ts, mutation_rate, random_seed = NULL,
                      keep_existing = TRUE, mut_type = NULL) {
  check_ts_class(ts)

  if (attr(ts, "mutated")) stop("Tree sequence already mutated", call. = FALSE)

  if (is.numeric(mut_type) && attr(ts, "type") == "SLiM")
    mut_type <- msp$SLiMMutationModel(type = as.integer(mut_type))

  random_seed <- set_random_seed(random_seed)

  ts_new <-
    msp$sim_mutations(
      ts,
      rate = mutation_rate,
      model = mut_type,
      keep = keep_existing,
      random_seed = random_seed
    )

  # copy attributes over to the new tree-sequence object or generate updates
  # ones where necessary
  attr(ts_new, "model") <- attr(ts, "model")
  attr(ts_new, "type") <- attr(ts, "type")
  attr(ts_new, "spatial") <- attr(ts, "spatial")

  attr(ts_new, "metadata") <- attr(ts, "metadata")

  attr(ts_new, "recapitated") <- attr(ts, "recapitated")
  attr(ts_new, "simplified") <- attr(ts, "simplified")
  attr(ts_new, "mutated") <- TRUE

  attr(ts_new, "raw_nodes") <- attr(ts, "raw_nodes")
  attr(ts_new, "raw_edges") <- attr(ts, "raw_edges")
  attr(ts_new, "raw_individuals") <- attr(ts, "raw_individuals")
  attr(ts_new, "raw_mutations") <- get_ts_raw_mutations(ts_new)

  attr(ts_new, "nodes") <- attr(ts, "nodes")

  attr(ts_new, "path") <- attr(ts, "path")

  class(ts_new) <- c("slendr_ts", class(ts_new))

  ts_new
}

#' Extract list with tree sequence metadata saved by SLiM
#'
#' @param ts Tree sequence object of the class \code{slendr_ts}
#'
#' @return List of metadata fields extracted from the tree-sequence object
#'
#' @examples
#' \dontshow{check_dependencies(python = TRUE, quit = TRUE) # dependencies must be present
#' }
#' init_env()
#'
#' # load an example model with an already simulated tree sequence
#' slendr_ts <- system.file("extdata/models/introgression_slim.trees", package = "slendr")
#' model <- read_model(path = system.file("extdata/models/introgression", package = "slendr"))
#'
#' # load the tree-sequence object from disk
#' ts <- ts_load(slendr_ts, model)
#'
#' # extract the list of metadata information from the tree sequence
#' ts_metadata(ts)
#' @export
ts_metadata <- function(ts) {
  check_ts_class(ts)
  attr(ts, "metadata")
}

# output formats ----------------------------------------------------------

#' Extract genotype table from the tree sequence
#'
#' @param ts Tree sequence object of the class \code{slendr_ts}
#'
#' @return Data frame object of the class \code{tibble} containing genotypes
#'   of simulated individuals in columns
#'
#' @examples
#' \dontshow{check_dependencies(python = TRUE, quit = TRUE) # dependencies must be present
#' }
#' init_env()
#'
#' # load an example model with an already simulated tree sequence
#' slendr_ts <- system.file("extdata/models/introgression_slim.trees", package = "slendr")
#' model <- read_model(path = system.file("extdata/models/introgression", package = "slendr"))
#'
#' # load the tree-sequence object from disk, recapitate it, simplify it, and mutate it
#' ts <- ts_load(slendr_ts, model) %>%
#'   ts_recapitate(Ne = 10000, recombination_rate = 1e-8) %>%
#'   ts_simplify() %>%
#'   ts_mutate(mutation_rate = 1e-8)
#'
#' # extract the genotype matrix (this could take  a long time consume lots
#' # of memory!)
#' gts <- ts_genotypes(ts)
#' @export
ts_genotypes <- function(ts) {
  if (!attr(ts, "mutated"))
    stop("Extracting genotypes from a tree sequence which has not been mutated",
         call. = FALSE)

  type <- attr(ts, "type")

  data <- ts_nodes(ts)

  gts <- ts$genotype_matrix()
  positions <- ts$tables$sites$position

  biallelic_pos <- get_biallelic_indices(ts)
  n_multiallelic <- sum(!biallelic_pos)

  if (n_multiallelic > 0) {
    message(sprintf("%i multiallelic sites (%.3f%% out of %i total) detected and removed",
                    n_multiallelic, n_multiallelic / length(positions) * 100,
                    length(positions)))
    gts <- gts[biallelic_pos, ]
    positions <- positions[biallelic_pos]
  }

  chromosomes <- ts_nodes(ts) %>%
    dplyr::filter(!is.na(name)) %>%
    dplyr::as_tibble() %>%
    dplyr::mutate(chr_name = sprintf("%s_chr%i", name, 1:2)) %>%
    dplyr::select(chr_name, node_id) %>%
    dplyr::arrange(node_id)

  colnames(gts) <- chromosomes$chr_name

  dplyr::as_tibble(gts) %>%
    dplyr::mutate(pos = as.integer(positions)) %>%
    dplyr::select(pos, dplyr::everything())
}

#' Convert genotypes to the EIGENSTRAT file format
#'
#' EIGENSTRAT data produced by this function can be used by the admixr R package
#' (<https://bodkan.net/admixr/>).
#'
#' In case an outgroup was not formally specified in a slendr model which
#' generated the tree sequence data, it is possible to artificially create an
#' outgroup sample with the name specified by the \code{outgroup} argument,
#' which will carry all ancestral alleles (i.e. value "2" in a geno file
#' for each position in a snp file).
#'
#' @param ts Tree sequence object of the class \code{slendr_ts}
#' @param prefix EIGENSTRAT trio prefix
#' @param chrom The name of the chromosome in the EIGENSTRAT snp file
#'   (default "chr1")
#' @param outgroup Should a formal, artificial outgroup be added? If \code{NULL}
#'   (default), no outgroup is added. A non-NULL character name will serve as
#'   the name of the outgroup in an ind file.
#'
#' @return Object of the class EIGENSTRAT created by the admixr package
#'
#' @export
ts_eigenstrat <- function(ts, prefix, chrom = "chr1", outgroup = NULL) {
  if (!requireNamespace("admixr", quietly = TRUE))
    message("For EIGENSTRAT conversion, please install the R package ",
            "admixr by calling `install.packages(\"admixr\")")

  if (!attr(ts, "recapitated") && !ts_coalesced(ts))
    stop("Tree sequence was not recapitated and some nodes do not ",
         "have parents over some portion of their genome. This is interpreted as ",
         "missing data, which is not currently supported. For more context, take ",
         "a look at <https://github.com/tskit-dev/tskit/issues/301#issuecomment-520990038>.",
         call. = FALSE)

  if (!attr(ts, "mutated"))
    stop("Attempting to extract genotypes from a tree sequence which has not been mutated",
         call. = FALSE)

  chrom_genotypes <- ts_genotypes(ts)
  chr1_genotypes <- dplyr::select(chrom_genotypes, dplyr::ends_with("_chr1"))
  chr2_genotypes <- dplyr::select(chrom_genotypes, dplyr::ends_with("_chr2"))

  # create a geno file table
  geno <- dplyr::as_tibble(2 - (chr1_genotypes + chr2_genotypes))
  individuals <- gsub("_chr.", "", colnames(geno))
  colnames(geno) <- individuals

  # create an ind file table
  ind <- dplyr::tibble(id = individuals, sex = "U", label = individuals)

  # create a snp file table
  positions <- chrom_genotypes$pos
  snp <- dplyr::tibble(
    id = sprintf("%s_%s", chrom, as.character(positions)),
    chrom = chrom,
    gen = 0.0,
    pos = positions,
    ref = "G",
    alt = "T"
  )

  # add an artificial outgroup individual carrying ancestral alleles only
  if (!is.null(outgroup)) {
    geno[[as.character(outgroup)]] <- 2
    ind <- data.frame(
      id = as.character(outgroup),
      sex = "U",
      label = as.character(outgroup)
    ) %>%
        dplyr::bind_rows(ind, .)
  }

  # save the EIGENSTRAT trio
  if (!dir.exists(dirname(prefix))) dir.create(dirname(prefix))
  admixr::write_geno(geno, paste0(prefix, ".geno"))
  admixr::write_snp(snp, paste0(prefix, ".snp"))
  admixr::write_ind(ind, paste0(prefix, ".ind"))

  # return the admixr eigenstrat object
  admixr::eigenstrat(prefix = prefix)
}

#' Save genotypes from the tree sequence as a VCF file
#'
#' @param ts Tree sequence object of the class \code{slendr_ts}
#' @param path Path to a VCF file
#' @param chrom Chromosome name to be written in the CHROM column of the VCF
#' @param individuals A character vector of individuals in the tree sequence. If
#'   missing, all individuals present in the tree sequence will be saved.
#'
#' @return No return value, called for side effects
#'
#' @export
ts_vcf <- function(ts, path, chrom = NULL, individuals = NULL) {
  if (!attr(ts, "recapitated") && !ts_coalesced(ts))
    stop("Tree sequence was not recapitated and some nodes do not ",
         "have parents over some portion of their genome. This is interpreted as ",
         "missing data, which is not currently supported by tskit. For more context, ",
         "take a look at <https://github.com/tskit-dev/tskit/issues/301#issuecomment-520990038>.",
         call. = FALSE)

  if (!attr(ts, "mutated"))
    stop("Attempting to extract genotypes from a tree sequence which has not been mutated",
         call. = FALSE)

  data <- ts_nodes(ts) %>%
    dplyr::filter(!is.na(name)) %>%
    dplyr::as_tibble() %>%
    dplyr::distinct(name, ind_id)

  if (is.null(individuals)) individuals <- data$name

  present <- individuals %in% unique(data$name)
  if (!all(present))
    stop("", paste(individuals[!present], collapse = ", "),
         " not present in the tree sequence", call. = FALSE)

  gzip <- reticulate::import("gzip")
  with(reticulate::`%as%`(gzip$open(path.expand(path), "wt"), vcf_file), {
    ts$write_vcf(vcf_file,
                 contig_id = chrom,
                 individuals = as.integer(data$ind_id),
                 individual_names = data$name)
  })
}

#' Convert a tree in the tree sequence to an object of the class \code{phylo}
#'
#' @inheritParams ts_tree
#' @param labels What should be stored as node labels in the final \code{phylo}
#'   object? Options are either a population name or a tskit integer node ID
#'   (which is a different thing from a \code{phylo} class node integer index).
#' @param quiet Should ape's internal phylo validity test be printed out?
#'
#' @return Standard phylogenetic tree object implemented by the R package ape
#'
#' @examples
#' \dontshow{check_dependencies(python = TRUE, quit = TRUE) # dependencies must be present
#' }
#' init_env()
#'
#' # load an example model with an already simulated tree sequence
#' slendr_ts <- system.file("extdata/models/introgression_slim.trees", package = "slendr")
#' model <- read_model(path = system.file("extdata/models/introgression", package = "slendr"))
#'
#' # load the tree-sequence object from disk
#' ts <- ts_load(slendr_ts, model) %>%
#'   ts_recapitate(Ne = 10000, recombination_rate = 1e-8) %>%
#'   ts_simplify()
#'
#' # extract the 1st tree from a given tree sequence, return ape object
#' tree <- ts_phylo(ts, i = 1, mode = "index", quiet = TRUE)
#' tree
#'
#' # extract the tree at a 42th basepair in the given tree sequence
#' tree <- ts_phylo(ts, i = 42, mode = "position", quiet = TRUE)
#'
#' # because the tree is a standard ape phylo object, we can plot it easily
#' plot(tree, use.edge.length = FALSE)
#' ape::nodelabels()
#' @export
ts_phylo <- function(ts, i, mode = c("index", "position"),
                     labels = c("tskit", "pop"), quiet = FALSE) {
  labels <- match.arg(labels)

  from_slendr <- !is.null(attr(ts, "model"))

  tree <- ts_tree(ts, i, mode)

  if (tree$num_roots > 1)
    stop("A tree sequence tree which is not fully coalesced or recapitated\n",
         "cannot be converted to an R phylo tree representation (see the help\n",
         "page of ?ts_recapitate for more details)", call. = FALSE)

  if (!attr(ts, "simplified") && attr(ts, "type") != "generic")
    stop("Please simplify your tree sequence first before converting a tree to\n",
         "an R phylo tree object format (see the help page of ?ts_simplify for\n",
         "more details)", call. = FALSE)

  # get tree sequence nodes which are present in the tskit tree object
  # (tree$preorder() just get the numerical node IDs, nothing else)
  data <- ts_nodes(ts) %>%
    dplyr::as_tibble() %>%
    dplyr::filter(node_id %in% tree$preorder())

  model <- attr(ts, "model")
  type <- attr(ts, "type")
  spatial <- attr(ts, "spatial")

  if (from_slendr)
    direction <- model$direction
  else
    direction <- "backward"

  if (direction == "forward")
    data <- dplyr::arrange(data, sampled, time)
  else
    data <- dplyr::arrange(data, sampled, -time)

  # convert the edge table to a proper ape phylo object
  # see http://ape-package.ird.fr/misc/FormatTreeR.pdf for more details
  n_tips <- sum(data$sampled, na.rm = TRUE)
  n_internal <- nrow(data) - n_tips
  n_all <- n_internal + n_tips; stopifnot(n_all == nrow(data))

  present_ids <- data$node_id
  # design a lookup table of consecutive integer numbers (reversing it because
  # in the ordered tree sequence table of nodes `data`, the sampled nodes which
  # will become the tips of the tree are at the end)
  lookup_ids <- rev(seq_along(present_ids))

  tip_labels <- dplyr::filter(data, sampled) %>%
    { if (from_slendr) sprintf("%s (%s)", .$node_id, .$name) else .$node_id } %>%
    as.character() %>%
    rev()

  # flip the index of the root in the lookup table
  lookup_ids[length(lookup_ids) - n_tips] <- lookup_ids[1]
  lookup_ids[1] <- n_tips + 1

  child_ids <- present_ids[present_ids != tree$root]
  parent_ids <- sapply(child_ids, function(i) tree$parent(i))

  children <- sapply(child_ids, function(n) lookup_ids[present_ids == n])
  parents <- sapply(parent_ids, function(n) lookup_ids[present_ids == n])

  # find which sampled nodes are not leaves:
  # - first look for those nodes in the tree sequence node IDs
  internal_ts_samples <- intersect(parent_ids, data[data$sampled, ]$node_id)
  # - then convert them to the phylo numbering
  internal_phylo_samples <- sapply(internal_ts_samples, function(n) lookup_ids[present_ids == n])

  # and then link them to dummy internal nodes, effectively turning them into
  # proper leaves
  dummies <- vector(mode = "integer", length(internal_ts_samples))
  if (length(dummies) > 0)
    warning("Some sampled nodes in the tree are internal nodes (i.e. represent ancestors\n",
            "of some sampled nodes forming the tips of the tree). This is not permitted\n",
            "by standard phylogenetic tree framework such as that implemented by the ape\n",
            "R package, which assumes that samples are present at the tips of a tree.\n",
            "To circumvent this problem, these sampled internal nodes have been\n",
            "attached to the tree via zero-length branches linking them to 'dummy' nodes.\n",
            "In total ", length(dummies), " of such nodes have been created and they are ",
            "indicated by `phylo_id`\nvalues larger than ", n_all, ".", call. = FALSE)
  for (d in seq_along(dummies)) {
    ts_node <- as.integer(internal_ts_samples[d])
    phylo_node <- as.integer(internal_phylo_samples[d])
    dummy <- n_all + d
    node_parent <- lookup_ids[present_ids == tree$parent(ts_node)]
    node_children <- sapply(unlist(tree$children(ts_node)), function(n) lookup_ids[present_ids == n])

    # replace the sampled node with a dummy node, linking to its parent and
    # children (all done in the phylo index space)
    parents[children %in% node_children] <- dummy
    children[children == phylo_node] <- dummy

    # add a new link from the dummy node to the real sample
    children <- c(children, phylo_node)
    parents <- c(parents, dummy)

    dummies[d] <- dummy
  }

  # bind the two columns back into an edge matrix
  edge <- cbind(as.integer(parents), as.integer(children))

  # create vector of edge lengths (adding zero-length branches linking the dummy
  # nodes)
  children_times <- sapply(child_ids, function(n) data[data$node_id == n, ]$time)
  parent_times <- sapply(parent_ids, function(n) data[data$node_id == n, ]$time)
  edge_lengths <- c(abs(parent_times - children_times), rep(0, length(dummies)))

  data$phylo_id <- sapply(data$node_id, function(n) lookup_ids[present_ids == n])
  columns <- c()
  if (type == "SLiM") {
    if (spatial) columns <- c(columns, "location")
    columns <- c(columns, c("sampled", "remembered", "retained", "alive", "pedigree_id"))
  } else
    columns <- "sampled"
  name_col <- if (from_slendr) "name" else NULL
  data <- dplyr::select(
    data, !!name_col, pop, node_id, phylo_id, time, time_tskit, !!columns, ind_id, pop_id
  )
  # add fake dummy information to the processed tree sequence table so that
  # the user knows what is real and what is not straight from the ts_phylo()
  # output
  if (length(dummies)) {
    data <- dplyr::bind_rows(
      data,
      data.frame(
        name = NA,
        pop = sapply(internal_ts_samples,
                     function(n) data[data$node_id == n, ]$pop),
        node_id = NA, phylo_id = dummies,
        time = sapply(internal_ts_samples,
                      function(n) data[data$node_id == n, ]$time),
        time_tskit = sapply(internal_ts_samples,
                            function(n) data[data$node_id == n, ]$time_tskit)
      )
    )
  }
  if (type == "SLiM" && spatial) {
    check_spatial_pkgs()
    data <- sf::st_as_sf(data)
  }

  class(data) <- set_class(data, "nodes")

  # generate appropriate internal node labels based on the user's choice
  elem <- if (labels == "pop") "pop" else "node_id"
  node_labels <- purrr::map_chr(unique(sort(parents)),
                                ~ as.character(data[data$phylo_id == .x, ][[elem]]))

  tree <- list(
    edge = edge,
    edge.length = edge_lengths,
    node.label = node_labels,
    tip.label = tip_labels,
    Nnode = n_internal + length(dummies)
  )
  class(tree) <- c("slendr_phylo", "phylo")

  check_log <- utils::capture.output(ape::checkValidPhylo(tree))

  # if there are fatal issues, report them and signal an error
  if (any(grepl("FATAL", check_log)))
    stop(paste(check_log, collapse = "\n"), call. = FALSE)

  if (!quiet) cat(check_log, sep = "\n")

  # subset ts_nodes result to only those nodes that are present in the phylo
  # object, adding another column with the rearranged node IDs
  attr(tree, "model") <- attr(ts, "model")
  attr(tree, "ts") <- ts
  attr(tree, "spatial") <- attr(ts, "spatial")
  attr(tree, "nodes") <- data
  attr(tree, "edges") <- get_annotated_edges(tree)
  attr(tree, "type") <- attr(ts, "type")

  tree
}

# tree sequence tables ----------------------------------------------------

#' Extract combined annotated table of individuals and nodes
#'
#' This function combines information from the table of individuals and table of
#' nodes into a single data frame which can be used in downstream analyses.
#'
#' The source of data (tables of individuals and nodes recorded in the tree
#' sequence generated by SLiM) are combined into a single data frame. If the
#' model which generated the data was spatial, coordinates of nodes (which are
#' pixel-based by default because SLiM spatial simulations occur on a raster),
#' the coordinates are automatically converted to an explicit spatial object of
#' the \code{sf} class unless \code{spatial = FALSE}. See
#' <https://r-spatial.github.io/sf/> for an extensive introduction to the sf
#' package and the ways in which spatial data can be processed, analysed, and
#' visualised.
#'
#' @seealso \code{\link{ts_table}} for accessing raw tree sequence tables
#'   without added metadata annotation. See also \code{\link{ts_ancestors}} to
#'   learn how to extract information about relationship beteween nodes in the
#'   tree sequence, and how to analysed data about distances between nodes in
#'   the spatial context.
#'
#' @param x Tree sequence object of the class \code{slendr_ts} or a \code{phylo}
#'   object extracted by \code{ts_phylo}
#' @param sf Should spatial data be returned in an sf format? If \code{FALSE},
#'   spatial geometries will be returned simply as x and y columns, instead of
#'   the standard POINT data type.
#'
#' @return Data frame with processed information from the tree sequence object.
#'   If the model which generated this data was spatial, result will be returned
#'   as a spatial object of the class \code{sf}.
#'
#' @examples
#' \dontshow{check_dependencies(python = TRUE, quit = TRUE) # dependencies must be present
#' }
#' init_env()
#'
#' # load an example model with an already simulated tree sequence
#' slendr_ts <- system.file("extdata/models/introgression_slim.trees", package = "slendr")
#' model <- read_model(path = system.file("extdata/models/introgression", package = "slendr"))
#'
#' # load the tree-sequence object from disk
#' ts <- ts_load(slendr_ts, model)
#'
#' # extract an annotated table with (spatio-)temporal node information
#' ts_nodes(ts)
#' @export
ts_nodes <- function(x, sf = TRUE) {
  if (!inherits(x, "slendr_ts") && !inherits(x, "slendr_phylo"))
    stop("Annotation data table can be only extracted for a slendr tree sequence\n",
         "object or a phylo object created by the ts_phylo function", call. = FALSE)

  data <- attr(x, "nodes")

  if (inherits(data, "sf")) check_spatial_pkgs()

  if (!sf && inherits(data, "sf")) {
    # unwrap the geometry column into separate x and y coordinates
    locations <- unlist(data$location)
    data$x <- locations[c(TRUE, FALSE)]
    data$y <- locations[c(FALSE, TRUE)]

    columns <- NULL
    if (!is.null(attr(x, "model"))) columns <- "name"
    columns <- c(columns, "pop")

    data <- sf::st_drop_geometry(data) %>% dplyr::select(
      !!columns, node_id, time, time_tskit, x, y,
      sampled, remembered, retained, alive, pedigree_id, ind_id
    )
  }

  attr(data, "model") <- attr(x, "model")
  attr(data, "type") <- attr(x, "type")

  class(data) <- set_class(data, "nodes")

  data
}

#' Get the table of individuals/nodes/edges/mutations from the tree sequence
#'
#' This function extracts data from a given tree sequence table. All times are
#' converted to model-specific time units from tskit's "generations backwards"
#' time direction.
#'
#' For further processing and analyses, the output of the function
#' \code{\link{ts_nodes}} might be more useful, as it merges the information in
#' node and individual tables into one table and further annotates it with
#' useful information from the model configuration data.
#'
#' @seealso \code{\link{ts_nodes}} and \code{\link{ts_edges}} for accessing an
#'   annotated, more user-friendly and analysis-friendly tree-sequence table
#'   data
#'
#' @param ts Tree sequence object of the class \code{slendr_ts}
#' @param table Which tree sequence table to return
#'
#' @return Data frame with the information from the give tree-sequence table
#'   (can be either a table of individuals, edges, nodes, or mutations).
#'
#' @examples
#' # load an example model with an already simulated tree sequence
#' slendr_ts <- system.file("extdata/models/introgression_slim.trees", package = "slendr")
#' model <- read_model(path = system.file("extdata/models/introgression", package = "slendr"))
#'
#' # load the tree-sequence object from disk and add mutations to it
#' ts <- ts_load(slendr_ts, model) %>% ts_mutate(mutation_rate = 1e-8, random_seed = 42)
#'
#' # get the 'raw' tskit table of individuals
#' ts_table(ts, "individuals")
#'
#' # get the 'raw' tskit table of edges
#' ts_table(ts, "edges")
#'
#' # get the 'raw' tskit table of nodes
#' ts_table(ts, "nodes")
#'
#' # get the 'raw' tskit table of mutations
#' ts_table(ts, "mutations")
#' @export
ts_table <- function(ts, table = c("individuals", "edges", "nodes", "mutations")) {
  table <- match.arg(table)
  check_ts_class(ts)
  df <- attr(ts, paste0("raw_", table))
  if (is.null(df))
    dplyr::tibble()
  else
    df
}

#' Extract spatio-temporal edge annotation table from a given tree or tree
#' sequence
#'
#' @param x Tree object generated by \code{ts_phylo} or a slendr tree sequence
#'   object produced by \code{ts_load}, \code{ts_recapitate},
#'   \code{ts_simplify}, or \code{ts_mutate}
#'
#' @return Data frame of the \code{sf} type containing the times of nodes and
#'   start-end coordinates of edges across space
#'
#' @examples
#' \dontshow{check_dependencies(python = TRUE, quit = TRUE) # dependencies must be present
#' }
#' init_env()
#'
#' # load an example model with an already simulated tree sequence
#' slendr_ts <- system.file("extdata/models/introgression_slim.trees", package = "slendr")
#' model <- read_model(path = system.file("extdata/models/introgression", package = "slendr"))
#'
#' # load the tree-sequence object from disk
#' ts <- ts_load(slendr_ts, model)
#'
#' # extract an annotated table with (spatio-)temporal edge information
#' ts_edges(ts)
#' @export
ts_edges <- function(x) {
  if (inherits(x, "slendr_phylo"))
    attr(x, "edges")
  else if (inherits(x, "slendr_ts"))
    get_annotated_edges(x)
  else
    stop("Annotation data table can be only extracted for a slendr tree sequence\n",
         "object or a phylo object created by the ts_phylo function", call. = FALSE)
}

#' Extract names and times of individuals of interest in the current tree sequence
#' (either all sampled individuals or those that the user simplified to)
#'
#' @param ts Tree sequence object of the class \code{slendr_ts}
#'
#' @return Table of individuals scheduled for sampling across space and time
#'
#' @examples
#' \dontshow{check_dependencies(python = TRUE, quit = TRUE) # dependencies must be present
#' }
#' init_env()
#'
#' # load an example model with an already simulated tree sequence
#' slendr_ts <- system.file("extdata/models/introgression_slim.trees", package = "slendr")
#' model <- read_model(path = system.file("extdata/models/introgression", package = "slendr"))
#'
#' # load the tree-sequence object from disk
#' ts <- ts_load(slendr_ts, model)
#'
#' # extract the table of individuals scheduled for simulation and sampling
#' ts_samples(ts)
#' @export
ts_samples <- function(ts) {
  if (is.null(attr(ts, "model")))
    stop("Sampling schedule can only be extracted for tree sequences\ngenerated ",
         "from a slendr model. To access information about times and\nlocations ",
         "of nodes and individuals from non-slendr tree sequences,\nuse the ",
         "function ts_nodes().\n", call. = FALSE)
  samples <- attr(ts, "metadata")$sampling %>%
    dplyr::filter(name %in% attr(ts, "metadata")$sample_names)

  samples
}

#' Extract names of individuals in a tree sequence
#'
#' @param ts Tree sequence object of the class \code{slendr_ts}
#' @param split Should sample names in the tree sequence be split by a column
#'   (a population or time column)? Default is \code{NULL} and all names of
#'   samples will be returned as a single character vector. If set to "pop" or
#'   "time", a list of character vectors will be returned, one vector for each
#'   unique "pop" or "time" grouping.
#'
#' @return A vector of character sample names. If \code{split} is specified,
#'   a list of such vectors is returned, one element of the list per population
#'   or sampling time.
#'
#' @export
ts_names <- function(ts, split = NULL) {
  df <- ts_samples(ts)

  if (is.null(split)) { # return all names if splitting not requested
    result <- df$name
  } else if (split %in% colnames(df)) { # otherwise split by a given column
    result <- df %>% split(., .[[split]]) %>% lapply(`[[`, "name")
  } else
    stop("Column '", split, "' not present in the samples table", call. = FALSE)

  result
}

#' Extract (spatio-)temporal ancestral history for given nodes/individuals
#'
#' @param ts Tree sequence object of the class \code{slendr_ts}
#' @param x Either an individual name or an integer node ID
#' @param verbose Report on the progress of ancestry path generation?
#' @param complete Does every individual in the tree sequence need to have
#'   complete metadata recorded? If \code{TRUE}, only individuals/nodes with
#'   complete metadata will be included in the reconstruction of ancestral
#'   relationships. For instance, nodes added during the coalescent recapitation
#'   phase will not be included because they don't have spatial information
#'   associated with them.
#'
#' @return A table of ancestral nodes of a given tree-sequence node all the
#'   way up to the root of the tree sequence
#'
#' @examples
#' \dontshow{check_dependencies(python = TRUE, quit = TRUE) # dependencies must be present
#' }
#' init_env()
#'
#' # load an example model with an already simulated tree sequence
#' slendr_ts <- system.file("extdata/models/introgression_slim.trees", package = "slendr")
#' model <- read_model(path = system.file("extdata/models/introgression", package = "slendr"))
#'
#' # load the tree-sequence object from disk
#' ts <- ts_load(slendr_ts, model)
#'
#' # find the complete ancestry information for a given individual
#' ts_ancestors(ts, "EUR_1", verbose = TRUE)
#' @export
ts_ancestors <- function(ts, x, verbose = FALSE, complete = TRUE) {
  check_ts_class(ts)

  model <- attr(ts, "model")
  spatial <- attr(ts, "spatial")
  if (spatial) check_spatial_pkgs()
  from_slendr <- !is.null(model)

  edges <- ts_table(ts, "edges")
  data <- ts_nodes(ts)

  if (complete) data <- dplyr::filter(data, !is.na(ind_id))

  if (is.character(x) && !any(x %in% data$name))
    stop("The given individual is ether not present in the tree sequence or it\n",
         "does not carry complete metadata information (see ?ts_ancestors)", call. = FALSE)
  if (is.numeric(x) && !any(x %in% data$node_id))
    stop("The given node is ether not present in the tree sequence or it\n",
         "does not carry complete metadata information (see ?ts_ancestors)", call. = FALSE)

  if (spatial && any(sf::st_is_empty(data$location))) {
    warning("Not all nodes have a known spatial location. Maybe you ran a neutral\n",
            "non-spatial coalescent recapitation after a spatial SLiM simulation?\n",
            "This is not a problem, but please note that the edge table encoded\n",
            "by this tree will not contain spatial information.", call. = FALSE)
    spatial <- FALSE
  }

  if (!spatial) data$location <- NA

  # collect child-parent branches starting from the "sampled nodes"
  branches <- purrr::map_dfr(x, function(.x) {
    if (verbose) message(sprintf("Collecting ancestors of %s [%d/%d]...",
                                 .x, which(.x == x), length(x)))
    ids <- get_node_ids(ts, .x)

    purrr::map_dfr(ids, function(.y) {
      if (!nrow(edges[edges$child == .y, ]))
        stop("The node specified does not have any ancestors of its own", call. = FALSE)

      result <- collect_ancestors(.y, edges) %>%
        dplyr::mutate(pop = dplyr::filter(data, node_id == .y)$pop[1],
                                          node_id = .y)

      if (from_slendr) result$name <- data[data$node_id == .y, ]$name

      result
    })
  })

  child_data  <- dplyr::select(data, child_pop  = pop, child_id  = node_id, child_time  = time, child_location = location) %>% as.data.frame()
  parent_data <- dplyr::select(data, parent_pop = pop, parent_id = node_id, parent_time = time, parent_location = location) %>% as.data.frame()

  combined <- branches %>%
    dplyr::inner_join(child_data, by = "child_id") %>%
    dplyr::inner_join(parent_data, by = "parent_id")

  if (spatial) combined <- sf::st_as_sf(combined)

  if (verbose) message("\nGenerating data about spatial relationships of nodes...")

  # perform further data processing (adding names of individuals, processing sf
  # spatial columns) if the model in question is spatial
  if (spatial) {
    location_col <- c("child_location", "parent_location", "connection")
    combined <- purrr::map2(
      combined$child_location, combined$parent_location, ~
        sf::st_union(.x, .y) %>%
        sf::st_cast("LINESTRING") %>%
        sf::st_sfc() %>%
        sf::st_sf(connection = ., crs = sf::st_crs(combined))) %>%
      dplyr::bind_rows() %>%
      dplyr::bind_cols(combined, .) %>%
        sf::st_set_geometry("connection")
  } else
    location_col <- NULL

  if (from_slendr) {
    name_col <- "name"
    pop_names <- order_pops(model$populations, model$direction)
    combined <- combined %>%
      dplyr::mutate(pop = factor(pop, levels = pop_names),
                    child_pop = factor(child_pop, levels = pop_names),
                    parent_pop = factor(parent_pop, levels = pop_names))
  } else
    name_col <- NULL

  combined <- dplyr::select(combined,
                            !!name_col, pop, node_id, level,
                            child_id, parent_id, child_time, parent_time,
                            child_pop, parent_pop, !!location_col,
                            left_pos = left, right_pos = right) %>%
    dplyr::mutate(level = as.factor(level))

  attr(combined, "model") <- model

  combined
}

#' Extract all descendants of a given tree-sequence node
#'
#' @param ts Tree sequence object of the class \code{slendr_ts}
#' @param x An integer node ID of the ancestral node
#' @param verbose Report on the progress of ancestry path generation?
#' @param complete Does every individual in the tree sequence need to have
#'   complete metadata recorded? If \code{TRUE}, only individuals/nodes with
#'   complete metadata will be included in the reconstruction of ancestral
#'   relationships. For instance, nodes added during the coalescent recapitation
#'   phase will not be included because they don't have spatial information
#'   associated with them.
#'
#' @return A table of descendant nodes of a given tree-sequence node all the
#'   way down to the leaves of the tree sequence
#'
#' @examples
#' \dontshow{check_dependencies(python = TRUE, quit = TRUE) # dependencies must be present
#' }
#' init_env()
#'
#' # load an example model with an already simulated tree sequence
#' slendr_ts <- system.file("extdata/models/introgression_slim.trees", package = "slendr")
#' model <- read_model(path = system.file("extdata/models/introgression", package = "slendr"))
#'
#' # load the tree-sequence object from disk
#' ts <- ts_load(slendr_ts, model)
#'
#' # find the complete descendancy information for a given individual
#' ts_descendants(ts, x = 62, verbose = TRUE)
#' @export
ts_descendants <- function(ts, x, verbose = FALSE, complete = TRUE) {
  check_ts_class(ts)

  model <- attr(ts, "model")
  spatial <- attr(ts, "spatial")
  if (spatial) check_spatial_pkgs()
  from_slendr <- !is.null(model)

  edges <- ts_table(ts, "edges")
  data <- ts_nodes(ts)

  if (complete) data <- dplyr::filter(data, !is.na(ind_id))

  if (is.character(x) && !x %in% data$name)
    stop("The given individual is ether not present in the tree sequence or it\n",
         "does not carry complete metadata information (see ?ts_ancestors)", call. = FALSE)
  if (is.numeric(x) && !x %in% data$node_id)
    stop("The given node is ether not present in the tree sequence or it\n",
         "does not carry complete metadata information (see ?ts_ancestors)", call. = FALSE)

  if (spatial && any(sf::st_is_empty(data$location))) {
    warning("Not all nodes have a known spatial location. Maybe you ran a neutral\n",
            "non-spatial coalescent recapitation after a spatial SLiM simulation?\n",
            "This is not a problem, but please note that the edge table encoded\n",
            "by this tree will not contain spatial information.", call. = FALSE)
    spatial <- FALSE
  }

  if (!spatial) data$location <- NA

  if (!nrow(edges[edges$parent == x, ]))
    stop("The 'ancestral' node specified does not have any children", call. = FALSE)

  # collect child-parent branches starting from the "sampled nodes"
  branches <- collect_descendants(x, edges) %>%
   dplyr::mutate(pop = dplyr::filter(data, node_id == x)$pop[1],
                 node_id = x)

  child_data  <- dplyr::select(data, child_pop  = pop, child_id  = node_id, child_time  = time, child_location = location)
  parent_data <- dplyr::select(data, parent_pop = pop, parent_id = node_id, parent_time = time, parent_location = location)

  combined <- branches %>%
    dplyr::inner_join(child_data, by = "child_id") %>%
    dplyr::inner_join(parent_data, by = "parent_id")

  if (spatial) combined <- sf::st_as_sf(combined)

  if (verbose) message("\nGenerating data about spatial relationships of nodes...")

  # perform further data processing (adding names of individuals, processing sf
  # spatial columns) if the model in question is spatial
  if (spatial) {
    location_col <- c("child_location", "parent_location", "connection")
    combined <- purrr::map2(
      combined$child_location, combined$parent_location, ~
        sf::st_union(.x, .y) %>%
        sf::st_cast("LINESTRING") %>%
        sf::st_sfc() %>%
        sf::st_sf(connection = ., crs = sf::st_crs(combined))) %>%
      dplyr::bind_rows() %>%
      dplyr::bind_cols(combined, .) %>%
      sf::st_set_geometry("connection")
  } else
    location_col <- NULL

  if (from_slendr) {
    name_col <- "name"
    pop_names <- order_pops(model$populations, model$direction)
    combined <- combined %>%
      dplyr::mutate(pop = factor(pop, levels = pop_names),
                    child_pop = factor(child_pop, levels = pop_names),
                    parent_pop = factor(parent_pop, levels = pop_names),
                    name = sapply(child_id, function(i) data[data$node_id == i, ]$name[1]))
  } else
    name_col <- NULL

  combined <- dplyr::select(combined,
                            !!name_col, pop, node_id, level,
                            child_id, parent_id, child_time, parent_time,
                            child_pop, parent_pop, !!location_col,
                            left_pos = left, right_pos = right) %>%
    dplyr::mutate(level = as.factor(level))

  attr(combined, "model") <- model

  combined
}

# tree operations ---------------------------------------------------------

#' Get a tree from a given tree sequence
#'
#' For more information about optional keyword arguments see tskit documentation:
#' <https://tskit.dev/tskit/docs/stable/python-api.html#the-treesequence-class>
#'
#' @param ts Tree sequence object of the class \code{slendr_ts}
#' @param i Position of the tree in the tree sequence. If \code{mode = "index"},
#'   an i-th tree will be returned (in zero-based indexing as in tskit), if
#'   \code{mode = "position"}, a tree covering the i-th base of the simulated genome will be
#'   returned (again, in tskit's indexing).
#' @param mode How should the \code{i} argument be interpreted? Either "index"
#'   as an i-th tree in the sequence of genealogies, or "position" along the
#'   simulated genome.
#' @param ... Additional keyword arguments accepted by
#'   \code{tskit.TreeSequence.at and tskit.TreeSequence.at_index} methods
#'
#' @return Python-reticulate-based object of the class tskit.trees.Tree
#'
#' @examples
#' \dontshow{check_dependencies(python = TRUE, quit = TRUE) # dependencies must be present
#' }
#' init_env()
#'
#' # load an example model with an already simulated tree sequence
#' slendr_ts <- system.file("extdata/models/introgression_slim.trees", package = "slendr")
#' model <- read_model(path = system.file("extdata/models/introgression", package = "slendr"))
#'
#' # load the tree-sequence object from disk
#' ts <- ts_load(slendr_ts, model)
#'
#' # extract the zero-th tree in the tree sequence
#' tree <- ts_tree(ts, i = 0)
#'
#' # extract the tree at a position in the tree sequence
#' tree <- ts_tree(ts, i = 100000, mode = "position")
#' @export
ts_tree <- function(ts, i, mode = c("index", "position"), ...) {
  check_ts_class(ts)
  mode <- match.arg(mode)
  i <- as.integer(i)
  if (mode == "index")
    tree <- ts$at_index(index = i, ...)
  else
    tree <- ts$at(position = i, ...)
  attr(tree, "tree_sequence") <- ts
  tree
}

#' Plot a graphical representation of a single tree
#'
#' This function first obtains an SVG representation of the tree by calling the
#' \code{draw_svg} method of tskit and renders it as a bitmap image in R. All of
#' the many optional keyword arguments of the \code{draw_svg} method can be
#' provided and will be automatically passed to the method behind the scenes.
#'
#' @param x A single tree extracted by \code{\link{ts_tree}}
#' @param width,height Pixel dimensions of the rendered bitmap
#' @param labels Label each node with the individual name?
#' @param sampled_only Should only individuals explicitly sampled through
#'   simplification be labeled? This is relevant in situations in which sampled
#'   individuals can themselves be among the ancestral nodes.
#' @param title Optional title for the figure
#' @param ... Keyword arguments to the tskit \code{draw_svg} function.
#'
#' @return No return value, called for side effects
#'
#' @examples
#' \dontshow{check_dependencies(python = TRUE, quit = TRUE) # dependencies must be present
#' }
#' init_env()
#'
#' # load an example model with an already simulated tree sequence
#' slendr_ts <- system.file("extdata/models/introgression_slim.trees", package = "slendr")
#' model <- read_model(path = system.file("extdata/models/introgression", package = "slendr"))
#'
#' # load the tree-sequence object from disk
#' ts <- ts_load(slendr_ts, model)
#'
#' # extract the first tree in the tree sequence and draw it
#' tree <- ts_tree(ts, i = 1)
#'
#' # ts_draw accepts various optional arguments of tskit.Tree.draw_svg
#' ts_draw(tree, time_scale = "rank")
#' @export
ts_draw <- function(x, width = 1000, height = 1000, labels = FALSE,
                    sampled_only = TRUE, title = NULL, ...) {
  # set margins to zero, save original settings
  top_margin <- if (is.null(title)) 0 else 5
  orig_par <- graphics::par(mar = c(0, 0, top_margin, 0))
  # restore original settings
  on.exit(graphics::par(orig_par))

  if (!requireNamespace("rsvg", quietly = TRUE))
    stop("For plotting trees using the native SVG tskit capabilities, please\n",
         "install the R package rsvg by calling `install.packages(\"rsvg\")")

  if (labels) {
    ts <- attr(x, "tree_sequence")
    df_labels <- ts_nodes(ts) %>%
      dplyr::select(node_id, name, sampled) %>%
      dplyr::mutate(node_label = ifelse(!is.na(name), sprintf("%s (%s)", name, node_id), node_id))
    if (sampled_only)
      df_labels$node_label <- ifelse(!df_labels$sampled, df_labels$node_id, df_labels$node_label)

    py_labels <- reticulate::py_dict(keys = df_labels$node_id,
                                     values = df_labels$node_label)
  } else
    py_labels <- NULL

  svg <- x$draw_svg(size = c(width, height), node_labels = py_labels, ...)

  # convert from a SVG representation to a PNG image
  raw <- charToRaw(as.character(svg))
  tmp_file <- paste0(tempfile(), ".png")
  rsvg::rsvg_png(svg = raw, file = tmp_file, width = width, height = height)

  # plot the PNG image, filling the entire plotting window
  img <- png::readPNG(tmp_file)
  graphics::plot.new()
  aspect_ratio <- dim(img)[1] / dim(img)[2]
  graphics::plot.window(xlim = c(0, 1), ylim = c(0, 1), asp = aspect_ratio)
  graphics::rasterImage(img, xleft = 0, ybottom = 0, xright = 1, ytop = 1)
  graphics::title(title)
}

#' Check that all trees in the tree sequence are fully coalesced
#'
#' @param ts Tree sequence object of the class \code{slendr_ts}
#' @param return_failed Report back which trees failed the coalescence
#'   check?
#'
#' @return TRUE or FALSE value if \code{return_failed = FALSE}, otherwise a vector of
#'   (tskit Python 0-based) indices of trees which failed the coalescence test
#'
#' @examples
#' \dontshow{check_dependencies(python = TRUE, quit = TRUE) # dependencies must be present
#' }
#' init_env()
#'
#' # load an example model with an already simulated tree sequence
#' slendr_ts <- system.file("extdata/models/introgression_slim.trees", package = "slendr")
#' model <- read_model(path = system.file("extdata/models/introgression", package = "slendr"))
#'
#' # load the tree-sequence object from disk
#' ts <- ts_load(slendr_ts, model)
#'
#' ts_coalesced(ts) # is the tree sequence fully coalesced? (TRUE or FALSE)
#'
#' # returns a vector of tree sequence segments which are not coalesced
#' not_coalesced <- ts_coalesced(ts, return_failed = TRUE)
#' @export
ts_coalesced <- function(ts, return_failed = FALSE) {
  # reticulate::py_run_string("def mult_roots(ts): return [not tree.has_multiple_roots for tree in ts.trees()]")

  # single_roots <- pylib$mult_roots(ts)

  single_roots <- reticulate::py$mult_roots(ts)

  if (all(single_roots))
    return(TRUE)
  else if (return_failed)
    return(which(!single_roots) - 1)
  else
    return(FALSE)
}

#' Collect Identity-by-Descent (IBD) segments (EXPERIMENTAL)
#'
#' This function iterates over a tree sequence and returns IBD tracts between
#' pairs of individuals or nodes
#'
#' This function is considered experimental. For full control over IBD segment
#' detection in tree-sequence data, users can (and perhaps, for the time being,
#' should) rely on the tskit method \code{ibd_segments}
#' (see <https://tskit.dev/tskit/docs/stable/python-api.html#tskit.TreeSequence.ibd_segments>).
#'
#' Iternally, this function leverages the tskit \code{TreeSequence} method
#' \code{ibd_segments}. However, note that the \code{ts_ibd} function always
#' returns a data frame of IBD tracts, it does not provide an option to iterate
#' over individual IBD segments as shown in the official tskit documentation
#' at <https://tskit.dev/tskit/docs/stable/ibd.html>. In general, R handles
#' heavy iteration poorly, and this function does not attempt to serve as
#' a full wrapper to \code{ibd_segments}.
#'
#' Unfortunately, the distinction between "squashed IBD" (what many would consider
#' to be the expected definition of IBD) and tskit’s IBD which is defined via
#' distinct genealogical paths (see <https://github.com/tskit-dev/tskit/issues/2459>
#' for a discussion of the topic), makes the meaning of the filtering parameter
#' of the \code{ibd_segments()} method of tskit \code{minimum_length} somewhat
#' unintuitive. As of this moment, this function argument filters on IBD segments
#' on the tskit level, not the level of the squashed IBD segments!
#'
#' @param ts Tree sequence object of the class \code{slendr_ts}
#' @param coordinates Should coordinates of all detected IBD tracts be reported?
#'   If \code{FALSE} (the default), only the total length of shared IBD segments
#'   and their numbers are reported. If \code{TRUE}, coordinates of each segment
#'   will be returned (but note that this can have a massive impact on memory
#'   usage). See details for more information.
#' @param within A character vector with individual names or an integer vector with
#'   node IDs indicating a set of nodes within which to look for IBD segments.
#' @param between A list of lists of character vectors with individual names or
#'   integer vectors with node IDs, indicating a set of nodes between which to
#'   look for shared IBD segments.
#' @param squash Should adjacent IBD segments for pairs of nodes be squashed if they
#'   only differ by their 'genealogical paths' but not by their MRCA? Default is
#'   \code{FALSE}. For more context, see <https://github.com/tskit-dev/tskit/issues/2459>.
#'   This option is EXPERIMENTAL!
#' @param minimum_length Minimum length of an IBD segment to return in results.
#'   This is useful for reducing the total amount of IBD returned (but see Details).
#' @param maximum_time Oldest MRCA of a node to be considered as an IBD ancestor
#'   to return that IBD segment in results. This is useful for reducing the total
#'   amount of IBD returned.
#' @param sf If IBD segments in a spatial tree sequence are being analyzed, should
#'   the returned table be a spatial sf object? Default is \code{TRUE}.
#'
#' @return A data frame with IBD results (either coordinates of each IBD segment
#'   shared by a pair of nodes, or summary statistics about the total IBD sharing
#'   for that pair)
#'
#' @examples
#' \dontshow{check_dependencies(python = TRUE, quit = TRUE) # dependencies must be present
#' }
#' init_env()
#'
#' # load an example model with an already simulated tree sequence
#' slendr_ts <- system.file("extdata/models/introgression_slim.trees", package = "slendr")
#' model <- read_model(path = system.file("extdata/models/introgression", package = "slendr"))
#'
#' # load the tree-sequence object from disk
#' ts <- ts_load(slendr_ts, model)
#'
#' # find IBD segments between specified Neanderthals and Europeans
#' ts_ibd(
#'   ts,
#'   coordinates = TRUE,
#'   between = list(c("NEA_1", "NEA_2"), c("EUR_1", "EUR_2")),
#'   minimum_length = 40000
#' )
#' @export
ts_ibd <- function(ts, coordinates = FALSE, within = NULL, between = NULL, squash = FALSE,
                   minimum_length = NULL, maximum_time = NULL, sf = TRUE) {
  # make sure warnings are reported immediately
  opts <- options(warn = 1)
  on.exit(options(opts))

  if (squash && !is.null(minimum_length)) {
    warning("Please note that when 'squashed' IBD segments are requested,\n",
            "the minimum IBD length cut off involves the 'distinct genealogical path'\n",
            "IBD segments at the tskit level, not the length of the squashed IBD\n",
            "segments. See the documentation of `ts_ibd()` for more detail and\n",
            "additional information.", call. = FALSE)
  }

  model <- attr(ts, "model")
  spatial <- attr(ts, "spatial")

  if (is.null(minimum_length) && is.null(maximum_time))
    warning("No minimum IBD length (minimum_length) or maximum age of an IBD\nancestor ",
            "(maximum_time) has been provided. As a result all IBD tracts will be\n",
            "reported. Depending on the size of your tree sequence, this might produce\n",
            "extremely huge amount of data.", call. = FALSE)
  if (!is.null(within))
    within <- unlist(purrr::map(within, ~ get_node_ids(ts, .x)))
  else if (!is.null(between)) {
    between <- unname(purrr::map(between, ~ get_node_ids(ts, .x)))
    # another bug in reticulate? if the list has names, Python gives us:
    #   Error in py_call_impl(callable, dots$args, dots$keywords):
    #     TypeError: '<' not supported between instances of 'numpy.ndarray' and 'int'
    # names(between) <- NULL
  }

  result <- reticulate::py$collect_ibd(
      ts,
      coordinates = coordinates,
      within = within,
      between = between,
      min_span = minimum_length,
      max_time = maximum_time,
      squash = squash
  )

  # drop a useless internal attribute (not a loss of information -- *we* are the ones
  # who created the pandas DataFrame in the first place)
  attr(result, "pandas.index") <- NULL

  if (!nrow(result)) return(result)

  nodes <- ts_nodes(ts)

  # add node times to the IBD results table
  result <- result %>%
    dplyr::inner_join(as.data.frame(nodes)[, c("node_id", "time")], by = c("node1" = "node_id")) %>%
    dplyr::rename(node1_time = time) %>%
    dplyr::inner_join(as.data.frame(nodes)[, c("node_id", "time")], by = c("node2" = "node_id")) %>%
    dplyr::rename(node2_time = time) %>%
    dplyr::tibble()

  # perform further data processing if the model in question is spatial (and if there
  # are any IBD segments at all)
  if (spatial && sf) {
    result <- result %>%
      dplyr::inner_join(nodes[, c("node_id", "location")], by = c("node1" = "node_id")) %>%
      dplyr::rename(node1_location = location) %>%
      dplyr::inner_join(nodes[, c("node_id", "location")], by = c("node2" = "node_id")) %>%
      dplyr::rename(node2_location = location)

    result <- purrr::map2(
      result$node1_location, result$node2_location, ~
        if (.x == .y)
          sf::st_sf(connection = sf::st_sfc(sf::st_linestring()), crs = sf::st_crs(nodes))
        else {
          sf::st_union(.x, .y) %>%
          sf::st_cast("LINESTRING") %>%
          sf::st_sfc() %>%
          sf::st_sf(connection = ., crs = sf::st_crs(nodes))
        }
      ) %>%
      dplyr::bind_rows() %>%
      dplyr::bind_cols(result, .) %>%
        sf::st_set_geometry("connection") %>%
      sf::st_set_crs(sf::st_crs(nodes))
  }

  if (!is.null(model)) {
    result[["name1"]] <- sapply(result$node1, function(i) nodes[nodes$node_id == i, ]$name)
    result[["name2"]] <- sapply(result$node2, function(i) nodes[nodes$node_id == i, ]$name)
    result[["pop1"]] <- sapply(result$node1, function(i) nodes[nodes$node_id == i, ]$pop)
    result[["pop2"]] <- sapply(result$node2, function(i) nodes[nodes$node_id == i, ]$pop)
  }

  if (coordinates)
    result <- dplyr::mutate(result, length = right - left) %>%
      dplyr::select(node1, node2, length, mrca, node1_time, node2_time, tmrca, dplyr::everything())
  else
    result <- dplyr::select(result, node1, node2, count, total, node1_time, node2_time,
                                    dplyr::everything())

  result
}

# f-statistics ------------------------------------------------------------

fstat <- function(ts, stat, sample_sets, mode, windows, span_normalise) {
  if (!stat %in% c("f2", "f3", "f4"))
    stop("Unknown statistic '", stat, "'", call. = FALSE)

  if (!is.null(windows)) windows <- define_windows(ts, windows)

  node_sets <- purrr::map(sample_sets, ~ get_node_ids(ts, .x))

  result <- ts[[stat]](sample_sets = node_sets, mode = mode,
                       span_normalise = TRUE, windows = windows)

  if (length(result) > 1) result <- list(result)
  result
}

#' @rdname ts_f4ratio
#' @export
ts_f2 <- function(ts, A, B, mode = c("site", "branch", "node"),
                  span_normalise = TRUE, windows = NULL) {
  mode <- match.arg(mode)
  result <- fstat(ts, "f2", list(A, B), mode, windows, span_normalise)
  dplyr::tibble(A = concat(A), B = concat(B), f2 = result)
}

#' @rdname ts_f4ratio
#'
#' @export
ts_f3 <- function(ts, A, B, C, mode = c("site", "branch", "node"),
                  span_normalise = TRUE, windows = NULL) {
  mode <- match.arg(mode)
  result <- fstat(ts, "f3", list(A, B, C), mode, windows, span_normalise)
  dplyr::tibble(A = concat(A), B = concat(B), C = concat(C), f3 = result)
}

#' @rdname ts_f4ratio
#' @export
ts_f4 <- function(ts, W, X, Y, Z, mode = c("site", "branch", "node"),
                  span_normalise = TRUE, windows = NULL) {
  mode <- match.arg(mode)
  result <- fstat(ts, "f4", list(W, X, Y, Z), mode, windows, span_normalise)
  dplyr::tibble(W = concat(W), X = concat(X), Y = concat(Y), Z = concat(Z),
                f4 = result)
}

#' Calculate the f2, f3, f4, and f4-ratio statistics
#'
#' These functions present an R interface to the corresponding f-statistics methods
#' in tskit.
#'
#' Note that the order of populations f3 statistic implemented in tskit
#' (<https://tskit.dev/tskit/docs/stable/python-api.html#tskit.TreeSequence.f3>) is
#' different from what you might expect from ADMIXTOOLS, as defined in
#' Patterson 2012 (see <https://academic.oup.com/genetics/article/192/3/1065/5935193>
#' under heading "The three-population test and introduction of f-statistics",
#' as well as ADMIXTOOLS documentation at
#' <https://github.com/DReichLab/AdmixTools/blob/master/README.3PopTest#L5>).
#' Specifically, the widely used notation introduced by Patterson assumes the
#' population triplet as f3(C; A, B), with C being the "focal" sample (i.e., either
#' the outgroup or a sample tested for admixture). In contrast, tskit implements
#' f3(A; B, C), with the "focal sample" being A.
#'
#' Although this is likely to confuse many ADMIXTOOLS users, slendr does not have
#' much choice in this, because its \code{ts_*()} functions are designed to be
#' broadly compatible with raw tskit methods.
#'
#' @param ts Tree sequence object of the class \code{slendr_ts}
#' @param W,X,Y,Z,A,B,C,O Character vectors of individual names (largely following
#'   the nomenclature of Patterson 2021, but see crucial differences between
#'   tskit and ADMIXTOOLS in Details)
#' @param span_normalise Divide the result by the span of the window? Default
#'   TRUE, see the tskit documentation for more detail.
#' @param windows Coordinates of breakpoints between windows. The first
#'   coordinate (0) and the last coordinate (equal to \code{ts$sequence_length})
#'   do not have to be specified as they are added automatically.
#' @param mode The mode for the calculation ("sites" or "branch")
#'
#' @return Data frame with statistics calculated for the given sets of individuals
#'
#' @examples
#' \dontshow{check_dependencies(python = TRUE, quit = TRUE) # dependencies must be present
#' }
#' init_env()
#'
#' # load an example model with an already simulated tree sequence
#' slendr_ts <- system.file("extdata/models/introgression_slim.trees", package = "slendr")
#' model <- read_model(path = system.file("extdata/models/introgression", package = "slendr"))
#'
#' # load the tree-sequence object from disk and add mutations to it
#' ts <- ts_load(slendr_ts, model) %>% ts_mutate(mutation_rate = 1e-8, random_seed = 42)
#'
#' # calculate f2 for two individuals in a previously loaded tree sequence
#' ts_f2(ts, A = "AFR_1", B = "EUR_1")
#'
#' # calculate f2 for two sets of individuals
#' ts_f2(ts, A = c("AFR_1", "AFR_2"), B = c("EUR_1", "EUR_3"))
#'
#' # calculate f3 for two individuals in a previously loaded tree sequence
#' ts_f3(ts, A = "EUR_1", B = "AFR_1", C = "NEA_1")
#'
#' # calculate f3 for two sets of individuals
#' ts_f3(ts, A = c("AFR_1", "AFR_2", "EUR_1", "EUR_2"),
#'           B = c("NEA_1", "NEA_2"),
#'           C = "CH_1")
#'
#' # calculate f4 for single individuals
#' ts_f4(ts, W = "EUR_1", X = "AFR_1", Y = "NEA_1", Z = "CH_1")
#'
#' # calculate f4 for sets of individuals
#' ts_f4(ts, W = c("EUR_1", "EUR_2"),
#'           X = c("AFR_1", "AFR_2"),
#'           Y = "NEA_1",
#'           Z = "CH_1")
#'
#' # calculate f4-ratio for a given set of target individuals X
#' ts_f4ratio(ts, X = c("EUR_1", "EUR_2", "EUR_4", "EUR_5"),
#'                A = "NEA_1", B = "NEA_2", C = "AFR_1", O = "CH_1")
#' @export
ts_f4ratio <- function(ts, X, A, B, C, O, mode = c("site", "branch"), span_normalise = TRUE) {
  mode <- match.arg(mode)

  purrr::map_dfr(
    X, function(.x) {
      alpha <-
        ts_f4(ts, A, O, .x, C, mode = mode)$f4 /
        ts_f4(ts, A, O, B, C, mode = mode)$f4
      dplyr::tibble(X = .x, A = concat(A), B = concat(B),
                    C = concat(C), O = concat(O), alpha)
    }
  )
}

# multiway statistics -----------------------------------------------------

multiway_stat <- function(ts, stat = c("fst", "divergence"),
                          k, sample_sets, mode, windows, span_normalise) {
  stat <- match.arg(stat)
  node_sets <- purrr::map(sample_sets, function(set) {
    get_node_ids(ts, set)
  })

  n_sets <- length(sample_sets)

  # generate all pairwise indexes required by tskit for more than
  # two sample sets
  indexes <- utils::combn(n_sets, m = k, simplify = FALSE,
                   FUN = function(x) as.integer(x - 1))

  fun <- switch(
    stat,
    "fst" = ts[["Fst"]],
    "divergence" = ts[["divergence"]]
  )
  if (is.null(fun))
    stop("Unknown statistic '", stat, "'", call. = FALSE)

  values <- fun(
    sample_sets = unname(node_sets),
    indexes = indexes,
    mode = mode,
    windows = windows,
    span_normalise = span_normalise
  )
  if (is.matrix(values))
    values <- split(values, col(values))

  if (!is.list(values)) values <- as.numeric(values) # convert 1D arrays to simple vectors

  if (is.null(names(sample_sets)))
    set_names <- paste0("set_", seq_len(n_sets))
  else
    set_names <- names(sample_sets)

  result <- purrr::map_dfr(indexes, ~ {
    set <- set_names[.x + 1]
    as.data.frame(t(matrix(set)), stringsAsFactors = FALSE)
  }) %>%
    dplyr::as_tibble() %>%
    dplyr::mutate(stat = values)

  result
}

#' Calculate pairwise statistics between sets of individuals
#'
#' For a discussion on the difference between "site", "branch", and "node"
#' options of the \code{mode} argument, please see the tskit documentation at
#' <https://tskit.dev/tskit/docs/stable/stats.html#sec-stats-mode>.
#'
#' @param ts Tree sequence object of the class \code{slendr_ts}
#' @param sample_sets A list (optionally a named list) of character vectors with
#'   individual names (one vector per set)
#' @param mode The mode for the calculation ("sites" or "branch")
#' @param windows Coordinates of breakpoints between windows. The first
#'   coordinate (0) and the last coordinate (equal to \code{ts$sequence_length})
#'   do not have to be specified as they are added automatically.
#' @param span_normalise Divide the result by the span of the window? Default
#'   TRUE, see the tskit documentation for more detail.
#'
#' @return For each pairwise calculation, either a single Fst value or a vector
#'   of Fst values (one for each window)
#'
#' @examples
#' \dontshow{check_dependencies(python = TRUE, quit = TRUE) # dependencies must be present
#' }
#' init_env()
#'
#' # load an example model with an already simulated tree sequence
#' slendr_ts <- system.file("extdata/models/introgression_slim.trees", package = "slendr")
#' model <- read_model(path = system.file("extdata/models/introgression", package = "slendr"))
#'
#' # load the tree-sequence object from disk
#' ts <- ts_load(slendr_ts, model) %>% ts_mutate(mutation_rate = 1e-8, random_seed = 42)
#'
#' # compute F_st between two sets of individuals in a given tree sequence ts
#' ts_fst(ts, sample_sets = list(afr = c("AFR_1", "AFR_2", "AFR_3"),
#'                               eur = c("EUR_1", "EUR_2")))
#' @export
ts_fst <- function(ts, sample_sets, mode = c("site", "branch", "node"),
                   windows = NULL, span_normalise = TRUE) {
  mode <- match.arg(mode)
  if (!is.list(sample_sets)) sample_sets <- as.list(sample_sets)
  if (!is.null(windows)) windows <- define_windows(ts, windows)
  multiway_stat(ts, "fst", k = 2, sample_sets, mode, windows, span_normalise) %>%
    stats::setNames(c("x", "y", "Fst"))
}

#' Calculate pairwise divergence between sets of individuals
#'
#' @inheritParams ts_fst
#' @return For each pairwise calculation, either a single divergence value or a
#'   vector of divergence values (one for each window)
#'
#' @examples
#' \dontshow{check_dependencies(python = TRUE, quit = TRUE) # dependencies must be present
#' }
#' init_env()
#'
#' # load an example model with an already simulated tree sequence
#' slendr_ts <- system.file("extdata/models/introgression_slim.trees", package = "slendr")
#' model <- read_model(path = system.file("extdata/models/introgression", package = "slendr"))
#'
#' # load the tree-sequence object from disk
#' ts <- ts_load(slendr_ts, model) %>% ts_mutate(mutation_rate = 1e-8, random_seed = 42)
#'
#' # collect sampled individuals from all populations in a list
#' sample_sets <- ts_samples(ts) %>%
#'   split(., .$pop) %>%
#'   lapply(function(pop) pop$name)
#'
#' # compute the divergence between individuals from each sample set (list of
#' # individual names generated in the previous step)
#' ts_divergence(ts, sample_sets) %>% .[order(.$divergence), ]
#' @export
ts_divergence <- function(ts, sample_sets, mode = c("site", "branch", "node"),
                   windows = NULL, span_normalise = TRUE) {
  mode <- match.arg(mode)
  if (!is.list(sample_sets)) sample_sets <- as.list(sample_sets)
  if (!is.null(windows)) windows <- define_windows(ts, windows)
  multiway_stat(ts, "divergence", k = 2, sample_sets, mode, windows, span_normalise) %>%
    stats::setNames(c("x", "y", "divergence"))
}

# oneway statistics -------------------------------------------------------

oneway_stat <- function(ts, stat, sample_sets, mode, windows, span_normalise = NULL) {
  node_sets <- purrr::map(sample_sets, function(set) {
    get_node_ids(ts, set)
  })

  n_sets <- length(sample_sets)

  fun <- switch(
    stat,
    "D" = ts[["Tajimas_D"]],
    "diversity" = ts[["diversity"]],
    "segsites" = ts[["segregating_sites"]]
  )
  if (is.null(fun))
    stop("Unknown statistic '", stat, "'", call. = FALSE)

  args <- list(sample_sets = unname(node_sets),
               mode = mode,
               windows = windows)
  if (!is.null(span_normalise)) args[["span_normalise"]] <- span_normalise

  values <- do.call(fun, args)

  if (is.matrix(values))
    values <- split(values, col(values))

  if (!is.list(values)) values <- as.numeric(values) # convert 1D arrays to simple vectors

  if (is.null(names(sample_sets)) || any(names(sample_sets) == "")) {
    if (all(sapply(sample_sets, length) == 1))
      set_names <- unlist(sample_sets)
    else
      set_names <- paste0("set_", seq_len(n_sets))
  } else
    set_names <- names(sample_sets)

  result <- dplyr::tibble(set = set_names)
  result[[stat]] <- values
  result
}

#' Calculate the density of segregating sites for the given sets of individuals
#'
#' @inheritParams ts_tajima
#' @param span_normalise Divide the result by the span of the window? Default
#'   TRUE, see the tskit documentation for more detail.
#'
#' @return For each set of individuals either a single diversity value or a
#'   vector of diversity values (one for each window)
#'
#' @examples
#' \dontshow{check_dependencies(python = TRUE, quit = TRUE) # dependencies must be present
#' }
#' init_env()
#'
#' # load an example model with an already simulated tree sequence
#' slendr_ts <- system.file("extdata/models/introgression_slim.trees", package = "slendr")
#' model <- read_model(path = system.file("extdata/models/introgression", package = "slendr"))
#'
#' # load the tree-sequence object from disk
#' ts <- ts_load(slendr_ts, model) %>% ts_mutate(mutation_rate = 1e-8, random_seed = 42)
#'
#' # collect sampled individuals from all populations in a list
#' sample_sets <- ts_samples(ts) %>%
#'   split(., .$pop) %>%
#'   lapply(function(pop) pop$name)
#'
#' ts_segregating(ts, sample_sets)
#' @export
ts_segregating <- function(ts, sample_sets, mode = c("site", "branch", "node"),
                           windows = NULL, span_normalise = FALSE) {
  mode <- match.arg(mode)
  if (!is.list(sample_sets)) sample_sets <- as.list(sample_sets)
  if (!is.null(windows)) windows <- define_windows(ts, windows)
  oneway_stat(ts, "segsites", sample_sets, mode, windows, span_normalise)
}

#' Calculate diversity in given sets of individuals
#'
#' @inheritParams ts_tajima
#' @param span_normalise Divide the result by the span of the window? Default
#'   TRUE, see the tskit documentation for more detail.
#'
#' @return For each set of individuals either a single diversity value or a
#'   vector of diversity values (one for each window)
#'
#' @examples
#' \dontshow{check_dependencies(python = TRUE, quit = TRUE) # dependencies must be present
#' }
#' init_env()
#'
#' # load an example model with an already simulated tree sequence
#' slendr_ts <- system.file("extdata/models/introgression_slim.trees", package = "slendr")
#' model <- read_model(path = system.file("extdata/models/introgression", package = "slendr"))
#'
#' # load the tree-sequence object from disk
#' ts <- ts_load(slendr_ts, model) %>% ts_mutate(mutation_rate = 1e-8, random_seed = 42)
#'
#' # collect sampled individuals from all populations in a list
#' sample_sets <- ts_samples(ts) %>%
#'   split(., .$pop) %>%
#'   lapply(function(pop) pop$name)
#'
#' # compute diversity in each population based on sample sets extracted
#' # in the previous step
#' ts_diversity(ts, sample_sets) %>% .[order(.$diversity), ]
#' @export
ts_diversity <- function(ts, sample_sets, mode = c("site", "branch", "node"),
                         windows = NULL, span_normalise = TRUE) {
  mode <- match.arg(mode)
  if (!is.list(sample_sets)) sample_sets <- as.list(sample_sets)
  if (!is.null(windows)) windows <- define_windows(ts, windows)
  oneway_stat(ts, "diversity", sample_sets, mode, windows, span_normalise)
}

#' Calculate Tajima's D for given sets of individuals
#'
#' For a discussion on the difference between "site" and "branch" options of the
#' \code{mode} argument, please see the tskit documentation at
#' <https://tskit.dev/tskit/docs/stable/stats.html#sec-stats-mode>
#'
#' @param ts Tree sequence object of the class \code{slendr_ts}
#' @param sample_sets A list (optionally a named list) of character vectors with
#'   individual names (one vector per set). If a simple vector is provided, it
#'   will be interpreted as \code{as.list(sample_sets)}, meaning that a given
#'   statistic will be calculated for each individual separately.
#' @param mode The mode for the calculation ("sites" or "branch")
#' @param windows Coordinates of breakpoints between windows. The first
#'   coordinate (0) and the last coordinate (equal to \code{ts$sequence_length})
#'   are added automatically)
#'
#' @return For each set of individuals either a single Tajima's D value or a
#'   vector of Tajima's D values (one for each window)
#'
#' @examples
#' \dontshow{check_dependencies(python = TRUE, quit = TRUE) # dependencies must be present
#' }
#' init_env()
#'
#' # load an example model with an already simulated tree sequence
#' slendr_ts <- system.file("extdata/models/introgression_slim.trees", package = "slendr")
#' model <- read_model(path = system.file("extdata/models/introgression", package = "slendr"))
#'
#' # load the tree-sequence object from disk
#' ts <- ts_load(slendr_ts, model) %>% ts_mutate(mutation_rate = 1e-8, random_seed = 42)
#'
#' # calculate Tajima's D for given sets of individuals in a tree sequence ts
#' ts_tajima(ts, list(eur = c("EUR_1", "EUR_2", "EUR_3", "EUR_4", "EUR_5"),
#'                    nea = c("NEA_1", "NEA_2")))
#' @export
ts_tajima <- function(ts, sample_sets, mode = c("site", "branch", "node"),
                      windows = NULL) {
  mode <- match.arg(mode)
  if (!is.list(sample_sets)) sample_sets <- as.list(sample_sets)
  if (!is.null(windows)) windows <- define_windows(ts, windows)
  oneway_stat(ts, "D", sample_sets, mode, windows)
}

# other statistics --------------------------------------------------------

#' Compute the allele frequency spectrum (AFS)
#'
#' This function computes the AFS with respect to the given set of individuals
#' or nodes.
#'
#' For more information on the format of the result and dimensions, in
#' particular the interpretation of the first and the last element of the AFS
#' (when \code{complete = TRUE}), please see the tskit manual at
#' <https://tskit.dev/tskit/docs/stable/python-api.html> and the example
#' section dedicated to AFS at
#' <https://tskit.dev/tutorials/analysing_tree_sequences.html#allele-frequency-spectra>.
#'
#' @param ts Tree sequence object of the class \code{slendr_ts}
#' @param sample_sets A list (optionally a named list) of character vectors with
#'   individual names (one vector per set). If NULL, allele frequency spectrum
#'   for all individuals in the tree sequence will be computed.
#' @param mode The mode for the calculation ("sites" or "branch")
#' @param windows Coordinates of breakpoints between windows. The first
#'   coordinate (0) and the last coordinate (equal to \code{ts$sequence_length})
#'   are added automatically)
#' @param polarised When TRUE (the default) the allele frequency spectrum will
#'   not be folded (i.e. the counts will assume knowledge of which allele is ancestral,
#'   and which is derived, which is known in a simulation)
#' @param span_normalise Argument passed to tskit's \code{allele_frequency_spectrum}
#'   method
#'
#' @return Allele frequency spectrum values for the given sample set. Note that the
#'   contents of the first and last elements of the AFS might surprise you. Read the
#'   links in the description for more detail on how tskit handles things.
#'
#' @examples
#' \dontshow{check_dependencies(python = TRUE, quit = TRUE) # dependencies must be present
#' }
#' init_env()
#'
#' # load an example model with an already simulated tree sequence
#' slendr_ts <- system.file("extdata/models/introgression_slim.trees", package = "slendr")
#' model <- read_model(path = system.file("extdata/models/introgression", package = "slendr"))
#'
#' # load the tree-sequence object from disk
#' ts <- ts_load(slendr_ts, model) %>% ts_mutate(mutation_rate = 1e-8, random_seed = 42)
#'
#' samples <- ts_samples(ts) %>% .[.$pop %in% c("AFR", "EUR"), ]
#'
#' # compute AFS for the given set of individuals
#' ts_afs(ts, sample_sets = list(samples$name))
#' @export
ts_afs <- function(ts, sample_sets = NULL, mode = c("site", "branch", "node"),
                   windows = NULL, span_normalise = FALSE, polarised = TRUE) {
  mode <- match.arg(mode)

  if (is.null(sample_sets))
    sample_sets <- list(ts_samples(ts)$name)
  else if (!is.list(sample_sets))
    sample_sets <- as.list(sample_sets)
  if (!is.null(windows)) windows <- define_windows(ts, windows)

  node_sets <- purrr::map(sample_sets, function(set) {
    get_node_ids(ts, set)
  })

  result <- ts$allele_frequency_spectrum(
    sample_sets = unname(node_sets),
    mode = mode,
    windows = windows,
    span_normalise = span_normalise,
    polarised = polarised
  )

  # convert 1D array to a simple vector
  if (length(sample_sets) == 1) result <- as.numeric(result)

  result
}

#' Convert an annotated \code{slendr_phylo} object to a \code{phylo} object
#'
#' This function servers as a workaround around a ggtree error:
#'   \code{Error in UseMethod("as.phylo") :
#'     no applicable method for 'as.phylo' applied to an object of class
#'     "c('phylo', 'slendr_phylo')"}
#'
#' @param x Tree object of the class \code{slendr_phylo}
#' @param ... Additional (unused) arguments of the \code{as.phylo} S3 method
#'
#' @return Standard phylogenetic tree object implemented by the R package ape
#'
#' @importFrom ape as.phylo
#' @export as.phylo.slendr_phylo
#' @export
#' @keywords internal
as.phylo.slendr_phylo <- function(x, ...) { class(x) <- "phylo"; x }


#' Print tskit's summary table of the Python tree-sequence object
#'
#' @param x Tree object of the class \code{slendr_phylo}
#' @param ... Additional arguments normally passed to \code{print} (not used in this case)
#'
#' @return No return value, simply prints the tskit summary table to the terminal
#'
#' @export print.slendr_ts
#' @export
print.slendr_ts <- function(x, ...) { summary(x, ...) }

# private tree sequence utility functions ---------------------------------

# Function for extracting numerical node IDs for various statistics
get_node_ids <- function(ts, x) {
  # slendr allows named lists to be used for easier group labelling in result tables
  if (is.list(x)) x <- x[[1]]

  if (is.null(x)) {
    ids <- ts_nodes(ts) %>% .[.$sampled, ]$node_id
  } else if (is.character(x)) {
    if (length(intersect(x, ts_samples(ts)$name)) != length(x))
      stop("Not all individual names are among those recorded in the tree sequence", call. = FALSE)
    ids <- ts_nodes(ts) %>% dplyr::filter(name %in% x) %>% .$node_id
  } else if (is.numeric(x)) {
    ids <- as.integer(x)
  } else
    stop("Unknown data given as individual names or node IDs", call. = FALSE)
  if (length(ids) == 1)
    list(ids)
  else
    ids
}

# Extract information from the nodes table
get_ts_raw_nodes <- function(ts) {
  model <- attr(ts, "model")
  from_slendr <- !is.null(model)

  table <- ts$tables$nodes

  node_table <- dplyr::tibble(
    node_id = as.integer(seq_len(table$num_rows) - 1),
    ind_id = ifelse(table$individual == -1, NA, table$individual),
    pop_id = as.integer(table$population)
  )

  # in case of slendr tree sequences, convert times to the model time units
  if (from_slendr)
    node_table$time <- as.numeric(time_fun(ts)(table$time, model))
  else
    node_table$time <- as.numeric(table$time)

  node_table$time_tskit <- as.numeric(table$time)

  # -1 as a missing value in tskit is not very R like, so let's replace it with
  # a proper NA
  node_table$pop_id[node_table$pop_id == -1] <- NA

  node_table
}

# Extract information from the table of individual table
get_ts_raw_individuals <- function(ts) {
  model <- attr(ts, "model")
  from_slendr <- !is.null(model)

  individuals <- ts$tables$individuals
  nodes <- ts$tables$nodes

  # create a data frame which will form the output table with individuals in the tree sequence
  # (unlike the raw tree-sequence tables, IDs are explicitly stored as 0-based columns)
  ind_table <- dplyr::tibble(
    ind_id = seq_len(ts$num_individuals) - 1
  )

  if (attr(ts, "type") == "SLiM") {
    ind_table <- dplyr::tibble(
      ind_table,
      # pedigree_id = pylib$get_pedigree_ids(ts), # TODO: check whether the reticulate bug is gone
      pedigree_id = reticulate::py$get_pedigree_ids(ts),
      pop_id = ts$individual_populations,
      alive = bitwAnd(individuals[["flags"]], pyslim$INDIVIDUAL_ALIVE) != 0,
      remembered = bitwAnd(individuals[["flags"]], pyslim$INDIVIDUAL_REMEMBERED) != 0,
      retained = bitwAnd(individuals[["flags"]], pyslim$INDIVIDUAL_RETAINED) != 0
    )

    if (from_slendr) {
      ind_table$time <- time_fun(ts)(ts$individual_times, model)
      # for slendr SLiM models, "sampled" nodes are those were explicitly scheduled for sampling
      # - for "original" SLiM/slendr tree sequences, sampled nodes are those that are remembered
      if (is.null(attr(ts, "metadata")$sample_ids))
        ind_table$sampled <- ind_table$remembered
      else # - for previously simplified tree sequences, use information from pedigree IDs
        ind_table$sampled <- ind_table$pedigree_id %in% attr(ts, "metadata")$sample_ids
    } else {
      ind_table$time <- ts$individual_times
      # for pure SLiM tree sequences, simply use the sampling information encoded in the data
      ind_table$sampled <- ind_table$ind_id %in% unique(nodes[(seq(ts$num_nodes) - 1) %in% as.integer(ts$samples())]$individual)
      # ind_table$sampled <- ifelse(is.na(ind_table$sampled), FALSE, TRUE)
    }
    ind_table$time_tskit <- ts$individual_times
  } else { # msprime tree-sequence table
    nodes_table <- dplyr::tibble(
      ind_id = nodes$individual,
      pop_id = nodes$population
    )

    if (from_slendr)
      nodes_table$time <- time_fun(ts)(nodes$time, model)
    else
      nodes_table$time <- nodes$time

    nodes_table$time_tskit <- nodes$time
    nodes_table <- dplyr::distinct(nodes_table)

    ind_table <- dplyr::inner_join(ind_table, nodes_table, by = "ind_id")

    # in a non-SLiM tree sequence, genomes/nodes of all individuals are "samples"
    ind_table$sampled <- TRUE
  }

  if (attr(ts, "spatial")) {
    ind_table$raster_x <- ts$individual_locations[, 1]
    ind_table$raster_y <- ts$individual_locations[, 2]
  }

  ind_table %>% dplyr::select(ind_id, time, dplyr::everything())
}

# Extract information from the edges table
get_ts_raw_edges <- function(ts) {
  table <- ts$tables$edges
  dplyr::tibble(
    id = seq_len(table$num_rows) - 1,
    child = as.vector(table[["child"]]),
    parent = as.vector(table[["parent"]]),
    left = as.vector(table[["left"]]),
    right = as.vector(table[["right"]])
  )
}

# Extract information from the muations table
get_ts_raw_mutations <- function(ts) {
  model <- attr(ts, "model")
  table <- ts$tables$mutations
  if (is.null(model))
    time <- table[["time"]]
  else
    time <- time_fun(ts)(as.vector(table[["time"]]), model)
  dplyr::tibble(
    id = seq_len(table$num_rows) - 1,
    site = as.vector(table[["site"]]),
    node = as.vector(table[["node"]]),
    time = as.numeric(time),
    time_tskit = as.numeric(table[["time"]])
  )
}

time_fun <- function(ts) {
  if (attr(ts, "type") == "SLiM")
    convert_slim_time
  else
    convert_msprime_time
}

get_pyslim_table_data <- function(ts, simplify_to = NULL) {
  model <- attr(ts, "model")
  spatial <- attr(ts, "spatial")
  from_slendr <- !is.null(model)

  # get data from the original individual table
  individuals <- attr(ts, "raw_individuals")

  # get data from the original nodes table to get node assignments for each
  # individual but also nodes which are not associated with any individuals
  # (i.e. those added through recapitation by msprime)
  nodes <- get_ts_raw_nodes(ts)

  # assign symbolic names to to the population column
  if (from_slendr) {
    individuals$pop <- model$splits$pop[individuals$pop_id + 1]
    nodes$pop <- model$splits$pop[nodes$pop_id + 1]
  } else {
    pop_table <- ts$tables$populations
    pop_present <- unlist(sapply(seq_along(pop_table), function(i) !is.null(pop_table[i - 1]$metadata)))
    pop_names <- rep("", length(pop_table))
    pop_names[pop_present] <- unlist(sapply(seq_along(pop_table), function(i) pop_table[i - 1]$metadata$name))
    individuals$pop <- pop_names[individuals$pop_id + 1]
    nodes$pop <- pop_names[nodes$pop_id + 1]
  }

  individuals <- dplyr::arrange(individuals, -time, pop)

  # load information about samples at times and from populations of remembered individuals
  if (from_slendr) {
    # when a tree sequence is being loaded from a file where it was saved in its
    # simplified form, the metadata table saved along side it won't correspond to the
    # subset of sampled nodes -- in these situations, subset the original sampling table
    # using the names of sampled individuals that are propagated through serialization
    # (this is achieved by the filter() step right below)
    samples <- attr(ts, "metadata")$sampling %>%
      dplyr::arrange(-time, pop) %>%
      dplyr::filter(name %in% attr(ts, "metadata")$sample_names)
    if (!is.null(simplify_to))
      samples <- samples %>% dplyr::filter(name %in% simplify_to)
  } else
    samples <- dplyr::filter(individuals, sampled) %>% dplyr::select(time, pop)

  # split individuals into sampled (those explicitly sampled, to which we
  # will add readable names from the sampling schedule table) and not sampled
  # (either "anonymous" individuals or also remembered individuals which should
  # not be regarded for simplification)
  sampled <- dplyr::filter(individuals, sampled) %>%
    dplyr::select(-time, -pop) %>%
    dplyr::bind_cols(samples)

  not_sampled <- dplyr::filter(individuals, !sampled)

  # add numeric node IDs to each individual
  combined <-
    dplyr::bind_rows(sampled, not_sampled) %>%
    dplyr::right_join(nodes, by = "ind_id", multiple = "all") %>% # required after dplyr v1.1.0
    dplyr::mutate(time = ifelse(is.na(ind_id), time.y, time.x),
                  time_tskit = time_tskit.y,
                  sampled = ifelse(is.na(ind_id), FALSE, sampled)) %>%
    dplyr::rename(pop = pop.y, pop_id = pop_id.y)

  if (spatial) {
    combined <- convert_to_sf(combined, model)
    location_cols <- "location"
  } else
    location_cols <- NULL

  if (from_slendr) {
    combined$pop <- factor(combined$pop, levels = order_pops(model$populations, model$direction))
    slendr_cols <- c("name", "pop")
  } else
    slendr_cols <- "pop"

  combined <- dplyr::select(
    combined, !!slendr_cols, node_id, time, time_tskit, !!location_cols,
    sampled, remembered, retained, alive, pedigree_id, pop_id, ind_id
  )

  if (spatial)
    combined
  else
    dplyr::as_tibble(combined)
}

get_tskit_table_data <- function(ts, simplify_to = NULL) {
  model <- attr(ts, "model")
  spatial <- attr(ts, "spatial")
  from_slendr <- !is.null(model)

  # get data from the original individual table
  individuals <- attr(ts, "raw_individuals")

  # get data from the original nodes table to get node assignments for each
  # individual but also nodes which are not associated with any individuals
  # (i.e. those added through recapitation by msprime)
  nodes <- get_ts_raw_nodes(ts)

  # assign symbolic names to the population column
  if (from_slendr) {
    individuals$pop <- model$splits$pop[individuals$pop_id + 1]
    nodes$pop <- model$splits$pop[nodes$pop_id + 1]
  } else {
    pop_table <- ts$tables$populations
    # pop_names <- unlist(sapply(seq_along(pop_table), function(i) pop_table[i - 1]$metadata$name))
    pop_names <- unlist(sapply(seq_along(pop_table), function(i) {
      pop_metadata <- pop_table[i - 1]$metadata
      if (all(as.character(pop_metadata) == "") || !any("name" %in% names(pop_metadata)))
        as.character(i - 1)
      else
        pop_metadata$name
    }))
    individuals$pop <- pop_names[individuals$pop_id + 1]
    nodes$pop <- pop_names[nodes$pop_id + 1]
    if (is.null(nodes[["pop"]])) nodes$pop <- NA
  }

  if (nrow(individuals)) {
    if (length(ts$tables$populations)) {
      individuals <- dplyr::arrange(individuals, -time, pop)
    } else {
      individuals <- dplyr::arrange(individuals, -time)
      individuals$pop <- NA
    }
  }

  # load information about samples at times and from populations of remembered
  # individuals
  if (from_slendr) {
    # when a tree sequence is being loaded from a file where it was saved in its
    # simplified form, the metadata table saved along side it won't correspond to the
    # subset of sampled nodes -- in these situations, subset the original sampling table
    # using the names of sampled individuals that are propagated through serialization
    samples <- attr(ts, "metadata")$sampling %>%
      dplyr::filter(name %in% attr(ts, "metadata")$sample_names)
    if (!is.null(simplify_to))
      samples <- dplyr::filter(samples, name %in% simplify_to)
    samples <- dplyr::arrange(samples, -time, pop)
    individuals <- dplyr::mutate(individuals, name = samples$name)
  }

  # some tree sequence don't have any information about individuals -- for those cases,
  # modify the temporary individuals/nodes tables accordingly so that the join operation
  # below goes through
  if (!nrow(individuals)) {
    # delete columns of the empty individuals table
    individuals$sampled <- NULL
    individuals$time_tskit <- NULL

    # add those columns to the nodes table instead (that table will also hold information
    # on which nodes are actually sampled)
    nodes$sampled <- FALSE
    nodes$sampled[ts$samples() + 1] <- TRUE

    nodes$time_tskit.y <- nodes$time_tskit
    nodes$time_tskit <- NULL

    nodes$pop.y <- nodes$pop
    nodes$pop <- individuals$pop <- NULL

    nodes$sampled <- nodes$node_id %in% as.vector(ts$samples())
  }

  # add numeric node IDs to each individual
  combined <- dplyr::select(individuals, -time, -pop_id) %>%
    dplyr::right_join(nodes, by = "ind_id", multiple = "all") %>%  # required after dplyr v1.1.0
    dplyr::rename(pop = pop.y, time_tskit = time_tskit.y)

  # for tree sequences which have some individuals/nodes marked as 'sampled', mark the
  # nodes which do not have individuals assigned to them as FALSE
  if (any(!is.na(combined$ind_id)))
    combined <- dplyr::mutate(combined, sampled = ifelse(is.na(ind_id), FALSE, sampled))

  if (spatial) {
    combined <- convert_to_sf(combined, model)
    location_cols <- "location"
  } else
    location_cols <- NULL

  if (from_slendr) {
    combined$pop <- factor(combined$pop, levels = order_pops(model$populations, model$direction))
    name_col <- "name"
  } else
    name_col <- NULL

  combined %>%
    dplyr::select(!!name_col, pop, ind_id, node_id, time, time_tskit, sampled, pop_id)
}

get_annotated_edges <- function(x) {
  data <- ts_nodes(x) %>% dplyr::as_tibble()
  source <- if (inherits(x, "slendr_phylo")) "tree" else "tskit"
  spatial <- attr(x, "spatial")
  if (spatial) check_spatial_pkgs()
  from_slendr <- !is.null(attr(x, "model"))

  if (spatial && any(sf::st_is_empty(data$location))) {
    warning("Not all nodes have a known spatial location. Maybe you ran a neutral\n",
            "non-spatial coalescent recapitation after a spatial SLiM simulation?\n",
            "This is not a problem, but please note that the edge table encoded\n",
            "by this tree will not contain spatial information.", call. = FALSE)
    spatial <- FALSE
  }

  # get the table of edges to be used as a scaffold for the full annotation table
  # (these can be either just edges of the tree phylo object, or the full tree sequence)
  if (source == "tree") {
    edges <- dplyr::tibble(parent = x$edge[, 1], child = x$edge[, 2])
    id <- "phylo_id"
    join1 <- "parent_phylo_id"
    join2 <- "child_phylo_id"
  } else {
    edges <- attr(x, "raw_edges") %>% dplyr::select(-id)
    id <- "node_id"
    join1 <- "parent_node_id"
    join2 <- "child_node_id"
  }

  if (!spatial) data$location <- NA

  # create a new table of node times/locations by running a join operation
  # against the `edges` table above
  parent_nodes <- data[data[[id]] %in% edges$parent, ]
  if (source == "tree")
    parent_nodes <- parent_nodes %>%
      dplyr::select(parent_pop = pop,
                    parent_phylo_id = phylo_id, parent_node_id = node_id,
                    parent_time = time, parent_location = location)
  else
    parent_nodes <- parent_nodes %>%
      dplyr::select(parent_pop = pop,
                    parent_node_id = node_id,
                    parent_time = time, parent_location = location)
  parent_nodes <- dplyr::left_join(
    parent_nodes, edges,
    by = stats::setNames("parent", join1), multiple = "all"
  ) %>% # multiple = "all" required after dplyr v1.1.0
    dplyr::arrange(!!join1)

  # take the `parent_nodes` able above and do another join operation, this time
  # with the table of child nodes' times/locations
  edge_nodes <- data[data[[id]] %in% edges$child, ]
  if (source == "tree")
    edge_nodes <- edge_nodes %>%
      dplyr::select(child_pop = pop,
                    child_phylo_id = phylo_id, child_node_id = node_id,
                    child_time = time, child_location = location)
  else
    edge_nodes <- edge_nodes %>%
      dplyr::select(child_pop = pop,
                    child_node_id = node_id,
                    child_time = time, child_location = location)
  edge_nodes <- dplyr::inner_join(
    edge_nodes, parent_nodes,
    by = stats::setNames("child", join2)
  ) %>%
    dplyr::arrange(!!join2)

  # transforming individual child/parent location columns (type POINT) into a
  # line (type LINESTRING)
  if (spatial) {
    connections <- purrr::map2(
      edge_nodes$child_location, edge_nodes$parent_location, ~
        sf::st_union(.x, .y) %>%
        sf::st_cast("LINESTRING") %>%
        sf::st_sfc() %>%
        sf::st_sf(connection = ., crs = sf::st_crs(edge_nodes))) %>%
      dplyr::bind_rows()
  } else
    connections <- data.frame(connection = NA)

  # create an sf table with all the locations in columns (child_location,
  # parent_location, connect)
  if (source == "tree")
    phylo_cols <- c("parent_phylo_id", "child_phylo_id")
  else
    phylo_cols <- NULL

  edges <-
    dplyr::bind_cols(edge_nodes, connections) %>%
    dplyr::select(!!phylo_cols,
                  child_node_id, parent_node_id,
                  connection,
                  child_time, parent_time,
                  child_pop, parent_pop,
                  child_location, parent_location)

  model <- attr(x, "model")

  if (spatial) {
    crs <- if (from_slendr) sf::st_crs(model$world) else NA
    edges <- sf::st_set_geometry(edges, "connection") %>%
      sf::st_set_crs(crs)
  } else
    edges[, c("connection", "parent_location", "child_location")] <- NULL

  if (from_slendr) {
    pop_names <- order_pops(model$populations, model$direction)
    edges$child_pop <- factor(edges$child_pop, levels = pop_names)
    edges$parent_pop <- factor(edges$parent_pop, levels = pop_names)
  }

  edges
}

check_ts_class <- function(x) {
  if (!inherits(x, "slendr_ts"))
    stop("Not a tree sequence object created by ts_load, ts_simplify, ts_recapitate or ts_mutate", call. = FALSE)
}

# Collect all ancestors of a given node up to the root by traversing the tree
# edges "bottom-up" using a queue
collect_ancestors <- function(x, edges) {
  # list for collecting paths (i.e. sets of edges) leading from the sampled node
  # to the root
  result <- list()

  # initialize the counter of nodes already processed by the queue
  n_nodes <- length(unique(c(edges$child, edges$parent)))
  processed_nodes <- vector(length = n_nodes + 1)

  # initialize the queue with all edges leading from the sampled node
  edge <- edges[edges$child %in% x, ] %>% dplyr::mutate(level = 1)
  queue <- split(edge, edge$child)

  # repeat until the queue is empty (this homebrew queue implementation is
  # probably horribly inefficient but it will do for now)
  i <- 0
  while (TRUE) {
    #cat("queue ", (i <- i + 1), "\n")
    # pop out the first element
    item <- queue[[1]]; queue[[1]] <- NULL

    # add it to the final list
    result[[length(result) + 1]] <- item

    # iterate over all parents of the current node
    p <- 0
    for (parent in split(item, item$parent)) {
      #cat("queue ", i, " parent ", (p <- i + 1), "\n")
      # get edges leading from the current parent to its own parent
      edge <- edges[edges$child == unique(parent$parent), ]

      # if the parent has no parent itself or its node has already been
      # processed, skip it and don't add it to the queue
      if (nrow(edge) == 0) next
      if (processed_nodes[unique(edge$child) + 1]) next

      # mark the node as processed...
      processed_nodes[unique(edge$child) + 1] <- TRUE
      # ... and add it to the queue
      edge$level <- item$level[1] + 1
      queue[[length(queue) + 1]] <- edge
    }

    if (length(queue) == 0) break
  }

  result <- dplyr::bind_rows(result) %>%
    dplyr::select(child_id = child, parent_id = parent, left, right, level)

  result
}

# Collect all descendants of a given node down to the leaves of he tree by
# traversing the tree edges "top-down" using a queue
collect_descendants <- function(x, edges) {
  # list for collecting paths (i.e. sets of edges) leading from the sampled node
  # to the root
  result <- list()

  # initialize the counter of nodes already processed by the queue
  n_nodes <- length(unique(c(edges$child, edges$parent)))
  processed_nodes <- vector(length = n_nodes + 1)

  # initialize the queue with all edges leading from the sampled ancestor
  edge <- edges[edges$parent == x, ] %>% dplyr::mutate(level = 1)
  queue <- split(edge, edge$child)

  # repeat until the queue is empty (this homebrew queue implementation is
  # probably horribly inefficient but it will do for now)
  # i <- 0
  while (TRUE) {
    # cat("queue ", (i <- i + 1), "\n")
    # pop out the first element
    item <- queue[[1]]; queue[[1]] <- NULL

    # add it to the final list
    result[[length(result) + 1]] <- item

    for (child in split(item, item$child)) {
      # cat("queue ", i, " parent ", (p <- i + 1), "\n")
      # get edges leading from the current child to its own children
      edge <- edges[edges$parent == unique(child$child), ] %>%
        dplyr::distinct(child, parent, .keep_all = TRUE)

      # if the child has no children itself or its node has already been
      # processed, skip it and don't add it to the queue
      already_processed <- processed_nodes[unique(edge$child) + 1]
      edge <- edge[!already_processed, ]
      if (nrow(edge) == 0) next

      # mark the node as processed...
      processed_nodes[unique(edge$child) + 1] <- TRUE
      # ... and add it to the queue
      edge$level <- item$level[1] + 1
      queue[[length(queue) + 1]] <- edge
    }

    if (length(queue) == 0) break
  }

  result <- dplyr::bind_rows(result) %>%
    dplyr::select(child_id = child, parent_id = parent, left, right, level)

  result
}

# Convert SLiM's dictionaries with the sampling schedule table to
# a normal R data frame
get_sampling <- function(metadata) {
  # again, because of how SLiM's dictionaries are structured, a bit of
  # data munging needs to be performed in order to get a simple data frame
  if (metadata$backend == "SLiM") {
    sampling <- purrr::transpose(metadata$sampling) %>%
      dplyr::as_tibble() %>%
      tidyr::unnest(cols = c("n", "pop", "time_gen", "time_orig"))
  } else
    sampling <- dplyr::as_tibble(metadata$sampling)

  df <- sampling %>%
    dplyr::select(-time_gen) %>%
    {
      rbind(
        dplyr::filter(., n == 1),
        dplyr::filter(., n > 1) %>% .[rep(seq_len(nrow(.)), .$n), ]
      )
    } %>%
    dplyr::group_by(pop) %>%
    dplyr::mutate(name = paste0(pop, "_", 1:dplyr::n())) %>%
    dplyr::ungroup() %>%
    dplyr::arrange(-time_orig, pop) %>%
    dplyr::rename(time = time_orig) %>%
    dplyr::select(name, time, pop)

  # if needed (i.e. after simplification to a smaller set of sampled individuals), subset
  # the full original sampling schedule table to only individuals of interest
  df %>% dplyr::filter(name %in% metadata$sample_names)
}

# Extract list with slendr metadata (created as Eidos Dictionaries by SLiM and Python
# dictionaries by the msprime simulation script)
get_slendr_metadata <- function(ts) {
  # custom slendr metadata is only present in tree sequences coming from slendr models
  if (is.null(attr(ts, "model"))) return(NULL)

  # SLiM forces metadata into a certain structure, so the slendr metadata
  # must be extracted differently for the two types
  if (attr(ts, "type") == "SLiM") {
    metadata <- ts$metadata$SLiM$user_metadata$slendr[[1]]
    arguments <- metadata$arguments[[1]]
  } else {
    metadata <- ts$metadata$slendr
    arguments <- metadata$arguments
  }

  list(
    version = metadata$version,
    description = metadata$description,
    sampling = get_sampling(metadata),
    sample_names = metadata$sample_names,
    sample_ids = metadata$sample_ids,
    map = metadata$map[[1]],
    arguments = arguments
  )
}

get_biallelic_indices <- function(ts) {
  gts <- ts$genotype_matrix()
  biallelic_pos <- rowSums(gts == 0 | gts == 1) == ncol(gts)
  biallelic_pos
}

# Convert a data frame of information extracted from a tree sequence
# table to an sf spatial object
convert_to_sf <- function(df, model) {
  from_slendr <- !is.null(model)

  crs <- if (from_slendr) sf::st_crs(model$world) else NA

  with_locations <- df[stats::complete.cases(df[, c("raster_x", "raster_y")]), ]
  without_locations <- df[!stats::complete.cases(df[, c("raster_x", "raster_y")]), ]

  if (from_slendr) {
    # reproject coordinates to the original crs
    with_locations <- reproject(
      from = "raster", to = crs, coords = with_locations, model = model,
      input_prefix = "raster_", output_prefix = "", add = TRUE
    )
  } else
    with_locations <- dplyr::rename(with_locations, x = raster_x, y = raster_y)

  result <- sf::st_as_sf(with_locations,
                         coords = c("x", "y"),
                         crs = crs) %>%
    dplyr::rename(location = geometry)

  # directly binding with without_locations which has no rows lead to an error:
  # 'Error: Assigned data `x[[all_sfc_columns[i]]]` must be compatible with
  # existing data.' (similar issue reported here:
  # https://github.com/mtennekes/tmap/issues/551)
  if (nrow(without_locations))
    result <- dplyr::bind_rows(result, without_locations)

  result
}

define_windows <- function(ts, breakpoints) {
  unique(c(0, breakpoints, ts$sequence_length))
}

concat <- function(x) {
  if (is.list(x) && !is.null(names(x)))
    return(names(x))
  else if (is.list(x) && is.null(names(x)))
    return(paste(x[[1]], collapse = "+"))
  else
    return(paste(x, collapse = "+"))
}
Any scripts or data that you put into this service are public.
slendr documentation built on June 22, 2024, 6:56 p.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
slendr
A Simulation Framework for Spatiotemporal Population Genetics

R/tree-sequences.R
In slendr: A Simulation Framework for Spatiotemporal Population Genetics

Defines functions ts_eigenstrat ts_genotypes ts_metadata ts_mutate ts_simplify ts_recapitate ts_save ts_load

Documented in ts_eigenstrat ts_genotypes ts_load ts_metadata ts_mutate ts_recapitate ts_save ts_simplify

Try the slendr package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

slendr A Simulation Framework for Spatiotemporal Population Genetics

R/tree-sequences.R In slendr: A Simulation Framework for Spatiotemporal Population Genetics

Defines functions ts_eigenstrat ts_genotypes ts_metadata ts_mutate ts_simplify ts_recapitate ts_save ts_load

Documented in ts_eigenstrat ts_genotypes ts_load ts_metadata ts_mutate ts_recapitate ts_save ts_simplify

Try the slendr package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

slendr
A Simulation Framework for Spatiotemporal Population Genetics

R/tree-sequences.R
In slendr: A Simulation Framework for Spatiotemporal Population Genetics