gscramble: Simulating Admixed Genotypes Without Replacement

Documented in gsp2dot

#' Write a dot file to represent a genome simulation pedigree
#'
#' This takes the tibble representation of a GSP and writes it to
#' a dot file to be rendered into a graph using the dot command
#' from the GraphViz package.  You can easily get GraphViz using Miniconda
#' or check out the GraphViz downloads page.  If you have the dot
#' executable in your PATH, then dot will be run on the dot file
#' and an SVG and a PNG image of the graph.
#' @param g a GSP tibble.
#' @param path the path to the file prefix to use (to this will be appended
#' .dot, and .png or .svg, if dot is on your system). By default these
#' paths are in a temporary directory, because packages are not allowed to
#' write to user home directories by default.  Supply a path with prefix,
#' like `my_path/myfile` to get the output file `mypath.myfile.dot`
#' @param edge_label_font_size The font size of the number annotations along the edges.
#' @param indiv_node_label_font_size the font size of the labels for the individual nodes
#' @param sample_node_label_font_size the font size of the labels for the individual nodes
#' @param haplo_origin_colors The colors for different origins of haplotypes.
#' By default there are only sixteen.  If you have more populations that founders
#' may come from, you should provide a vector with more than 16 colors.
#' @param sam_node_color The color given to the sample nodes in the GSP.
#' @param sample_edge_label_color Color for the numeric annotations along the
#' edges leading to samples.
#' @param parent_edge_label_color Color for the numeric annotations along the
#' edges leading from parents to offspring.
#' @details It can be tricky knowing whether or not R or Rstudio will read
#' your Unix rc files and populate your paths appropriately.  If you want to
#' test whether `dot` in on your PATH as it is when running under R, try:
#' `Sys.which("dot")` at your R console.  If that returns an empty string,
#' (`""`), then you need to do something else to make sure R can find `dot`
#' on your system.
#' @return A vector of file paths.  The first is the path of the dot
#' file that was produced.  The second and third are only present if
#' dot was found in the PATH.  They are the paths of the png and svg
#' files that were produced.
#' @export
#' @examples
#' gsp_file <- system.file("extdata/13-member-ped.csv", package = "gscramble")
#' g <- readr::read_csv(gsp_file)
#'
#' paths <- gsp2dot(g)
#' paths
#'
gsp2dot <- function(
    g,
    path = file.path(tempfile(), "file_prefix"),
    edge_label_font_size = 18,
    indiv_node_label_font_size = 18,
    sample_node_label_font_size = 18,
    haplo_origin_colors = c(
      "lightblue",
      "orange",
      "blue",
      "green",
      "cadetblue",
      "dodgerblue3",
      "darkolivegreen1",
      "forestgreen",
      "lightpink",
      "red2",
      "sandybrown",
      "orangered",
      "plum3",
      "purple4",
      "palegoldenrod",
      "peru"
    ),
    sam_node_color = "violet",
    sample_edge_label_color = "purple",
    parent_edge_label_color = "red"
) {

  # get the path to the output files and make the directory if necessary
  DIR <- dirname(path)
  BASE <- basename(path)
  file <- path
  dir.create(DIR, recursive = TRUE, showWarnings = FALSE)

  #### GET THE NODE SPECIFICATIONS ALL SET UP IN A TIBBLE ####
  # define shapes and such for different nodes
  shape_stuff <- tibble(
    node_type = c("hap", "ind", "sam"),
    shape_text = c("shape=invtriangle, regular=1, height=0.56, fixedsize=true",
                   glue("shape=box, regular=1, height=0.86, fixedsize=true, fontsize={indiv_node_label_font_size}"),
                   glue("shape=hexagon, regular=1, height=0.86, fixedsize=true, style=filled, fillcolor={sam_node_color}, fontsize={sample_node_label_font_size}"))
  )

  # get a tibble of all the different types of nodes, and add the
  # labels that they should have
  node_tib <- list(
    hap = tibble(id = as.character(unique(c(g$hap1, g$hap2)))),
    ind = tibble(id = as.character(unique(c(g$ind, g$par1, g$par2)))),
    sam = tibble(id = as.character(g$sample))
  ) %>% bind_rows(.id = "node_type") %>%
    filter(!is.na(id)) %>%
    left_join(shape_stuff, by = "node_type")

  # now, pull stuff together to make the labels for the haplos
  tmp <- g %>%
    filter(!(is.na(hap1) & is.na(hap2)))
  haplos <- tibble(
    node_type = "hap",
    id = c(tmp$hap1, tmp$hap2),
    pop = c(tmp$hpop1, tmp$hpop2)
  )

  node_tib2 <- node_tib %>%
    left_join(haplos, by = c("node_type", "id")) %>%
    mutate(pop_f = factor(pop)) %>%
    mutate(label_text = case_when(
      node_type == "hap" ~ glue("label={pop}, style=filled, fillcolor={haplo_origin_colors[pop_f]}"),
      TRUE ~ str_c("label=", id)
    )) %>%
    mutate(node_text = glue("\"{id}\" [{shape_text}, {label_text}];"))

  #### NOW, GET THE EDGE SPECS ALL SET UP IN A TIBBLE ####
  hap_edges <- g %>%
    filter(!(is.na(hap1) & is.na(hap2))) %>%
    mutate(edges = glue("\"{hap1}\" -> \"{ind}\" [dir=none, style=solid];
                         \"{hap2}\" -> \"{ind}\" [dir=none, style=solid];"))

  par_edges <- g %>%
    filter(!is.na(par1) & !is.na(par2)) %>%
    mutate(edges = glue("\"{par1}\" -> \"{ind}\" [dir=none, style=solid, label = \" {ipar1}\", fontsize={edge_label_font_size}, fontcolor={parent_edge_label_color}];
                         \"{par2}\" -> \"{ind}\" [dir=none, style=solid, label = \" {ipar2}\", fontsize={edge_label_font_size}, fontcolor={parent_edge_label_color}];"))


  sam_edges <- g %>%
    filter(!is.na(sample)) %>%
    mutate(edges = glue("\"{ind}\" -> \"{sample}\" [dir=none, style=solid, label = \" {osample}\", fontsize={edge_label_font_size}, fontcolor={sample_edge_label_color}];"))

  #### NOW, JUST WRITE THAT OUT ####
  ret <- paste0(file, ".dot")
  names(ret) <- "dot"
  cat(
    "digraph xxx {\nlabel = \"  \"\nranksep = 1\nnodesep = 1\ncompress = false",
    node_tib2$node_text,
    hap_edges$edges,
    par_edges$edges,
    sam_edges$edges,
    file = ret,
    sep = "\n",
    "}"
  )

  ### Now, if dot is on the system make the png and the svg
  if(Sys.which("dot") != "") {
    PNG <- paste0(BASE, ".png")
    SVG <- paste0(BASE, ".svg")
    dot <- basename(ret)
    CALL <- paste(
      "cd ", DIR,
      "; dot -Tpng ", dot, " > ", PNG,
      "; dot -Tsvg ", dot, " > ", SVG
    )
    system(CALL)

    ret[2] <- paste0(file, ".png")
    ret[3] <- paste0(file, ".svg")
    names(ret) <- c("dot", "png", "svg")
  } else {
    message("Cannot find dot on the system path. Returning the path to just the dot file, but not the rendered png and svg files.")
  }

  return(ret)

}