R/gggenomes.R

Defines functions theme_gggenomes_clean infer_seqs_from_links infer_seqs_from_feats print.gggenomes_layout dim.gggenomes_layout layout_genomes ggplot.gggenomes_layout gggenomes

Documented in dim.gggenomes_layout gggenomes ggplot.gggenomes_layout layout_genomes theme_gggenomes_clean

#' Plot genomes, features and synteny maps
#'
#' @description
#' `gggenomes()` initializes a gggenomes-flavored ggplot object.
#' It is used to declare the input data for gggenomes' track system.
#'
#' (*See for more details on the track system, gggenomes vignette or the Details/Arguments section*)
#'
#'
#' @details
#' `gggenomes::gggenomes()` resembles the functionality of `ggplot2::ggplot()`.
#' It is used to construct the initial plot object, and is often followed by "+" to add components to the plot (*e.g. "+ geom_gene()"*).
#'
#' A big difference between the two is that gggenomes has a multi-track setup (*`'seqs'`, `'feats'`, `'genes'` and `'links'`*).
#' `gggenomes()` pre-computes a layout and adds coordinates (`y,x,xend`) to each data frame prior to the actual plot construction.
#' This has some implications for the usage of gggenomes:
#' - **Data frames for tracks have required variables.** These predefined variables are used during import
#' to compute x/y coordinates (*see arguments*).
#' - **gggenomes' geoms can often be used without explicit `aes()` mappings**  This works because
#' we always know the names of the plot variables ahead of time: they originate from the pre-computed layout,
#' and we can use that information to set sensible default aesthetic mappings for most cases.
#'
#' @param genes,feats A data.frame, a list of data.frames, or a character vector
#'   with paths to files containing gene data. Each item is added as feature
#'   track.
#'
#'   For a single data.frame the track_id will be "genes" and "feats",
#'   respectively. For a list, track_ids are parsed from the list names, or if
#'   names are missing from the name of the variable containing each data.frame.
#'   Data columns:
#'
#'   - required: `seq_id,start,end`
#'   - recognized: `strand,bin_id,feat_id,introns`
#'
#' @param seqs A data.frame or a character vector with paths to files containing
#'   sequence data. Data columns:
#'
#'   - required: `seq_id,length`
#'   - recognized: `bin_id,start,end,strand`
#'
#' @param links A data.frame or a character vector with paths to files
#'   containing link data. Each item is added as links track. Data columns:
#'
#'   - required: `seq_id,seq_id2`
#'   - recognized: `start,end,bin_id,start2,end2,bin_id2,strand`
#'
#' @param .id The name of the column for file labels that are created when
#'   reading directly from files. Defaults to "file_id". Set to "bin_id" if
#'   every file represents a different bin.
##' @param infer_length,infer_start,infer_end,infer_bin_id used to infer pseudo
#' seqs if only feats or links are provided, or if no bin_id column was
#' provided. The expressions are evaluated in the context of the first feat
#' or link track.
#'
#' By default subregions of sequences from the first to the last feat/link
#' are generated. Set `infer_start` to 0 to show all sequences from their
#' true beginning.
#' @param adjacent_only Indicates whether links should be created between adjacent sequences/chromosomes only.
#' By default it is set to `adjacent_only = TRUE`. If `FALSE`, links will be created between all sequences
#'
#' (*not recommended for large data sets*)
#' @inheritParams layout_seqs
#' @param theme choose a gggenomes default theme, NULL to omit.
#' @param .layout a pre-computed layout from [layout_genomes()]. Useful for
#'   developmental purposes.
#' @param ... additional parameters, passed to layout
#' @import rlang
#' @import ggplot2 dplyr tidyr stringr
#' @import grid
#' @export
#' @return gggenomes-flavored ggplot object
#' @examples
#' # Compare the genomic organization of three viral elements
#' # EMALEs: endogenous mavirus-like elements (example data shipped with gggenomes)
#' gggenomes(emale_genes, emale_seqs, emale_tirs, emale_ava) +
#'   geom_seq() + geom_bin_label() + # chromosomes and labels
#'   geom_feat(size = 8) + # terminal inverted repeats
#'   geom_gene(aes(fill = strand), position = "strand") + # genes
#'   geom_link(offset = 0.15) # synteny-blocks
#'
#' # with some more information
#' gggenomes(emale_genes, emale_seqs, emale_tirs, emale_ava) %>%
#'   add_feats(emale_ngaros, emale_gc) %>%
#'   add_clusters(emale_cogs) %>%
#'   sync() +
#'   geom_link(offset = 0.15, color = "white") + # synteny-blocks
#'   geom_seq() + geom_bin_label() + # chromosomes and labels
#'   # thistle4, salmon4, burlywood4
#'   geom_feat(size = 6, position = "identity") + # terminal inverted repeats
#'   geom_feat(
#'     data = feats(emale_ngaros), color = "turquoise4", alpha = .3,
#'     position = "strand", size = 16
#'   ) +
#'   geom_feat_note(aes(label = type),
#'     data = feats(emale_ngaros),
#'     position = "strand", nudge_y = .3
#'   ) +
#'   geom_gene(aes(fill = cluster_id), position = "strand") + # genes
#'   geom_wiggle(aes(z = score, linetype = "GC-content"), feats(emale_gc),
#'     fill = "lavenderblush4", position = position_nudge(y = -.2), height = .2
#'   ) +
#'   scale_fill_brewer("Conserved genes", palette = "Dark2", na.value = "cornsilk3")
#'
#' # initialize plot directly from files
#' gggenomes(
#'   ex("emales/emales.gff"),
#'   ex("emales/emales.gff"),
#'   ex("emales/emales-tirs.gff"),
#'   ex("emales/emales.paf")
#' ) + geom_seq() + geom_gene() + geom_feat() + geom_link()
#'
#' # multi-contig genomes wrap to fixed width
#' s0 <- read_seqs(list.files(ex("cafeteria"), "Cr.*\\.fa.fai$", full.names = TRUE))
#' s1 <- s0 %>% dplyr::filter(length > 5e5)
#' gggenomes(seqs = s1, infer_bin_id = file_id, wrap = 5e6) +
#'   geom_seq() + geom_bin_label() + geom_seq_label()
gggenomes <- function(
    genes = NULL, seqs = NULL, feats = NULL, links = NULL,
    .id = "file_id", spacing = 0.05, wrap = NULL, adjacent_only = TRUE,
    infer_bin_id = seq_id, infer_start = min(start, end),
    infer_end = max(start, end), infer_length = max(start, end),
    theme = c("clean", NULL), .layout = NULL, ...) {
  # parse track_args to tracks - some magic for a convenient api
  genes_exprs <- enexpr(genes)
  feats_exprs <- enexpr(feats)
  links_exprs <- enexpr(links)

  genes <- as_tracks(genes, genes_exprs, "seqs", context = "feats")
  feats <- as_tracks(feats, feats_exprs, c("seqs", names2(genes)), context = "feats")
  feats <- c(genes, feats) # genes are just feats
  links <- as_tracks(links, links_exprs, c("seqs", names2(feats)), context = "links")

  if (is_character(seqs)) {
    seqs <- read_seqs(seqs, .id = .id)
  }

  layout <- .layout %||% layout_genomes(
    seqs = seqs, genes = genes, feats = feats,
    links = links, spacing = spacing, wrap = wrap, adjacent_only = adjacent_only,
    infer_bin_id = {{ infer_bin_id }}, infer_start = {{ infer_start }},
    infer_end = {{ infer_end }}, infer_length = {{ infer_length }}, ...
  )

  p <- ggplot(data = layout)
  class(p) <- c("gggenomes", class(p))

  p <- p + scale_y_continuous(expand = expansion(add = .7, mult = 0.01))

  theme_name <- theme[[1]] %||% match.arg(theme[[1]], c("clean"))
  if (!is.null(theme_name)) { # add theme
    theme_args <- if (is.list(theme) && length(theme) > 1) theme[-1] else list()
    p <- p + do.call(paste0("theme_gggenomes_", theme), theme_args)
  }

  p <- p + geom_blank(data = seqs())

  p
}

#' ggplot.default tries to `fortify(data)` and we don't want that here
#'
#' @export
#' @return ggplot object
#' @keywords internal
ggplot.gggenomes_layout <- function(data, mapping = aes(), ...,
                                    environment = parent.frame()) {
  if (!missing(mapping) && !inherits(mapping, "uneval")) {
    stop("Mapping should be created with `aes() or `aes_()`.", call. = FALSE)
  }

  p <- structure(list(
    data = data,
    layers = list(),
    scales = ggplot2__scales_list(),
    guides = ggplot2__guides_list(),
    mapping = mapping,
    theme = list(),
    coordinates = coord_cartesian(default = TRUE),
    facet = facet_null(),
    plot_env = environment,
    layout = ggplot2::ggproto(NULL, Layout)
  ), class = c("gg", "ggplot"))

  p$labels <- ggplot2__make_labels(mapping)

  ggplot2::set_last_plot(p)
  p
}
#' Layout genomes
#' @inheritParams gggenomes
#' @keywords internal
#' @return gggenomes_layout object
#' @export
layout_genomes <- function(
    seqs = NULL, genes = NULL, feats = NULL, links = NULL,
    infer_bin_id = seq_id, infer_start = min(start, end), infer_end = max(start, end),
    infer_length = max(start, end), adjacent_only = TRUE, ...) {
  # check seqs / infer seqs if not provided
  if (!is.null(seqs)) {
    if (!has_name(seqs, "bin_id")) {
      seqs <- mutate(seqs, bin_id = {{ infer_bin_id }})
    }
  } else {
    if (is.null(feats) & is.null(links)) {
      abort("Need at least one of: seqs, genes, feats or links")
    }

    # infer dummy seqs
    if (!is.null(feats)) {
      inform("No seqs provided, inferring seqs from feats")
      seqs <- infer_seqs_from_feats(
        feats[[1]], {{ infer_bin_id }}, {{ infer_start }},
        {{ infer_end }}, {{ infer_length }}
      )
    } else if (!is.null(links)) {
      inform("No seqs or feats provided, inferring seqs from links")
      seqs <- infer_seqs_from_links(
        links[[1]], {{ infer_bin_id }}, {{ infer_start }},
        {{ infer_end }}, {{ infer_length }}
      )
    }
  }

  # init the gggenomes_layout object
  x <- list(
    seqs = NULL, feats = list(), links = list(), orig_links = list(),
    args_seqs = list(...), args_links = list(adjacent_only = adjacent_only)
  )
  x %<>% set_class("gggenomes_layout", "prepend")

  # add track data to layout
  x %<>% add_seqs(seqs, ...) # layout seqs
  if (!is.null(feats)) x <- add_feat_tracks(x, feats)
  if (!is.null(links)) x <- add_link_tracks(x, links, adjacent_only = adjacent_only)

  x
}

#' `ggplot2::facet_null` checks data with `empty(df)` using `dim`. This causes
#' an error because dim(gggenome_layout) is undefined. Return dim of primary
#' table instead
#' @export
#' @return dim of primary table
#' @keywords internal
dim.gggenomes_layout <- function(x) dim(get_seqs(x))

#' @export
print.gggenomes_layout <- function(x, ...) track_info(x)

infer_seqs_from_feats <- function(
    feats, infer_bin_id = seq_id, infer_start = min(start, end),
    infer_end = max(start, end), infer_length = max(start, end)) {
  if (!has_name(feats, "bin_id")) {
    feats <- mutate(feats, bin_id = {{ infer_bin_id }})
  } else {
    warn("bin_id found in feats, won't overwrite")
  }

  seqs <- feats %>%
    dplyr::group_by(.data$bin_id, .data$seq_id) %>%
    dplyr::summarize(
      length = {{ infer_length }},
      .start = {{ infer_start }},
      .end = {{ infer_end }}
    ) %>%
    dplyr::rename(start = .data$.start, end = .data$.end) # this is necessary, so {{ infer_end }} does
  # not already use the "start" from {{ infer_start }}

  dplyr::ungroup(seqs)
}

infer_seqs_from_links <- function(
    links, infer_bin_id = seq_id, infer_start = min(start, end),
    infer_end = max(start, end), infer_length = max(start, end)) {
  seqs <- bind_rows(
    select_at(links, vars(ends_with("1")), str_replace, "1", ""),
    select_at(links, vars(ends_with("2")), str_replace, "2", "")
  )

  if (!has_name(seqs, "bin_id")) {
    seqs <- mutate(seqs, bin_id = {{ infer_bin_id }})
  }

  seqs %<>%
    dplyr::mutate(bin_id = {{ infer_bin_id }}) %>%
    dplyr::group_by(.data$seq_id, .data$bin_id) %>%
    dplyr::summarize(
      length = {{ infer_length }},
      .start = {{ infer_start }},
      .end = {{ infer_end }}
    ) %>%
    dplyr::rename(start = .data$.start, end = .data$.end)

  dplyr::ungroup(seqs)
}

#' gggenomes default theme
#' @importFrom ggplot2 theme_bw
#' @importFrom ggplot2 theme
#' @inheritParams ggplot2::theme_bw
#' @return ggplot2 theme with gggenomes defaults
#' @export
theme_gggenomes_clean <- function(base_size = 12, base_family = "", base_line_size = base_size / 30, base_rect_size = base_size / 30) {
  theme_bw(
    base_size = base_size, base_family = base_family,
    base_line_size = base_line_size, base_rect_size = base_rect_size
  ) + theme(
    panel.border = element_blank(),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    panel.background = element_rect(fill = "white"),
    # x-axis
    axis.line.x = element_line(color = "black", size = .4),
    axis.title.x = element_blank(),
    axis.text.x = element_text(color = "black", size = 7),
    axis.ticks.length.x = unit(.7, "mm"),
    # y-axis
    axis.title.y = element_blank(),
    axis.ticks.y = element_blank(),
    axis.text.y = element_blank()
  )
}
thackl/gggenomes documentation built on Aug. 31, 2024, 9:47 p.m.