R/write.R

Defines functions unchop_cds write_gff3

Documented in write_gff3

#' Write a gff3 file from a tidy table
#'
#' @importFrom stats na.omit
#' @param feats tidy feat table
#' @param file name of output file
#' @param seqs a tidy sequence table to generate optional `##sequence-region`
#'   directives in the header
#' @param type if no type column exists, use this as the default type
#' @param source if no source column exists, use this as the default source
#' @param score if no score column exists, use this as the default score
#' @param strand if no strand column exists, use this as the default strand
#' @param phase if no phase column exists, use this as the default phase
#' @param id_var the name of the column to use as the GFF3 `ID` tag
#' @param parent_var the name of the column to use as GFF3 `Parent` tag
#' @param ignore_attr attributes not to be included in GFF3 tag list. Defaults
#'   to internals: `introns, geom_id`
#' @param head additional information to add to the header section
#' @return No return value, writes to file
#' @examples
#' filename <- tempfile(fileext = ".gff")
#' write_gff3(emale_genes, filename, emale_seqs, id_var = "feat_id")
#' @export
write_gff3 <- function(
    feats, file, seqs = NULL, type = NULL, source = ".", score = ".", strand = ".", phase = ".",
    id_var = "feat_id", parent_var = "parent_ids", head = "##gff-version 3", ignore_attr = c("introns", "geom_id")) {
  if (!is.null(seqs)) {
    if (!all(has_name(seqs, c("start", "end")))) {
      if (!has_name(seqs, "length")) {
        rlang::abort("eigher start/end or length required")
      }
      seqs <- mutate(seqs, start = 1, end = length)
    }
    seqs <- mutate(seqs, directive = "##sequence-region", end = .data$end - .data$start + 1, start = 1) %>%
      select("directive", "seq_id", "start", "end") %>%
      unite("seq_reg", 1:4, sep = " ")
  }

  require_vars(feats, c("seq_id", "start", "end"))
  if (!has_name(feats, "type")) {
    if (is.null(type)) rlang::abort("type required")
    feats$type <- type
  }
  if (!has_name(feats, "score")) feats$score <- score
  if (!has_name(feats, "phase")) feats$phase <- phase
  if (!has_name(feats, "source")) feats$source <- source
  if (!has_name(feats, "strand")) {
    feats$strand <- strand
  } else {
    feats$strand <- strand_chr(feats$strand, na = ".")
  }

  # arrange so that predefined gff attributes come first and in fixed order
  if (id_var %in% names(feats)) {
    names(feats)[names(feats) == id_var] <- "ID"
    id_tag <- "ID"
  } else {
    id_tag <- NULL
    rlang::warn(paste("No ID variable detected. An ID tag is not required, but recommended.",
      "Use `id_var` to specify an ID column with a different name",
      sep = "\n"
    ))
  }

  if (parent_var %in% names(feats)) {
    names(feats)[names(feats) == parent_var] <- "Parent"
  }

  gff3_attr <- c("ID", "Parent", "Name", "Alias", "Target", "Gap", "Derives_from", "Note", "Ontology_term")
  gff3_cols <- c("seq_id", "source", "type", "start", "end", "score", "strand", "phase")

  # compute starts/ends for CDS and cDNA_match from introns
  feats <- unchop_cds(feats)
  feats <- select(feats, -all_of(ignore_attr))
  feats <- feats %>% mutate(
    across(all_of(gff3_cols), ~ replace_na(.x, ".")),
    across(where(is_list), function(l) purrr::map_chr(l, stringr::str_c, collapse = ","))
  )

  attr <- setdiff(names(feats), c(gff3_cols, id_tag))

  if (!is.null(id_tag) || length(attr)) {
    # convert attributes tags to title case, gff convention
    attr_predef <- attr[na.omit(match(gff3_attr, str_to_title(attr)))]
    attr_predef_ii <- names(feats) %in% attr_predef
    names(feats)[attr_predef_ii] <- str_to_title(names(feats)[attr_predef_ii])
    attr_custom <- setdiff(attr, attr_predef)
    attr <- c(id_tag, str_to_title(attr_predef), attr_custom)

    for (att in attr) {
      feats[[att]] <- ifelse(
        is.na(feats[[att]]) | !length(feats[[att]]),
        NA, paste0(att, "=", feats[[att]])
      )
    }

    feats <- unite(feats, "attr", all_of(attr), sep = ";", na.rm = TRUE)
    body <- feats[, c(gff3_cols, "attr")]
  } else {
    body <- feats[, gff3_cols]
  }

  write(head, file)
  if (!is.null(seqs)) readr::write_tsv(seqs, file, append = T, col_names = F, quote = "none", escape = "none")
  readr::write_tsv(body, file, append = T, col_names = F, quote = "none", escape = "none")
}

unchop_cds <- function(x) {
  # expand collapsed CDS from start/end/introns to starts/ends list cols
  coords <- purrr::pmap_df(x, function(type, start, end, introns, ...) {
    if (!type %in% c("CDS", "cDNA_match")) {
      tibble(start = list(start), end = list(end))
    } else {
      se <- c(start, start + introns - c(2, 0), end)
      tibble(
        start = list(se[c(TRUE, FALSE)]),
        end = list(se[c(FALSE, TRUE)])
      )
    }
  })

  mutate(x, coords) %>% tidyr::unchop(c(start, end))
}

Try the gggenomes package in your browser

Any scripts or data that you put into this service are public.

gggenomes documentation built on Sept. 11, 2024, 8:55 p.m.