tidyorthogroups: A Grammar of Genome Data Manipulation

Documented in read_pangenome read_phylip_distmat

#' Read pangenome from a file or folder
#'
#' This function reads a file or folder containing a pangenome, as created by
#' tools such as OrthoFinder, and returns a genes tibble with the genome and
#' orthogroup of each gene.
#' 
#' The `path` can currently only be an OrthoFinder Orthogroups folder (for
#' OrthoFinder >= v2.3.1), or Results folder (older versions). In the future,
#' this can be extended to output folders or files from other pangenome tools.
#'
#' @param path Path to a file or folder containing the pangenome
#' 
#' @return A tibble with the variables gene, genome and orthogroup
#'
#' @examples
#' \dontrun{
#' genes <- read_pangenome("Results_Sep06")
#' }
#' 
#' @export
read_pangenome <- function(path) {
  
  path_orthogroups <- 
    paste0(path, "/Orthogroups.?sv") %>% 
    Sys.glob()
  path_unassigned <- 
    paste0(path, "/Orthogroups_UnassignedGenes.?sv") %>% 
    Sys.glob()
  
  if (file.exists(path_orthogroups) & file.exists(path_unassigned)) {
    message("OrthoFinder pagenome files detected")
  } else {
    stop("No pangenome file(s) found")
  }
  
  genes_assigned <- 
    path_orthogroups %>%
    readr::read_tsv(col_names = T, col_types = cols(.default = "c")) %>%
    rename(orthogroup = 1) %>%
    gather(key = "genome", value = "gene", na.rm = T, - orthogroup) %>%
    separate_rows(gene, sep = ", ")
  
  genes_unassigned <- 
    path_unassigned %>%
    readr::read_tsv(col_names = T, col_types = cols(.default = "c")) %>%
    rename(orthogroup = 1) %>%
    gather(key = "genome", value = "gene", na.rm = T, - orthogroup)
  
  genes <- bind_rows(genes_assigned, genes_unassigned)
  
  genes
  
}

#' Read a phylip formatted distance matrix
#'
#' This function reads a phylip formatted distance matrix, as generated by e.g.
#' the EMBOSS tool distmat.
#' 
#' The number of lines to skip can be different for different phylip distance
#' matrix files.
#' 
#' For each row, the value of sequence_1 will be before the value of sequence_2
#' in sorting order.
#'
#' @param path Path to a phylip formatted distance matrix file
#' @param skip Number of lines before the actual distance values start
#' @param include_diagonal Should the diagonal of the distance matrix be included? 
#' 
#' @return A tibble with the variables sequence_1, sequence_2 and distance
#' 
#' @export
read_phylip_distmat <- function(path, skip = 8, include_diagonal = T) {
  
  distances_raw <- 
    readr::read_tsv(path, col_names = F, skip = skip) %>%
    separate(ncol(.), into = c("name", "number"), sep = " ")
  
  names <- distances_raw$name
  
  n <- length(names)
  
  distances <- 
    distances_raw %>%
    select_if(~ ! all(is.na(.))) %>%
    select(1:(!! n)) %>%
    `names<-`(names) %>%
    mutate(sequence_1 = !! names) %>%
    gather(key = "sequence_2", value = "distance", - sequence_1, na.rm = T) %>%
    mutate_at("distance", as.double)
  
  distances_1 <- filter(distances, sequence_1 >= sequence_2)
  
  distances_2 <- 
    distances %>%
    filter(sequence_1 < sequence_2) %>%
    mutate(sequence_1_temp = sequence_2, sequence_2_temp = sequence_1) %>%
    select(sequence_1 = sequence_1_temp, sequence_2 = sequence_2_temp, distance)
  
  distances <- bind_rows(distances_1, distances_2)
  
  if (! include_diagonal) {
    distances <- filter(distances, sequence_1 != sequence_2)
  }
  
  distances
  
}