#' Read pangenome from a file or folder
#'
#' This function reads a file or folder containing a pangenome, as created by
#' tools such as OrthoFinder, and returns a genes tibble with the genome and
#' orthogroup of each gene.
#'
#' The `path` can currently only be an OrthoFinder Orthogroups folder (for
#' OrthoFinder >= v2.3.1), or Results folder (older versions). In the future,
#' this can be extended to output folders or files from other pangenome tools.
#'
#' @param path Path to a file or folder containing the pangenome
#'
#' @return A tibble with the variables gene, genome and orthogroup
#'
#' @examples
#' \dontrun{
#' genes <- read_pangenome("Results_Sep06")
#' }
#'
#' @export
read_pangenome <- function(path) {
path_orthogroups <-
paste0(path, "/Orthogroups.?sv") %>%
Sys.glob()
path_unassigned <-
paste0(path, "/Orthogroups_UnassignedGenes.?sv") %>%
Sys.glob()
if (file.exists(path_orthogroups) & file.exists(path_unassigned)) {
message("OrthoFinder pagenome files detected")
} else {
stop("No pangenome file(s) found")
}
genes_assigned <-
path_orthogroups %>%
readr::read_tsv(col_names = T, col_types = cols(.default = "c")) %>%
rename(orthogroup = 1) %>%
gather(key = "genome", value = "gene", na.rm = T, - orthogroup) %>%
separate_rows(gene, sep = ", ")
genes_unassigned <-
path_unassigned %>%
readr::read_tsv(col_names = T, col_types = cols(.default = "c")) %>%
rename(orthogroup = 1) %>%
gather(key = "genome", value = "gene", na.rm = T, - orthogroup)
genes <- bind_rows(genes_assigned, genes_unassigned)
genes
}
#' Read a phylip formatted distance matrix
#'
#' This function reads a phylip formatted distance matrix, as generated by e.g.
#' the EMBOSS tool distmat.
#'
#' The number of lines to skip can be different for different phylip distance
#' matrix files.
#'
#' For each row, the value of sequence_1 will be before the value of sequence_2
#' in sorting order.
#'
#' @param path Path to a phylip formatted distance matrix file
#' @param skip Number of lines before the actual distance values start
#' @param include_diagonal Should the diagonal of the distance matrix be included?
#'
#' @return A tibble with the variables sequence_1, sequence_2 and distance
#'
#' @export
read_phylip_distmat <- function(path, skip = 8, include_diagonal = T) {
distances_raw <-
readr::read_tsv(path, col_names = F, skip = skip) %>%
separate(ncol(.), into = c("name", "number"), sep = " ")
names <- distances_raw$name
n <- length(names)
distances <-
distances_raw %>%
select_if(~ ! all(is.na(.))) %>%
select(1:(!! n)) %>%
`names<-`(names) %>%
mutate(sequence_1 = !! names) %>%
gather(key = "sequence_2", value = "distance", - sequence_1, na.rm = T) %>%
mutate_at("distance", as.double)
distances_1 <- filter(distances, sequence_1 >= sequence_2)
distances_2 <-
distances %>%
filter(sequence_1 < sequence_2) %>%
mutate(sequence_1_temp = sequence_2, sequence_2_temp = sequence_1) %>%
select(sequence_1 = sequence_1_temp, sequence_2 = sequence_2_temp, distance)
distances <- bind_rows(distances_1, distances_2)
if (! include_diagonal) {
distances <- filter(distances, sequence_1 != sequence_2)
}
distances
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.