knitr::opts_chunk$set( collapse = TRUE, comment = "#>", fig.path = "man/figures/README-", out.width = "100%" )
cat( badger::badge_devel("cparsania/tidywrappers" , color = "blue"), badger::badge_lifecycle() )
library(magrittr) library(dplyr)
phyloR
is an R package to deal with NCBI-BLAST output. It helps to pre-process BLAST tabular output for downstream phylogenetic analysis.
if(require("devtools")){ devtools::install_github("cparsania/phyloR") } else{ install.packages("devtools") devtools::install_github("cparsania/phyloR") }
blast_out_file <- system.file("extdata" ,"blast_output_01.txt", package = "phyloR") blast_out_tbl <- readr::read_delim(blast_out_file , delim = "\t" , comment = "#" ,col_names = F) colnames(blast_out_tbl) <- phyloR::get_blast_outformat_7_colnames() colnames(blast_out_tbl)
One can filter blast hits by individual or combinations of these variables.
## evalue <= 1e-5 blast_out_tbl %>% phyloR::filter_blast_hits(evalue = 1e-5) ## Filter blast hits : bitscore >= 200 blast_out_tbl %>% phyloR::filter_blast_hits(bit_score = 200) ## Filter blast hits : query coverage >= 90 blast_out_tbl %>% phyloR::filter_blast_hits(query_cov = 90, query_length = 253) ## Filter blast hits : percent identity >= 90 blast_out_tbl %>% phyloR::filter_blast_hits(identity = 90) ## Filter blast hits : evalue <= 1e-5, bitscore >= 200, query coverage >= 90, percent identity >= 60, blast_out_tbl %>% phyloR::filter_blast_hits(evalue = 1e-5, bit_score = 200, query_cov = 90, query_length = 253, identity = 60)
Fasta file downloaded as an output of blast result has fasta headers of two types
1) Headers with alignment co-ordinates - if only aligned sequences downloaded.
KAE8371401.1:1-253 trypsin-like cysteine/serine peptidase domain-containing protein [Aspergillus bertholletius]
2) Headers without alignment co-ordinates - if full sequences downloaded.
KAE8371401.1 trypsin-like cysteine/serine peptidase domain-containing protein [Aspergillus bertholletius]
By default, phyloR::format_fasta_headers()
divides headers in three (header type 2) or four (header type 1) groups delimited by 2 underscores (__
).
1) subject id 2) alignment coordinates (Optional for header type 1) 3) subject description 4) subject species
One can drop alignment coordinates from sequence headers by keeping argument keep_alignemnt_coord = FALSE
fa_file <- system.file("extdata" ,"blast_output_01.fasta", package = "phyloR") fa <- Biostrings::readBStringSet(fa_file) ## existing headers names(fa) %>% head() ## new headers fa_file %>% phyloR::format_fasta_headers() %>% names() %>% head() ## drop alignment co-ordinates fa_file %>% phyloR::format_fasta_headers(keep_alignemnt_coord = FALSE) %>% names() %>% head()
BLAST reports same subject hit multiple time with different alignment co-ordinates. For a same subjet id, phyloR::remove_redundant_hits()
select longest subject hit out of given multiples.
blast_out_tbl %>% phyloR::remove_redundant_hits()
After filtering hits from tabular blast output, next obvious question is to filter same sequences from a fasta file and write them to new file. phyloR::subset_bstringset()
subset the sequences from a fasta using a vector of fasta headers.
fa_file <- system.file("extdata" ,"blast_output_01.fasta", package = "phyloR") fa <- Biostrings::readBStringSet(fa_file) query_headers <- fa %>% names() %>% sample(100) seq_filtered <- phyloR::subset_bstringset(x = query_headers , y = fa, partial_match = F) # One can use function `Biostrings::writeXStringSet()` to write filterd sequences to new fasta file.
phyloR allows you to assign NCBI taxonomy levels to NCBI protein accession using function phyloR::add_taxonomy_columns()
. One of the applications of this function is to assign the NCBI taxonomy levels to blast output, which is required for downstream phylogenetic analysis. For example, a phylogenetic tree generated from blast hits often required to color resultant tree branches either by kingdom, family or any other NCBI taxonomy level. One can easily identify these taxonomy levels using function phyloR::add_taxonomy_columns()
## add kingdom with_kingdom <- blast_out_tbl %>% slice(1:10) %>% phyloR::add_taxonomy_columns(ncbi_accession_colname = "subject_acc_ver", taxonomy_level = "kingdom") with_kingdom %>% dplyr::select(query_acc_ver, subject_acc_ver, kingdom) ## add family with_kingdom_and_family <- with_kingdom %>% phyloR::add_taxonomy_columns(ncbi_accession_colname = "subject_acc_ver" , taxonomy_level = "family") with_kingdom_and_family %>% dplyr::select(query_acc_ver, subject_acc_ver, kingdom, family ) ## add species with_kingdom_family_and_species <- with_kingdom_and_family %>% phyloR::add_taxonomy_columns(ncbi_accession_colname = "subject_acc_ver" , taxonomy_level = "species") with_kingdom_family_and_species %>% dplyr::select(query_acc_ver, subject_acc_ver, kingdom, family ,species)
An object of class phylo
is widely used object to visualize the phylogenetic tree in R. To gain more insights from the resultant phylogenetic tree, often tree tips required to color by different levels of taxonomy, for instance species, kingdom or family. Obtaining taxonomy data to use them for tree annotations is not straight forward task and require lots of data wrangling in R. phyloR provides handy way to map such a taxonomy data to the phylogenetic tree. For example, one can convert an object of class phylo
to an object of class tibble
and vice versa. Once the tibble obtained, phyloR allows to add columns of ncbi taxonomy levels. Resultant tibble can be passed to ggtree::ggtree()
to visualize the tree and taxonomy levels can be used, for instance, to color the tree tips.
tree_string <- "((XP_005187699_1__Musca_domestica:0.070627277,(XP_019893806_1__Musca_domestica:0.071069674,((XP_013113221_1__Stomoxys_calcitrans:0.1494662042,ACB98719_1__Glossina_morsitans_morsitans:0.3489851076)67.4/100:0.0470213767,XP_013102958_1__Stomoxys_calcitrans:0.1794878827)98.1/100:0.0959227604)88.2/99:0.0323598861)93/99:0.0435291148,((XP_017472861_1__Rhagoletis_zephyria:0.0049337059,XP_017472862_1__Rhagoletis_zephyria:0.0112391294)97.3/100:0.0860969479,(XP_020713236_1__Ceratitis_capitata:0.2642805176,(XP_014102010_1__Bactrocera_oleae:0.1183517872,XP_018784523_1__Bactrocera_latifrons:0.1137567198)29.6/88:0.0758551876)99.9/100:0.247740081)92/100:0.0716529011)34.3/66:2.487103817;" tree_objct <- treeio::read.tree(text = tree_string) tree_tbl <- tree_objct %>% ggtree::fortify() tree_tbl <- tree_tbl %>% dplyr::mutate( seqid = dplyr::case_when(isTip ~ stringr::str_replace(label , pattern = "__.*","" ) %>% ## split by '__' stringr::str_replace(pattern = "_\\d$" , ""), ## remove trailing digits from seqid TRUE ~ label ) ) ## add taxonomy tree_tbl_with_taxonomy <- tree_tbl %>% phyloR::tidy_taxonomy_tree(ncbi_accession_colname = "seqid",taxonomy_levels = c("species" ,"kingdom","family")) ## visualize tree # tips colored by species tree_tbl_with_taxonomy %>% ggtree::ggtree() + ggtree::geom_tiplab(ggplot2::aes(color = species, label = seqid),size = 20) + ggplot2::theme(legend.position = "bottom" , legend.text = ggplot2::element_text(size= 50) , legend.title = ggplot2::element_text(size= 50)) # tips colored by family tree_tbl_with_taxonomy %>% ggtree::ggtree() + ggtree::geom_tiplab(ggplot2::aes(color = family,label = seqid),size = 20) + ggplot2::theme(legend.position = "bottom" , legend.text = ggplot2::element_text(size= 50) , legend.title = ggplot2::element_text(size= 50)) # tips colored by kingdom tree_tbl_with_taxonomy %>% ggtree::ggtree() + ggtree::geom_tiplab(ggplot2::aes(color = kingdom,label = seqid ),size =20) + ggplot2::theme(legend.position = "bottom" , legend.text = ggplot2::element_text(size= 50) , legend.title = ggplot2::element_text(size= 50))
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.