In cparsania/phyloR: Clean blast results for downstream phyloegentic analysis

knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  fig.path = "man/figures/README-",
  out.width = "100%"
)

phyloR

cat(
badger::badge_devel("cparsania/tidywrappers" , color = "blue"),
badger::badge_lifecycle()
)

library(magrittr)
library(dplyr)

phyloR is an R package to deal with NCBI-BLAST output. It helps to pre-process BLAST tabular output for downstream phylogenetic analysis.

Install

if(require("devtools")){
        devtools::install_github("cparsania/phyloR")
} else{
        install.packages("devtools")
        devtools::install_github("cparsania/phyloR")
}

Assign column names to blast tabular output

blast_out_file <- system.file("extdata" ,"blast_output_01.txt", package = "phyloR")
blast_out_tbl <- readr::read_delim(blast_out_file , delim = "\t" , comment = "#" ,col_names = F)

colnames(blast_out_tbl) <- phyloR::get_blast_outformat_7_colnames()
colnames(blast_out_tbl)

Filter blast hits

One can filter blast hits by individual or combinations of these variables.

e-value
bitscore
query coverage
percent identity

## evalue <= 1e-5

blast_out_tbl %>% phyloR::filter_blast_hits(evalue = 1e-5)

##  Filter blast hits : bitscore >= 200

blast_out_tbl %>% phyloR::filter_blast_hits(bit_score = 200)

##  Filter blast hits : query coverage >= 90

blast_out_tbl %>% phyloR::filter_blast_hits(query_cov = 90, query_length = 253)

##  Filter blast hits : percent identity >= 90

blast_out_tbl %>% phyloR::filter_blast_hits(identity = 90)

##  Filter blast hits : evalue <= 1e-5, bitscore >= 200, query coverage >= 90, percent identity >= 60, 

blast_out_tbl %>% phyloR::filter_blast_hits(evalue = 1e-5, bit_score = 200, query_cov = 90, query_length = 253, identity = 60)

Format fasta headers

Fasta file downloaded as an output of blast result has fasta headers of two types

1) Headers with alignment co-ordinates - if only aligned sequences downloaded.

E.g. KAE8371401.1:1-253 trypsin-like cysteine/serine peptidase domain-containing protein [Aspergillus bertholletius]

2) Headers without alignment co-ordinates - if full sequences downloaded.

E.g. KAE8371401.1 trypsin-like cysteine/serine peptidase domain-containing protein [Aspergillus bertholletius]

By default, phyloR::format_fasta_headers() divides headers in three (header type 2) or four (header type 1) groups delimited by 2 underscores (__).

1) subject id 2) alignment coordinates (Optional for header type 1) 3) subject description 4) subject species

One can drop alignment coordinates from sequence headers by keeping argument keep_alignemnt_coord = FALSE

fa_file <- system.file("extdata" ,"blast_output_01.fasta", package = "phyloR")
fa <- Biostrings::readBStringSet(fa_file)

## existing headers 

names(fa) %>% head()

## new headers 

fa_file %>% phyloR::format_fasta_headers() %>% names() %>% head()

## drop alignment co-ordinates

fa_file %>% phyloR::format_fasta_headers(keep_alignemnt_coord = FALSE) %>% names() %>% head()

Remove redundant hits from blast tabular ouput

BLAST reports same subject hit multiple time with different alignment co-ordinates. For a same subjet id, phyloR::remove_redundant_hits() select longest subject hit out of given multiples.

blast_out_tbl %>% phyloR::remove_redundant_hits()

Subset fasta sequences

After filtering hits from tabular blast output, next obvious question is to filter same sequences from a fasta file and write them to new file. phyloR::subset_bstringset() subset the sequences from a fasta using a vector of fasta headers.

fa_file <- system.file("extdata" ,"blast_output_01.fasta", package = "phyloR")
fa <- Biostrings::readBStringSet(fa_file)
query_headers <- fa %>% names() %>% sample(100)
seq_filtered <- phyloR::subset_bstringset(x = query_headers , y = fa, partial_match = F)

# One can use function `Biostrings::writeXStringSet()` to write filterd sequences to new fasta file.

Assign taxonomy columns to NCBI protein acession

phyloR allows you to assign NCBI taxonomy levels to NCBI protein accession using function phyloR::add_taxonomy_columns(). One of the applications of this function is to assign the NCBI taxonomy levels to blast output, which is required for downstream phylogenetic analysis. For example, a phylogenetic tree generated from blast hits often required to color resultant tree branches either by kingdom, family or any other NCBI taxonomy level. One can easily identify these taxonomy levels using function phyloR::add_taxonomy_columns()

## add kingdom

with_kingdom <- blast_out_tbl %>% slice(1:10) %>% 
        phyloR::add_taxonomy_columns(ncbi_accession_colname = "subject_acc_ver", taxonomy_level = "kingdom")


with_kingdom %>% dplyr::select(query_acc_ver, subject_acc_ver, kingdom)

## add family  

with_kingdom_and_family <- with_kingdom %>% phyloR::add_taxonomy_columns(ncbi_accession_colname = "subject_acc_ver" ,
                                                                  taxonomy_level = "family")

with_kingdom_and_family %>% dplyr::select(query_acc_ver, subject_acc_ver, kingdom, family )

## add species 

with_kingdom_family_and_species <- with_kingdom_and_family %>% phyloR::add_taxonomy_columns(ncbi_accession_colname = "subject_acc_ver" ,
                                                                  taxonomy_level = "species")

with_kingdom_family_and_species %>% dplyr::select(query_acc_ver, subject_acc_ver, kingdom, family ,species)

Use NCBI taxonomy levels to color the phyloegenetic tree

An object of class phylo is widely used object to visualize the phylogenetic tree in R. To gain more insights from the resultant phylogenetic tree, often tree tips required to color by different levels of taxonomy, for instance species, kingdom or family. Obtaining taxonomy data to use them for tree annotations is not straight forward task and require lots of data wrangling in R. phyloR provides handy way to map such a taxonomy data to the phylogenetic tree. For example, one can convert an object of class phylo to an object of class tibble and vice versa. Once the tibble obtained, phyloR allows to add columns of ncbi taxonomy levels. Resultant tibble can be passed to ggtree::ggtree() to visualize the tree and taxonomy levels can be used, for instance, to color the tree tips.

tree_string <- "((XP_005187699_1__Musca_domestica:0.070627277,(XP_019893806_1__Musca_domestica:0.071069674,((XP_013113221_1__Stomoxys_calcitrans:0.1494662042,ACB98719_1__Glossina_morsitans_morsitans:0.3489851076)67.4/100:0.0470213767,XP_013102958_1__Stomoxys_calcitrans:0.1794878827)98.1/100:0.0959227604)88.2/99:0.0323598861)93/99:0.0435291148,((XP_017472861_1__Rhagoletis_zephyria:0.0049337059,XP_017472862_1__Rhagoletis_zephyria:0.0112391294)97.3/100:0.0860969479,(XP_020713236_1__Ceratitis_capitata:0.2642805176,(XP_014102010_1__Bactrocera_oleae:0.1183517872,XP_018784523_1__Bactrocera_latifrons:0.1137567198)29.6/88:0.0758551876)99.9/100:0.247740081)92/100:0.0716529011)34.3/66:2.487103817;"

 tree_objct <- treeio::read.tree(text = tree_string)

 tree_tbl <- tree_objct %>% 
   ggtree::fortify()

 tree_tbl <- tree_tbl  %>%
         dplyr::mutate( seqid =  dplyr::case_when(isTip ~ stringr::str_replace(label , pattern = "__.*","" ) %>%  ## split by '__'
                                                         stringr::str_replace(pattern = "_\\d$" , ""), ## remove trailing digits from seqid
                                                  TRUE ~ label
         )
        )
 ## add taxonomy
 tree_tbl_with_taxonomy <- tree_tbl %>%
        phyloR::tidy_taxonomy_tree(ncbi_accession_colname = "seqid",taxonomy_levels = c("species" ,"kingdom","family"))

 ## visualize  tree

 # tips colored by species
 tree_tbl_with_taxonomy %>% ggtree::ggtree() + ggtree::geom_tiplab(ggplot2::aes(color = species, label = seqid),size = 20) + 
   ggplot2::theme(legend.position = "bottom" , legend.text = ggplot2::element_text(size= 50) , legend.title = ggplot2::element_text(size= 50))
 # tips colored by family
 tree_tbl_with_taxonomy %>% ggtree::ggtree() + ggtree::geom_tiplab(ggplot2::aes(color = family,label = seqid),size = 20) +
   ggplot2::theme(legend.position = "bottom" , legend.text = ggplot2::element_text(size= 50) , legend.title = ggplot2::element_text(size= 50))
 # tips colored by kingdom
 tree_tbl_with_taxonomy %>% ggtree::ggtree() + ggtree::geom_tiplab(ggplot2::aes(color = kingdom,label = seqid ),size =20) +
   ggplot2::theme(legend.position = "bottom" , legend.text = ggplot2::element_text(size= 50) , legend.title = ggplot2::element_text(size= 50))