palmID
is a contained analysis suite for viral RNA-dependent RNA polymerases
(RdRP) based on the "Palmprint" RNA virus barcodes described by
Babaian and Edgar, 2022..
# RMarkdown Setting Initialization # Command Line Interface ---- # Rscript -e "rmarkdown::render( \ # input = 'palmid_dev.Rmd', \ # output_file = 'amexnv.html', \ # output_format = 'html_notebook', \ # params=list( input.path = 'amexnv/amexnv'))" # --------------------------- knitr::opts_chunk$set(warning = FALSE, message = FALSE)
```{css, echo=FALSE} / Move code folding buttons to the left / div.col-md-12 .pull-right { float: left !important }
```r # Run as Production Pipeline or Development production.version = params$prod.run if (production.version) { # Load stable palmid package library('palmid', quietly = T) data("palmdb") # Disable warnings defaultW <- getOption("warn") options(warn = -1) } else { # Warnings variable defaultW <- getOption("warn") options(warn = -1) # Compile a new palmid from source roxygen2::roxygenise() load("data/palmdb.RData") }
# Print out version cat( paste0("palmID Version: ", params$palmid.version) ) # INITIALIZE PALMID WORKSPACE ------------------- # Establish Serratus server connection con <- SerratusConnect() # Input files # Generated within `palmid` container # input.path parameter is defined in YAML header # which is exposed to CLI input.path <- params$input.path if (is.null(input.path)) { stop("Error: No input provided.") } input.fa <- paste0(input.path, '.input.fa')# palmscan-palmprint sequence input.pp <- paste0(input.path, '.trim.fa') # palmscan-palmprint sequence input.fev <- paste0(input.path, '.fev') # palmscan .fev report input.rep <- paste0(input.path, '.txt') # palmscan text motif-report input.pro <- paste0(input.path, '.pro') # diamond palmDB-alignment file input.msa <- paste0(input.path, '.msa.fa') # muscle msa input.phy <- paste0(input.path, '.phy') # tree file in newick format # Output HTML Report output.html <- paste0(input.path, '.html') save.plots <- FALSE # save individual plots as png # Parameters id_threshold <- 0 # Minimum AA% to retain a hit max_palmdb_hits <- 200 # Maximum number of alignment hits in PalmDB hits to return run.time <- Sys.time() # IMPORT DATASETS ------------------------------- # Import a palmprint-analysis pp.in <- read.fev(input.fev, FIRST = TRUE) # Import a diamond-aligned pro file pro.df <- read.pro(input.pro) # Set backstop when too many similar pro.df hits come up palmdb.hits <- length(pro.df$qseqid) if (palmdb.hits > max_palmdb_hits) { pro.df <- pro.df[ 1:max_palmdb_hits, ] print.hitn <- paste0("Reporting Top ", max_palmdb_hits, "/", palmdb.hits, " matches") } else { print.hitn <- paste0("Reporting all ", palmdb.hits, " matches") } # Populate with Nickname/Taxonomy-data pro.df$nickname <- get.nickname(pro.df$sseqid, con, ordinal = T) pro.df <- get.proTax2(pro.df, con) # Import a phylogenetic tree file tree.phy <- ggtree::read.tree(input.phy) # Get a dataframe from the tree phy object tree.df <- get.proPhy(pro.df, tree.phy) # SQL-Import of palmprint/sra meta-data # parent/child sOTU lookup, sra, biosample, date, organism, geo # optimization needed here palm.sra <- get.palmSra(pro.df, con) # SQL-import of STAT kmer taxonomic analysis of the retrieved # matching SRA libraries stat.sra <- get.sraSTAT(palm.sra$run_id, con) # Populate stat.sra with percent identity from palm.sra stat.sra$pident <- palm.sra$pident[ match(stat.sra$run_id, palm.sra$run_id) ] # GENERATE REPORT-PLOTS ------------------------- # Palmprint Report pp.report <- PlotReport(pp.in, palmdb) # Diamond-palmDB Alignment Report pro.report <- PlotProReport(pro.df, html = T) # PalmDB Viral Taxonomy Report tax.report <- PlotTaxReport(pro.df) # Phylogeny Report phy.report <- PlotPhyReport(input.msa, tree.df, tree.phy) # Geospatial distribution Report geo.report <- PlotGeo2(palm.sra) date.report <- PlotTimeline(palm.sra) # SRA Expression Report expr.report <- PlotSraExpr(palm.sra) proj.report <- PlotProjExpr(palm.sra) # Host/Library organism Report orgn.report <- PlotOrgn(palm.sra, freq = FALSE) orgn.reportXY <- PlotOrgnScatter(palm.sra) stat.report <- PlotSTAT(stat.sra)
# Generate text-based summary of palmID data palm.summary <- text_summary(pp.in, pro.df, palm.sra, palmdb.hits) cat(palm.summary)
options(warn = -1) pro.df %>% downloadthis::download_this( output_name = paste0(input.path, '.pro'), output_extension = ".csv", button_label = "Download palmdb_alignment.csv", button_type = "success", has_icon = TRUE, icon = "fa fa-file-export" ) palm.sra %>% downloadthis::download_this( output_name = paste0(input.path, '.sra.metadata'), output_extension = ".csv", button_label = "Download sra_metadata.csv", button_type = "success", has_icon = TRUE, icon = "fa fa-file-export" )
Identification of the the core catalytic motifs A,B,C within the input sequence and reporting the "palmprint" RNA-virus barcode.
# Input Sequence cat("Input FASTA sequence: \n\n") cat(paste0(readLines(input.fa), collapse = "\n"), "\n") cat("\n") # Palmscan-generated reports cat("palmprint sequence: \n\n") cat(paste0(readLines(input.pp), collapse = "\n"), "\n") cat("\n") cat("catalytic-motifs: \n") cat(paste0(readLines(input.rep), collapse = "\n"), "\n") cat("\n") # lapply(scan(input.rep, 'character'),cat)[[1]]
Quality control metrics on the motif-scores and size distribution of the "palmprint" informs the confidence that the input sequence contains a viral RdRP. Input scores are shown compared to score distribution from 15,010 canonical viral RdRP from GenBank. Palmscan v1.0
(the motif-detection algorithm) has a sensitivity of 87%, and a positive predictive value of 99.89%. This is a conservative algorithm by design. While an RdRp score below 20
is deemed "Low Confidence", if you know your input is an RdRp it likely will retrieve the correct motifs. In general this tool is designed to "detect" RdRp motifs with a known RdRp input.
# Palmscan QC-plot plot(pp.report)
A MUSCLE
-generated multiple sequence alignment of the input palmprint and
the top-10 matching palmprint in palmdb
.
Header is palmprint_id and percent identity (aa%) alignment to input sequence.
# Multiple Sequence Alignment of Top-10 hits in palmDB cat(paste0(readLines(input.msa), collapse = "\n"))
A rough sketch (MUSCLE
UPGMA tree) of the local evolutionary neighborhood of
the input palmprint and it's homologs in PalmDB. To generate an accurate tree,
use full-length, or as long as possible, RdRp sequences. Palmprints are not
sufficient to resolve phylogeny accurately.
# Viral Taxonomy of palmDB Hits plot(phy.report)
The extracted "palmprint" from input sequence is aligned against
palmdb
using diamond
to retrieve
related palmprints (upto 200 hits).
Each palmprint is also aligned against GenBank from which the top-hit
taxonomy and aa% identity is reported.
A "species" is not in palmDB if no point is right of the red vertical line.
A "species" is not in GenBank when point is below the gray horizontal dotted line.
# Protein-alignment of Input vs. palmDB # Taxonomic Demarcations (~) # Species >90% - Red # Genus 70-90% - Orange # Family 45-70% - Green # Phylum <45% - Purple plotly::ggplotly(pro.report)
Species-, Genus- and Phylum- level taxonomy of the matching hits in PalmDB allow for taxonomic-inference of the input virus.
# Viral Taxonomy of palmDB Hits aligned against GenBank (top hit) plot(tax.report)
### Data-Table: `palmDB.csv` # Data table for palmDB Alignment blast.col <- linkBLAST(header = pro.df$sseqid, aa.seq = pro.df$full_sseq) gbacc.col <- linkGB( pro.df$gbacc , prefix_text = pro.df$tspe) cbind( pro.df[, c("sseqid", "nickname", "pident", "evalue")], gbacc.col, pro.df[, c( "gbid", "cigar")], blast.col) %>% DT::datatable( colnames = c('palmprint', 'nickname', 'id%_to_input','input_evalue', 'genbank_tax', 'gb_id', 'CIGAR', 'BLAST'), rownames = FALSE, filter = "top", escape = FALSE, options = list(pageLength = 10, scrollX = TRUE) )
Download palmDB Alignment Table
RNA viruses matching input are cross-referenced against 5.7M SRA sequencing libraries to identify sra-virus matches and their associated meta-data.
Distribution of matching RNA viruses. Click to explore each matching record. Species, genus, family and phylum level matches are red, orange, green and purple, respectively.
# Geo-distribution of palmprint-containing SRA # Taxonomic Demarcations (~) # Species >90% - Red # Genus 70-90% - Orange # Family 45-70% - Green # Phylum <45% - Purple geo.report
# Timeline of "Release Dates" for matching datsets # click on legend to unselect certain rank_matches plotly::ggplotly(date.report)
From each SRA dataset, RdRp-mapping reads are "micro-assembled" into contigs. The fold read-coverage of the contigs is an indirect measurement of the expression level of the virus in that datasets. Here the per-SRA expression for each palmprint sOTU are shown. Expression of high-divergence (<60% identity to known) RdRp are under-estimated due to lower sensitivity for recovering RdRp-reads.
# SRA Expression Report plotly::ggplotly(expr.report)
# BioProject Level Variance in Expression plotly::ggplotly( proj.report )
Organism meta-data from the SRA sequencing libraries matching the input-virus. Word size and color is scaled by proximity of the input-virus to its match in the library. Wordclouds show a) Researcher provided annotations of the libraries or b) STAT automated taxonomic (orders) analysis of the reads contained within the library (Katz et al.,2021).
# Count of each Organism (Scientific_Name, taken from SRA meta-data) # plotted against the maximum id% to input sequence for that organism plotly::ggplotly(orgn.reportXY)
# Plot SRA Wordcloud - id # scaled by AA% proximity to input palmprint plot(orgn.report)
# Plot SRA-Organism Wordcloud - freq # scaled by frequency in the SRA # alpha scaled by the abundance of the organism in the target datasets plot(stat.report)
### Data-Table: `sra.metadata.csv` # Data table for palmDB Alignment blast.col <- linkBLAST(header = paste0(palm.sra$run_id,"_",palm.sra$palm_id), aa.seq = palm.sra$sra_sequence ) sra.col <- linkDB(palm.sra$run_id) biosample.col <- linkDB(palm.sra$biosample_id, DB = "biosample") cbind( sra.col, biosample.col, palm.sra[, c('sOTU', 'nickname', 'pident', 'coverage', 'evalue', 'scientific_name')], blast.col) %>% DT::datatable( colnames = c("sra_run", "biosample_id", "sOTU", 'nickname', "id%", "coverage", "evalue", "sra_tag", "BLAST"), rownames = FALSE, filter = "top", escape = F, options = list(pageLength = 10, scrollX = T) )
To save this report: File
--> Save Page As
.
if (save.plots) { # SAVE PP-REPORT png(filename = output.report, width = 800, height = 400) plot(pp.report) dev.off() # SAVE PRO-REPORT png(filename = output.pro, width = 800, height = 400) plot(pro.report) dev.off() # SAVE TAX-REPORT ggsave(output.tax, plot(tax.report), width = 16, height = 10, dpi = 72 ) # SAVE PHYLOGENY-REPORT ggsave(output.phy, plot(phy.report), width = 16, height = 10, dpi = 72 ) # SAVE GEO-REPORT htmlwidgets::saveWidget(geo.report, file = output.geo) # SAVE ORGN-REPORT png(filename = output.orgn, width = 800, height = 400, res = 100) plot(orgn.report) dev.off() }
# Development Code # rmarkdown::render('palmid_dev.Rmd', # output_file = output.html) # # Generate /data files # # based on waxsys example # waxsys.palm.sra <- palm.sra # waxsys.palmprint <- pp.in # waxsys.pro.df <- pro.df # waxsys.stat.sra <- stat.sra # waxsys.tree.phy <- tree.phy # waxsys.tree.df <- tree.df # waxsys.input.msa <- input.msa # save( waxsys.palm.sra, file = "data/waxsys.palm.sra.RData", compress = "xz") # save( waxsys.palmprint, file = "data/waxsys.palmprint.RData", compress = "xz") # save( waxsys.pro.df, file = "data/waxsys.pro.df.RData", compress = "xz") # save( waxsys.stat.sra, file = "data/waxsys.stat.sra.RData", compress = "xz") # save( waxsys.tree.phy, file = "data/waxsys.tree.phy.RData", compress = "xz") # save( waxsys.tree.df, file = "data/waxsys.tree.df.RData", compress = "xz") # save( waxsys.input.msa, file = "data/waxsys.input.msa.RData", compress = "xz")
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.