library(phylostratr) library(reshape2) library(taxizedb) library(dplyr) library(readr) library(magrittr) library(onekp) library(knitr)
if(file.exists('strata.rds')){ strata <- readRDS('strata.rds') } if(file.exists('results.rds')){ results <- readRDS('results.rds') }
# First get core uniprot proteomes weights=uniprot_weight_by_ref() focal_taxid <- '3702' strata <- # Get stratified relatives represented in UniProt uniprot_strata(focal_taxid, from=2L) %>% # Select a diverse subset of 5 or fewer representatives from each stratum. strata_apply(f=diverse_subtree, n=5L, weights=weights) %>% # Use a prebuilt set of prokaryotic species use_recommended_prokaryotes %>% add_taxa(c('4932', '9606')) %>% # Download genomes, storing the filenames uniprot_fill_strata # Next add a few supplementary proteomes from 1KP kp <- onekp::retrieve_onekp() # get the ginkgo proteome from 1KP ginkgo_taxid <- taxizedb::name2taxid("Ginkgo biloba") ginkgo_proteome <- onekp::filter_by_species(kp, ginkgo_taxid) %>% onekp::download_peptides(dir="custom-seqs") strata <- add_taxa(strata, ginkgo_taxid) strata@data$faa[[as.character(ginkgo_taxid)]] <- as.vector(ginkgo_proteome) # Remove Klebsormidium nitens strata <- prune(strata, '105231', type='name') ## Replace the UniProt Arabidopsis thaliana proteome with the Araport11 proteome. ## The path below is specific to my pipeline, you can just use the UniProt genes, ## if you want. The problem with them is that they include all previous variants ## of the gene models, raising to 89135 genes. Since `A. thaliana` is the focal ## species, it is important to be very explicit about version of the proteome. strata@data$faa[['3702']] <- 'Araport11_genes.201606.pep.fasta'
The next step is to BLAST the A. thaliana proteins against each target proteome.
This will take a long time to finish. We recommend running it on a cluster with
the search against each target proteome as a separate job or using GNU
Parallel to speed up the process (see
guide).
phylostratr
expects one tabular file of BLAST results for each comparison
against each species. The files should head headers and should contain the
following fields: qseqid
, sseqid
, qstart
, qend
, sstart
, send
,
evalue
, score
The name of the BLAST result files should have the form
<target_name>.tab
, where <target_name>
is the name stored in the
phylostratr
Strata
object.
While the details will depend on your needs and the particulars of your system, for each target you will want something of this general form:
echo -e 'qseqid\tsseqid\tqstart\tqend\tsstart\tsend\tevalue\tscore' > <target>.tab blastp \ -query <focal_species> \ -db <target_database> \ -outfmt '6 qseqid sseqid qstart qend sstart send evalue score' >> <target>.tab
Then move the resulting files into the working directory where you are running
phylostratr
. Then you may proceed with the following code (if everything went
well, it will automatically read these BLAST results).
# replace 8 with the number of threads on your system, if you running locally strata <- strata_blast(strata, blast_args=list(nthreads=8L)) %>% strata_besthits # Merge results into a single hittable results <- merge_besthits(strata) # Get metadata. Note, this will take awhile strata <- strata %>% # for each species, add map of UniProt IDs to PFAM domains strata_uniprot_pfam_map %>% # for each species, add proteome summary data add_proteome_stats %>% # for each species, add list of organelle encoded UniProt proteins add_organelle_proteins
prot <- proteome_stats_table(strata) strata2 <- strata_convert(strata, target='all', to='name')
proteome_report_table(strata2) %>% merge(get_phylostrata_map(strata2)) %>% dplyr::arrange(-ps) %>% dplyr::select(species, ps, N, min, q25, median, q75, max)
proteome_report_table(strata2) %>% merge(get_phylostrata_map(strata2)) %>% dplyr::arrange(-ps) %>% dplyr::select(species, ps, N, min, q25, median, q75, max) %>% knitr::kable(format='pandoc', caption="Proteome statistics for all proteomes in the *A. thaliana* analysis. **N**: total number of proteins in the UniProt proteome (these may be redundant). **ps**: phylostratum level, where 1 is *cellular organisms*. **min**, **q25**, **median**, **q75**, **max**: summaries of the protein lengths in each proteome.")
# Select just the plant species subtree(strata, name2taxid('Viridiplantae')) %>% # replace the taxon IDs with scientific names strata_convert(target='all', to='name') %>% # summarize the numbers of organelle genes present per species organelle_table %>% # sort by ascending phylostratum age dplyr::arrange(-.data$ps, .data$species)
subtree(strata, name2taxid('Viridiplantae')) %>% strata_convert(target='all', to='name') %>% organelle_table %>% dplyr::arrange(-.data$ps, .data$species) %>% knitr::kable(format='pandoc', caption="Number of organelle-encoded proteins appearing in each plant target species. Currently, organelle proteins can only be counted for sequeces retrieved from UniProt (which have UniProt IDs in their FASTA headers). Thus *A. thaliana* and *Ginkgo biloba* are each listed as having no organellar sequences.")
# Set the classifier for determining statistical significance default_classifier <- classify_by_adjusted_pvalue(threshold=0.001) # Find all genes that are missing in one or more phylostrata younger than their # oldest match. revenants <- find_revenants(results, classifier=default_classifier)
# The number of genes with a skipped stratum immediately after # the basal stratum (subset(revenants, ps - 1 == basal_ps)$qseqid %>% unique %>% length)
# Summarize of the number of genes that skip $n\_skipped$ strata. revenants %>% dplyr::distinct(qseqid, n) %>% dplyr::group_by(n) %>% dplyr::count() %>% dplyr::select(n_skipped=n, n_total=nn)
revenants %>% dplyr::distinct(qseqid, n) %>% dplyr::group_by(n) %>% dplyr::count() %>% dplyr::select(n_skipped=n, n_total=nn) %>% knitr::kable(format='pandoc', caption="A summary of the number of *A. thaliana* genes (**n_total**) where a given number (**n_skipped**) of strata are passed before the oldest stratum with a predicted homolog is reached.")
# Show the skipped phylostrata for genes skipping more than 4 revenants %>% dplyr::filter(n > 4) %>% dplyr::select(qseqid, ps, basal_ps, n) %>% dplyr::arrange(-ps) %>% dplyr::group_by(qseqid) %>% dplyr::summarize(basal_ps=basal_ps[1], skipped_ps = paste(ps, collapse=",")) %>% dplyr::arrange(-basal_ps)
# Show the skipped phylostrata for genes skipping more than 4 revenants %>% dplyr::filter(n > 4) %>% dplyr::select(qseqid, ps, basal_ps, n) %>% dplyr::arrange(-ps) %>% dplyr::group_by(qseqid) %>% dplyr::summarize(basal_ps=basal_ps[1], skipped_ps = paste(ps, collapse=",")) %>% dplyr::arrange(-basal_ps) %>% knitr::kable(format='pandoc', caption="Here we identify which strata were skipped for each of the *A. thaliana* genes that skipped more than 4 strata.")
revenants %>% dplyr::group_by(mrca) %>% dplyr::summarize(missing_genes = n(), ps=ps[1]) %>% dplyr::arrange(-ps) %>% dplyr::mutate(mrca = partial_id_to_name(mrca)) %>% dplyr::select(Phylostratum = mrca, Level = ps, missing_genes)
revenants %>% dplyr::group_by(mrca) %>% dplyr::summarize(missing_genes = n(), ps=ps[1]) %>% dplyr::arrange(-ps) %>% dplyr::mutate(mrca = partial_id_to_name(mrca)) %>% dplyr::select(Phylostratum = mrca, Level = ps, missing_genes) %>% knitr::kable(format='pandoc', caption="The number of *A. thaliana* genes that have no hits to any species in a given phylostratum but do have hits to older strata. Strata may have unexpectedly high number of \"missing genes\" for interesting biological reasons, for example, the branch may include degenerate intracellular parasites. But more likely, it is due to poor annotations of the stratum representatives; this is the case for the Spermatophyta stratum, which is represented only by *Picea glauca*.")
Here I will load the phylostratigraphy results from (Arendsee et al., 2014) and
compare them to the new results from the phylostratr
algorithm. The
algorithms and cutoffs are the same, but the search space is different. Also,
in the new study, I use the Araport11 anotation of Arabidopsis thaliana,
whereas in (Arendsee et al., 2014) TAIR10 was used.
# Get the strata from the current analysis a2018 <- stratify(results, classifier=default_classifier) # Get and clean the (Arendsee et al., 2014) data (stored in phylostratr package) a2014 <- read_tsv(system.file('extdata', 'arendsee2014_strata.tab', package='phylostratr')) # Add NCBI taxonomy ID to tips data a2014$mrca <- taxizedb::name2taxid(a2014$name) # match column naming conventions a2014 <- dplyr::select(a2014, qseqid, mrca, ps, mrca_name=name) # underscores to spaces a2014$mrca_name <- sub('_', ' ', a2014$mrca_name) # get a lineage map for Arabidopsis strata_map <- taxizedb::classification('3702')[[1]] # factor the backbone a2014$mrca_name <- droplevels(factor(a2014$mrca_name, levels=strata_map$name)) # convert TAIR IDs from models (transcripts) to loci (genes) to_locus <- function(d){ d$qseqid <- sub("\\.\\d+$", "", d$qseqid) dplyr::group_by(d, qseqid) %>% # select the model from the oldest stratum dplyr::filter(ps == min(ps)) %>% dplyr::distinct() } a2018 <- to_locus(a2018) a2014 <- to_locus(a2014) ps <- standardize_strata(list( a2014 = a2014, a2018 = a2018 )) ps <- lapply(ps, function(x) x[c('qseqid', 'mrca_name')]) # 2014 to 2018 study comparison matrix (2014 on x-axis) versus_TIPS <- merge(ps[[1]], ps[[2]], by="qseqid", suffixes=paste0(".", names(ps))) %>% dplyr::select(-qseqid) %>% make_matrix_from_two_strata()
Now, since the two phylostratigraphy experiments used different annotations of Arabidopsis, I will take the intersection. The model IDs shared between the annotations refer to the same models, phylostratr has a dedicated function for this:
# # Compare the counts between the two studies: data.frame( a2014 = ps[[1]]$mrca_name %>% summary , a2018 = ps[[2]]$mrca_name %>% summary )
data.frame( a2014 = ps[[1]]$mrca_name %>% summary , a2018 = ps[[2]]$mrca_name %>% summary ) %>% knitr::kable(format='pandoc', caption="A comparison between (Arendsee et al., 2014, 'a2014') and this study's ('a2018') stratification of *A. thaliana* genes. The gene counts include all gene models that are shared between the Araport11 (2018) and older TAIR10 (2014) annotations of the *A.thaliana* genome. In cases where one study has representatives in a phylostratum and other does not (e.g. the 2014 study had a representative from the Brassicales stratum), the genes are merged into the older strata as needed.")
is_present <- default_classifier(results) strata_table <- stratify(results, classifier=default_classifier) strata_levels <- levels(strata_table$mrca_name) evalue2 <- stratify(results, classifier=classify_by_adjusted_pvalue(threshold=1e-2)) evalue5 <- stratify(results, classifier=classify_by_adjusted_pvalue(threshold=1e-5)) sigresults = merge(evalue2, evalue5, by='qseqid')[, c(4,7)] names(sigresults) <- c("evalue2", "evalue5")
# To get a proper phylostrata table for this 2018 a2018$qseqid <- sub("\\.\\d+$", "", a2018$qseqid) a2018 %>% unique %$% mrca_name %>% table %>% knitr::kable(format='pandoc')
d <- list() d$sigmats <- make_significance_matrices(sigresults) d$versus_TIPS <- versus_TIPS d$jmpmat <- make_jump_matrix(results[is_present, c('qseqid', 'ps')], labels=strata_levels) d$sigmats[[1]][[1]]@xlab = "Phylostratum (E-value < 1e-5)" d$sigmats[[1]][[1]]@ylab = "Phylostratum (E-value < 0.01)"
plot(d$sigmats[[1]][[1]])
plot(t(d$versus_TIPS)) + ggplot2::ylab("Classification with phylostratr dataset") + ggplot2::xlab("Classification with 2014 dataset")
plot(t(d$jmpmat)) + ggplot2::ylab("Phylostratum with homolog (E-value = 0.001)") + ggplot2::xlab("Next youngest with homolog (E-value = 0.001)")
saveRDS(results, 'results.rds') saveRDS(strata, 'strata.rds')
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.