Arabidopsis case study

Build the tree and collect the proteomes

library(phylostratr)
library(reshape2)
library(taxizedb)
library(dplyr)
library(readr)
library(magrittr)
library(onekp)
library(knitr)
if(file.exists('strata.rds')){
    strata <- readRDS('strata.rds')
}
if(file.exists('results.rds')){
    results <- readRDS('results.rds')
}
# First get core uniprot proteomes 
weights=uniprot_weight_by_ref()
focal_taxid <- '3702'
strata <-
  # Get stratified relatives represented in UniProt
  uniprot_strata(focal_taxid, from=2L) %>%
  # Select a diverse subset of 5 or fewer representatives from each stratum.
  strata_apply(f=diverse_subtree, n=5L, weights=weights) %>%
  # Use a prebuilt set of prokaryotic species
  use_recommended_prokaryotes %>%
  add_taxa(c('4932', '9606')) %>%
  # Download genomes, storing the filenames
  uniprot_fill_strata

# Next add a few supplementary proteomes from 1KP
kp <- onekp::retrieve_onekp()
# get the ginkgo proteome from 1KP
ginkgo_taxid <- taxizedb::name2taxid("Ginkgo biloba")
ginkgo_proteome <- onekp::filter_by_species(kp, ginkgo_taxid) %>%
  onekp::download_peptides(dir="custom-seqs")
strata <- add_taxa(strata, ginkgo_taxid)
strata@data$faa[[as.character(ginkgo_taxid)]] <- as.vector(ginkgo_proteome)

# Remove Klebsormidium nitens
strata <- prune(strata, '105231', type='name')

## Replace the UniProt Arabidopsis thaliana proteome with the Araport11 proteome.
## The path below is specific to my pipeline, you can just use the UniProt genes,
## if you want. The problem with them is that they include all previous variants
## of the gene models, raising to 89135 genes. Since `A. thaliana` is the focal
## species, it is important to be very explicit about version of the proteome.
strata@data$faa[['3702']] <- 'Araport11_genes.201606.pep.fasta'

BLAST focal species against all target species

The next step is to BLAST the A. thaliana proteins against each target proteome.

This will take a long time to finish. We recommend running it on a cluster with the search against each target proteome as a separate job or using GNU Parallel to speed up the process (see guide). phylostratr expects one tabular file of BLAST results for each comparison against each species. The files should head headers and should contain the following fields: qseqid, sseqid, qstart, qend, sstart, send, evalue, score The name of the BLAST result files should have the form <target_name>.tab, where <target_name> is the name stored in the phylostratr Strata object.

While the details will depend on your needs and the particulars of your system, for each target you will want something of this general form:

echo -e 'qseqid\tsseqid\tqstart\tqend\tsstart\tsend\tevalue\tscore' > <target>.tab
blastp \
    -query <focal_species> \
    -db <target_database> \
    -outfmt '6 qseqid sseqid qstart qend sstart send evalue score' >> <target>.tab

Then move the resulting files into the working directory where you are running phylostratr. Then you may proceed with the following code (if everything went well, it will automatically read these BLAST results).

# replace 8 with the number of threads on your system, if you running locally
strata <- strata_blast(strata, blast_args=list(nthreads=8L)) %>%
  strata_besthits

# Merge results into a single hittable
results <- merge_besthits(strata)

# Get metadata. Note, this will take awhile
strata <- strata %>%
    # for each species, add map of UniProt IDs to PFAM domains
    strata_uniprot_pfam_map %>%
    # for each species, add proteome summary data
    add_proteome_stats %>%
    # for each species, add list of organelle encoded UniProt proteins
    add_organelle_proteins

Table S1: Proteome stats tables

prot <- proteome_stats_table(strata)
strata2 <- strata_convert(strata, target='all', to='name')
proteome_report_table(strata2) %>%
  merge(get_phylostrata_map(strata2)) %>%
  dplyr::arrange(-ps) %>%
  dplyr::select(species, ps, N, min, q25, median, q75, max)
proteome_report_table(strata2) %>%
  merge(get_phylostrata_map(strata2)) %>%
  dplyr::arrange(-ps) %>%
  dplyr::select(species, ps, N, min, q25, median, q75, max) %>%
  knitr::kable(format='pandoc', caption="Proteome statistics for all proteomes in the *A. thaliana* analysis. **N**: total number of proteins in the UniProt proteome (these may be redundant). **ps**: phylostratum level, where 1 is *cellular organisms*. **min**, **q25**, **median**, **q75**, **max**: summaries of the protein lengths in each proteome.")

Table S2: Identification of organelle proteins

# Select just the plant species 
subtree(strata, name2taxid('Viridiplantae')) %>%
    # replace the taxon IDs with scientific names
    strata_convert(target='all', to='name') %>%
    # summarize the numbers of organelle genes present per species
    organelle_table %>%
    # sort by ascending phylostratum age
    dplyr::arrange(-.data$ps, .data$species)
subtree(strata, name2taxid('Viridiplantae')) %>%
    strata_convert(target='all', to='name') %>%
    organelle_table %>%
    dplyr::arrange(-.data$ps, .data$species) %>%
    knitr::kable(format='pandoc', caption="Number of organelle-encoded proteins appearing in each plant target species. Currently, organelle proteins can only be counted for sequeces retrieved from UniProt (which have UniProt IDs in their FASTA headers). Thus *A. thaliana* and *Ginkgo biloba* are each listed as having no organellar sequences.")

Table S3-5: Finding genes that skip strata

# Set the classifier for determining statistical significance
default_classifier <- classify_by_adjusted_pvalue(threshold=0.001)
# Find all genes that are missing in one or more phylostrata younger than their
# oldest match.
revenants <- find_revenants(results, classifier=default_classifier)
# The number of genes with a skipped stratum immediately after
# the basal stratum
(subset(revenants, ps - 1 == basal_ps)$qseqid %>% unique %>% length)
# Summarize of the number of genes that skip $n\_skipped$ strata.
revenants %>%
    dplyr::distinct(qseqid, n) %>%
    dplyr::group_by(n) %>%
    dplyr::count() %>%
    dplyr::select(n_skipped=n, n_total=nn)
revenants %>%
    dplyr::distinct(qseqid, n) %>%
    dplyr::group_by(n) %>%
    dplyr::count() %>%
    dplyr::select(n_skipped=n, n_total=nn) %>%
    knitr::kable(format='pandoc', caption="A summary of the number of *A. thaliana* genes (**n_total**) where a given number (**n_skipped**) of strata are passed before the oldest stratum with a predicted homolog is reached.")
# Show the skipped phylostrata for genes skipping more than 4 
revenants %>%
    dplyr::filter(n > 4) %>%
    dplyr::select(qseqid, ps, basal_ps, n) %>%
    dplyr::arrange(-ps) %>%
    dplyr::group_by(qseqid) %>%
    dplyr::summarize(basal_ps=basal_ps[1], skipped_ps = paste(ps, collapse=",")) %>%
    dplyr::arrange(-basal_ps)
# Show the skipped phylostrata for genes skipping more than 4 
revenants %>%
    dplyr::filter(n > 4) %>%
    dplyr::select(qseqid, ps, basal_ps, n) %>%
    dplyr::arrange(-ps) %>%
    dplyr::group_by(qseqid) %>%
    dplyr::summarize(basal_ps=basal_ps[1], skipped_ps = paste(ps, collapse=",")) %>%
    dplyr::arrange(-basal_ps) %>%
    knitr::kable(format='pandoc', caption="Here we identify which strata were skipped for each of the *A. thaliana* genes that skipped more than 4 strata.")
revenants %>%
    dplyr::group_by(mrca) %>%
    dplyr::summarize(missing_genes = n(), ps=ps[1]) %>%
    dplyr::arrange(-ps) %>%
    dplyr::mutate(mrca = partial_id_to_name(mrca)) %>%
    dplyr::select(Phylostratum = mrca, Level = ps, missing_genes)
revenants %>%
    dplyr::group_by(mrca) %>%
    dplyr::summarize(missing_genes = n(), ps=ps[1]) %>%
    dplyr::arrange(-ps) %>%
    dplyr::mutate(mrca = partial_id_to_name(mrca)) %>%
    dplyr::select(Phylostratum = mrca, Level = ps, missing_genes) %>%
    knitr::kable(format='pandoc', caption="The number of *A. thaliana* genes that have no hits to any species in a given phylostratum but do have hits to older strata. Strata may have unexpectedly high number of \"missing genes\" for interesting biological reasons, for example, the branch may include degenerate intracellular parasites. But more likely, it is due to poor annotations of the stratum representatives; this is the case for the Spermatophyta stratum, which is represented only by *Picea glauca*.")

Table S6: Compare to phylostratigraphy results from 2014

Here I will load the phylostratigraphy results from (Arendsee et al., 2014) and compare them to the new results from the phylostratr algorithm. The algorithms and cutoffs are the same, but the search space is different. Also, in the new study, I use the Araport11 anotation of Arabidopsis thaliana, whereas in (Arendsee et al., 2014) TAIR10 was used.

# Get the strata from the current analysis
a2018 <- stratify(results, classifier=default_classifier)
# Get and clean the (Arendsee et al., 2014) data (stored in phylostratr package)
a2014 <- read_tsv(system.file('extdata', 'arendsee2014_strata.tab', package='phylostratr'))
# Add NCBI taxonomy ID to tips data
a2014$mrca <- taxizedb::name2taxid(a2014$name)
# match column naming conventions
a2014 <- dplyr::select(a2014, qseqid, mrca, ps, mrca_name=name)
# underscores to spaces 
a2014$mrca_name <- sub('_', ' ', a2014$mrca_name)
# get a lineage map for Arabidopsis
strata_map <- taxizedb::classification('3702')[[1]]
# factor the backbone
a2014$mrca_name <- droplevels(factor(a2014$mrca_name, levels=strata_map$name))

# convert TAIR IDs from models (transcripts) to loci (genes)
to_locus <- function(d){
    d$qseqid <- sub("\\.\\d+$", "", d$qseqid)
    dplyr::group_by(d, qseqid) %>%
        # select the model from the oldest stratum
        dplyr::filter(ps == min(ps)) %>%
        dplyr::distinct()
}
a2018 <- to_locus(a2018)
a2014 <- to_locus(a2014)

ps <- standardize_strata(list(
  a2014 = a2014,
  a2018 = a2018
))
ps <- lapply(ps, function(x) x[c('qseqid', 'mrca_name')])

# 2014 to 2018 study comparison matrix (2014 on x-axis)
versus_TIPS <-
    merge(ps[[1]], ps[[2]], by="qseqid", suffixes=paste0(".", names(ps))) %>%
    dplyr::select(-qseqid) %>%
    make_matrix_from_two_strata()

Now, since the two phylostratigraphy experiments used different annotations of Arabidopsis, I will take the intersection. The model IDs shared between the annotations refer to the same models, phylostratr has a dedicated function for this:

# # Compare the counts between the two studies:
data.frame(
    a2014 = ps[[1]]$mrca_name %>% summary
  , a2018 = ps[[2]]$mrca_name %>% summary
)
data.frame(
    a2014 = ps[[1]]$mrca_name %>% summary
  , a2018 = ps[[2]]$mrca_name %>% summary
) %>% knitr::kable(format='pandoc', caption="A comparison between (Arendsee et al., 2014, 'a2014') and this study's ('a2018') stratification of *A. thaliana* genes. The gene counts include all gene models that are shared between the Araport11 (2018) and older TAIR10 (2014) annotations of the *A.thaliana* genome. In cases where one study has representatives in a phylostratum and other does not (e.g.  the 2014 study had a representative from the Brassicales stratum), the genes are merged into the older strata as needed.")
is_present    <- default_classifier(results)
strata_table  <- stratify(results, classifier=default_classifier)
strata_levels <- levels(strata_table$mrca_name)

evalue2 <- stratify(results, classifier=classify_by_adjusted_pvalue(threshold=1e-2))
evalue5 <- stratify(results, classifier=classify_by_adjusted_pvalue(threshold=1e-5))
sigresults = merge(evalue2, evalue5, by='qseqid')[, c(4,7)]
names(sigresults) <- c("evalue2", "evalue5")

Figure S1: E-value cutoff comparisons

# To get a proper phylostrata table for this 2018
a2018$qseqid <- sub("\\.\\d+$", "", a2018$qseqid)
a2018 %>% unique %$% mrca_name %>% table %>% knitr::kable(format='pandoc')
d <- list()
d$sigmats <- make_significance_matrices(sigresults)
d$versus_TIPS <- versus_TIPS
d$jmpmat <- make_jump_matrix(results[is_present, c('qseqid', 'ps')], labels=strata_levels)
d$sigmats[[1]][[1]]@xlab = "Phylostratum (E-value < 1e-5)"
d$sigmats[[1]][[1]]@ylab = "Phylostratum (E-value < 0.01)"
plot(d$sigmats[[1]][[1]])

Figure S2: phylostrata classifications comparison to 2014 study

plot(t(d$versus_TIPS)) +
  ggplot2::ylab("Classification with phylostratr dataset") +
  ggplot2::xlab("Classification with 2014 dataset")

Figure S3: skipped strata comparisons

plot(t(d$jmpmat)) +
  ggplot2::ylab("Phylostratum with homolog (E-value = 0.001)") +
  ggplot2::xlab("Next youngest with homolog (E-value = 0.001)")
saveRDS(results, 'results.rds')
saveRDS(strata, 'strata.rds')


arendsee/phylostratr documentation built on Dec. 31, 2022, 10:22 a.m.