Yeast case study

Build tree

require(phylostratr)
require(magrittr)
require(dplyr)
require(readr)
require(knitr)
require(taxizedb)
require(ape)
require(reshape2)
# set weights on species selection
weights <- c(
  '1355160' = 0,   # these three are deprecated
  '245562'  = 0,
  '1094981' = 0,
  '284813'  = 1.1, # these are the reference replacements
  '766039'  = 1.1,
  '237561'  = 1.1
)
focal_taxid <- '4932'
strata <- readRDS('strata.Rda')
strata <- focal_taxid %>%
  # Get all UniProt proteomes
  uniprot_strata %>%
  # build a tree of all UniProt genomes
  # Select a diverse subset of 5 or fewer representatives from each stratum.
  # Only do this above the Saccharomyces genus, since we will later replace
  # Saccharomyces with out own tree.
  strata_apply(f=diverse_subtree, n=5, weights=weights) %>%
  # add prokaryote stratum
  use_recommended_prokaryotes %>%
  # download UniProt sequences (this may take 10+ minutes)
  uniprot_fill_strata

Next we replace the Saccharomyces genus with our own tree describing the phylogenetic relationships between the species. We also provide our own proteome sequence.

Figure S4: Saccharomyces family tree

saccharomyces <- Strata(
    tree = ape::read.tree(system.file('extdata', 'yeast', 'tree', package='phylostratr')),
    data = list(faa=list(
             # The user is responsible for collecting these data 
             Saccharomyces_cerevisiae   = 'yeast/cerevisiae.faa',
             Saccharomyces_paradoxus    = 'yeast/paradoxus.faa',
             Saccharomyces_mikatae      = 'yeast/mikatae.faa',
             Saccharomyces_kudriavzevii = 'yeast/kudriavzevii.faa',
             Saccharomyces_arboricola   = 'yeast/arboricola.faa',
             Saccharomyces_eubayanus    = 'yeast/eubayanus.faa',
             Saccharomyces_uvarum       = 'yeast/uvarum.faa'
           )),
    focal_species = 'Saccharomyces_cerevisiae'
) 
plot(saccharomyces@tree, show.node.label=TRUE)
# Convert names to NCBI taxon IDs
saccharomyces <- strata_convert(saccharomyces, target='tip', to='id')
# Replace the NCBI common tree Saccharomyces branch with our custom tree 
strata <- replace_branch(strata, y=saccharomyces, node='4930')

Run BLAST

Now we can BLAST the focal species genome against the other species. This step is exactly the same as the one in the Arabidopsis study.

results <- readRDS('results.Rda')
strata  <- strata_blast(strata, blast_args=list(nthreads=2)) %>% strata_besthits
results <- merge_besthits(strata)
strata  <- strata %>%
    strata_uniprot_pfam_map %>%
    add_proteome_stats

Table S7: Proteome stats tables

strata2 <- strata_convert(strata, target='all', FUN=partial_id_to_name)
proteome_report_table(strata2) %>%
  merge(get_phylostrata_map(strata2)) %>%
  dplyr::arrange(-ps) %>%
  dplyr::select(species, ps, N, min, q25, median, q75, max) %>%
  knitr::kable(format='pandoc', caption="Proteome statistics for all proteomes in the *S. cerevisiae* analysis.  **N**: total number of proteins in the UniProt proteome (these may be redundant). **ps**: phylostratum level, where 1 is *cellular organisms*.  **min**, **q25**, **median**, **q75**, **max**: summaries of the protein lengths in each proteome.")

Figure S5: Plotting top scores across target species

The top hits across target species for each gene in the focal species can be plotted as a heat map. For example:

plot_heatmaps(results, "heatmap.pdf")

phylostratr gene by clade tree-heat map for *Saccharomyces cerevisiae*.  The figure (page **6** of a total of **234** pages) shows the degree of similarly of each protein encoded by the focal species (far left column) to that of its closest homolog in each species in the clade tree.  Proteins encoded by consecutive *S. cereviciae* genes (NM_001178687.1 - NM_001178743.1), sorted by phylostratum membership, are compared to the proteomes of 133 species. The designated phylostratum membership (1-15) is indicated on the far left.  Red wedge: This gene has homologs in the fungal phylostrata, but not in other eukaryotes, and also has bacterial homologs; it is potentially associated with a horizontal transfer event. Orange wedge: A gene with skiped strata, possibly due to lateral transfer or false positive matches to prokaryotes.  Magenta wedge: a gene present in eukaryotes and archaea but not in bacteria.  Black arrowhead: The nine "missing" homologs in *S. arboricola* could be absent due to an interesting biological phenomenon such as a rapidly evolving genome or massive gene loss or due to an incomplete genome annotation; the gene annotation statistics for the *S. arboricola* genome assembly at NCBI support the latter explanation.

Table S8-10: Finding genes that skip strata

Since the code is identical to that in the Arabidopsis study, it is not printed.

revenants <- find_revenants(results)
revenants %>%
    dplyr::distinct(qseqid, n) %>%
    dplyr::group_by(n) %>%
    dplyr::count() %>%
    dplyr::select(n_skipped=n, n_total=nn) %>%
    knitr::kable(format='pandoc', caption="A summary of the number of *S. cerevisiae* genes (**n_total**) where a given number (**n_skipped**) of strata are passed before the oldest stratum with a predicted homolog is reached.")
revenants %>%
    dplyr::filter(n > 4) %>%
    dplyr::select(qseqid, ps, basal_ps, n) %>%
    dplyr::arrange(-ps) %>%
    dplyr::group_by(qseqid) %>%
    dplyr::summarize(basal_ps=basal_ps[1], skipped_ps = paste(ps, collapse=",")) %>%
    dplyr::arrange(-basal_ps) %>%
    knitr::kable(format='pandoc', caption="Here we identify the strata that where skipped for all *S. cerevisiae* genes skipping more than 4 strata. The genes with names other than 'NM_XXXXXXXXX.X' are all mitochondrial genes.")
revenants %>%
    dplyr::arrange(-ps) %>%
    dplyr::group_by(mrca) %>%
    dplyr::summarize(missing_genes = n()) %>%
    dplyr::mutate(mrca = partial_id_to_name(mrca)) %>%
    knitr::kable(format='pandoc', caption="The number of *S. cerevisiae* genes that have no hits to any species in a given stratum ('mrca' but do have hits to older strata. In the table below, we see that 's2', a stratum represented by *S. arboricola*, has skipped far more often than the surrounding strata.")

Figure S6-8: Steps in building the tree

# Get the full taxonomic order
sc_order <-
  focal_taxid %>%
  uniprot_strata %>%
  subtree(name2taxid('Saccharomycetales'))

# Filter for a balanced diverse representation
sc_filtered <- sc_order %>% 
  strata_apply(f=diverse_subtree, n=5, weights=weights) %>%
  uniprot_fill_strata

# Replace the Saccharomyces branch with the users tree 
sc_final <- replace_branch(sc_filtered, saccharomyces, '4930')
plot(
    make_tree_relative_to(sc_order@tree, '4932'),
    show.node.label=TRUE,
    cex=0.45,
    type='cladogram'
)
plot(
    make_tree_relative_to(sc_filtered@tree, '4932'),
    show.node.label=TRUE,
    cex=0.45,
    type='cladogram'
)
plot(
    make_tree_relative_to(sc_final@tree, '4932'),
    show.node.label=TRUE,
    cex=0.45,
    type='cladogram'
)

Comparison to (Carvunis et al., 2012)

The (Carvunis et al., 2012, c2012) phylostratigraphic analysis was based on far less data than is now available. We compare these results to the current study (a2018).

strata_names <- levels(get_phylostrata_map(strata)$mrca)
strata_names <- factor(strata_names, levels=strata_names)
a2018 <- stratify(results, strata_names = strata_names)

# c2012 youngest 4 strata are stored in the `phylostratr` package:
c2012_file <- system.file('extdata', 'carvunis2012_strata.tab', package='phylostratr')
c2012 <- read_tsv(c2012_file)

# The oldest stratum uses S. bayanus, a species not included in the current
# study, so we drop this stratum. 
c2012 <- subset(c2012, conservation_level < 4)

common_ids <- intersect(c2012$refseq_id, a2018$qseqid)

c2012 <- subset(c2012, refseq_id %in% common_ids)
a2018 <- subset(a2018, qseqid %in% common_ids)

# The c2012 `conservation_level`s 1-3 correspond to phylostrata 15,14, and 13,
# respectively. 
c2012$ps <- c('1'=15L, '2'=14L, '3'=13L)[c2012$conservation_level]
c2012 <- select(c2012, qseqid=refseq_id, ps)
c2012$group <- 'c2012'
a2018 <- select(a2018, qseqid, ps)
a2018$group <- 'a2018'

Table S11: Phylostrata comparison to 2012 results

map <- get_phylostrata_map(strata) %>% select(mrca, ps) %>% distinct
c2012_a2018_comp <- rbind(c2012, a2018) %>%
  dplyr::group_by(group, ps) %>%
  dplyr::summarize(count = n()) %>%
  dcast(ps ~ group, value.var='count') %>%
  merge(map) %>%
  mutate(
    c2012 = ifelse(is.na(c2012), '-', c2012),
    mrca = partial_id_to_name(as.character(mrca))
  ) %>%
  select(Phylostratum = mrca, Level = ps, a2018, c2012) %>%
  knitr::kable(format='pandoc', caption="Distribution across strata of genes common to the current *S. cerevisiae* analysis and the youngest 3 strata from (Carvunis et al. 2012).")

Figure S9: E-value threshold classification comparison

default_classifier <- classify_by_adjusted_pvalue(threshold=0.001)
is_present         <- default_classifier(results)
strata_table       <- stratify(results, classifier=default_classifier)
strata_levels      <- levels(strata_table$mrca_name)

ll <- list(
  bon2  = stratify(results, classifier=classify_by_adjusted_pvalue(threshold=1e-2)),
  bon5  = stratify(results, classifier=classify_by_adjusted_pvalue(threshold=1e-5))
)
sigresults <- Reduce(init=ll[[1]], x=ll[-1],
  function(a,b){
    merge(a, b, by='qseqid')
  }
)
sigresults <- sigresults[, c(4,7)]
names(sigresults) <- names(ll)

d <- list()
d$sigmats <- make_significance_matrices(sigresults)
d$jmpmat <- make_jump_matrix(results[is_present, c('qseqid', 'ps')], labels=strata_levels)

d$sigmats[[1]][[1]]@xlab = "Phylostratum (E-value < 1e-5)"
d$sigmats[[1]][[1]]@ylab = "Phylostratum (E-value < 0.01)"
plot(d$sigmats[[1]][[1]])
saveRDS(results, 'results.rds')
saveRDS(strata, 'strata.rds')


arendsee/phylostratr documentation built on Dec. 31, 2022, 10:22 a.m.