In arendsee/phylostratr: Analyze phylostratigraphic data

Yeast case study

Build tree

require(phylostratr)
require(magrittr)
require(dplyr)
require(readr)
require(knitr)
require(taxizedb)
require(ape)
require(reshape2)

# set weights on species selection
weights <- c(
  '1355160' = 0,   # these three are deprecated
  '245562'  = 0,
  '1094981' = 0,
  '284813'  = 1.1, # these are the reference replacements
  '766039'  = 1.1,
  '237561'  = 1.1
)
focal_taxid <- '4932'

strata <- readRDS('strata.Rda')

strata <- focal_taxid %>%
  # Get all UniProt proteomes
  uniprot_strata %>%
  # build a tree of all UniProt genomes
  # Select a diverse subset of 5 or fewer representatives from each stratum.
  # Only do this above the Saccharomyces genus, since we will later replace
  # Saccharomyces with out own tree.
  strata_apply(f=diverse_subtree, n=5, weights=weights) %>%
  # add prokaryote stratum
  use_recommended_prokaryotes %>%
  # download UniProt sequences (this may take 10+ minutes)
  uniprot_fill_strata

Next we replace the Saccharomyces genus with our own tree describing the phylogenetic relationships between the species. We also provide our own proteome sequence.

Figure S4: Saccharomyces family tree

saccharomyces <- Strata(
    tree = ape::read.tree(system.file('extdata', 'yeast', 'tree', package='phylostratr')),
    data = list(faa=list(
             # The user is responsible for collecting these data 
             Saccharomyces_cerevisiae   = 'yeast/cerevisiae.faa',
             Saccharomyces_paradoxus    = 'yeast/paradoxus.faa',
             Saccharomyces_mikatae      = 'yeast/mikatae.faa',
             Saccharomyces_kudriavzevii = 'yeast/kudriavzevii.faa',
             Saccharomyces_arboricola   = 'yeast/arboricola.faa',
             Saccharomyces_eubayanus    = 'yeast/eubayanus.faa',
             Saccharomyces_uvarum       = 'yeast/uvarum.faa'
           )),
    focal_species = 'Saccharomyces_cerevisiae'
) 
plot(saccharomyces@tree, show.node.label=TRUE)

# Convert names to NCBI taxon IDs
saccharomyces <- strata_convert(saccharomyces, target='tip', to='id')

# Replace the NCBI common tree Saccharomyces branch with our custom tree 
strata <- replace_branch(strata, y=saccharomyces, node='4930')

Run BLAST

Now we can BLAST the focal species genome against the other species. This step is exactly the same as the one in the Arabidopsis study.

results <- readRDS('results.Rda')

strata  <- strata_blast(strata, blast_args=list(nthreads=2)) %>% strata_besthits
results <- merge_besthits(strata)
strata  <- strata %>%
    strata_uniprot_pfam_map %>%
    add_proteome_stats

Table S7: Proteome stats tables

strata2 <- strata_convert(strata, target='all', FUN=partial_id_to_name)
proteome_report_table(strata2) %>%
  merge(get_phylostrata_map(strata2)) %>%
  dplyr::arrange(-ps) %>%
  dplyr::select(species, ps, N, min, q25, median, q75, max) %>%
  knitr::kable(format='pandoc', caption="Proteome statistics for all proteomes in the *S. cerevisiae* analysis.  **N**: total number of proteins in the UniProt proteome (these may be redundant). **ps**: phylostratum level, where 1 is *cellular organisms*.  **min**, **q25**, **median**, **q75**, **max**: summaries of the protein lengths in each proteome.")

Figure S5: Plotting top scores across target species

The top hits across target species for each gene in the focal species can be plotted as a heat map. For example:

plot_heatmaps(results, "heatmap.pdf")

Table S8-10: Finding genes that skip strata

Since the code is identical to that in the Arabidopsis study, it is not printed.

revenants <- find_revenants(results)

revenants %>%
    dplyr::distinct(qseqid, n) %>%
    dplyr::group_by(n) %>%
    dplyr::count() %>%
    dplyr::select(n_skipped=n, n_total=nn) %>%
    knitr::kable(format='pandoc', caption="A summary of the number of *S. cerevisiae* genes (**n_total**) where a given number (**n_skipped**) of strata are passed before the oldest stratum with a predicted homolog is reached.")

revenants %>%
    dplyr::filter(n > 4) %>%
    dplyr::select(qseqid, ps, basal_ps, n) %>%
    dplyr::arrange(-ps) %>%
    dplyr::group_by(qseqid) %>%
    dplyr::summarize(basal_ps=basal_ps[1], skipped_ps = paste(ps, collapse=",")) %>%
    dplyr::arrange(-basal_ps) %>%
    knitr::kable(format='pandoc', caption="Here we identify the strata that where skipped for all *S. cerevisiae* genes skipping more than 4 strata. The genes with names other than 'NM_XXXXXXXXX.X' are all mitochondrial genes.")

revenants %>%
    dplyr::arrange(-ps) %>%
    dplyr::group_by(mrca) %>%
    dplyr::summarize(missing_genes = n()) %>%
    dplyr::mutate(mrca = partial_id_to_name(mrca)) %>%
    knitr::kable(format='pandoc', caption="The number of *S. cerevisiae* genes that have no hits to any species in a given stratum ('mrca' but do have hits to older strata. In the table below, we see that 's2', a stratum represented by *S. arboricola*, has skipped far more often than the surrounding strata.")

Figure S6-8: Steps in building the tree

# Get the full taxonomic order
sc_order <-
  focal_taxid %>%
  uniprot_strata %>%
  subtree(name2taxid('Saccharomycetales'))

# Filter for a balanced diverse representation
sc_filtered <- sc_order %>% 
  strata_apply(f=diverse_subtree, n=5, weights=weights) %>%
  uniprot_fill_strata

# Replace the Saccharomyces branch with the users tree 
sc_final <- replace_branch(sc_filtered, saccharomyces, '4930')

plot(
    make_tree_relative_to(sc_order@tree, '4932'),
    show.node.label=TRUE,
    cex=0.45,
    type='cladogram'
)

plot(
    make_tree_relative_to(sc_filtered@tree, '4932'),
    show.node.label=TRUE,
    cex=0.45,
    type='cladogram'
)

plot(
    make_tree_relative_to(sc_final@tree, '4932'),
    show.node.label=TRUE,
    cex=0.45,
    type='cladogram'
)

Comparison to (Carvunis et al., 2012)

The (Carvunis et al., 2012, c2012) phylostratigraphic analysis was based on far less data than is now available. We compare these results to the current study (a2018).

strata_names <- levels(get_phylostrata_map(strata)$mrca)
strata_names <- factor(strata_names, levels=strata_names)
a2018 <- stratify(results, strata_names = strata_names)

# c2012 youngest 4 strata are stored in the `phylostratr` package:
c2012_file <- system.file('extdata', 'carvunis2012_strata.tab', package='phylostratr')
c2012 <- read_tsv(c2012_file)

# The oldest stratum uses S. bayanus, a species not included in the current
# study, so we drop this stratum. 
c2012 <- subset(c2012, conservation_level < 4)

common_ids <- intersect(c2012$refseq_id, a2018$qseqid)

c2012 <- subset(c2012, refseq_id %in% common_ids)
a2018 <- subset(a2018, qseqid %in% common_ids)

# The c2012 `conservation_level`s 1-3 correspond to phylostrata 15,14, and 13,
# respectively. 
c2012$ps <- c('1'=15L, '2'=14L, '3'=13L)[c2012$conservation_level]
c2012 <- select(c2012, qseqid=refseq_id, ps)
c2012$group <- 'c2012'
a2018 <- select(a2018, qseqid, ps)
a2018$group <- 'a2018'

Table S11: Phylostrata comparison to 2012 results

map <- get_phylostrata_map(strata) %>% select(mrca, ps) %>% distinct
c2012_a2018_comp <- rbind(c2012, a2018) %>%
  dplyr::group_by(group, ps) %>%
  dplyr::summarize(count = n()) %>%
  dcast(ps ~ group, value.var='count') %>%
  merge(map) %>%
  mutate(
    c2012 = ifelse(is.na(c2012), '-', c2012),
    mrca = partial_id_to_name(as.character(mrca))
  ) %>%
  select(Phylostratum = mrca, Level = ps, a2018, c2012) %>%

  knitr::kable(format='pandoc', caption="Distribution across strata of genes common to the current *S. cerevisiae* analysis and the youngest 3 strata from (Carvunis et al. 2012).")

Figure S9: E-value threshold classification comparison

default_classifier <- classify_by_adjusted_pvalue(threshold=0.001)
is_present         <- default_classifier(results)
strata_table       <- stratify(results, classifier=default_classifier)
strata_levels      <- levels(strata_table$mrca_name)

ll <- list(
  bon2  = stratify(results, classifier=classify_by_adjusted_pvalue(threshold=1e-2)),
  bon5  = stratify(results, classifier=classify_by_adjusted_pvalue(threshold=1e-5))
)
sigresults <- Reduce(init=ll[[1]], x=ll[-1],
  function(a,b){
    merge(a, b, by='qseqid')
  }
)
sigresults <- sigresults[, c(4,7)]
names(sigresults) <- names(ll)

d <- list()
d$sigmats <- make_significance_matrices(sigresults)
d$jmpmat <- make_jump_matrix(results[is_present, c('qseqid', 'ps')], labels=strata_levels)

d$sigmats[[1]][[1]]@xlab = "Phylostratum (E-value < 1e-5)"
d$sigmats[[1]][[1]]@ylab = "Phylostratum (E-value < 0.01)"
plot(d$sigmats[[1]][[1]])

saveRDS(results, 'results.rds')
saveRDS(strata, 'strata.rds')

arendsee/phylostratr documentation built on Dec. 31, 2022, 10:22 a.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

arendsee/phylostratr
Analyze phylostratigraphic data

In arendsee/phylostratr: Analyze phylostratigraphic data

Yeast case study

Build tree

Figure S4: Saccharomyces family tree

Run BLAST

Table S7: Proteome stats tables

Figure S5: Plotting top scores across target species

Table S8-10: Finding genes that skip strata

Figure S6-8: Steps in building the tree

Comparison to (Carvunis et al., 2012)

Table S11: Phylostrata comparison to 2012 results

Figure S9: E-value threshold classification comparison

R Package Documentation

Browse R Packages

We want your feedback!

arendsee/phylostratr Analyze phylostratigraphic data

In arendsee/phylostratr: Analyze phylostratigraphic data

Yeast case study

Build tree

Figure S4: Saccharomyces family tree

Run BLAST

Table S7: Proteome stats tables

Figure S5: Plotting top scores across target species

Table S8-10: Finding genes that skip strata

Figure S6-8: Steps in building the tree

Comparison to (Carvunis et al., 2012)

Table S11: Phylostrata comparison to 2012 results

Figure S9: E-value threshold classification comparison

R Package Documentation

Browse R Packages

We want your feedback!

arendsee/phylostratr
Analyze phylostratigraphic data