require(phylostratr) require(magrittr) require(dplyr) require(readr) require(knitr) require(taxizedb) require(ape) require(reshape2)
# set weights on species selection weights <- c( '1355160' = 0, # these three are deprecated '245562' = 0, '1094981' = 0, '284813' = 1.1, # these are the reference replacements '766039' = 1.1, '237561' = 1.1 ) focal_taxid <- '4932'
strata <- readRDS('strata.Rda')
strata <- focal_taxid %>% # Get all UniProt proteomes uniprot_strata %>% # build a tree of all UniProt genomes # Select a diverse subset of 5 or fewer representatives from each stratum. # Only do this above the Saccharomyces genus, since we will later replace # Saccharomyces with out own tree. strata_apply(f=diverse_subtree, n=5, weights=weights) %>% # add prokaryote stratum use_recommended_prokaryotes %>% # download UniProt sequences (this may take 10+ minutes) uniprot_fill_strata
Next we replace the Saccharomyces genus with our own tree describing the phylogenetic relationships between the species. We also provide our own proteome sequence.
saccharomyces <- Strata( tree = ape::read.tree(system.file('extdata', 'yeast', 'tree', package='phylostratr')), data = list(faa=list( # The user is responsible for collecting these data Saccharomyces_cerevisiae = 'yeast/cerevisiae.faa', Saccharomyces_paradoxus = 'yeast/paradoxus.faa', Saccharomyces_mikatae = 'yeast/mikatae.faa', Saccharomyces_kudriavzevii = 'yeast/kudriavzevii.faa', Saccharomyces_arboricola = 'yeast/arboricola.faa', Saccharomyces_eubayanus = 'yeast/eubayanus.faa', Saccharomyces_uvarum = 'yeast/uvarum.faa' )), focal_species = 'Saccharomyces_cerevisiae' ) plot(saccharomyces@tree, show.node.label=TRUE)
# Convert names to NCBI taxon IDs saccharomyces <- strata_convert(saccharomyces, target='tip', to='id')
# Replace the NCBI common tree Saccharomyces branch with our custom tree strata <- replace_branch(strata, y=saccharomyces, node='4930')
Now we can BLAST the focal species genome against the other species. This step is exactly the same as the one in the Arabidopsis study.
results <- readRDS('results.Rda')
strata <- strata_blast(strata, blast_args=list(nthreads=2)) %>% strata_besthits results <- merge_besthits(strata) strata <- strata %>% strata_uniprot_pfam_map %>% add_proteome_stats
strata2 <- strata_convert(strata, target='all', FUN=partial_id_to_name) proteome_report_table(strata2) %>% merge(get_phylostrata_map(strata2)) %>% dplyr::arrange(-ps) %>% dplyr::select(species, ps, N, min, q25, median, q75, max) %>% knitr::kable(format='pandoc', caption="Proteome statistics for all proteomes in the *S. cerevisiae* analysis. **N**: total number of proteins in the UniProt proteome (these may be redundant). **ps**: phylostratum level, where 1 is *cellular organisms*. **min**, **q25**, **median**, **q75**, **max**: summaries of the protein lengths in each proteome.")
The top hits across target species for each gene in the focal species can be plotted as a heat map. For example:
plot_heatmaps(results, "heatmap.pdf")
Since the code is identical to that in the Arabidopsis study, it is not printed.
revenants <- find_revenants(results)
revenants %>% dplyr::distinct(qseqid, n) %>% dplyr::group_by(n) %>% dplyr::count() %>% dplyr::select(n_skipped=n, n_total=nn) %>% knitr::kable(format='pandoc', caption="A summary of the number of *S. cerevisiae* genes (**n_total**) where a given number (**n_skipped**) of strata are passed before the oldest stratum with a predicted homolog is reached.")
revenants %>% dplyr::filter(n > 4) %>% dplyr::select(qseqid, ps, basal_ps, n) %>% dplyr::arrange(-ps) %>% dplyr::group_by(qseqid) %>% dplyr::summarize(basal_ps=basal_ps[1], skipped_ps = paste(ps, collapse=",")) %>% dplyr::arrange(-basal_ps) %>% knitr::kable(format='pandoc', caption="Here we identify the strata that where skipped for all *S. cerevisiae* genes skipping more than 4 strata. The genes with names other than 'NM_XXXXXXXXX.X' are all mitochondrial genes.")
revenants %>% dplyr::arrange(-ps) %>% dplyr::group_by(mrca) %>% dplyr::summarize(missing_genes = n()) %>% dplyr::mutate(mrca = partial_id_to_name(mrca)) %>% knitr::kable(format='pandoc', caption="The number of *S. cerevisiae* genes that have no hits to any species in a given stratum ('mrca' but do have hits to older strata. In the table below, we see that 's2', a stratum represented by *S. arboricola*, has skipped far more often than the surrounding strata.")
# Get the full taxonomic order sc_order <- focal_taxid %>% uniprot_strata %>% subtree(name2taxid('Saccharomycetales')) # Filter for a balanced diverse representation sc_filtered <- sc_order %>% strata_apply(f=diverse_subtree, n=5, weights=weights) %>% uniprot_fill_strata # Replace the Saccharomyces branch with the users tree sc_final <- replace_branch(sc_filtered, saccharomyces, '4930')
plot( make_tree_relative_to(sc_order@tree, '4932'), show.node.label=TRUE, cex=0.45, type='cladogram' )
plot( make_tree_relative_to(sc_filtered@tree, '4932'), show.node.label=TRUE, cex=0.45, type='cladogram' )
plot( make_tree_relative_to(sc_final@tree, '4932'), show.node.label=TRUE, cex=0.45, type='cladogram' )
The (Carvunis et al., 2012, c2012) phylostratigraphic analysis was based on far less data than is now available. We compare these results to the current study (a2018).
strata_names <- levels(get_phylostrata_map(strata)$mrca) strata_names <- factor(strata_names, levels=strata_names) a2018 <- stratify(results, strata_names = strata_names) # c2012 youngest 4 strata are stored in the `phylostratr` package: c2012_file <- system.file('extdata', 'carvunis2012_strata.tab', package='phylostratr') c2012 <- read_tsv(c2012_file) # The oldest stratum uses S. bayanus, a species not included in the current # study, so we drop this stratum. c2012 <- subset(c2012, conservation_level < 4) common_ids <- intersect(c2012$refseq_id, a2018$qseqid) c2012 <- subset(c2012, refseq_id %in% common_ids) a2018 <- subset(a2018, qseqid %in% common_ids) # The c2012 `conservation_level`s 1-3 correspond to phylostrata 15,14, and 13, # respectively. c2012$ps <- c('1'=15L, '2'=14L, '3'=13L)[c2012$conservation_level] c2012 <- select(c2012, qseqid=refseq_id, ps) c2012$group <- 'c2012' a2018 <- select(a2018, qseqid, ps) a2018$group <- 'a2018'
map <- get_phylostrata_map(strata) %>% select(mrca, ps) %>% distinct c2012_a2018_comp <- rbind(c2012, a2018) %>% dplyr::group_by(group, ps) %>% dplyr::summarize(count = n()) %>% dcast(ps ~ group, value.var='count') %>% merge(map) %>% mutate( c2012 = ifelse(is.na(c2012), '-', c2012), mrca = partial_id_to_name(as.character(mrca)) ) %>% select(Phylostratum = mrca, Level = ps, a2018, c2012) %>%
knitr::kable(format='pandoc', caption="Distribution across strata of genes common to the current *S. cerevisiae* analysis and the youngest 3 strata from (Carvunis et al. 2012).")
default_classifier <- classify_by_adjusted_pvalue(threshold=0.001) is_present <- default_classifier(results) strata_table <- stratify(results, classifier=default_classifier) strata_levels <- levels(strata_table$mrca_name) ll <- list( bon2 = stratify(results, classifier=classify_by_adjusted_pvalue(threshold=1e-2)), bon5 = stratify(results, classifier=classify_by_adjusted_pvalue(threshold=1e-5)) ) sigresults <- Reduce(init=ll[[1]], x=ll[-1], function(a,b){ merge(a, b, by='qseqid') } ) sigresults <- sigresults[, c(4,7)] names(sigresults) <- names(ll) d <- list() d$sigmats <- make_significance_matrices(sigresults) d$jmpmat <- make_jump_matrix(results[is_present, c('qseqid', 'ps')], labels=strata_levels) d$sigmats[[1]][[1]]@xlab = "Phylostratum (E-value < 1e-5)" d$sigmats[[1]][[1]]@ylab = "Phylostratum (E-value < 0.01)" plot(d$sigmats[[1]][[1]])
saveRDS(results, 'results.rds') saveRDS(strata, 'strata.rds')
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.