# Load the package (after installation, see above).
library(ape)
library(optimx) # optimx seems better than R's default optim()
library(GenSA) # GenSA seems better than optimx (but slower) on 5+ parameters,
# seems to sometimes fail on simple problems (2-3 parameters)
library(rexpokit)
library(cladoRcpp)
library(snow) # (if you want to use multicore functionality; some systems/R versions prefer library(parallel), try either)
library(parallel)
library(BioGeoBEARS)
#######################################################
# SETUP: YOUR WORKING DIRECTORY
#######################################################
# You will need to set your working directory to match your local system
# Note these very handy functions!
# Command "setwd(x)" sets your working directory
# Command "getwd()" gets your working directory and tells you what it is.
# Command "list.files()" lists the files in your working directory
# To get help on any command, use "?". E.g., "?list.files"
# Set your working directory for output files
# default here is your home directory ("~")
# Change this as you like
wd = np("~")
setwd(wd)
# Double-check your working directory with getwd()
getwd()
#######################################################
# SETUP: Extension data directory
#######################################################
# When R packages contain extra files, they are stored in the "extdata" directory
# inside the installed package.
#
# BioGeoBEARS contains various example files and scripts in its extdata directory.
#
# Each computer operating system might install BioGeoBEARS in a different place,
# depending on your OS and settings.
#
# However, you can find the extdata directory like this:
extdata_dir = np(system.file("extdata", package="BioGeoBEARS"))
extdata_dir
list.files(extdata_dir)
# "system.file" looks in the directory of a specified package (in this case BioGeoBEARS)
# The function "np" is just a shortcut for normalizePath(), which converts the
# path to the format appropriate for your system (e.g., Mac/Linux use "/", but
# Windows uses "\\", if memory serves).
# Even when using your own data files, you should KEEP these commands in your
# script, since the plot_BioGeoBEARS_results function needs a script from the
# extdata directory to calculate the positions of "corners" on the plot. This cannot
# be made into a straight up BioGeoBEARS function because it uses C routines
# from the package APE which do not pass R CMD check for some reason.
#######################################################
# SETUP: YOUR TREE FILE AND GEOGRAPHY FILE
#######################################################
# Example files are given below. To run your own data,
# make the below lines point to your own files, e.g.
# trfn = "/mydata/frogs/frogBGB/tree.newick"
# geogfn = "/mydata/frogs/frogBGB/geog.data"
#######################################################
# Phylogeny file
# Notes:
# 1. Must be binary/bifurcating: no polytomies
# 2. No negative branchlengths (e.g. BEAST MCC consensus trees sometimes have negative branchlengths)
# 3. Be careful of very short branches, as BioGeoBEARS will interpret ultrashort branches as direct ancestors
# 4. You can use non-ultrametric trees, but BioGeoBEARS will interpret any tips significantly below the
# top of the tree as fossils! This is only a good idea if you actually do have fossils in your tree,
# as in e.g. Wood, Matzke et al. (2013), Systematic Biology.
# 5. The default settings of BioGeoBEARS make sense for trees where the branchlengths are in units of
# millions of years, and the tree is 1-1000 units tall. If you have a tree with a total height of
# e.g. 0.00001, you will need to adjust e.g. the max values of d and e, or (simpler) multiply all
# your branchlengths to get them into reasonable units.
# 6. DON'T USE SPACES IN SPECIES NAMES, USE E.G. "_"
#######################################################
# This is the example Newick file for Hawaiian Psychotria
# (from Ree & Smith 2008)
# "trfn" = "tree file name"
trfn = np(paste(addslash(extdata_dir), "Psychotria_5.2.newick", sep=""))
# Look at the raw Newick file:
moref(trfn)
# Look at your phylogeny (plots to a PDF, which avoids issues with multiple graphics in same window):
pdffn = "tree.pdf"
pdf(file=pdffn, width=9, height=12)
tr = read.tree(trfn)
tr
plot(tr)
title("Example Psychotria phylogeny from Ree & Smith (2008)")
axisPhylo() # plots timescale
dev.off()
cmdstr = paste0("open ", pdffn)
system(cmdstr)
#######################################################
# Geography file
# Notes:
# 1. This is a PHYLIP-formatted file. This means that in the
# first line,
# - the 1st number equals the number of rows (species)
# - the 2nd number equals the number of columns (number of areas)
# - after a tab, put the areas in parentheses, with spaces: (A B C D)
#
# 1.5. Example first line:
# 10 4 (A B C D)
#
# 2. The second line, and subsequent lines:
# speciesA 0110
# speciesB 0111
# speciesC 0001
# ...
#
# 2.5a. This means a TAB between the species name and the area 0/1s
# 2.5b. This also means NO SPACE AND NO TAB between the area 0/1s.
#
# 3. See example files at:
# http://phylo.wikidot.com/biogeobears#files
#
# 4. Make you understand what a PLAIN-TEXT EDITOR is:
# http://phylo.wikidot.com/biogeobears#texteditors
#
# 3. The PHYLIP format is the same format used for C++ LAGRANGE geography files.
#
# 4. All names in the geography file must match names in the phylogeny file.
#
# 5. DON'T USE SPACES IN SPECIES NAMES, USE E.G. "_"
#
# 6. Operational taxonomic units (OTUs) should ideally be phylogenetic lineages,
# i.e. genetically isolated populations. These may or may not be identical
# with species. You would NOT want to just use specimens, as each specimen
# automatically can only live in 1 area, which will typically favor DEC+J
# models. This is fine if the species/lineages really do live in single areas,
# but you wouldn't want to assume this without thinking about it at least.
# In summary, you should collapse multiple specimens into species/lineages if
# data indicates they are the same genetic population.
######################################################
# This is the example geography file for Hawaiian Psychotria
# (from Ree & Smith 2008)
geogfn = np(paste(addslash(extdata_dir), "Psychotria_geog.data", sep=""))
# Look at the raw geography text file:
moref(geogfn)
# Look at your geographic range data:
tipranges = getranges_from_LagrangePHYLIP(lgdata_fn=geogfn)
tipranges
# Maximum range size observed:
max(rowSums(dfnums_to_numeric(tipranges@df)))
# Set the maximum number of areas any species may occupy; this cannot be larger
# than the number of areas you set up, but it can be smaller.
max_range_size = 4
####################################################
####################################################
# KEY HINT: The number of states (= number of different possible geographic ranges)
# depends on (a) the number of areas and (b) max_range_size.
# If you have more than about 500-600 states, the calculations will get REALLY slow,
# since the program has to exponentiate a matrix of e.g. 600x600. Often the computer
# will just sit there and crunch, and never get through the calculation of the first
# likelihood.
#
# (this is also what is usually happening when LAGRANGE hangs: you have too many states!)
#
# To check the number of states for a given number of ranges, try:
numstates_from_numareas(numareas=4, maxareas=4, include_null_range=TRUE)
numstates_from_numareas(numareas=4, maxareas=4, include_null_range=FALSE)
numstates_from_numareas(numareas=4, maxareas=3, include_null_range=TRUE)
numstates_from_numareas(numareas=4, maxareas=2, include_null_range=TRUE)
# Large numbers of areas have problems:
numstates_from_numareas(numareas=10, maxareas=10, include_null_range=TRUE)
# ...unless you limit the max_range_size:
numstates_from_numareas(numareas=10, maxareas=2, include_null_range=TRUE)
####################################################
####################################################
#######################################################
#######################################################
# DEC AND DEC+J ANALYSIS
#######################################################
#######################################################
# NOTE: The BioGeoBEARS "DEC" model is identical with
# the Lagrange DEC model, and should return identical
# ML estimates of parameters, and the same
# log-likelihoods, for the same datasets.
#
# Ancestral state probabilities at nodes will be slightly
# different, since BioGeoBEARS is reporting the
# ancestral state probabilities under the global ML
# model, and Lagrange is reporting ancestral state
# probabilities after re-optimizing the likelihood
# after fixing the state at each node. These will
# be similar, but not identical. See Matzke (2014),
# Systematic Biology, for discussion.
#
# Also see Matzke (2014) for presentation of the
# DEC+J model.
#######################################################
#######################################################
#######################################################
#######################################################
#######################################################
# Run DEC
#######################################################
# Intitialize a default model (DEC model)
BioGeoBEARS_run_object = define_BioGeoBEARS_run()
# Give BioGeoBEARS the location of the phylogeny Newick file
BioGeoBEARS_run_object$trfn = trfn
# Give BioGeoBEARS the location of the geography text file
BioGeoBEARS_run_object$geogfn = geogfn
# Input the maximum range size
BioGeoBEARS_run_object$max_range_size = max_range_size
BioGeoBEARS_run_object$min_branchlength = 0.000001 # Min to treat tip as a direct ancestor (no speciation event)
BioGeoBEARS_run_object$include_null_range = TRUE # set to FALSE for e.g. DEC* model, DEC*+J, etc.
# (For DEC* and other "*" models, please cite: Massana, Kathryn A.; Beaulieu,
# Jeremy M.; Matzke, Nicholas J.; O’Meara, Brian C. (2015). Non-null Effects of
# the Null Range in Biogeographic Models: Exploring Parameter Estimation in the
# DEC Model. bioRxiv, http://biorxiv.org/content/early/2015/09/16/026914 )
# Also: search script on "include_null_range" for other places to change
# Set up a time-stratified analysis:
# 1. Here, un-comment ONLY the files you want to use.
# 2. Also un-comment "BioGeoBEARS_run_object = section_the_tree(...", below.
# 3. For example files see (a) extdata_dir,
# or (b) http://phylo.wikidot.com/biogeobears#files
# and BioGeoBEARS Google Group posts for further hints)
#
# Uncomment files you wish to use in time-stratified analyses:
#BioGeoBEARS_run_object$timesfn = "timeperiods.txt"
#BioGeoBEARS_run_object$dispersal_multipliers_fn = "manual_dispersal_multipliers.txt"
#BioGeoBEARS_run_object$areas_allowed_fn = "areas_allowed.txt"
#BioGeoBEARS_run_object$areas_adjacency_fn = "areas_adjacency.txt"
#BioGeoBEARS_run_object$distsfn = "distances_matrix.txt"
# See notes on the distances model on PhyloWiki's BioGeoBEARS updates page.
# Speed options and multicore processing if desired
BioGeoBEARS_run_object$on_NaN_error = -1e50 # returns very low lnL if parameters produce NaN error (underflow check)
BioGeoBEARS_run_object$speedup = TRUE # shorcuts to speed ML search; use FALSE if worried (e.g. >3 params)
BioGeoBEARS_run_object$use_optimx = TRUE # if FALSE, use optim() instead of optimx();
# if "GenSA", use Generalized Simulated Annealing, which seems better on high-dimensional
# problems (5+ parameters), but seems to sometimes fail to optimize on simple problems
BioGeoBEARS_run_object$num_cores_to_use = 1
# (use more cores to speed it up; this requires
# library(parallel) and/or library(snow). The package "parallel"
# is now default on Macs in R 3.0+, but apparently still
# has to be typed on some Windows machines. Note: apparently
# parallel works on Mac command-line R, but not R.app.
# BioGeoBEARS checks for this and resets to 1
# core with R.app)
# Sparse matrix exponentiation is an option for huge numbers of ranges/states (600+)
# I have experimented with sparse matrix exponentiation in EXPOKIT/rexpokit,
# but the results are imprecise and so I haven't explored it further.
# In a Bayesian analysis, it might work OK, but the ML point estimates are
# not identical.
# Also, I have not implemented all functions to work with force_sparse=TRUE.
# Volunteers are welcome to work on it!!
BioGeoBEARS_run_object$force_sparse = FALSE # force_sparse=TRUE causes pathology & isn't much faster at this scale
# This function loads the dispersal multiplier matrix etc. from the text files into the model object. Required for these to work!
# (It also runs some checks on these inputs for certain errors.)
BioGeoBEARS_run_object = readfiles_BioGeoBEARS_run(BioGeoBEARS_run_object)
# Divide the tree up by timeperiods/strata (uncomment this for stratified analysis)
#BioGeoBEARS_run_object = section_the_tree(inputs=BioGeoBEARS_run_object, make_master_table=TRUE, plot_pieces=FALSE, fossils_older_than=0.001, cut_fossils=FALSE)
# The stratified tree is described in this table:
#BioGeoBEARS_run_object$master_table
# Good default settings to get ancestral states
BioGeoBEARS_run_object$return_condlikes_table = TRUE
BioGeoBEARS_run_object$calc_TTL_loglike_from_condlikes_table = TRUE
BioGeoBEARS_run_object$calc_ancprobs = TRUE # get ancestral states from optim run
# Set up DEC model
# (nothing to do; defaults)
# Look at the BioGeoBEARS_run_object; it's just a list of settings etc.
BioGeoBEARS_run_object
# This contains the model object
BioGeoBEARS_run_object$BioGeoBEARS_model_object
# This table contains the parameters of the model
BioGeoBEARS_run_object$BioGeoBEARS_model_object@params_table
# Run this to check inputs. Read the error messages if you get them!
BioGeoBEARS_run_object = fix_BioGeoBEARS_params_minmax(BioGeoBEARS_run_object=BioGeoBEARS_run_object)
check_BioGeoBEARS_run(BioGeoBEARS_run_object)
# For a slow analysis, run once, then set runslow=FALSE to just
# load the saved result.
runslow = TRUE
resfn = "Psychotria_DEC_M0_unconstrained_v1.Rdata"
if (runslow)
{
res = bears_optim_run(BioGeoBEARS_run_object)
res
save(res, file=resfn)
resDEC = res
} else {
# Loads to "res"
load(resfn)
resDEC = res
}
# Added to package
#source("/Users/nmat471/HD/GitHub/BioGeoBEARS/R/aa_tree_basics_v1.R")
#source("/Users/nmat471/HD/GitHub/BioGeoBEARS/R/BioGeoBEARS_plots_v1.R")
# Get some info:
results_object = resDEC
# Basic areas info
areas = getareas_from_tipranges_object(tipranges)
areas
if (!is.na(results_object$inputs$max_range_size))
{
max_range_size = results_object$inputs$max_range_size
} else {
max_range_size = length(areas)
}
if (is.null(results_object$inputs$states_list))
{
numstates = numstates_from_numareas(numareas=length(areas), maxareas=max_range_size, include_null_range=results_object$inputs$include_null_range)
numstates
states_list_areaLetters = areas_list_to_states_list_new(areas, maxareas=max_range_size, include_null_range=results_object$inputs$include_null_range)
states_list_0based_index = rcpp_areas_list_to_states_list(areas, maxareas=max_range_size, include_null_range=results_object$inputs$include_null_range)
} else {
states_list_0based_index = results_object$inputs$states_list
}
if (is.null(states_list_0based_index))
{
statenames = areas_list_to_states_list_new(areas, maxareas=max_range_size, include_null_range=results_object$inputs$include_null_range, split_ABC=FALSE)
ranges_list = as.list(statenames)
statenames
} else {
ranges_list = states_list_0based_to_ranges_txt_list(state_indices_0based=states_list_0based_index, areanames=areas)
ranges_list
statenames = unlist(ranges_list)
statenames
}
colors_list_for_states=NULL
# Set up colors for each state
if (is.null(colors_list_for_states))
{
# Fix plot_null_range to FALSE (don't want to plot that color)
colors_matrix = get_colors_for_numareas(length(areas))
colors_list_for_states = mix_colors_for_states(colors_matrix, states_list_0based_index, plot_null_range=results_object$inputs$include_null_range)
colors_list_for_states
} # END if (is.null(colors_list_for_states))
# Set up colors by possible ranges
if (is.null(ranges_list))
{
possible_ranges_list_txt = areas_list_to_states_list_new(areas, maxareas=max_range_size, split_ABC=FALSE, include_null_range=results_object$inputs$include_null_range)
} else {
possible_ranges_list_txt = ranges_list
} # if (is.null(ranges_list))
# Just tip states, by hand
tipnodes = tnn(tr)
intnodes = inn(tr)
relprobs_matrix = results_object$ML_marginal_prob_each_state_at_branch_top_AT_node
MLprobs = get_ML_probs(relprobs_matrix)
MLprobs
MLstates = get_ML_states_from_relprobs(relprobs_matrix, statenames, returnwhat="states", if_ties="takefirst")
MLstates
states_list_0based = states_list_0based_index
probs_each_area = infprobs_to_probs_of_each_area(relprobs_matrix, states_list=states_list_0based)
cols_byNode = rangestxt_to_colors(possible_ranges_list_txt, colors_list_for_states, MLstates)
tips = tipnodes
statecex = 0.6
tiplabel_adj=c(0.5)
#######################################################
# PDF plots
#######################################################
pdffn = "Psychotria_DEC_vs_DEC+J_M0_unconstrained_v1.pdf"
pdf(pdffn, height=6, width=6)
#######################################################
# Plot ancestral states - DEC
#######################################################
analysis_titletxt ="BioGeoBEARS DEC on Psychotria M0_unconstrained"
# Setup
results_object = resDEC
scriptdir = np(system.file("extdata/a_scripts", package="BioGeoBEARS"))
# States, tips and internal
res2 = plot_BioGeoBEARS_results(results_object, analysis_titletxt, addl_params=list("j"), plotwhat="text", label.offset=0.45, tipcex=0.7, statecex=0.7, splitcex=0.6, titlecex=0.8, plotsplits=TRUE, cornercoords_loc=scriptdir, include_null_range=TRUE, tr=tr, tipranges=tipranges)
dev.off()
cmdstr = paste0("open ", pdffn)
system(cmdstr)
#######################################################
# Plot just node labels
#######################################################
pdffn = "nodelabels.pdf"
pdf(file=pdffn, width=6, height=6)
plot_tree_nodenums_tipnums(tr)
dev.off()
cmdstr = paste0("open ", pdffn)
system(cmdstr)
pdffn = "Psychotria_DEC_per-area_probs1.pdf"
pdf(pdffn, height=6, width=6)
plot(tr, label.offset=0.1)
axisPhylo()
mtext("Millions of years ago (Ma)", side=1, line=2)
tiplabels(text=MLstates[tips], tip=tips, bg=cols_byNode[tips], cex=statecex, adj=tiplabel_adj)
# Add per-area probs internally
add_per_area_probs_to_nodes(tr, probs_each_area, nodes_to_plot=intnodes)
plot(tr, label.offset=0.1)
axisPhylo()
mtext("Millions of years ago (Ma)", side=1, line=2)
tiplabels(text=MLstates[tips], tip=tips, bg=cols_byNode[tips], cex=statecex, adj=tiplabel_adj)
# Add per-area probs internally
cols_each_area = c("blue", "green", "yellow", "red")
add_per_area_probs_to_nodes(tr, probs_each_area, cols_each_area=cols_each_area, barwidth_proportion=0.03, barheight_proportion=0.025, offset_nodenums=c(34,35), offset_xvals=c(-0.0,-0.0), offset_yvals=c(-0.0,-0.0), nodes_to_plot=intnodes)
# Move node labels
plot(tr, label.offset=0.1)
axisPhylo()
mtext("Millions of years ago (Ma)", side=1, line=2)
tiplabels(text=MLstates[tips], tip=tips, bg=cols_byNode[tips], cex=statecex, adj=tiplabel_adj)
# Add per-area probs internally
cols_each_area = c("blue", "green", "yellow", "red")
add_per_area_probs_to_nodes(tr, probs_each_area, cols_each_area=cols_each_area, barwidth_proportion=0.03, barheight_proportion=0.025, offset_nodenums=c(24,29), offset_xvals=c(-2,1), offset_yvals=c(-1,0.0), nodes_to_plot=intnodes)
dev.off() # Turn off PDF
cmdstr = paste("open ", pdffn, sep="")
system(cmdstr) # Plot it
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.