#######################################################
# DETECTION PROBABILITIES
#######################################################
# source("/Dropbox/_njm/__packages/BioGeoBEARS_setup/BioGeoBEARS_detection_v1.R")
#require("ape")
#require("rexpokit")
#require("cladoRcpp")
# Get tipranges from a detection counts matrix
# Get tipranges from a detection counts matrix
#' Read a table of detection counts into a tipranges_object
#'
#' Reads a table of detection counts into a \code{tipranges_object}, coding any
#' nonzero counts as present (1), and any zero counts as absent (0).
#'
#' The function can read the table either from a \code{BioGeoBEARS_run_object$detects_df}
#' item, or from an input detections \code{data.frame} \code{detects_df} directly.
#'
#' @param BioGeoBEARS_run_object The inputs list (typically a BioGeoBEARS_run_object),
#' derived from e.g. \code{define_BioGeoBEARS_run()}. If used, needs to have a
#' \code{BioGeoBEARS_run_object$detects_df} available.
#' @param detects_df A \code{data.frame} for the counts of detections of OTUs of
#' interest. This can be read from a text file with
#' \code{\link{read_detections}}. The format of the text file is
#' similar to a geography file, but with counts instead of tab-delimited columns
#' instead of the collapsed 00110 format. See PhyloWiki.
#' @return tipranges A \code{tipranges_object} with counts per taxon.
#' @export
#' @author Nicholas J. Matzke \email{matzke@@berkeley.edu}
#' @examples
#' test=1
get_tipranges_from_detects_df <- function(BioGeoBEARS_run_object=NULL, detects_df=NULL)
{
# Inputs check
if (is.null(BioGeoBEARS_run_object$detects_df))
{
if (is.null(detects_df))
{
stoptxt = "STOP ERROR in get_tipranges_from_detects_df(): either BioGeoBEARS_run_object$detects_df or detects_df must be non-null!"
cat("\n\n")
cat(stoptxt)
cat("\n\n")
stop(stoptxt)
} # END if (is.null(detects_df))
} else {
detects_df = BioGeoBEARS_run_object$detects_df
} # END if (is.null(BioGeoBEARS_run_object$detects_df))
tmp_blah = as.matrix(detects_df)
tmp_blah[isblank_TF(tmp_blah)] = 0
tmp_blah[tmp_blah > 0] = 1
tmp_input = adf2(tmp_blah)
tipranges_object = define_tipranges_object(tmpdf=tmp_input)
tipranges_object@df = adf2(tipranges_object@df)
rownames(tipranges_object@df) = rownames(tmp_blah)
tipranges = tipranges_object
return(tipranges)
} # END get_tipranges_from_detects_df <- function(BioGeoBEARS_run_object=NULL, detects_df=NULL)
# Get geographic ranges at tips, from the detections file rather than a geog file
#' Read a file of detection counts into a tipranges_object
#'
#' Reads a file of detection counts into a \code{tipranges_object}, coding any
#' nonzero counts as present (1), and any zero counts as absent (0).
#'
#' @param detects_fn Filename for the counts of detections of OTUs of interest. This is
#' similar to a geography file, but with counts instead of tab-delimited columns
#' instead of the collapsed 00110 format. See PhyloWiki.
#' @return tipranges A \code{tipranges_object} with counts per taxon.
#' @export
#' @author Nicholas J. Matzke \email{matzke@@berkeley.edu}
#' @examples
#' test=1
tipranges_from_detects_fn <- function(detects_fn)
{
if (is.character(detects_fn))
{
detects_df = read_detections(detects_fn, OTUnames=NULL, areanames=NULL, tmpskip=0, phy=NULL)
tmp_blah = as.matrix(detects_df)
tmp_blah[isblank_TF(tmp_blah)] = 0
tmp_blah[tmp_blah > 0] = 1
tmp_input = adf2(tmp_blah)
tipranges_object = define_tipranges_object(tmpdf=tmp_input)
tipranges_object@df = adf2(tipranges_object@df)
rownames(tipranges_object@df) = rownames(tmp_blah)
tipranges = tipranges_object
} else {
txt = paste0("STOP ERROR in tipranges_from_detects_fn(): you are trying to get tipranges from a detections file, so an input file is required for input detects_fn. This is required to set up the tipranges internally.")
cat("\n\n")
cat(txt)
cat("\n\n")
stop(txt)
}# END if (is.character(BioGeoBEARS_run_object$detects_fn))
return(tipranges)
} # END tipranges_from_detects_fn <- function(detects_fn)
#' Read a file of detection counts from a BioGeoBEARS_run_object into a tipranges_object
#'
#' Reads a file of detection counts from from a BioGeoBEARS_run_object into
#' a \code{tipranges_object}, coding any
#' nonzero counts as present (1), and any zero counts as absent (0).
#'
#' @param BioGeoBEARS_run_object The inputs list (typically a \code{BioGeoBEARS_run_object}),
#' derived from e.g. \code{define_BioGeoBEARS_run()}. Here, a
#' \code{BioGeoBEARS_run_object$detects_fn} is needed, specifying the filename
#' for the counts of detections of OTUs of interest. This is
#' similar to a geography file, but with counts instead of tab-delimited columns
#' instead of the collapsed 00110 format. See PhyloWiki.
#' @return tipranges A \code{tipranges_object} with counts per taxon.
#' @export
#' @author Nicholas J. Matzke \email{matzke@@berkeley.edu}
#' @examples
#' test=1
tipranges_from_BGB_run_object <- function(BioGeoBEARS_run_object)
{
# Get geographic ranges at tips
if (BioGeoBEARS_run_object$use_detection_model == FALSE)
{
tipranges = getranges_from_LagrangePHYLIP(lgdata_fn=np(BioGeoBEARS_run_object$geogfn))
}
if (BioGeoBEARS_run_object$use_detection_model == TRUE)
{
tipranges = tipranges_from_detects_fn(detects_fn=BioGeoBEARS_run_object$detects_fn)
} # END if (BioGeoBEARS_run_object$use_detection_model == TRUE)
return(tipranges)
} # END tipranges_from_BGB_run_object <- function(BioGeoBEARS_run_object)
#' Calculate geographic ancestral state probabilities from a traits+geography analysis
#'
#' This function takes the ancestral state probabilities from a traits+geography analysis
#' and returns just the ancestral state probabilities of the geographic states.
#'
#' In a trait-dependent dispersal model, the state space consists of all of the
#' geographic states, combined with all of the trait states. For example, if there
#' are 4 geographic areas, there are 16 possible geographic ranges (geographic states),
#' and if there are 2 trait states, then there are a total of 16x2=32 trait+geography
#' combined states.
#'
#' The geography-only ancestral ststate probabilities can be calculated by
#' summing the trait+geography state probabilities across each trait state. The
#' trait-only ancestral state probabilities can be calculated by
#' summing the trait+geography state probabilities across each geographic range/state.
#'
#' @param res A \code{BioGeoBEARS_results_object} that is the result of a
#' \code{\link{bears_optim_run}}. (typically \code{res},
#' \code{resDEC}, \code{resDECj}, etc.)
#' @param num_trait_states The number of trait states. Default: 2.
#' @return newres, a new results object with ancestral state probabilities for just
#' the geographic ranges/states
#' @export
#' @author Nicholas J. Matzke \email{matzke@@berkeley.edu}
#' @examples
#' test=1
geog_ancstates_from_both <- function(res, num_trait_states=2)
{
newres = res
numcols = dim(res$ML_marginal_prob_each_state_at_branch_top_AT_node)[2]
# Check if even
even_TF = ((numcols %% num_trait_states) == 0)
if (even_TF == FALSE)
{
txt = paste0("STOP ERROR in geog_ancstates_from_both(): Your num_trait_states does not divide evenly into the number of columns in res$ML_marginal_prob_each_state_at_branch_top_AT_node")
stop(txt)
}
num_geog_states = numcols / num_trait_states
states1 = res$ML_marginal_prob_each_state_at_branch_top_AT_node[,1:num_geog_states]
states2 = res$ML_marginal_prob_each_state_at_branch_top_AT_node[,(num_geog_states+1):numcols]
new_state_probs = states1 + states2
newres$ML_marginal_prob_each_state_at_branch_top_AT_node = new_state_probs
states1 = res$ML_marginal_prob_each_state_at_branch_bottom_below_node[,1:num_geog_states]
states2 = res$ML_marginal_prob_each_state_at_branch_bottom_below_node[,(num_geog_states+1):numcols]
new_state_probs = states1 + states2
newres$ML_marginal_prob_each_state_at_branch_bottom_below_node = new_state_probs
return(newres)
} # END geog_ancstates_from_both <- function(res, num_trait_states=2)
#' Calculate trait ancestral state probabilities from a traits+geography analysis
#'
#' This function takes the ancestral state probabilities from a traits+geography analysis
#' and returns just the ancestral state probabilities of the trait states.
#'
#' In a trait-dependent dispersal model, the state space consists of all of the
#' geographic states, combined with all of the trait states. For example, if there
#' are 4 geographic areas, there are 16 possible geographic ranges (geographic states),
#' and if there are 2 trait states, then there are a total of 16x2=32 trait+geography
#' combined states.
#'
#' The geography-only ancestral ststate probabilities can be calculated by
#' summing the trait+geography state probabilities across each trait state. The
#' trait-only ancestral state probabilities can be calculated by
#' summing the trait+geography state probabilities across each geographic range/state.
#'
#' @param res A \code{BioGeoBEARS_results_object} that is the result of a
#' \code{\link{bears_optim_run}}. (typically \code{res},
#' \code{resDEC}, \code{resDECj}, etc.)
#' @param num_trait_states The number of trait states. Default: 2.
#' @return newres, a new results object with ancestral state probabilities for just
#' the geographic ranges/states
#' @export
#' @author Nicholas J. Matzke \email{matzke@@berkeley.edu}
#' @examples
#' test=1
trait_ancstates_from_both <- function(res, num_trait_states=2)
{
newres = res
numcols = dim(res$ML_marginal_prob_each_state_at_branch_top_AT_node)[2]
# Check if even
even_TF = ((numcols %% num_trait_states) == 0)
if (even_TF == FALSE)
{
txt = paste0("STOP ERROR in trait_ancstates_from_both(): Your num_trait_states does not divide evenly into the number of columns in res$ML_marginal_prob_each_state_at_branch_top_AT_node")
stop(txt)
}
num_geog_states = numcols / num_trait_states
trait_states_branchtops = matrix(data=NA, ncol=num_trait_states, nrow=nrow(res$ML_marginal_prob_each_state_at_branch_top_AT_node))
trait_states_branchbots = matrix(data=NA, ncol=num_trait_states, nrow=nrow(res$ML_marginal_prob_each_state_at_branch_top_AT_node))
startcol = 0
for (i in 1:num_trait_states)
{
cols_to_sum = (startcol+1):(startcol+num_geog_states)
tmp_traits = matrix(res$ML_marginal_prob_each_state_at_branch_top_AT_node[,cols_to_sum], ncol=num_geog_states)
trait_col = rowSums(tmp_traits)
trait_states_branchtops[,i] = trait_col
tmp_traits = matrix(res$ML_marginal_prob_each_state_at_branch_bottom_below_node[,cols_to_sum], ncol=num_geog_states)
trait_col = rowSums(tmp_traits)
trait_states_branchbots[,i] = trait_col
startcol = startcol + num_geog_states
}
newres$ML_marginal_prob_each_state_at_branch_top_AT_node = trait_states_branchtops
newres$ML_marginal_prob_each_state_at_branch_bottom_below_node = trait_states_branchbots
return(newres)
} # END trait_ancstates_from_both <- function(res, num_trait_states=2)
#######################################################
# read_detections
#######################################################
#' Read a file with detection counts per area
#'
#' This function reads in a tab-delimited text file containing counts of detections of
#' each OTU in each region. These could be from database records or some other source.
#'
#' @param detects_fn The filename of the detections file.
#' @param OTUnames Default \code{NULL}, in which case the first column of the text file is used as row names/OTU names.
#' @param areanames Default \code{NULL}, in which case the text file column headings are used.
#' @param tmpskip How many lines should be skipped before reading the text file? Default \code{0}.
#' @param phy An ape phylo object. If included, the rows will be sorted to match the order of tree tip labels.
#' @return \code{dtf}
#' @export
#' @seealso \code{\link[cladoRcpp]{rcpp_calc_anclikes_sp_COOweights_faster}}
#' @note Go BEARS!
#' @author Nicholas J. Matzke \email{matzke@@berkeley.edu}
#' @references
#' \url{http://phylo.wikidot.com/matzke-2013-international-biogeography-society-poster}
#' @bibliography /Dropbox/_njm/__packages/BioGeoBEARS_setup/BioGeoBEARS_refs.bib
#' @cite Matzke_2012_IBS
#' @cite Bottjer_Jablonski_1988
#' @examples
#' testval=1
#'
#' # soft-coded input files
#' extdata_dir = np(system.file("extdata", package="BioGeoBEARS"))
#' detects_fn = np(paste(extdata_dir, "/Psychotria_detections_v1.txt", sep=""))
#' controls_fn = np(paste(extdata_dir, "/Psychotria_controls_v1.txt", sep=""))
#'
#' OTUnames=NULL
#' areanames=NULL
#' tmpskip=0
#' phy=NULL
#'
#' dtf = read_detections(detects_fn, OTUnames=NULL, areanames=NULL, tmpskip=0, phy=NULL)
#' dtf
#'
#' trfn = np(paste(extdata_dir, "/Psychotria_5.2.newick", sep=""))
#' phy = read.tree(trfn)
#'
#' dtf = read_detections(detects_fn, OTUnames=NULL, areanames=NULL, tmpskip=0, phy=phy)
#' dtf
#'
#'
#'
#'
#' # soft-coded input files
#' extdata_dir = np(system.file("extdata", package="BioGeoBEARS"))
#' detects_fn = np(paste(extdata_dir, "/Psychotria_detections_v1.txt", sep=""))
#' controls_fn = np(paste(extdata_dir, "/Psychotria_controls_v1.txt", sep=""))
#'
#' OTUnames=NULL
#' areanames=NULL
#' tmpskip=0
#' phy=NULL
#'
#' dtf = read_controls(controls_fn, OTUnames=NULL, areanames=NULL, tmpskip=0, phy=NULL)
#' dtf
#'
#' trfn = np(paste(extdata_dir, "/Psychotria_5.2.newick", sep=""))
#' phy = read.tree(trfn)
#'
#' dtf = read_controls(controls_fn, OTUnames=NULL, areanames=NULL, tmpskip=0, phy=phy)
#' dtf
#'
#'
read_detections <- function(detects_fn, OTUnames=NULL, areanames=NULL, tmpskip=0, phy=NULL)
{
runjunk='
detects_fn = "/Library/Frameworks/R.framework/Versions/2.15/Resources/library/BioGeoBEARS/extdata/Psychotria_detections_v1.txt"
detects_fn = "/Dropbox/_njm/__packages/BioGeoBEARS_setup/inst/extdata/Psychotria_detections_v1.txt"
OTUnames=NULL
areanames=NULL
tmpskip=0
phy=NULL
trfn = "/Dropbox/_njm/__packages/BioGeoBEARS_setup/inst/extdata/Psychotria_5.2.newick"
phy = read.tree(trfn)
'
if (file.exists(detects_fn) == FALSE)
{
txt = paste0("STOP ERROR in read_detections(): file detects_fn='", detects_fn, "' was not found. Check your inputs and your working directory (wd). Use commands like:\n - 'getwd()' to get your current R working directory (wd)\n - '?setwd' to see how to set your R working directory\n - 'list.files()' to list the files in your current R working directory\n - and use 'system('open .')' to open your file browsing program from the R command line (this works on macs, at least). ")
cat("\n\n")
cat(txt)
cat("\n\n")
stop(txt)
} # END if (file.exists(detects_fn) == FALSE)
if (is.null(OTUnames) == TRUE)
{
dtf = utils::read.table(file=detects_fn, header=TRUE, skip=tmpskip, sep="\t", quote="", stringsAsFactors = FALSE, strip.white=TRUE, fill=TRUE, row.names=1)
dtf
OTUnames = row.names(dtf)
} else {
# Assumes no OTU names
dtf = utils::read.table(file=detects_fn, header=TRUE, skip=tmpskip, sep=" ", quote="", stringsAsFactors = FALSE, strip.white=TRUE, fill=TRUE, row.names=OTUnames)
dtf
}
if (is.null(areanames) == TRUE)
{
pass=1
} else {
names(dtf) = areanames
}
# Order tip labels
if (!is.null(phy))
{
tipranges_df_order = match(phy$tip.label, rownames(dtf))
dtf = dtf[tipranges_df_order, ]
OTUnames = OTUnames[tipranges_df_order]
}
# Standardize and return
dtf = adf2(dtf)
dtf = dfnums_to_numeric(dtf)
row.names(dtf) = OTUnames
return(dtf)
}
#######################################################
# read_controls
#######################################################
#' Read a file with the total number of detections in a taphonomic control
#'
#' This function reads in a tab-delimited text file containing counts of detections of
#' the taphonomic controls in each region. These numbers should always be equal to or
#' larger than the counts in the detections file.
#'
#' The idea of taphonomic controls dates back at least to work of Bottjer & Jablonski (1988). The basic
#' idea is that if you have taxa of roughly similar detectability, then detections of other
#' taxa give some idea of overall detection effort. Obviously this is a very simple
#' model that can be criticized in any number of ways (different alpha diversity in each
#' region, different detectability of individual taxa, etc.), but it is a useful starting
#' point as there has been no implementation of any detection model in historical/phylogenetic
#' biogeography to date.
#'
#' One could imagine (a) every OTU and area has a different count of detections and taphonomic control
#' detections, or (b) the taphonomic control detections are specified by area, and shared across all OTUs.
#' Situation (b) is likely more common, but this function implements (a). Behavior (b)
#' could be reproduced by summing each column, and/or copying this sum to all cells for a
#' particular area.
#'
#' @param controls_fn The filename of the file containing the counts of taphonomic control detections.
#' @param OTUnames Default \code{NULL}, in which case the first column of the text file is used as row names/OTU names.
#' @param areanames Default \code{NULL}, in which case the text file column headings are used.
#' @param tmpskip How many lines should be skipped before reading the text file? Default \code{0}.
#' @param phy An ape phylo object. If included, the rows will be sorted to match the order of tree tip labels.
#' @return \code{dtf}
#' @export
#' @seealso \code{\link[cladoRcpp]{rcpp_calc_anclikes_sp_COOweights_faster}}
#' @note Go BEARS!
#' @author Nicholas J. Matzke \email{matzke@@berkeley.edu}
#' @references
#' \url{http://phylo.wikidot.com/matzke-2013-international-biogeography-society-poster}
#' @bibliography /Dropbox/_njm/__packages/BioGeoBEARS_setup/BioGeoBEARS_refs.bib
#' @cite Matzke_2012_IBS
#' @cite Bottjer_Jablonski_1988
#' @examples
#' testval=1
#'
#' # soft-coded input files
#' extdata_dir = np(system.file("extdata", package="BioGeoBEARS"))
#' detects_fn = np(paste(extdata_dir, "/Psychotria_detections_v1.txt", sep=""))
#' controls_fn = np(paste(extdata_dir, "/Psychotria_controls_v1.txt", sep=""))
#'
#' OTUnames=NULL
#' areanames=NULL
#' tmpskip=0
#' phy=NULL
#'
#' dtf = read_detections(detects_fn, OTUnames=NULL, areanames=NULL, tmpskip=0, phy=NULL)
#' dtf
#'
#' trfn = np(paste(extdata_dir, "/Psychotria_5.2.newick", sep=""))
#' phy = read.tree(trfn)
#'
#' dtf = read_detections(detects_fn, OTUnames=NULL, areanames=NULL, tmpskip=0, phy=phy)
#' dtf
#'
#'
#'
#'
#' # soft-coded input files
#' extdata_dir = np(system.file("extdata", package="BioGeoBEARS"))
#' detects_fn = np(paste(extdata_dir, "/Psychotria_detections_v1.txt", sep=""))
#' controls_fn = np(paste(extdata_dir, "/Psychotria_controls_v1.txt", sep=""))
#'
#' OTUnames=NULL
#' areanames=NULL
#' tmpskip=0
#' phy=NULL
#'
#' dtf = read_controls(controls_fn, OTUnames=NULL, areanames=NULL, tmpskip=0, phy=NULL)
#' dtf
#'
#' trfn = np(paste(extdata_dir, "/Psychotria_5.2.newick", sep=""))
#' phy = read.tree(trfn)
#'
#' dtf = read_controls(controls_fn, OTUnames=NULL, areanames=NULL, tmpskip=0, phy=phy)
#' dtf
#'
#'
read_controls <- function(controls_fn, OTUnames=NULL, areanames=NULL, tmpskip=0, phy=NULL)
{
runjunk='
controls_fn = "/Library/Frameworks/R.framework/Versions/2.15/Resources/library/BioGeoBEARS/extdata/Psychotria_controls_v1.txt"
controls_fn = "/Dropbox/_njm/__packages/BioGeoBEARS_setup/inst/extdata/Psychotria_controls_v1.txt"
OTUnames=NULL
areanames=NULL
tmpskip=0
'
if (file.exists(controls_fn) == FALSE)
{
txt = paste0("STOP ERROR in read_controls(): file controls_fn='", controls_fn, "' was not found. Check your inputs and your working directory (wd). Use commands like:\n - 'getwd()' to get your current R working directory (wd)\n - '?setwd' to see how to set your R working directory\n - 'list.files()' to list the files in your current R working directory\n - and use 'system('open .')' to open your file browsing program from the R command line (this works on macs, at least). ")
cat("\n\n")
cat(txt)
cat("\n\n")
stop(txt)
} # END if (file.exists(controls_fn) == FALSE)
if (is.null(OTUnames) == TRUE)
{
dtf = utils::read.table(file=controls_fn, header=TRUE, skip=tmpskip, sep=" ", quote="", stringsAsFactors = FALSE, strip.white=TRUE, fill=TRUE, row.names=1)
dtf
OTUnames = row.names(dtf)
} else {
# Assumes no OTU names
dtf = utils::read.table(file=controls_fn, header=TRUE, skip=tmpskip, sep=" ", quote="", stringsAsFactors = FALSE, strip.white=TRUE, fill=TRUE, row.names=OTUnames)
dtf
}
if (is.null(areanames) == TRUE)
{
pass=1
} else {
names(dtf) = areanames
}
# Order tip labels
if (!is.null(phy))
{
tipranges_df_order = match(phy$tip.label, rownames(dtf))
dtf = dtf[tipranges_df_order, ]
OTUnames = OTUnames[tipranges_df_order]
}
# Standardize and return
dtf = adf2(dtf)
dtf = dfnums_to_numeric(dtf)
row.names(dtf) = OTUnames
return(dtf)
}
# Likelihood model
# Calculate P(obs | presence)
# Source:
# "/Users/nickm/Desktop/__projects/___phylokriging_example/__scripts/_ocean_data_predict_v6.R', chdir = TRUE)
#######################################################
# calc_obs_like
#######################################################
#' Calculate likelihood of count data given true presence/absence and parameters
#'
#' This function calculates P(data|presence,parameters), i.e. the probability of some detection and
#' taphonomic control counts, given the true geographic range/state, and parameters such as \code{dp}, a
#' detection probability (and, optionally, a false detection probability, \code{fdp}).
#'
#' The idea of taphonomic controls dates back at least to work of Bottjer & Jablonski (1988). The basic
#' idea is that if you have taxa of roughly similar detectability, then detections of other
#' taxa give some idea of overall detection effort. Obviously this is a very simple
#' model that can be criticized in any number of ways (different alpha diversity in each
#' region, different detectability of individual taxa, etc.), but it is a useful starting
#' point as there has been no implementation of any detection model in historical/phylogenetic
#' biogeography to date.
#'
#' One could imagine (a) every OTU and area has a different count of detections and taphonomic control
#' detections, or (b) the taphonomic control detections are specified by area, and shared across all OTUs.
#' Situation (b) is likely more common, but this function assumes (a) as this is the more thorough case.
#' Behavior (b) could be reproduced by summing each column, and/or copying this sum to all cells for a
#' particular area.
#'
#' @param truly_present Is the OTU of interest known/conditionally assumed to be truly present (\code{TRUE}) or truly absent (\code{FALSE})?
#' @param obs_target_species A count of detections of your OTU of interest, e.g. as produced from a cell of the matrix output from \code{\link{read_detections}}.
#' @param obs_all_species A count of detections of your taphonomic controls, e.g. as produced from a cell of the output from \code{\link{read_controls}}.
#' @param mean_frequency This is the proportion of samples from the taphonomic control group that will truly be from this OTU, GIVEN that the OTU is present.
#' This could be estimated, but a decent first guess is (total # samples of OTU of interest / total # of samples in the taphonomic control group where
#' the OTU is known to be present). All that is really needed is some reasonable value, such that more sampling without detection lowers the
#' likelihood of the data on the hypothesis of true presence, and vice versa. This value can only be 1 when the number of detections = the number
#' of taphonomic control detections, for every OTU and area. This is the implicit assumption in e.g. standard historical biogeography analyses in
#' LAGRANGE or BioGeoBEARS.
#' @param dp The detection probability. This is the per-sample probability that you will correctly detect the OTU in question,
#' when you are looking at it. Default is 1, which is the implicit assumption in standard analyses.
#' @param fdp The false detection probability. This is probability of falsely concluding a detection of the OTU of interest occurred, when in
#' fact the specimen was of something else. The default is 0, which assumes zero error rate,
#' i.e. the assumption being made in all historical biogeography analyses that do not take into account detection
#' probability. These options are being included for completeness, but it may not be wise to try to infer \code{mean_frequency},
#' \code{dp} and \code{fdp} all at once due to identifiability issues (and estimation of fdp may take a very large amount of data). However,
#' fixing some of these parameters to reasonable values can allow the user to effectively include beliefs about the uncertainty of the
#' input data into the analysis, if desired.
#' @return \code{lnlike_allobs_given_absence} The natural log-likelihood of the data, given the model & assumption of true presence or absence.
#' @export
#' @seealso \code{\link{mapply_calc_post_prob_presence}}, \code{\link{calc_post_prob_presence}}, \code{\link{mapply_calc_obs_like}}
#' @note Go BEARS!
#' @author Nicholas J. Matzke \email{matzke@@berkeley.edu}
#' @references
#' \url{http://phylo.wikidot.com/matzke-2013-international-biogeography-society-poster}
#' @bibliography /Dropbox/_njm/__packages/BioGeoBEARS_setup/BioGeoBEARS_refs.bib
#' @cite Matzke_2012_IBS
#' @cite Bottjer_Jablonski_1988
#' @examples
#'
#' # Example: 10 observations of the species mean dramatically higher likelihood of the
#' # data on the hypothesis that it is truly present.
#'
#' # With zero error rate
#' obs_target_species = 10
#' obs_all_species = 100
#' mean_frequency=0.1
#' dp=1
#' fdp=0
#' LnL_under_presence = calc_obs_like(truly_present=TRUE, obs_target_species,
#' obs_all_species, mean_frequency, dp, fdp)
#' LnL_under_absence = calc_obs_like(truly_present=FALSE, obs_target_species,
#' obs_all_species, mean_frequency, dp, fdp)
#' LnL_under_presence
#' LnL_under_absence
#' # Note that the probability of getting detections, under the hypothesis of
#' # true absence, is -Inf
#'
#'
#' # With a small error rate, there is some small but positive probability of
#' # falsely getting 10 detections
#' obs_target_species = 10
#' obs_all_species = 100
#' mean_frequency=0.1
#' dp=0.99
#' fdp=0.001
#' LnL_under_presence = calc_obs_like(truly_present=TRUE, obs_target_species,
#' obs_all_species, mean_frequency, dp, fdp)
#' LnL_under_absence = calc_obs_like(truly_present=FALSE, obs_target_species,
#' obs_all_species, mean_frequency, dp, fdp)
#' LnL_under_presence
#' LnL_under_absence
#' # i.e. the prob. of the data is 1 under the hypothesis of presence, and 0
#' # under the hypothesis of absence (ln(prob) = 0 & -Inf, respectively)
#'
#'
#' # Note that with very high error rates, your conclusion could reverse
#' obs_target_species = 10
#' obs_all_species = 100
#' mean_frequency=0.1
#' dp=0.5
#' fdp=0.3
#' LnL_under_presence = calc_obs_like(truly_present=TRUE, obs_target_species,
#' obs_all_species, mean_frequency, dp, fdp)
#' LnL_under_absence = calc_obs_like(truly_present=FALSE, obs_target_species,
#' obs_all_species, mean_frequency, dp, fdp)
#' LnL_under_presence
#' LnL_under_absence
#'
#'
#' # Example #2 -- what if you have ZERO detections, but lots of detections
#' # of your taphonomic control?
#' obs_target_species = 0
#' obs_all_species = 1
#' mean_frequency=0.1
#' dp=1
#' fdp=0
#' LnL_under_presence = calc_obs_like(truly_present=TRUE, obs_target_species,
#' obs_all_species, mean_frequency, dp, fdp)
#' LnL_under_absence = calc_obs_like(truly_present=FALSE, obs_target_species,
#' obs_all_species, mean_frequency, dp, fdp)
#' LnL_under_presence
#' LnL_under_absence
#'
#' # With a slight error rate
#' obs_target_species = 0
#' obs_all_species = 1
#' mean_frequency=0.1
#' dp=0.99
#' fdp=0.001
#' LnL_under_presence = calc_obs_like(truly_present=TRUE, obs_target_species,
#' obs_all_species, mean_frequency, dp, fdp)
#' LnL_under_absence = calc_obs_like(truly_present=FALSE, obs_target_species,
#' obs_all_species, mean_frequency, dp, fdp)
#' LnL_under_presence
#' LnL_under_absence
#'
#'
#' obs_target_species = 0
#' obs_all_species = 2
#' mean_frequency=0.1
#' dp=1
#' fdp=0
#' LnL_under_presence = calc_obs_like(truly_present=TRUE, obs_target_species,
#' obs_all_species, mean_frequency, dp, fdp)
#' LnL_under_absence = calc_obs_like(truly_present=FALSE, obs_target_species,
#' obs_all_species, mean_frequency, dp, fdp)
#' LnL_under_presence
#' LnL_under_absence
#'
#' # With a slight error rate
#' obs_target_species = 0
#' obs_all_species = 2
#' mean_frequency=0.1
#' dp=0.99
#' fdp=0.001
#' LnL_under_presence = calc_obs_like(truly_present=TRUE, obs_target_species,
#' obs_all_species, mean_frequency, dp, fdp)
#' LnL_under_absence = calc_obs_like(truly_present=FALSE, obs_target_species,
#' obs_all_species, mean_frequency, dp, fdp)
#' LnL_under_presence
#' LnL_under_absence
#'
#'
#'
#'
#'
#' # Example #3 -- what if you have ZERO detections, but only a few
#' # detections of your taphonomic control?
#' obs_target_species = 0
#' obs_all_species = 100
#' mean_frequency=0.1
#' dp=1
#' fdp=0
#' LnL_under_presence = calc_obs_like(truly_present=TRUE, obs_target_species,
#' obs_all_species, mean_frequency, dp, fdp)
#' LnL_under_absence = calc_obs_like(truly_present=FALSE, obs_target_species,
#' obs_all_species, mean_frequency, dp, fdp)
#' LnL_under_presence
#' LnL_under_absence
#'
#' # With a slight error rate
#' obs_target_species = 0
#' obs_all_species = 100
#' mean_frequency=0.1
#' dp=0.99
#' fdp=0.001
#' LnL_under_presence = calc_obs_like(truly_present=TRUE, obs_target_species,
#' obs_all_species, mean_frequency, dp, fdp)
#' LnL_under_absence = calc_obs_like(truly_present=FALSE, obs_target_species,
#' obs_all_species, mean_frequency, dp, fdp)
#' LnL_under_presence
#' LnL_under_absence
#'
#'
#'
#' # Special cases -- e.g., no data
#' # Prob(data)=1, ln(prob)=0
#' obs_target_species = 0
#' obs_all_species = 0
#' mean_frequency=0.1
#' dp=0.99
#' fdp=0.001
#' LnL_under_presence = calc_obs_like(truly_present=TRUE, obs_target_species,
#' obs_all_species, mean_frequency, dp, fdp)
#' LnL_under_absence = calc_obs_like(truly_present=FALSE, obs_target_species,
#' obs_all_species, mean_frequency, dp, fdp)
#' LnL_under_presence
#' LnL_under_absence
#'
#' obs_target_species = 0
#' obs_all_species = 0
#' mean_frequency=0.1
#' dp=1
#' fdp=0
#' LnL_under_presence = calc_obs_like(truly_present=TRUE, obs_target_species,
#' obs_all_species, mean_frequency, dp, fdp)
#' LnL_under_absence = calc_obs_like(truly_present=FALSE, obs_target_species,
#' obs_all_species, mean_frequency, dp, fdp)
#' LnL_under_presence
#' LnL_under_absence
#'
#'
#' # What if, for some reason, you put in identical detections and taphonomic control
#' # counts? (e.g., you load in a standard tipranges file)
#' obs_target_species = 1
#' obs_all_species = 1
#' mean_frequency=1
#' dp=1
#' fdp=0
#' LnL_under_presence = calc_obs_like(truly_present=TRUE, obs_target_species,
#' obs_all_species, mean_frequency, dp, fdp)
#' LnL_under_absence = calc_obs_like(truly_present=FALSE, obs_target_species,
#' obs_all_species, mean_frequency, dp, fdp)
#' LnL_under_presence
#' LnL_under_absence
#'
#' # What if, for some reason, you put in identical detections and taphonomic control
#' # counts? (e.g., you load in a standard tipranges file)
#' obs_target_species = 1
#' obs_all_species = 1
#' mean_frequency=1
#' dp=0.99
#' fdp=0.001
#' LnL_under_presence = calc_obs_like(truly_present=TRUE, obs_target_species,
#' obs_all_species, mean_frequency, dp, fdp)
#' LnL_under_absence = calc_obs_like(truly_present=FALSE, obs_target_species,
#' obs_all_species, mean_frequency, dp, fdp)
#' LnL_under_presence
#' LnL_under_absence
#'
#'
calc_obs_like <- function(truly_present=TRUE, obs_target_species, obs_all_species, mean_frequency=0.1, dp=1, fdp=0)
{
# Error check -- if you have NO taphonomic controls, you have no information to calculate the likelihood.
# Therefore just return 1 for any hypothesis (implicit flat prior)
if (obs_all_species == 0)
{
# Probability of these "data" is 1 under any hypothesis; ln(prob=1) is 0
return(0)
}
# Hard-coded model parameters
#########################################
# Sampling probability
# Given a single sample, what is the probability that it is the target species?
Psample_is_the_species_given_presence = mean_frequency
Psample_is_NOT_species_given_presence = 1-mean_frequency
Psample_is_the_species_given_absence = 0
Psample_is_NOT_species_given_absence = 1-Psample_is_the_species_given_absence
# Identification probability
# Given an ID, what is the probability that it is accurate?
P_ID_is_accurate_given_species_is_target = dp
P_ID_is_NOT_accurate_given_species_is_target = 1-P_ID_is_accurate_given_species_is_target
P_ID_is_accurate_given_NONtarget_species = 1-fdp
P_ID_is_NOT_accurate_given_NONtarget_species = 1-P_ID_is_accurate_given_NONtarget_species # false detection prob
#########################################
# Get observations
# count of the observations of the target species
obs_target_species = obs_target_species
# count of the observations of the other species (non-target)
obs_NONtarget_species = obs_all_species-obs_target_species
# Calculate P(obs | true_presence)
if (truly_present == TRUE)
{
# prob of presence data (given presence in pixel)
# prob of ways to observe species correctly (true positive)
p1 = Psample_is_the_species_given_presence * P_ID_is_accurate_given_species_is_target
# prob of ways to observe species incorrectly (false positive)
p2 = Psample_is_NOT_species_given_presence * P_ID_is_NOT_accurate_given_NONtarget_species
lnlike_presencedata_given_presence = log(p1 + p2)*obs_target_species
exp(lnlike_presencedata_given_presence)
# prob of absence data (given presence in pixel)
# prob of ways to observe NOT species correctly (true negative)
# still given presence
p1 = Psample_is_NOT_species_given_presence * P_ID_is_accurate_given_NONtarget_species
# prob of ways to observe NOT species incorrectly (false negative)
p2 = Psample_is_the_species_given_presence * P_ID_is_NOT_accurate_given_species_is_target
lnlike_absensedata_given_presence = log(p1 + p2)*obs_NONtarget_species
# -Inf * 0 trap; prob is 1, log prob is 0
if ( (log(p1 + p2) == -Inf) && obs_NONtarget_species == 0)
{
lnlike_absensedata_given_presence = 0
}
exp(lnlike_absensedata_given_presence)
lnlike_allobs_given_presence = lnlike_presencedata_given_presence + lnlike_absensedata_given_presence
exp(lnlike_allobs_given_presence)
return(lnlike_allobs_given_presence)
}
# Calculate P(obs | true_absence)
if (truly_present == FALSE)
{
# prob of ways to observe species correctly (true result, which can't be positive)
p1 = Psample_is_the_species_given_absence * P_ID_is_accurate_given_species_is_target
# prob of ways to observe species incorrectly (false result, which might positive)
p2 = Psample_is_NOT_species_given_absence * P_ID_is_NOT_accurate_given_NONtarget_species
lnlike_presencedata_given_absence = log(p1 + p2)*obs_target_species
# -Inf * 0 trap; prob is 1, log prob is 0
if ( (log(p1 + p2) == -Inf) && obs_target_species == 0)
{
lnlike_presencedata_given_absence = 0
}
exp(lnlike_presencedata_given_absence)
# prob of ways to observe NOT species correctly (true negative)
# still given absence
p1 = Psample_is_NOT_species_given_absence * P_ID_is_accurate_given_NONtarget_species
# prob of ways to observe NOT species incorrectly (false negative)
p2 = Psample_is_the_species_given_absence * P_ID_is_NOT_accurate_given_species_is_target
lnlike_absensedata_given_absence = log(p1 + p2)*obs_NONtarget_species
exp(lnlike_absensedata_given_absence)
lnlike_allobs_given_absence = lnlike_presencedata_given_absence + lnlike_absensedata_given_absence
exp(lnlike_allobs_given_absence)
return(lnlike_allobs_given_absence)
}
}
#######################################################
# calc_post_prob_presence
#######################################################
#' Calculate posterior probability of presence, given count data and parameters
#'
#' This function calculates P(presence|count data,parameters), i.e. the posterior
#' probability of presence in an area, given data on detection counts and
#' taphonomic control counts, and a detection model with the parameters mean_frequency, \code{dp}, a
#' detection probability (and, optionally, a false detection probability, \code{fdp}).
#'
#' Essentially, this function combines a prior probability, with the likelihood function
#' (coded in \code{\link{calc_obs_like}}) to produce a posterior probability of presence
#' given Bayes' Theorem (Bayes & Price, 1763).
#'
#' The idea of taphonomic controls dates back at least to work of Bottjer & Jablonski (1988). The basic
#' idea is that if you have taxa of roughly similar detectability, then detections of other
#' taxa give some idea of overall detection effort. Obviously this is a very simple
#' model that can be criticized in any number of ways (different alpha diversity in each
#' region, different detectability of individual taxa, etc.), but it is a useful starting
#' point as there has been no implementation of any detection model in historical/phylogenetic
#' biogeography to date.
#'
#' One could imagine (a) every OTU and area has a different count of detections and taphonomic control
#' detections, or (b) the taphonomic control detections are specified by area, and shared across all OTUs.
#' Situation (b) is likely more common, but this function assumes (a) as this is the more thorough case.
#' Behavior (b) could be reproduced by summing each column, and/or copying this sum to all cells for a
#' particular area.
#'
#' @param prior_prob_presence The prior probability of presence, i.e. when no detection or taphonomic control data
#' whatsoever is available. Default is set to 0.01 which expresses my totally uninformed bias that
#' in whatever your data is, your species of interest probably doesn't live in the typical area you are
#' looking at.
#' @param obs_target_species A count of detections of your OTU of interest, e.g. as produced from a cell of the matrix output from \code{\link{read_detections}}.
#' @param obs_all_species A count of detections of your taphonomic controls, e.g. as produced from a cell of the output from \code{\link{read_controls}}.
#' @param mean_frequency This is the proportion of samples from the taphonomic control group that will truly be from this OTU, GIVEN that the OTU is present.
#' This could be estimated, but a decent first guess is (total # samples of OTU of interest / total # of samples in the taphonomic control group where
#' the OTU is known to be present). All that is really needed is some reasonable value, such that more sampling without detection lowers the
#' likelihood of the data on the hypothesis of true presence, and vice versa. This value can only be 1 when the number of detections = the number
#' of taphonomic control detections, for every OTU and area. This is the implicit assumption in e.g. standard historical biogeography analyses in
#' LAGRANGE or BioGeoBEARS.
#' @param dp The detection probability. This is the per-sample probability that you will correctly detect the OTU in question,
#' when you are looking at it. Default is 1, which is the implicit assumption in standard analyses.
#' @param fdp The false detection probability. This is probability of falsely concluding a detection of the OTU of interest occurred, when in
#' fact the specimen was of something else. The default is 0, which assumes zero error rate,
#' i.e. the assumption being made in all historical biogeography analyses that do not take into account detection
#' probability. These options are being included for completeness, but it may not be wise to try to infer \code{mean_frequency},
#' \code{dp} and \code{fdp} all at once due to identifiability issues (and estimation of fdp may take a very large amount of data). However,
#' fixing some of these parameters to reasonable values can allow the user to effectively include beliefs about the uncertainty of the
#' input data into the analysis, if desired.
#' @param print_progress If not the default (\code{""}), print whatever is in print_progress, followed by a space (for error checking/surveying results).
#' @return \code{post_prob} The posterior probability of presence, given the prior probability, the model parameters, and the data.
#' @export
#' @seealso \code{\link{calc_obs_like}}, \code{\link{mapply_calc_post_prob_presence}}, \code{\link{mapply_calc_obs_like}}
#' @note Go BEARS!
#' @author Nicholas J. Matzke \email{matzke@@berkeley.edu}
#' @references
#' \url{http://phylo.wikidot.com/matzke-2013-international-biogeography-society-poster}
#' \url{http://en.wikipedia.org/wiki/Bayes'_theorem}
#' @bibliography /Dropbox/_njm/__packages/BioGeoBEARS_setup/BioGeoBEARS_refs.bib
#' @cite Matzke_2012_IBS
#' @cite Bottjer_Jablonski_1988
#' @cite Bayes_1763
#' @examples
#'
#' # Calculate posterior probability of presence in an area,
#' # given a dp (detection probability) and detection model.
#'
#' # With zero error rate
#' obs_target_species = 10
#' obs_all_species = 100
#' mean_frequency=0.1
#' dp=1
#' fdp=0
#' prior_prob_presence = 0.01
#' post_prob = calc_post_prob_presence(prior_prob_presence, obs_target_species,
#' obs_all_species, mean_frequency, dp, fdp)
#' post_prob
#' # i.e., with perfect detection, the prob. of presence is 1 under the
#' # hypothesis of presence, and 0 under the hypothesis of
#' # (This is because the likelihood of the data under
#' # presence and absence are ln(prob) = 0 & -Inf, respectively.)
#'
#'
#' # Note that with very high error rates, your conclusion could reverse
#' obs_target_species = 10
#' obs_all_species = 100
#' mean_frequency=0.1
#' dp=0.5
#' fdp=0.3
#' prior_prob_presence = 0.01
#' post_prob = calc_post_prob_presence(prior_prob_presence, obs_target_species,
#' obs_all_species, mean_frequency, dp, fdp)
#' post_prob
#'
#'
#' # With 0 error rate, even 1 observation makes P(presence) = 1
#' obs_target_species = 1
#' obs_all_species = 100
#' mean_frequency=0.1
#' dp=1
#' fdp=0
#' prior_prob_presence = 0.01
#' post_prob = calc_post_prob_presence(prior_prob_presence, obs_target_species,
#' obs_all_species, mean_frequency, dp, fdp)
#' post_prob
#'
#'
#' # With a small error rate, there is some small but positive probability of
#' # falsely getting 10 detections; but it may be effectively 0
#' obs_target_species = 10
#' obs_all_species = 100
#' mean_frequency=0.1
#' dp=0.99
#' fdp=0.001
#' prior_prob_presence = 0.01
#' post_prob = calc_post_prob_presence(prior_prob_presence, obs_target_species,
#' obs_all_species, mean_frequency, dp, fdp)
#' post_prob
#'
#'
#' # If you have only 1 detection, and you have 100 taphonomic controls and
#' # a mean_frequency of sampling the OTU of interest of 0.1, then there is
#' # still a very low probability of presence (since, under your model,
#' # you should expect to see about 10 detections, not 1)
#' obs_all_species = 100
#' mean_frequency=0.1
#' dp=0.99
#' fdp=0.001
#' prior_prob_presence = 0.01
#'
#' obs_target_species = 0
#' calc_post_prob_presence(prior_prob_presence, obs_target_species,
#' obs_all_species, mean_frequency, dp, fdp)
#'
#' obs_target_species = 1
#' calc_post_prob_presence(prior_prob_presence, obs_target_species,
#' obs_all_species, mean_frequency, dp, fdp)
#'
#' obs_target_species = 2
#' calc_post_prob_presence(prior_prob_presence, obs_target_species,
#' obs_all_species, mean_frequency, dp, fdp)
#'
#' obs_target_species = 3
#' calc_post_prob_presence(prior_prob_presence, obs_target_species,
#' obs_all_species, mean_frequency, dp, fdp)
#'
#'
#'
#' # Note how quickly this chances if you drop the mean_frequency from 0.1
#' # to 0.01. This means that if you want single detections to count for
#' # a lot, you need either a low mean_frequency which matches the observed
#' # frequency, or an extremely high/perfect detection probability (dp).
#' obs_all_species = 100
#' mean_frequency=0.01
#' dp=0.99
#' fdp=0.001
#' prior_prob_presence = 0.01
#'
#' obs_target_species = 0
#' calc_post_prob_presence(prior_prob_presence, obs_target_species,
#' obs_all_species, mean_frequency, dp, fdp)
#'
#' obs_target_species = 1
#' calc_post_prob_presence(prior_prob_presence, obs_target_species,
#' obs_all_species, mean_frequency, dp, fdp)
#'
#' obs_target_species = 2
#' calc_post_prob_presence(prior_prob_presence, obs_target_species,
#' obs_all_species, mean_frequency, dp, fdp)
#'
#' obs_target_species = 3
#' calc_post_prob_presence(prior_prob_presence, obs_target_species,
#' obs_all_species, mean_frequency, dp, fdp)
#'
#'
#' # Changing mean_frequency from 0.01 to 0.001 actually LOWERS the posterior
#' # probability of presence based on 1 detection, as we have a somewhat
#' # significant false detection rate:
#' obs_target_species = 1
#' obs_all_species = 100
#' mean_frequency=0.001
#' dp=0.99
#' fdp=0.001
#' prior_prob_presence = 0.01
#'
#' obs_target_species = 0
#' calc_post_prob_presence(prior_prob_presence, obs_target_species,
#' obs_all_species, mean_frequency, dp, fdp)
#'
#' obs_target_species = 1
#' calc_post_prob_presence(prior_prob_presence, obs_target_species,
#' obs_all_species, mean_frequency, dp, fdp)
#'
#' obs_target_species = 2
#' calc_post_prob_presence(prior_prob_presence, obs_target_species,
#' obs_all_species, mean_frequency, dp, fdp)
#'
#' obs_target_species = 3
#' calc_post_prob_presence(prior_prob_presence, obs_target_species,
#' obs_all_species, mean_frequency, dp, fdp)
#'
#'
#' # Change false detection probability to a much lower value
#' obs_all_species = 100
#' mean_frequency=0.001
#' dp=0.99
#' fdp=0.00001
#' prior_prob_presence = 0.01
#'
#' obs_target_species = 0
#' calc_post_prob_presence(prior_prob_presence, obs_target_species,
#' obs_all_species, mean_frequency, dp, fdp)
#'
#' obs_target_species = 1
#' calc_post_prob_presence(prior_prob_presence, obs_target_species,
#' obs_all_species, mean_frequency, dp, fdp)
#'
#' obs_target_species = 2
#' calc_post_prob_presence(prior_prob_presence, obs_target_species,
#' obs_all_species, mean_frequency, dp, fdp)
#'
#' obs_target_species = 3
#' calc_post_prob_presence(prior_prob_presence, obs_target_species,
#' obs_all_species, mean_frequency, dp, fdp)
#'
#'
#'
#'
#' # Change false detection probability to 0
#' obs_all_species = 100
#' mean_frequency=0.001
#' dp=0.99
#' fdp=0.0
#' prior_prob_presence = 0.01
#'
#'
#' obs_target_species = 0
#' calc_post_prob_presence(prior_prob_presence, obs_target_species,
#' obs_all_species, mean_frequency, dp, fdp)
#'
#' obs_target_species = 1
#' calc_post_prob_presence(prior_prob_presence, obs_target_species,
#' obs_all_species, mean_frequency, dp, fdp)
#'
#' obs_target_species = 2
#' calc_post_prob_presence(prior_prob_presence, obs_target_species,
#' obs_all_species, mean_frequency, dp, fdp)
#'
#' obs_target_species = 3
#' calc_post_prob_presence(prior_prob_presence, obs_target_species,
#' obs_all_species, mean_frequency, dp, fdp)
#'
#'
#'
#'
#'
#'
#'
#'
#'
#' # Change mean_frequency to 0.001
#' obs_all_species = 100
#' mean_frequency=0.001
#' dp=0.99
#' fdp=0.0
#' prior_prob_presence = 0.01
#'
#' obs_target_species = 0
#' calc_post_prob_presence(prior_prob_presence, obs_target_species,
#' obs_all_species, mean_frequency, dp, fdp)
#'
#' obs_target_species = 1
#' calc_post_prob_presence(prior_prob_presence, obs_target_species,
#' obs_all_species, mean_frequency, dp, fdp)
#'
#' obs_target_species = 2
#' calc_post_prob_presence(prior_prob_presence, obs_target_species,
#' obs_all_species, mean_frequency, dp, fdp)
#'
#' obs_target_species = 3
#' calc_post_prob_presence(prior_prob_presence, obs_target_species,
#' obs_all_species, mean_frequency, dp, fdp)
#'
#'
#'
#'
#'
#'
#'
#' # Example #2 -- what if you have ZERO detections, but lots of detections
#' # of your taphonomic control?
#' obs_target_species = 0
#' obs_all_species = 100
#' mean_frequency=0.1
#' dp=1
#' fdp=0
#' prior_prob_presence = 0.01
#' post_prob = calc_post_prob_presence(prior_prob_presence, obs_target_species,
#' obs_all_species, mean_frequency, dp, fdp)
#' post_prob
#'
#' # With a slight error rate
#' obs_target_species = 0
#' obs_all_species = 100
#' mean_frequency=0.1
#' dp=0.99
#' fdp=0.001
#' prior_prob_presence = 0.01
#' post_prob = calc_post_prob_presence(prior_prob_presence, obs_target_species,
#' obs_all_species, mean_frequency, dp, fdp)
#' post_prob
#'
#'
#' obs_target_species = 0
#' obs_all_species = 2
#' mean_frequency=0.1
#' dp=1
#' fdp=0
#' prior_prob_presence = 0.01
#' post_prob = calc_post_prob_presence(prior_prob_presence, obs_target_species,
#' obs_all_species, mean_frequency, dp, fdp)
#' post_prob
#'
#' # With a slight error rate
#' obs_target_species = 0
#' obs_all_species = 2
#' mean_frequency=0.1
#' dp=0.99
#' fdp=0.001
#' prior_prob_presence = 0.01
#' post_prob = calc_post_prob_presence(prior_prob_presence, obs_target_species,
#' obs_all_species, mean_frequency, dp, fdp)
#' post_prob
#'
#'
#'
#'
#'
#' # Example #3 -- what if you have ZERO detections, but only a few
#' # detections of your taphonomic control?
#' obs_target_species = 0
#' obs_all_species = 1
#' mean_frequency=0.1
#' dp=1
#' fdp=0
#' prior_prob_presence = 0.01
#' post_prob = calc_post_prob_presence(prior_prob_presence, obs_target_species,
#' obs_all_species, mean_frequency, dp, fdp)
#' post_prob
#'
#' # With a slight error rate
#' obs_target_species = 0
#' obs_all_species = 1
#' mean_frequency=0.1
#' dp=0.99
#' fdp=0.001
#' prior_prob_presence = 0.01
#' post_prob = calc_post_prob_presence(prior_prob_presence, obs_target_species,
#' obs_all_species, mean_frequency, dp, fdp)
#' post_prob
#'
#'
#'
#' # Special cases -- e.g., no data
#' # Prob(data)=1, ln(prob)=0
#' obs_target_species = 0
#' obs_all_species = 0
#' mean_frequency=0.1
#' dp=0.99
#' fdp=0.001
#' prior_prob_presence = 0.01
#' post_prob = calc_post_prob_presence(prior_prob_presence, obs_target_species,
#' obs_all_species, mean_frequency, dp, fdp)
#' post_prob
#'
#' obs_target_species = 0
#' obs_all_species = 0
#' mean_frequency=0.1
#' dp=1
#' fdp=0
#' prior_prob_presence = 0.01
#' post_prob = calc_post_prob_presence(prior_prob_presence, obs_target_species,
#' obs_all_species, mean_frequency, dp, fdp)
#' post_prob
#'
#'
#' # What if, for some reason, you put in identical detections and taphonomic control
#' # counts? (e.g., you load in a standard tipranges file)
#' obs_target_species = 1
#' obs_all_species = 1
#' mean_frequency=1
#' dp=1
#' fdp=0
#' prior_prob_presence = 0.01
#' post_prob = calc_post_prob_presence(prior_prob_presence, obs_target_species,
#' obs_all_species, mean_frequency, dp, fdp)
#' post_prob
#'
#' # What if, for some reason, you put in identical detections and taphonomic control
#' # counts? (e.g., you load in a standard tipranges file)
#' obs_target_species = 1
#' obs_all_species = 1
#' mean_frequency=1
#' dp=0.99
#' fdp=0.001
#' prior_prob_presence = 0.01
#' post_prob = calc_post_prob_presence(prior_prob_presence, obs_target_species,
#' obs_all_species, mean_frequency, dp, fdp)
#' post_prob
#'
#'
calc_post_prob_presence <- function(prior_prob_presence=0.01, obs_target_species, obs_all_species, mean_frequency=0.1, dp=1, fdp=0, print_progress="")
{
# Priors
prior_prob_presence = prior_prob_presence
prior_prob_absence = 1-prior_prob_presence
# likelihood of data (assuming truly present)
lnlike_allobs_given_presence = calc_obs_like(truly_present=TRUE, obs_target_species, obs_all_species, mean_frequency=mean_frequency, dp=dp, fdp=fdp)
numerator = lnlike_allobs_given_presence + log(prior_prob_presence)
# prob(obs)
# denominator
d1 = lnlike_allobs_given_presence + log(prior_prob_presence)
lnlike_allobs_given_absence = calc_obs_like(truly_present=FALSE, obs_target_species, obs_all_species, mean_frequency=mean_frequency, dp=dp, fdp=fdp)
d2 = lnlike_allobs_given_absence + log(prior_prob_absence)
denominator = log(exp(d1) + exp(d2))
ln_post_prob = numerator - denominator
post_prob = exp(ln_post_prob)
if (print_progress == "")
{
nothing_happens = 0
} else {
cat(print_progress, " ", sep="")
}
return(post_prob)
}
#######################################################
# mapply_calc_post_prob_presence
#######################################################
#' Mapply version of calc_post_prob_presence()
#'
#' This function applies \code{\link{calc_post_prob_presence}} to all cells of
#' the input matrices \code{obs_target_species} and \code{obs_all_species}. These matrices
#' obviously must have the same dimensions.
#'
#' The inputs are the same as for \code{\link{calc_post_prob_presence}}, except that
#' \code{obs_target_species} and \code{obs_all_species} can be matrices.
#'
#' @param prior_prob_presence The prior probability of presence, i.e. when no detection or taphonomic control data
#' whatsoever is available. Default is set to 0.01 which expresses my totally uninformed bias that
#' in whatever your data is, your species of interest probably doesn't live in the typical area you are
#' looking at.
#' @param obs_target_species A scalar or column/vector/matrix of detection counts, e.g. as produced from the output from \code{\link{read_detections}}.
#' @param obs_all_species A scalar or column/vector/matrix of detection counts, e.g. as produced from the output from \code{\link{read_controls}}.
#' @param mean_frequency This is the proportion of samples from the taphonomic control group that will truly be from this OTU, GIVEN that the OTU is present.
#' This could be estimated, but a decent first guess is (total # samples of OTU of interest / total # of samples in the taphonomic control group where
#' the OTU is known to be present). All that is really needed is some reasonable value, such that more sampling without detection lowers the
#' likelihood of the data on the hypothesis of true presence, and vice versa. This value can only be 1 when the number of detections = the number
#' of taphonomic control detections, for every OTU and area. This is the implicit assumption in e.g. standard historical biogeography analyses in
#' LAGRANGE or BioGeoBEARS.
#' @param dp The detection probability. This is the per-sample probability that you will correctly detect the OTU in question,
#' when you are looking at it. Default is 1, which is the implicit assumption in standard analyses.
#' @param fdp The false detection probability. This is probability of falsely concluding a detection of the OTU of interest occurred, when in
#' fact the specimen was of something else. The default is 0, which assumes zero error rate,
#' i.e. the assumption being made in all historical biogeography analyses that do not take into account detection
#' probability. These options are being included for completeness, but it may not be wise to try to infer \code{mean_frequency},
#' \code{dp} and \code{fdp} all at once due to identifiability issues (and estimation of fdp may take a very large amount of data). However,
#' fixing some of these parameters to reasonable values can allow the user to effectively include beliefs about the uncertainty of the
#' input data into the analysis, if desired.
#' @param print_progress If not the default (\code{""}), print whatever is in print_progress, followed by a space (for error checking/surveying results).
#' @return \code{pp_df} A matrix of the posterior probability of presence, given the prior probability, the model parameters, and the data.
#' @export
#' @seealso \code{\link{calc_obs_like}}, \code{\link{calc_post_prob_presence}}, \code{\link{mapply_calc_obs_like}}
#' \code{\link{Pdata_given_rangerow}}, \code{\link[base]{mapply}}, \code{\link{tiplikes_wDetectionModel}}
#' @note Go BEARS!
#' @author Nicholas J. Matzke \email{matzke@@berkeley.edu}
#' @references
#' \url{http://phylo.wikidot.com/matzke-2013-international-biogeography-society-poster}
#' \url{http://en.wikipedia.org/wiki/Bayes'_theorem}
#' @bibliography /Dropbox/_njm/__packages/BioGeoBEARS_setup/BioGeoBEARS_refs.bib
#' @cite Matzke_2012_IBS
#' @cite Bottjer_Jablonski_1988
#' @cite Bayes_1763
#' @examples
#'
#' # Calculate posterior probability of presence in an area,
#' # given a dp (detection probability) and detection model.
#'
#' # soft-coded input files
#' extdata_dir = np(system.file("extdata", package="BioGeoBEARS"))
#' detects_fn = np(paste(extdata_dir, "/Psychotria_detections_v1.txt", sep=""))
#' controls_fn = np(paste(extdata_dir, "/Psychotria_controls_v1.txt", sep=""))
#'
#' OTUnames=NULL
#' areanames=NULL
#' tmpskip=0
#'
#' detects_df = read_detections(detects_fn, OTUnames=NULL, areanames=NULL, tmpskip=0)
#' controls_df = read_controls(controls_fn, OTUnames=NULL, areanames=NULL, tmpskip=0)
#'
#' detects_df
#' controls_df
#' detects_df / controls_df
#'
#'
#' # Calculate data likelihoods, and posterior probability of presence=TRUE
#' mean_frequency=0.1
#' dp=1
#' fdp=0
#'
#' mapply_calc_obs_like(truly_present=TRUE, obs_target_species=detects_df,
#' obs_all_species=controls_df, mean_frequency, dp, fdp)
#'
#' mapply_calc_obs_like(truly_present=FALSE, obs_target_species=detects_df,
#' obs_all_species=controls_df, mean_frequency, dp, fdp)
#'
#' mapply_calc_post_prob_presence(prior_prob_presence=0.01,
#' obs_target_species=detects_df,
#' obs_all_species=controls_df, mean_frequency, dp, fdp)
#'
mapply_calc_post_prob_presence <- function(prior_prob_presence=0.01, obs_target_species, obs_all_species, mean_frequency=0.1, dp=1, fdp=0, print_progress="")
{
# Calculate the posterior probability of presence in each area in a matrix / data.frame
tmpdf = mapply(FUN=calc_post_prob_presence, obs_target_species=unlist(obs_target_species),
obs_all_species=unlist(obs_all_species), MoreArgs=list(prior_prob_presence=prior_prob_presence, mean_frequency=mean_frequency, dp=dp, fdp=fdp), USE.NAMES=TRUE)
pp_df = matrix(data=tmpdf, ncol=ncol(obs_target_species), nrow=nrow(obs_target_species), byrow=FALSE)
pp_df
pp_df = adf2(pp_df)
names(pp_df) = names(obs_target_species)
rownames(pp_df) = rownames(obs_target_species)
return(pp_df)
}
#######################################################
# mapply_calc_obs_like
#######################################################
#' Mapply version of calc_obs_like()
#'
#' This function applies \code{\link{calc_obs_like}} to all cells of
#' the input matrices \code{obs_target_species} and \code{obs_all_species}. These matrices
#' obviously must have the same dimensions.
#'
#' The inputs are the same as for \code{\link{calc_obs_like}}, except that
#' \code{obs_target_species} and \code{obs_all_species} can be matrices.
#'
#' @param truly_present Is the OTU of interest known/conditionally assumed to be truly present (\code{TRUE}) or truly absent (\code{FALSE})?
#' @param obs_target_species A scalar or column/vector/matrix of detection counts, e.g. as produced from the output from \code{\link{read_detections}}.
#' @param obs_all_species A scalar or column/vector/matrix of detection counts, e.g. as produced from the output from \code{\link{read_controls}}.
#' @param mean_frequency This is the proportion of samples from the taphonomic control group that will truly be from this OTU, GIVEN that the OTU is present.
#' This could be estimated, but a decent first guess is (total # samples of OTU of interest / total # of samples in the taphonomic control group where
#' the OTU is known to be present). All that is really needed is some reasonable value, such that more sampling without detection lowers the
#' likelihood of the data on the hypothesis of true presence, and vice versa. This value can only be 1 when the number of detections = the number
#' of taphonomic control detections, for every OTU and area. This is the implicit assumption in e.g. standard historical biogeography analyses in
#' LAGRANGE or BioGeoBEARS.
#' @param dp The detection probability. This is the per-sample probability that you will correctly detect the OTU in question,
#' when you are looking at it. Default is 1, which is the implicit assumption in standard analyses.
#' @param fdp The false detection probability. This is probability of falsely concluding a detection of the OTU of interest occurred, when in
#' fact the specimen was of something else. The default is 0, which assumes zero error rate,
#' i.e. the assumption being made in all historical biogeography analyses that do not take into account detection
#' probability. These options are being included for completeness, but it may not be wise to try to infer \code{mean_frequency},
#' \code{dp} and \code{fdp} all at once due to identifiability issues (and estimation of fdp may take a very large amount of data). However,
#' fixing some of these parameters to reasonable values can allow the user to effectively include beliefs about the uncertainty of the
#' input data into the analysis, if desired.
#' @return \code{pp_df} A matrix of the natural log-likelihood of the data, given the model & assumption of true presence or absence.
#' @export
#' @seealso \code{\link{calc_obs_like}}, \code{\link{calc_post_prob_presence}}, \code{\link{mapply_calc_post_prob_presence}},
#' \code{\link{Pdata_given_rangerow}}, \code{\link[base]{mapply}}, \code{\link{tiplikes_wDetectionModel}}
#' @note Go BEARS!
#' @author Nicholas J. Matzke \email{matzke@@berkeley.edu}
#' @references
#' \url{http://phylo.wikidot.com/matzke-2013-international-biogeography-society-poster}
#' \url{http://en.wikipedia.org/wiki/Bayes'_theorem}
#' @bibliography /Dropbox/_njm/__packages/BioGeoBEARS_setup/BioGeoBEARS_refs.bib
#' @cite Matzke_2012_IBS
#' @cite Bottjer_Jablonski_1988
#' @cite Bayes_1763
#' @examples
#' test=1
#' # Calculate likelihood of data, given presence in an area,
#' # given a dp (detection probability) and detection model.
#'
#' # soft-coded input files
#' extdata_dir = np(system.file("extdata", package="BioGeoBEARS"))
#' detects_fn = np(paste(extdata_dir, "/Psychotria_detections_v1.txt", sep=""))
#' controls_fn = np(paste(extdata_dir, "/Psychotria_controls_v1.txt", sep=""))
#'
#' OTUnames=NULL
#' areanames=NULL
#' tmpskip=0
#'
#' detects_df = read_detections(detects_fn, OTUnames=NULL, areanames=NULL, tmpskip=0)
#' controls_df = read_controls(controls_fn, OTUnames=NULL, areanames=NULL, tmpskip=0)
#'
#' detects_df
#' controls_df
#' detects_df / controls_df
#'
#'
#' # Calculate data likelihoods, and posterior probability of presence=TRUE
#' mean_frequency=0.1
#' dp=1
#' fdp=0
#'
#' mapply_calc_obs_like(truly_present=TRUE, obs_target_species=detects_df,
#' obs_all_species=controls_df, mean_frequency, dp, fdp)
#'
#' mapply_calc_obs_like(truly_present=FALSE, obs_target_species=detects_df,
#' obs_all_species=controls_df, mean_frequency, dp, fdp)
#'
#' mapply_calc_post_prob_presence(prior_prob_presence=0.01,
#' obs_target_species=detects_df,
#' obs_all_species=controls_df, mean_frequency, dp, fdp)
#'
mapply_calc_obs_like <- function(truly_present=TRUE, obs_target_species, obs_all_species, mean_frequency=0.1, dp=1, fdp=0)
{
# Calculate the posterior probability of presence in each area in a matrix / data.frame
tmpdf = mapply(FUN=calc_obs_like, obs_target_species=unlist(obs_target_species),
obs_all_species=unlist(obs_all_species), MoreArgs=list(truly_present=truly_present, mean_frequency=mean_frequency, dp=dp, fdp=fdp), USE.NAMES=TRUE)
like_df = matrix(data=tmpdf, ncol=ncol(obs_target_species), nrow=nrow(obs_target_species), byrow=FALSE)
like_df
like_df = adf2(like_df)
names(like_df) = names(obs_target_species)
rownames(like_df) = rownames(obs_target_species)
return(like_df)
}
#######################################################
# Pdata_given_range_dp
#######################################################
#' Calculate probability of detection data given a true geographic range and a detection probability
#'
#' This function calculates P(data|range,dp), i.e. the probability of some detection and
#' taphonomic control counts, given the true geographic range/state, and \code{dp}, a
#' detection probability (and, optionally, a false detection probability, \code{fdp}).
#'
#' The idea of taphonomic controls dates back at least to work of Bottjer & Jablonski (1988). The basic
#' idea is that if you have taxa of roughly similar detectability, then detections of other
#' taxa give some idea of overall detection effort. Obviously this is a very simple
#' model that can be criticized in any number of ways (different alpha diversity in each
#' region, different detectability of individual taxa, etc.), but it is a useful starting
#' point as there has been no implementation of any detection model in historical/phylogenetic
#' biogeography to date.
#'
#' One could imagine (a) every OTU and area has a different count of detections and taphonomic control
#' detections, or (b) the taphonomic control detections are specified by area, and shared across all OTUs.
#' Situation (b) is likely more common, but this function assumes (a) as this is the more thorough case.
#' Behavior (b) could be reproduced by summing each column, and/or copying this sum to all cells for a
#' particular area.
#'
#' @param truerange_areas The list of areas (as 0-based numbers) in this geographic range/state.
#' @param numareas The function needs to know the total number of areas in the analysis.
#' @param detects_df A column/vector of detection counts, as produced from a column of the output from \code{\link{read_detections}}.
#' @param controls_df A column/vector of detection counts, as produced from a column of the output from \code{\link{read_controls}}.
#' @param dp The detection probability. This is the per-sample probability that you will detect the OTU in question.
#' In other words, the model assumes that each specimen from the taphonomic control group has a chance of being
#' a representative of the OTU you are looking for. The default is 1, which assumes perfect detection, i.e. the assumption
#' being made in all historical biogeography analyses that do not take into account detection probability. A value of 1
#' will only work when the taphonomic control count equals the detection count; any other data would have likelihood=0.
#' @param fdp The false detection probability. This is probability of falsely concluding a detection occurred, when in
#' fact the specimen was of something else. The default is 0, which assumes zero error rate,
#' i.e. the assumption being made in all historical biogeography analyses that do not take into account detection
#' probability. This option is being included for completeness, but it may not be wise to try to infer both
#' \code{dp} and \code{fdp} at once due to identifiability issues (and estimation of fdp may take a very large amount of data).
#' @param mean_frequency This is the proportion of samples from the taphonomic control group that will truly be from this OTU, GIVEN that the OTU is present.
#' This could be estimated, but a decent first guess is (total # samples of OTU of interest / total # of samples in the taphonomic control group where
#' the OTU is known to be present). All that is really needed is some reasonable value, such that more sampling without detection lowers the
#' likelihood of the data on the hypothesis of true presence, and vice versa. This value can only be 1 when the number of detections = the number
#' of taphonomic control detections, for every OTU and area. This is the implicit assumption in e.g. standard historical biogeography analyses in
#' LAGRANGE or BioGeoBEARS.
#' @return \code{dtf}
#' @export
#' @seealso \code{\link[cladoRcpp]{rcpp_calc_anclikes_sp_COOweights_faster}}
#' @note Go BEARS!
#' @author Nicholas J. Matzke \email{matzke@@berkeley.edu}
#' @references
#' \url{http://phylo.wikidot.com/matzke-2013-international-biogeography-society-poster}
#' @bibliography /Dropbox/_njm/__packages/BioGeoBEARS_setup/BioGeoBEARS_refs.bib
#' @cite Matzke_2012_IBS
#' @cite Bottjer_Jablonski_1988
#' @examples
#' testval=1
#'
Pdata_given_rangerow_dp <- function(truerange_areas, numareas, detects_df, controls_df, mean_frequency=0.1, dp=1, fdp=0)
{
runjunk='
controls_fn = "/Library/Frameworks/R.framework/Versions/2.15/Resources/library/BioGeoBEARS/extdata/Psychotria_controls_v1.txt"
controls_fn = "/Dropbox/_njm/__packages/BioGeoBEARS_setup/inst/extdata/Psychotria_controls_v1.txt"
OTUnames=NULL
areanames=NULL
tmpskip=0
' # end runjunk
# Likelihood of a state = the probability of data at ALL areas (occupied, and not, in the state/geographic range) given the
# conditionally assumed true presences/absences in the specified range
# Build a TRUE/FALSE matrix specifying the ranges in this assumed true state/geographic range
# (copy for every row, this does 1 state/geographic range across all OTUs)
range_as_areas_TF = matrix(data=FALSE, nrow=nrow(detects_df), ncol=numareas)
range_as_areas_TF[truerange_areas+1] = TRUE
# OK, calculate likelihood OF THE DATA ON THAT RANGE
LnLs_of_data_in_each_area = mapply(FUN=calc_obs_like, truly_present=range_as_areas_TF, obs_target_species=detects_df,
obs_all_species=controls_df, MoreArgs=list(mean_frequency=mean_frequency, dp=dp, fdp=fdp), USE.NAMES=TRUE)
# LnL of the data at each tip, for this state
LnLs_of_this_range_for_each_OTU = rowSums(LnLs_of_data_in_each_area)
LnLs_of_this_range_for_each_OTU
return(LnLs_of_this_range_for_each_OTU)
}
#######################################################
# Pdata_given_rangerow
#######################################################
#' Calculate probability of detection data given a true geographic range and a detection probability
#'
#' This function calculates P(data|range,dp), i.e. the probability of some detection and
#' taphonomic control counts, given the true geographic range/state, and \code{dp}, a
#' detection probability (and, optionally, a false detection probability, \code{fdp}).
#'
#' The idea of taphonomic controls dates back at least to work of Bottjer & Jablonski (1988). The basic
#' idea is that if you have taxa of roughly similar detectability, then detections of other
#' taxa give some idea of overall detection effort. Obviously this is a very simple
#' model that can be criticized in any number of ways (different alpha diversity in each
#' region, different detectability of individual taxa, etc.), but it is a useful starting
#' point as there has been no implementation of any detection model in historical/phylogenetic
#' biogeography to date.
#'
#' One could imagine (a) every OTU and area has a different count of detections and taphonomic control
#' detections, or (b) the taphonomic control detections are specified by area, and shared across all OTUs.
#' Situation (b) is likely more common, but this function assumes (a) as this is the more thorough case.
#' Behavior (b) could be reproduced by summing each column, and/or copying this sum to all cells for a
#' particular area.
#'
#' @param range_as_areas_TF The list of areas (as TRUE/FALSE) in this geographic range/state.
#' @param detects_df_row A column/vector of detection counts, as produced from a row of the output from \code{\link{read_detections}}.
#' @param controls_df_row A column/vector of detection counts, as produced from a row of the output from \code{\link{read_controls}}.
#' @param mean_frequency This is the proportion of samples from the taphonomic control group that will truly be from this OTU, GIVEN that the OTU is present.
#' This could be estimated, but a decent first guess is (total # samples of OTU of interest / total # of samples in the taphonomic control group where
#' the OTU is known to be present). All that is really needed is some reasonable value, such that more sampling without detection lowers the
#' likelihood of the data on the hypothesis of true presence, and vice versa. This value can only be 1 when the number of detections = the number
#' of taphonomic control detections, for every OTU and area. This is the implicit assumption in e.g. standard historical biogeography analyses in
#' LAGRANGE or BioGeoBEARS.
#' @param dp The detection probability. This is the per-sample probability that you will correctly detect the OTU in question,
#' when you are looking at it. Default is 1, which is the implicit assumption in standard analyses.
#' @param fdp The false detection probability. This is probability of falsely concluding a detection of the OTU of interest occurred, when in
#' fact the specimen was of something else. The default is 0, which assumes zero error rate,
#' i.e. the assumption being made in all historical biogeography analyses that do not take into account detection
#' probability. These options are being included for completeness, but it may not be wise to try to infer \code{mean_frequency},
#' \code{dp} and \code{fdp} all at once due to identifiability issues (and estimation of fdp may take a very large amount of data). However,
#' fixing some of these parameters to reasonable values can allow the user to effectively include beliefs about the uncertainty of the
#' input data into the analysis, if desired.
#' @param return_LnLs If \code{FALSE} (default), return exp(sum(LnLs of data in each area)), i.e. the likelihood of the data,
#' non-logged. If \code{TRUE}, return the LnLs of the data in each area.
#' @return \code{likelihood_of_data_given_range} The (non-logged!) likelihood of the data given the input range, and the
#' detection model parameters. If \code{return_LnLs=TRUE}, returns \code{LnLs_of_data_in_each_area}, the LnLs of the data in each area.
#' @export
#' @seealso \code{\link{calc_obs_like}}, \code{\link[base]{mapply}}, \code{\link{tiplikes_wDetectionModel}}
#' @note Go BEARS!
#' @author Nicholas J. Matzke \email{matzke@@berkeley.edu}
#' @references
#' \url{http://phylo.wikidot.com/matzke-2013-international-biogeography-society-poster}
#' @bibliography /Dropbox/_njm/__packages/BioGeoBEARS_setup/BioGeoBEARS_refs.bib
#' @cite Matzke_2012_IBS
#' @cite Bottjer_Jablonski_1988
#' @examples
#' testval=1
#'
#' # soft-coded input files
#' extdata_dir = np(system.file("extdata", package="BioGeoBEARS"))
#' detects_fn = np(paste(extdata_dir, "/Psychotria_detections_v1.txt", sep=""))
#' controls_fn = np(paste(extdata_dir, "/Psychotria_controls_v1.txt", sep=""))
#'
#' OTUnames=NULL
#' areanames=NULL
#' tmpskip=0
#'
#' detects_df = read_detections(detects_fn, OTUnames=NULL, areanames=NULL, tmpskip=0)
#' controls_df = read_controls(controls_fn, OTUnames=NULL, areanames=NULL, tmpskip=0)
#'
#' detects_df
#' controls_df
#' detects_df / controls_df
#'
#' mean_frequency=0.1
#' dp=1
#' fdp=0
#'
#' mapply_calc_obs_like(truly_present=TRUE, obs_target_species=detects_df,
#' obs_all_species=controls_df, mean_frequency, dp, fdp)
#'
#' mapply_calc_obs_like(truly_present=FALSE, obs_target_species=detects_df,
#' obs_all_species=controls_df, mean_frequency, dp, fdp)
#'
#' mapply_calc_post_prob_presence(prior_prob_presence=0.01,
#' obs_target_species=detects_df,
#' obs_all_species=controls_df, mean_frequency, dp, fdp)
#'
#'
#'
#' # Now, calculate the likelihood of the data given a geographic range
#' numareas = 4
#' tmpranges = list(c(0), c(1), c(0,1))
#' truerange_areas = tmpranges[[3]]
#' truerange_areas
#'
#'
#' # Build a TRUE/FALSE row specifying the ranges in this assumed true
#' # state/geographic range
#' range_as_areas_TF = matrix(data=FALSE, nrow=1, ncol=numareas)
#' range_as_areas_TF[truerange_areas+1] = TRUE
#' range_as_areas_TF
#'
#' detects_df_row = detects_df[1,]
#' controls_df_row = controls_df[1,]
#'
#' # Manual method, superceded by Pdata_given_rangerow():
#' # LnLs_of_data_in_each_area = mapply(FUN=calc_obs_like,
#' # obs_target_species=detects_df_row,
#' # obs_all_species=controls_df_row, truly_present=range_as_areas_TF,
#' # MoreArgs=list(mean_frequency=mean_frequency, dp=dp, fdp=fdp),
#' # USE.NAMES=TRUE)
#'
#'
#' # Calculate data likelihoods on for this geographic range
#' mean_frequency=0.1
#' dp=1
#' fdp=0
#'
#' # Get the likelihood (the probability of the data, given this range)
#' likelihood_of_data_given_range = Pdata_given_rangerow(
#' range_as_areas_TF=range_as_areas_TF,
#' detects_df_row=detects_df_row,
#' controls_df_row=controls_df_row, mean_frequency=mean_frequency, dp=dp, fdp=fdp)
#' likelihood_of_data_given_range
#'
#' # Return the raw log-likelihoods:
#' LnLs_of_data_in_each_area = Pdata_given_rangerow(range_as_areas_TF=range_as_areas_TF,
#' detects_df_row=detects_df_row,
#' controls_df_row=controls_df_row, mean_frequency=mean_frequency, dp=dp, fdp=fdp,
#' return_LnLs=TRUE)
#'
#' detects_df_row
#' controls_df_row
#' LnLs_of_data_in_each_area
#'
#' # The likelihood: the probability of the data in each area:
#' exp(LnLs_of_data_in_each_area)
#'
Pdata_given_rangerow <- function(range_as_areas_TF, detects_df_row, controls_df_row, mean_frequency=0.1, dp=1, fdp=0, return_LnLs=TRUE)
{
runjunk='
controls_fn = "/Library/Frameworks/R.framework/Versions/2.15/Resources/library/BioGeoBEARS/extdata/Psychotria_controls_v1.txt"
controls_fn = "/Dropbox/_njm/__packages/BioGeoBEARS_setup/inst/extdata/Psychotria_controls_v1.txt"
OTUnames=NULL
areanames=NULL
tmpskip=0
' # end runjunk
# Likelihood of a state = the probability of data at ALL areas (occupied, and not, in the state/geographic range) given the
# conditionally assumed true presences/absences in the specified range
# Build a TRUE/FALSE matrix specifying the ranges in this assumed true state/geographic range
# (copy for every row, this does 1 state/geographic range across all OTUs)
#ranges_as_TF = matrix(data=FALSE, nrow=nrow(detects_df), ncol=numareas)
#ranges_as_TF[truerange_areas+1] = TRUE
# OK, calculate likelihood OF THE DATA ON THAT RANGE
LnLs_of_data_in_each_area = mapply(FUN=calc_obs_like, truly_present=range_as_areas_TF, obs_target_species=detects_df_row,
obs_all_species=controls_df_row, MoreArgs=list(mean_frequency=mean_frequency, dp=dp, fdp=fdp), USE.NAMES=TRUE)
if (return_LnLs == FALSE)
{
LnL_range = sum(LnLs_of_data_in_each_area)
likelihood_of_data_given_range = exp(LnL_range)
return(likelihood_of_data_given_range)
} else {
return(sum(LnLs_of_data_in_each_area))
}
}
#######################################################
# tiplikes_wDetectionModel
#######################################################
#' Calculate probability of detection data for each OTU at each range in a list of states/geographic ranges
#'
#' This function calculates P(data|range,dp), i.e. the probability of some detection and
#' taphonomic control counts, given the true geographic range/state, and \code{dp}, a
#' detection probability (and, optionally, a false detection probability, \code{fdp}).
#'
#' This function performs the operation for all states/ranges for all tips.
#'
#' The idea of taphonomic controls dates back at least to work of Bottjer & Jablonski (1988). The basic
#' idea is that if you have taxa of roughly similar detectability, then detections of other
#' taxa give some idea of overall detection effort. Obviously this is a very simple
#' model that can be criticized in any number of ways (different alpha diversity in each
#' region, different detectability of individual taxa, etc.), but it is a useful starting
#' point as there has been no implementation of any detection model in historical/phylogenetic
#' biogeography to date.
#'
#' One could imagine (a) every OTU and area has a different count of detections and taphonomic control
#' detections, or (b) the taphonomic control detections are specified by area, and shared across all OTUs.
#' Situation (b) is likely more common, but this function assumes (a) as this is the more thorough case.
#' Behavior (b) could be reproduced by summing each column, and/or copying this sum to all cells for a
#' particular area.
#'
#' @param states_list_0based_index A states_list, 0-based, e.g. from \code{\link[cladoRcpp]{rcpp_areas_list_to_states_list}}.
#' @param phy A phylogeny object is required by default. This ensures that the tip likelihoods that are produced are in the same order as the tips. The
#' user can skip this by setting \code{phy="none"}.
#' @param numareas The number of areas being considered in the analysis. If \code{NULL} (default), this is calculated to be the maximum range length, or
#' one plus the maximum 0-based index in any of the ranges.
#' @param detects_df A matrix/data.frame of detection counts, as produced from the output from \code{\link{read_detections}}.
#' @param controls_df A matrix/data.frame of detection counts, as produced from the output from \code{\link{read_controls}}.
#' @param mean_frequency This is the proportion of samples from the taphonomic control group that will truly be from this OTU, GIVEN that the OTU is present.
#' This could be estimated, but a decent first guess is (total # samples of OTU of interest / total # of samples in the taphonomic control group where
#' the OTU is known to be present). All that is really needed is some reasonable value, such that more sampling without detection lowers the
#' likelihood of the data on the hypothesis of true presence, and vice versa. This value can only be 1 when the number of detections = the number
#' of taphonomic control detections, for every OTU and area. This is the implicit assumption in e.g. standard historical biogeography analyses in
#' LAGRANGE or BioGeoBEARS.
#' @param dp The detection probability. This is the per-sample probability that you will correctly detect the OTU in question,
#' when you are looking at it. Default is 1, which is the implicit assumption in standard analyses.
#' @param fdp The false detection probability. This is probability of falsely concluding a detection of the OTU of interest occurred, when in
#' fact the specimen was of something else. The default is 0, which assumes zero error rate,
#' i.e. the assumption being made in all historical biogeography analyses that do not take into account detection
#' probability. These options are being included for completeness, but it may not be wise to try to infer \code{mean_frequency},
#' \code{dp} and \code{fdp} all at once due to identifiability issues (and estimation of fdp may take a very large amount of data). However,
#' fixing some of these parameters to reasonable values can allow the user to effectively include beliefs about the uncertainty of the
#' input data into the analysis, if desired.
#' @param null_range_gets_0_like If \code{TRUE} (default), then the data is given zero probability on the hypothesis that the
#' range is a null range (i.e., no areas occupied). This is equivalent to saying that you are sure/are willing to assume that
#' the OTU exists somewhere in your study area, at the timepoint being considered. Null ranges are identified by length=1,
#' containing NULL, NA, "", "_", etc.
#' @param return_LnLs Passed to \code{\link{Pdata_given_rangerow}}. If \code{FALSE}, \code{\link{Pdata_given_rangerow}} returns
#' \code{exp(sum(LnLs of data in each area))}, i.e. the likelihood of the data,
#' non-logged. If \code{TRUE} (default), \code{\link{Pdata_given_rangerow}} returns the LnLs of the data in each area. \code{FALSE} is
#' handy for examples, \code{TRUE} is safer for large computations/functions where underflow is a possibility.
#' @param relative_LnLs Default \code{TRUE}. If \code{return_LnLs==TRUE}, then \code{relative_LnLs} says whether or not to subtract the maximum LnL from
#' each row of the (logged) tip conditional likelihoods. This effectively sets the state that confers the highest likelihood on the data to
#' LnL=0, which translates to likelihood 1 of the data under that state in normal space (note that this is different than the probability of states;
#' see Felsenstein 2004). This relative scaling should not effect results, but may help avoid underflow.
#' @param exp_LnLs Default \code{TRUE}. If \code{return_LnLs==TRUE}, then, if \code{exp_LnLs==TRUE}, the final tip log-likelihoods are exponentiated to
#' convert back to normal probability space before returning to the user.
#' @return \code{tip_condlikes_of_data_on_each_state} The (logged, or non-logged!) likelihood of the data for each tip, given each possible range, and the
#' detection model parameters.
#' @param error_check If \code{TRUE} (default), then the tip data conditional likelihoods are summed for each row, and if any row/tip has a total likelihood of
#' 0, an error is thrown. Relevant only if the function is returning regular likelihoods, not log-likelihoods.
#' @export
#' @seealso \code{\link{Pdata_given_rangerow}}, \code{\link{calc_obs_like}}, \code{\link[base]{mapply}}, \code{\link{read_detections}}, \code{\link{read_controls}}
#' @note Go BEARS!
#' @author Nicholas J. Matzke \email{matzke@@berkeley.edu}
#' @references
#' \url{http://phylo.wikidot.com/matzke-2013-international-biogeography-society-poster}
#' @bibliography /Dropbox/_njm/__packages/BioGeoBEARS_setup/BioGeoBEARS_refs.bib
#' @cite Matzke_2012_IBS
#' @cite Bottjer_Jablonski_1988
#' @examples
#' testval=1
#'
#' # soft-coded input files
#' extdata_dir = np(system.file("extdata", package="BioGeoBEARS"))
#' detects_fn = np(paste(extdata_dir, "/Psychotria_detections_v1.txt", sep=""))
#' controls_fn = np(paste(extdata_dir, "/Psychotria_controls_v1.txt", sep=""))
#' trfn = np(paste(extdata_dir, "/Psychotria_5.2.newick", sep=""))
#'
#' detects_df = read_detections(detects_fn, OTUnames=NULL, areanames=NULL, tmpskip=0)
#' controls_df = read_controls(controls_fn, OTUnames=NULL, areanames=NULL, tmpskip=0)
#' phy = read.tree(trfn)
#'
#' # Calculate the likelihood of the data at each tip, for each possible geographic range
#' numareas = 4
#' states_list_0based_index = rcpp_areas_list_to_states_list(areas=LETTERS[1:numareas], maxareas=numareas, include_null_range=TRUE)
#'
#' mean_frequency=0.1
#' dp=1
#' fdp=0
#'
#' # This is returning the raw, un-scaled, un-logged likelihoods; it is safer to use the log likelihoods
#' # (from which the max is substracted, and exp(LnL) is performed) in actual code, due to
#' # underflow issues.
#' null_range_gets_0_like=TRUE; return_LnLs=FALSE; relative_LnLs=FALSE; exp_LnLs=FALSE; error_check=TRUE
#'
#'
#' tip_condlikes_of_data_on_each_state =
#' tiplikes_wDetectionModel(states_list_0based_index=states_list_0based_index, phy=phy, numareas=numareas,
#' detects_df=detects_df, controls_df=controls_df, mean_frequency=mean_frequency, dp=dp, fdp=fdp,
#' null_range_gets_0_like=TRUE, return_LnLs=FALSE, relative_LnLs=FALSE, exp_LnLs=FALSE, error_check=TRUE)
#'
#' tip_condlikes_of_data_on_each_state
#'
#'
#'
#' # Do it in log space, with scale so that max LnL = 0, max likelihood = 1
#'
#' null_range_gets_0_like=TRUE; return_LnLs=TRUE; relative_LnLs=TRUE; exp_LnLs=TRUE; error_check=TRUE
#'
#' tip_condlikes_of_data_on_each_state_scaled =
#' tiplikes_wDetectionModel(states_list_0based_index=states_list_0based_index, phy=phy, numareas=numareas,
#' detects_df=detects_df, controls_df=controls_df, mean_frequency=mean_frequency, dp=dp, fdp=fdp,
#' null_range_gets_0_like=TRUE, return_LnLs=TRUE, relative_LnLs=TRUE, exp_LnLs=TRUE, error_check=TRUE)
#'
#' tip_condlikes_of_data_on_each_state_scaled
#'
#'
#' tip_condlikes_of_data_on_each_state_scaled
#' tip_condlikes_of_data_on_each_state
#' tip_condlikes_of_data_on_each_state_scaled - tip_condlikes_of_data_on_each_state
#'
#'
tiplikes_wDetectionModel <- function(states_list_0based_index, phy, numareas=NULL, detects_df, controls_df, mean_frequency=0.1, dp=1, fdp=0, null_range_gets_0_like=TRUE, return_LnLs=TRUE, relative_LnLs=TRUE, exp_LnLs=TRUE, error_check=TRUE)
{
runjunk='
controls_fn = "/Library/Frameworks/R.framework/Versions/2.15/Resources/library/BioGeoBEARS/extdata/Psychotria_controls_v1.txt"
controls_fn = "/Dropbox/_njm/__packages/BioGeoBEARS_setup/inst/extdata/Psychotria_controls_v1.txt"
OTUnames=NULL
areanames=NULL
tmpskip=0
' # end runjunk
# Get the number of areas, if needed
if (is.null(numareas))
{
maxlength = max(unlist(lapply(X=states_list_0based_index,FUN=length)), na.rm=TRUE)
maxnum = 1+max(unlist(lapply(X=states_list_0based_index,FUN=max, na.rm=TRUE)), na.rm=TRUE)
numareas = max(c(maxnum, maxlength), na.rm=TRUE)
}
# Likelihood of a state = the probability of data at ALL areas (occupied, and not, in the state/geographic range) given the
# conditionally assumed true presences/absences in the specified range
# Go through the states
#states_list_0based_index = tmpstates
numstates = length(states_list_0based_index)
numtips = nrow(detects_df)
# print("numtips")
# print(numtips)
# print("numstates")
# print(numstates)
tip_condlikes_of_data_on_each_state = matrix(data=0, nrow=numtips, ncol=numstates)
for (statenum in 1:numstates)
{
# Get the current state/geographic range
truerange_areas = states_list_0based_index[[statenum]]
# Check for null range, if you're supposed to, and the length is 1
if ((null_range_gets_0_like==TRUE) && (length(truerange_areas)==1))
{
if ( (truerange_areas=="_") || (is.null(truerange_areas)==TRUE) || (is.na(truerange_areas)==TRUE) || (truerange_areas=="") )
{
# Then you have a null range, and you have decided to give the data 0 likelihood on all states
if (return_LnLs == FALSE)
{
likes_vector = rep(0, times=numtips)
} else {
# In log space, it has to be -Inf
likes_vector = rep(-Inf, times=numtips)
}
tip_condlikes_of_data_on_each_state[, statenum] = likes_vector
# skip to next loop
next()
}
}
# Build a TRUE/FALSE row specifying the ranges in this assumed true
# state/geographic range
range_as_areas_TF = matrix(data=FALSE, nrow=nrow(detects_df), ncol=numareas)
range_as_areas_TF[, truerange_areas+1] = TRUE # MUST BE OVER ALL ROWS
range_as_areas_TF
# iterate over the rows
likes_vector = rep(0, times=numtips)
for (i in 1:nrow(range_as_areas_TF))
{
likes_vector[i] = Pdata_given_rangerow(range_as_areas_TF=range_as_areas_TF[i,], detects_df_row=detects_df[i,],
controls_df_row=controls_df[i,], mean_frequency=mean_frequency, dp=dp, fdp=fdp, return_LnLs=return_LnLs)
}
tip_condlikes_of_data_on_each_state[, statenum] = likes_vector
}
tip_condlikes_of_data_on_each_state
# Post-calculation edits
# Make log-likelihoods relative to the maximum? (I.e., set the maximum to 0?)
if ((return_LnLs==TRUE) && (relative_LnLs==TRUE))
{
tip_condlikes_of_data_on_each_state_LnL = tip_condlikes_of_data_on_each_state
maxlike_each_row = apply(X=tip_condlikes_of_data_on_each_state_LnL, MARGIN=1, max)
tip_condlikes_of_data_on_each_state_LnL = tip_condlikes_of_data_on_each_state_LnL - maxlike_each_row
tip_condlikes_of_data_on_each_state = tip_condlikes_of_data_on_each_state_LnL
}
if ((return_LnLs==TRUE) && (exp_LnLs==TRUE))
{
tip_condlikes_of_data_on_each_state = exp(tip_condlikes_of_data_on_each_state)
# > rowSums(tip_condlikes_of_data_on_each_state)
# [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 64 64 64 64 64 1 1 1 1 1
}
# May be kinda slow, so make optional
if (error_check == TRUE)
{
# Only do, if likelihoods in regular space
if ( ((return_LnLs==TRUE) && (exp_LnLs==TRUE)) || (return_LnLs==FALSE) )
{
# Error check
sums = rowSums(tip_condlikes_of_data_on_each_state)
tiplike_sums_to_0_TF = sums <= 0
if (sum(tiplike_sums_to_0_TF) > 0)
{
stoptxt = "\nFATAL ERROR in tiplikes_wDetectionModel(): Some tiplikes sum to 0!!!\n"
cat(stoptxt)
print((1:length(tiplike_sums_to_0_TF))[tiplike_sums_to_0_TF])
print(tip_condlikes_of_data_on_each_state[tiplike_sums_to_0_TF, ])
stop(stoptxt)
}
}
}
#######################################################
# MAKE SURE THE TIP LIKELIHOODS ARE IN TREE TIP ORDER!!!!
#######################################################
# Make sure the letter code ranges are in the same order as the
# phylogeny tips
tipranges_df_order = match(phy$tip.label, rownames(detects_df))
tip_condlikes_of_data_on_each_state = tip_condlikes_of_data_on_each_state[tipranges_df_order, ]
return(tip_condlikes_of_data_on_each_state)
}
#######################################################
# prob_of_states_from_prior_prob_areas
#######################################################
#' Calculate probability of detection data for each OTU at each range in a list of states/geographic ranges
#'
#' This function calculates P(data|range,dp), i.e. the probability of some detection and
#' taphonomic control counts, given the true geographic range/state, and \code{dp}, a
#' detection probability (and, optionally, a false detection probability, \code{fdp}).
#'
#' This function performs the operation for all states/ranges for all tips.
#'
#' The idea of taphonomic controls dates back at least to work of Bottjer & Jablonski (1988). The basic
#' idea is that if you have taxa of roughly similar detectability, then detections of other
#' taxa give some idea of overall detection effort. Obviously this is a very simple
#' model that can be criticized in any number of ways (different alpha diversity in each
#' region, different detectability of individual taxa, etc.), but it is a useful starting
#' point as there has been no implementation of any detection model in historical/phylogenetic
#' biogeography to date.
#'
#' One could imagine (a) every OTU and area has a different count of detections and taphonomic control
#' detections, or (b) the taphonomic control detections are specified by area, and shared across all OTUs.
#' Situation (b) is likely more common, but this function assumes (a) as this is the more thorough case.
#' Behavior (b) could be reproduced by summing each column, and/or copying this sum to all cells for a
#' particular area.
#'
#' @param states_list_0based_index A states_list, 0-based, e.g. from \code{\link[cladoRcpp]{rcpp_areas_list_to_states_list}}.
#' @param numareas The number of areas being considered in the analysis. If \code{NULL} (default), this is calculated to be the maximum range length, or
#' one plus the maximum 0-based index in any of the ranges.
#' @param prior_prob_presence The prior probability of presence, i.e. when no detection or taphonomic control data
#' whatsoever is available. Default is set to 0.01 which expresses my totally uninformed bias that
#' in whatever your data is, your species of interest probably doesn't live in the typical area you are
#' looking at.
#' @param null_range_gets_0_prob If \code{TRUE} (default), then the null range is given zero probability.
#' A null range has no areas occupied. This is equivalent to saying that you are sure/are willing to assume that
#' the OTU exists somewhere in your study area, at the timepoint being considered. Null ranges are identified by length=1,
#' containing NULL, NA, "", "_", etc.
#' @param normalize_probs If \code{TRUE}, the probabilities of each range will be normalized so that they sum to 1. Otherwise, they
#' won't.
#' @return \code{prob_of_each_range} The probability of each range, given the prior probability of presence in each area.
#' @export
#' @seealso \code{\link[cladoRcpp]{rcpp_areas_list_to_states_list}}, \code{\link{Pdata_given_rangerow}},
#' \code{\link{calc_obs_like}}, \code{\link[base]{mapply}}, \code{\link{read_detections}}, \code{\link{read_controls}}
#' @note Go BEARS!
#' @author Nicholas J. Matzke \email{matzke@@berkeley.edu}
#' @references
#' \url{http://phylo.wikidot.com/matzke-2013-international-biogeography-society-poster}
#' @bibliography /Dropbox/_njm/__packages/BioGeoBEARS_setup/BioGeoBEARS_refs.bib
#' @cite Matzke_2012_IBS
#' @cite Bottjer_Jablonski_1988
#' @examples
#' testval=1
#'
#' prior_prob_presence = 0.01
#'
#' areas = c("K", "O", "M", "H")
#' numareas = length(areas)
#' states_list_0based_index =
#' rcpp_areas_list_to_states_list(areas=areas, maxareas=4, include_null_range=TRUE)
#' states_list_0based_index
#' numareas = 4
#'
#' mean_frequency=0.1
#' dp=1
#' fdp=0
#'
#' prob_of_states_from_prior_prob_areas(states_list_0based_index, numareas=numareas,
#' prior_prob_presence=prior_prob_presence, null_range_gets_0_prob=TRUE,
#' normalize_probs=TRUE)
#'
#' prob_of_states_from_prior_prob_areas(states_list_0based_index, numareas=numareas,
#' prior_prob_presence=prior_prob_presence, null_range_gets_0_prob=TRUE,
#' normalize_probs=FALSE)
#'
#' prob_of_states_from_prior_prob_areas(states_list_0based_index, numareas=numareas,
#' prior_prob_presence=prior_prob_presence, null_range_gets_0_prob=FALSE,
#' normalize_probs=TRUE)
#'
#' prob_of_states_from_prior_prob_areas(states_list_0based_index, numareas=numareas,
#' prior_prob_presence=prior_prob_presence, null_range_gets_0_prob=FALSE,
#' normalize_probs=FALSE)
#'
prob_of_states_from_prior_prob_areas <- function(states_list_0based_index, numareas=NULL, prior_prob_presence=0.01, null_range_gets_0_prob=TRUE, normalize_probs=TRUE)
{
runjunk='
' # end runjunk
# Get the number of areas, if needed
if (is.null(numareas))
{
maxlength = max(unlist(lapply(X=states_list_0based_index,FUN=length)), na.rm=TRUE)
maxnum = 1+max(unlist(lapply(X=states_list_0based_index,FUN=max, na.rm=TRUE)), na.rm=TRUE)
numareas = max(c(maxnum, maxlength), na.rm=TRUE)
}
numstates = length(states_list_0based_index)
range_as_areas_TF = matrix(data=FALSE, nrow=numstates, ncol=numareas)
prob_of_TF = matrix(data=0, nrow=numstates, ncol=numareas)
for (statenum in 1:numstates)
{
# Get the current state/geographic range
truerange_areas = states_list_0based_index[[statenum]]
# Check for null range, if you're supposed to, and the length is 1
if ((null_range_gets_0_prob==TRUE) && (length(truerange_areas)==1))
{
if ( (truerange_areas=="_") || (is.null(truerange_areas)==TRUE) || (is.na(truerange_areas)==TRUE) || (truerange_areas=="") )
{
# Then you have a null range, and you have decided to give the data 0 likelihood on all states
range_as_areas_TF[statenum, ] = NA
# skip to next loop
next()
}
}
# Build a TRUE/FALSE row specifying the ranges in this assumed true
# state/geographic range
range_as_areas_TF[statenum, truerange_areas+1] = TRUE # MUST BE OVER ALL ROWS
range_as_areas_TF
}
range_as_areas_TF
# Prob of each range
prob_of_TF[range_as_areas_TF == TRUE] = prior_prob_presence
prob_of_TF[range_as_areas_TF == FALSE] = 1-prior_prob_presence
# Set anything in the NULL range to 0
range_as_areas_TF[is.na(range_as_areas_TF)] = 0
prob_of_TF
prob_of_each_range = exp(rowSums(log(prob_of_TF)))
prob_of_each_range
if (normalize_probs == TRUE)
{
prob_of_each_range = prob_of_each_range / sum(prob_of_each_range)
prob_of_each_range
}
return(prob_of_each_range)
}
#######################################################
# post_prob_states
#######################################################
#' Calculate posterior probability of each states/geographic ranges, given prior probabilities and data likelihoods
#'
#' This function calculates P(range|data,detection model), i.e. the probability of each possible
#' range, given a prior probability of each range, and the likelihood of each range.
#'
#' The prior probability of each range should be considered by the user. Note that putting the same prior on
#' the probability of occurrence in each individual range does NOT mean a flat prior on each
#' state/geographic range. This fact is demonstrated in the function \code{\link{prob_of_states_from_prior_prob_areas}}.
#'
#' @param prob_of_each_range The probability of each range, given the prior probability of presence in each area.
#' @param condlikes_of_data_on_each_range The probability of the data, conditional on each range (i.e., the likelihood), as
#' found in e.g. a row of the output from \code{\link{tiplikes_wDetectionModel}}.
#'
#' @return \code{posterior_probs} The posterior probability of each range.
#' @export
#' @seealso \code{\link{prob_of_states_from_prior_prob_areas}}, \code{\link{tiplikes_wDetectionModel}},
#' \code{\link[cladoRcpp]{rcpp_areas_list_to_states_list}}, \code{\link{Pdata_given_rangerow}},
#' \code{\link{calc_obs_like}}, \code{\link[base]{mapply}}, \code{\link{read_detections}}, \code{\link{read_controls}}
#' @note Go BEARS!
#' @author Nicholas J. Matzke \email{matzke@@berkeley.edu}
#' @references
#' \url{http://phylo.wikidot.com/matzke-2013-international-biogeography-society-poster}
#' \url{http://en.wikipedia.org/wiki/Log_probability}
#' @bibliography /Dropbox/_njm/__packages/BioGeoBEARS_setup/BioGeoBEARS_refs.bib
#' @cite Matzke_2012_IBS
#' @cite Bottjer_Jablonski_1988
#' @examples
#' testval=1
#'
#' # soft-coded input files
#' extdata_dir = np(system.file("extdata", package="BioGeoBEARS"))
#' detects_fn = np(paste(extdata_dir, "/Psychotria_detections_v1.txt", sep=""))
#' controls_fn = np(paste(extdata_dir, "/Psychotria_controls_v1.txt", sep=""))
#'
#' detects_df = read_detections(detects_fn, OTUnames=NULL, areanames=NULL, tmpskip=0)
#' controls_df = read_controls(controls_fn, OTUnames=NULL, areanames=NULL, tmpskip=0)
#'
#'
#' # Setup
#' prior_prob_presence = 0.01
#' areas = c("K", "O", "M", "H")
#' numareas = length(areas)
#' maxareas = length(areas)
#' states_list_0based_index =
#' rcpp_areas_list_to_states_list(areas=areas, maxareas=maxareas,
#' include_null_range=TRUE)
#' states_list_0based_index
#'
#' mean_frequency=0.1
#' dp=1
#' fdp=0
#'
#' trfn = np(paste(extdata_dir, "/Psychotria_5.2.newick", sep=""))
#' phy = read.tree(trfn)
#'
#' # This is returning the raw, un-scaled, un-logged likelihoods; it is safer to use the log likelihoods
#' # (from which the max is substracted, and exp(LnL) is performed) in actual code, due to
#' # underflow issues.
#' null_range_gets_0_like=TRUE; return_LnLs=FALSE; relative_LnLs=FALSE; exp_LnLs=FALSE; error_check=TRUE
#'
#'
#' tip_condlikes_of_data_on_each_state =
#' tiplikes_wDetectionModel(states_list_0based_index=states_list_0based_index, phy=phy, numareas=numareas,
#' detects_df=detects_df, controls_df=controls_df, mean_frequency=mean_frequency, dp=dp, fdp=fdp,
#' null_range_gets_0_like=TRUE, return_LnLs=FALSE, relative_LnLs=FALSE, exp_LnLs=FALSE, error_check=TRUE)
#'
#' tip_condlikes_of_data_on_each_state
#'
#'
#'
#' # To get denominator, just iterate over all the states
#' # Prior probability
#' prob_of_each_range = prob_of_states_from_prior_prob_areas(states_list_0based_index,
#' numareas=numareas,
#' prior_prob_presence=prior_prob_presence, null_range_gets_0_prob=TRUE,
#' normalize_probs=TRUE)
#'
#' # Likelihoods of the data on each range
#' condlikes_of_data_on_each_range = tip_condlikes_of_data_on_each_state[1,]
#'
#' posterior_probs = post_prob_states(prob_of_each_range,
#' condlikes_of_data_on_each_range)
#' posterior_probs
#'
#' # Should sum to 1
#' sum(posterior_probs)
#'
post_prob_states <- function(prob_of_each_range, condlikes_of_data_on_each_range)
{
# The denominator in Bayes's Theorem is the P(data), i.e. the probability of the
# data conditional on all possible hypotheses, i.e. the sum of the data likelihood times the
# prior for all states.\
# Sum probabilities in log space
# http://en.wikipedia.org/wiki/Log_probability
log_probs_in_denominator = log(prob_of_each_range) + log(condlikes_of_data_on_each_range)
log_probs_in_denominator
# Simplest way to add probabilities in log formet
denominator = log(sum(exp(log_probs_in_denominator)))
denominator
numstates = length(prob_of_each_range)
numerators = rep(NA, times=numstates)
for (i in 1:numstates)
{
prior_prob_of_this_range = prob_of_each_range[i]
like_of_data_on_this_range = condlikes_of_data_on_each_range[i]
# Sum probabilities in log space
# http://en.wikipedia.org/wiki/Log_probability
numerators[i] = log(prior_prob_of_this_range * like_of_data_on_this_range)
}
# Calculate the posterior probabilities
posterior_probs = exp(numerators - denominator)
posterior_probs
# They should sum to 1
sum(posterior_probs)
return(posterior_probs)
}
#######################################################
# post_prob_states_matrix
#######################################################
#' Calculate posterior probability of each states/geographic ranges, given prior probabilities and data likelihoods
#'
#' This function calculates P(range|data,detection model), i.e. the probability of each possible
#' range, given a prior probability of each range, and the likelihood of each range.
#'
#' The prior probability of each range should be considered by the user. Note that putting the same prior on
#' the probability of occurrence in each individual range does NOT mean a flat prior on each
#' state/geographic range. This fact is demonstrated in the function \code{\link{prob_of_states_from_prior_prob_areas}}.
#'
#' @param prob_of_each_range The probability of each range, given the prior probability of presence in each area.
#' @param tip_condlikes_of_data_on_each_state The probability of the data, conditional on each range (i.e., the likelihood), as
#' found in e.g. a row of the output from \code{\link{tiplikes_wDetectionModel}}.
#' @return \code{posterior_probs} The posterior probability of each range.
#' @export
#' @seealso \code{\link{prob_of_states_from_prior_prob_areas}}, \code{\link{tiplikes_wDetectionModel}},
#' \code{\link[cladoRcpp]{rcpp_areas_list_to_states_list}}, \code{\link{Pdata_given_rangerow}},
#' \code{\link{calc_obs_like}}, \code{\link[base]{mapply}}, \code{\link{read_detections}}, \code{\link{read_controls}}
#' @note Go BEARS!
#' @author Nicholas J. Matzke \email{matzke@@berkeley.edu}
#' @references
#' \url{http://phylo.wikidot.com/matzke-2013-international-biogeography-society-poster}
#' \url{http://en.wikipedia.org/wiki/Log_probability}
#' @bibliography /Dropbox/_njm/__packages/BioGeoBEARS_setup/BioGeoBEARS_refs.bib
#' @cite Matzke_2012_IBS
#' @cite Bottjer_Jablonski_1988
#' @examples
#' testval=1
#'
#' # soft-coded input files
#' extdata_dir = np(system.file("extdata", package="BioGeoBEARS"))
#' #extdata_dir = "/Dropbox/_njm/__packages/BioGeoBEARS_setup/inst/extdata/"
#' detects_fn = np(paste(extdata_dir, "/Psychotria_detections_v1.txt", sep=""))
#' controls_fn = np(paste(extdata_dir, "/Psychotria_controls_v1.txt", sep=""))
#'
#' detects_df = read_detections(detects_fn, OTUnames=NULL, areanames=NULL, tmpskip=0)
#' controls_df = read_controls(controls_fn, OTUnames=NULL, areanames=NULL, tmpskip=0)
#'
#'
#' # Setup
#' prior_prob_presence = 0.01
#' areas = c("K", "O", "M", "H")
#' numareas = length(areas)
#' maxareas = length(areas)
#' states_list_0based_index =
#' rcpp_areas_list_to_states_list(areas=areas, maxareas=maxareas,
#' include_null_range=TRUE)
#' states_list_0based_index
#'
#' mean_frequency=0.1
#' dp=1
#' fdp=0
#'
#' trfn = np(paste(extdata_dir, "/Psychotria_5.2.newick", sep=""))
#' phy = read.tree(trfn)
#'
#' # This is returning the raw, un-scaled, un-logged likelihoods; it is safer to use the log likelihoods
#' # (from which the max is substracted, and exp(LnL) is performed) in actual code, due to
#' # underflow issues.
#' null_range_gets_0_like=TRUE; return_LnLs=FALSE; relative_LnLs=FALSE; exp_LnLs=FALSE; error_check=TRUE
#'
#'
#' tip_condlikes_of_data_on_each_state =
#' tiplikes_wDetectionModel(states_list_0based_index=states_list_0based_index, phy=phy, numareas=numareas,
#' detects_df=detects_df, controls_df=controls_df, mean_frequency=mean_frequency, dp=dp, fdp=fdp,
#' null_range_gets_0_like=TRUE, return_LnLs=FALSE, relative_LnLs=FALSE, exp_LnLs=FALSE, error_check=TRUE)
#'
#' tip_condlikes_of_data_on_each_state
#'
#'
#'
#' # To get denominator, just iterate over all the states
#' # Prior probability
#' prob_of_each_range = prob_of_states_from_prior_prob_areas(states_list_0based_index,
#' numareas=numareas,
#' prior_prob_presence=prior_prob_presence, null_range_gets_0_prob=TRUE,
#' normalize_probs=TRUE)
#'
#' posterior_probs_matrix = post_prob_states_matrix(prob_of_each_range,
#' tip_condlikes_of_data_on_each_state)
#' posterior_probs_matrix
#'
#' # Should sum to 1
#' rowSums(posterior_probs_matrix)
#'
#' # How does posterior probability correlate with likelihood and prior probability?
#' par(mfrow=c(1,2))
#' plot(x=jitter(log(tip_condlikes_of_data_on_each_state)),
#' y=jitter(log(posterior_probs_matrix)))
#' title("Correlation of data likelihoods\nand posterior probabilities")
#'
#' prob_of_each_range_matrix = matrix(data=prob_of_each_range,
#' nrow=nrow(posterior_probs_matrix), ncol=length(prob_of_each_range))
#' plot(x=jitter(log(prob_of_each_range_matrix)),
#' y=jitter(log(posterior_probs_matrix)))
#' title("Correlation of prior probability\nand posterior probabilities")
#'
post_prob_states_matrix <- function(prob_of_each_range, tip_condlikes_of_data_on_each_state)
{
# The denominator in Bayes's Theorem is the P(data), i.e. the probability of the
# data conditional on all possible hypotheses, i.e. the sum of the data likelihood times the
# prior for all states.\
numtips = nrow(tip_condlikes_of_data_on_each_state)
posterior_probs_matrix = matrix(NA, nrow=numtips, ncol=ncol(tip_condlikes_of_data_on_each_state))
# Iterate through each tip
for (tipnum in 1:numtips)
{
condlikes_of_data_on_each_range = tip_condlikes_of_data_on_each_state[tipnum, ]
# Sum probabilities in log space
# http://en.wikipedia.org/wiki/Log_probability
log_probs_in_denominator = log(prob_of_each_range) + log(condlikes_of_data_on_each_range)
log_probs_in_denominator
# Simplest way to add probabilities in log formet
denominator = log(sum(exp(log_probs_in_denominator)))
denominator
numstates = length(prob_of_each_range)
numerators = rep(NA, times=numstates)
for (i in 1:numstates)
{
prior_prob_of_this_range = prob_of_each_range[i]
like_of_data_on_this_range = condlikes_of_data_on_each_range[i]
# Sum probabilities in log space
# http://en.wikipedia.org/wiki/Log_probability
numerators[i] = log(prior_prob_of_this_range * like_of_data_on_this_range)
}
# Calculate the posterior probabilities
posterior_probs = exp(numerators - denominator)
posterior_probs
posterior_probs_matrix[tipnum, ] = posterior_probs
}
# They should sum to 1
rowSums(posterior_probs_matrix)
return(posterior_probs_matrix)
}
# add_probs_as_logs <- function(lnP_list)
# {
#
# # Complex:
# # num_lnPs_to_add = length(lnP_list)
# # sum_lnPs = lnP_list[1]
# # for (i in 2:num_lnPs_to_add)
# # {
# # lnP = lnP_list[i]
# #
# # # Sum probabilities in log space
# # # http://en.wikipedia.org/wiki/Log_probability
# # #sum_lnPs = -1 * log( exp(-sum_lnPs) + exp(-lnP) )
# #
# # # Simpler:
# # sum_lnPs = 1 * log( exp(sum_lnPs) + exp(lnP) )
# # }
#
# # Simple:
# sum_lnPs = log(sum(exp(lnP_list)))
#
# return(sum_lnPs)
# }
#' Get the size distribution of range sizes from tipranges
#'
#' This function gets the size distribution of range sizes
#' from a tipranges \code{data.frame}, e.g. from \code{tipranges@df}.
#'
#' Some range sizes will be unobserved. If include_null_range=TRUE, the
#' function will put in 1s for null_range (0 areas). Other unobserved sizes
#' will copy the count value above for intermediate range sizes, or will
#' get a count of 1 for range sizes with no larger observed range sizes.
#'
#' This function can be used to set up a rough prior on
#' range sizes, in cases where the frequency of different range
#' sizes in the state space is substantially different from the
#' observed distribution of range sizes. (This is very frequently
#' the case, and can cause a major bias towards inferring
#' widespread tips and ancestors in imperfect detection models.)
#'
#' @param dtf A data.frame, e.g. from tipranges@df
#' @param max_range_size The maximum allowed range size.
#' @param include_null_range Should the null range be included in the
#' state space? Default is \code{TRUE}.
#' @return res, a list object containing \code{res$observed_range_sizes},
#' the observed counts in each allowed range size (the result of
#' \code{table(rowSums(dfnums_to_numeric(dtf)))}); and \code{res$new_weighting_by_range_size}
#' the modified list of counts by range size, modified as described above.
#' @export
#' @author Nicholas J. Matzke \email{matzke@@berkeley.edu}
#' @examples
#' test=1
#' living_tipranges = structure(list(N = c("0", "0", "0", "0", "1", "1", "0", "1",
#' "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0",
#' "0", "1", "1", "1", "0", "0", "1", "1", "1", "1", "0", "0", "0",
#' "0", "0", "0"), S = c("1", "0", "0", "0", "0", "0", "0", "0",
#' "0", "1", "1", "0", "1", "1", "0", "0", "0", "1", "1", "1", "1",
#' "1", "1", "1", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0",
#' "0", "0", "0"), R = c("0", "0", "0", "1", "0", "0", "0", "0",
#' "0", "0", "0", "1", "0", "0", "0", "1", "0", "0", "0", "0", "0",
#' "0", "0", "0", "0", "1", "1", "1", "0", "0", "1", "0", "0", "0",
#' "0", "0", "0"), E = c("0", "0", "0", "1", "0", "0", "0", "0",
#' "0", "0", "0", "1", "0", "0", "0", "0", "0", "0", "0", "0", "0",
#' "0", "0", "0", "0", "0", "0", "1", "0", "0", "1", "0", "0", "0",
#' "1", "0", "1"), M = c("0", "0", "0", "1", "0", "0", "1", "0",
#' "1", "0", "0", "0", "0", "0", "1", "0", "1", "0", "0", "0", "0",
#' "0", "0", "0", "0", "0", "0", "0", "0", "0", "1", "1", "1", "0",
#' "1", "0", "1"), F = c("0", "1", "1", "1", "0", "0", "1", "0",
#' "0", "0", "0", "0", "0", "0", "1", "0", "1", "0", "0", "0", "0",
#' "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "1", "0",
#' "0", "1", "0"), I = c("0", "0", "0", "1", "0", "0", "0", "0",
#' "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0",
#' "0", "0", "0", "0", "1", "1", "0", "0", "0", "1", "0", "0", "1",
#' "1", "0", "1")), row.names = c("Atelocynus_microtis", "Lupulella_adustus",
#' "Canis_anthus", "Canis_aureus", "Canis_latrans", "Canis_lupus",
#' "Lupulella_mesomelas", "Canis_rufus", "Canis_simensis", "Cerdocyon_thous",
#' "Chrysocyon_brachyurus", "Cuon_alpinus", "Dusicyon_australis",
#' "Lycalopex_fulvipes", "Lycaon_pictus", "Nyctereutes_procyonoides",
#' "Otocyon_megalotis", "Lycalopex_culpaeus", "Lycalopex_griseus",
#' "Lycalopex_gymnocercus", "Lycalopex_sechurae", "Pseudalopex_vetulus",
#' "Speothos_venaticus", "Urocyon_cinereoargenteus", "Urocyon_littoralis",
#' "Vulpes_corsac", "Vulpes_ferrilata", "Vulpes_lagopus", "Vulpes_macrotis",
#' "Vulpes_velox", "Vulpes_vulpes", "Vulpes_zerda", "Vulpes_pallida",
#' "Vulpes_bengalensis", "Vulpes_cana", "Vulpes_chama", "Vulpes_rueppellii"
#' ), class = "data.frame")
#' dtf = living_tipranges
#' max_range_size = 7
#' include_null_range=TRUE
#' get_distribution_of_range_sizes_in_tipranges(dtf=living_tipranges, max_range_size=max_range_size, include_null_range=TRUE)
#'
get_distribution_of_range_sizes_in_tipranges <- function(dtf, max_range_size, include_null_range=TRUE)
{
junk='
living_tipranges = structure(list(N = c("0", "0", "0", "0", "1", "1", "0", "1",
"0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0",
"0", "1", "1", "1", "0", "0", "1", "1", "1", "1", "0", "0", "0",
"0", "0", "0"), S = c("1", "0", "0", "0", "0", "0", "0", "0",
"0", "1", "1", "0", "1", "1", "0", "0", "0", "1", "1", "1", "1",
"1", "1", "1", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0",
"0", "0", "0"), R = c("0", "0", "0", "1", "0", "0", "0", "0",
"0", "0", "0", "1", "0", "0", "0", "1", "0", "0", "0", "0", "0",
"0", "0", "0", "0", "1", "1", "1", "0", "0", "1", "0", "0", "0",
"0", "0", "0"), E = c("0", "0", "0", "1", "0", "0", "0", "0",
"0", "0", "0", "1", "0", "0", "0", "0", "0", "0", "0", "0", "0",
"0", "0", "0", "0", "0", "0", "1", "0", "0", "1", "0", "0", "0",
"1", "0", "1"), M = c("0", "0", "0", "1", "0", "0", "1", "0",
"1", "0", "0", "0", "0", "0", "1", "0", "1", "0", "0", "0", "0",
"0", "0", "0", "0", "0", "0", "0", "0", "0", "1", "1", "1", "0",
"1", "0", "1"), F = c("0", "1", "1", "1", "0", "0", "1", "0",
"0", "0", "0", "0", "0", "0", "1", "0", "1", "0", "0", "0", "0",
"0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "1", "0",
"0", "1", "0"), I = c("0", "0", "0", "1", "0", "0", "0", "0",
"0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0",
"0", "0", "0", "0", "1", "1", "0", "0", "0", "1", "0", "0", "1",
"1", "0", "1")), row.names = c("Atelocynus_microtis", "Lupulella_adustus",
"Canis_anthus", "Canis_aureus", "Canis_latrans", "Canis_lupus",
"Lupulella_mesomelas", "Canis_rufus", "Canis_simensis", "Cerdocyon_thous",
"Chrysocyon_brachyurus", "Cuon_alpinus", "Dusicyon_australis",
"Lycalopex_fulvipes", "Lycaon_pictus", "Nyctereutes_procyonoides",
"Otocyon_megalotis", "Lycalopex_culpaeus", "Lycalopex_griseus",
"Lycalopex_gymnocercus", "Lycalopex_sechurae", "Pseudalopex_vetulus",
"Speothos_venaticus", "Urocyon_cinereoargenteus", "Urocyon_littoralis",
"Vulpes_corsac", "Vulpes_ferrilata", "Vulpes_lagopus", "Vulpes_macrotis",
"Vulpes_velox", "Vulpes_vulpes", "Vulpes_zerda", "Vulpes_pallida",
"Vulpes_bengalensis", "Vulpes_cana", "Vulpes_chama", "Vulpes_rueppellii"
), class = "data.frame")
dtf = living_tipranges
max_range_size = 7
include_null_range=TRUE
get_distribution_of_range_sizes_in_tipranges(dtf=living_tipranges, max_range_size=max_range_size, include_null_range=TRUE)
'
# To take a tipranges object directly
if ("tipranges" %in% class(dtf))
{
dtf = dtf@df
}
numareas = ncol(dtf)
if (include_null_range == FALSE)
{
range_sizes = 1:max_range_size
} # END if (include_null_range == FALSE)
if (include_null_range == TRUE)
{
range_sizes = 0:max_range_size
} # END if (include_null_range == TRUE)
observed_range_sizes = table(rowSums(dfnums_to_numeric(dtf)))
new_observed_range_sizes = NULL
for (range_size in range_sizes)
{
observed_range_size = observed_range_sizes[as.character(range_size)]
if (is.na(observed_range_size))
{
new_observed_range_sizes[as.character(range_size)] = 0
} else {
new_observed_range_sizes[as.character(range_size)] = observed_range_size
}
}
new_observed_range_sizes
# Input 1 for range size 0 (null_range)
TF = (include_null_range == TRUE) && (new_observed_range_sizes["0"] == 0)
if (TF == TRUE)
{
new_observed_range_sizes["0"] = 1
}
# Counting down from the largest ranges, input 1 for
# largest unobserved ranges, for other 0s, input previous value
last_value_input = 1
for (i in (length(range_sizes):1))
{
range_size = range_sizes[i]
new_observed_range_size = new_observed_range_sizes[as.character(range_size)]
if (new_observed_range_size == 0)
{
new_observed_range_sizes[as.character(range_size)] = last_value_input
}
last_value_input = new_observed_range_sizes[as.character(range_size)]
}
names(new_observed_range_sizes) = range_sizes
new_weighting_by_range_size = new_observed_range_sizes
res = NULL
res$observed_range_sizes = observed_range_sizes
res$new_weighting_by_range_size = new_weighting_by_range_size
junk='
observed_range_sizes = res$observed_range_sizes
new_weighting_by_range_size = res$new_weighting_by_range_size
'
return(res)
}
# res$numareas_per_state = numareas_per_state
# res$numstates_per_numareas = numstates_per_numareas
#' Get the size distribution of range sizes in a states_list
#'
#' This function gets the size distribution of range sizes
#' from a \code{states_list}, e.g. from
#' \code{\link[cladoRcpp]{rcpp_areas_list_to_states_list}}.
#'
#' This function can be used to help set up a rough prior on
#' range sizes, in cases where the frequency of different range
#' sizes in the state space is substantially different from the
#' observed distribution of range sizes. (This is very frequently
#' the case, and can cause a major bias towards inferring
#' widespread tips and ancestors in imperfect detection models.)
#'
#' @param states_list_0based A list of states, where each
#' list item is a vector of 0-based area numbers.
#' @return res, a list object containing (2) \code{res$numareas_per_state},
#' the number of areas (the range size) of each state/range; and
#' (2) \code{res$numstates_per_numareas}, the observed counts in
#' each range size in the states_list
#' @export
#' @author Nicholas J. Matzke \email{matzke@@berkeley.edu}
#' @examples
#' test=1
#' living_tipranges = structure(list(N = c("0", "0", "0", "0", "1", "1", "0", "1",
#' "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0",
#' "0", "1", "1", "1", "0", "0", "1", "1", "1", "1", "0", "0", "0",
#' "0", "0", "0"), S = c("1", "0", "0", "0", "0", "0", "0", "0",
#' "0", "1", "1", "0", "1", "1", "0", "0", "0", "1", "1", "1", "1",
#' "1", "1", "1", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0",
#' "0", "0", "0"), R = c("0", "0", "0", "1", "0", "0", "0", "0",
#' "0", "0", "0", "1", "0", "0", "0", "1", "0", "0", "0", "0", "0",
#' "0", "0", "0", "0", "1", "1", "1", "0", "0", "1", "0", "0", "0",
#' "0", "0", "0"), E = c("0", "0", "0", "1", "0", "0", "0", "0",
#' "0", "0", "0", "1", "0", "0", "0", "0", "0", "0", "0", "0", "0",
#' "0", "0", "0", "0", "0", "0", "1", "0", "0", "1", "0", "0", "0",
#' "1", "0", "1"), M = c("0", "0", "0", "1", "0", "0", "1", "0",
#' "1", "0", "0", "0", "0", "0", "1", "0", "1", "0", "0", "0", "0",
#' "0", "0", "0", "0", "0", "0", "0", "0", "0", "1", "1", "1", "0",
#' "1", "0", "1"), F = c("0", "1", "1", "1", "0", "0", "1", "0",
#' "0", "0", "0", "0", "0", "0", "1", "0", "1", "0", "0", "0", "0",
#' "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "1", "0",
#' "0", "1", "0"), I = c("0", "0", "0", "1", "0", "0", "0", "0",
#' "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0",
#' "0", "0", "0", "0", "1", "1", "0", "0", "0", "1", "0", "0", "1",
#' "1", "0", "1")), row.names = c("Atelocynus_microtis", "Lupulella_adustus",
#' "Canis_anthus", "Canis_aureus", "Canis_latrans", "Canis_lupus",
#' "Lupulella_mesomelas", "Canis_rufus", "Canis_simensis", "Cerdocyon_thous",
#' "Chrysocyon_brachyurus", "Cuon_alpinus", "Dusicyon_australis",
#' "Lycalopex_fulvipes", "Lycaon_pictus", "Nyctereutes_procyonoides",
#' "Otocyon_megalotis", "Lycalopex_culpaeus", "Lycalopex_griseus",
#' "Lycalopex_gymnocercus", "Lycalopex_sechurae", "Pseudalopex_vetulus",
#' "Speothos_venaticus", "Urocyon_cinereoargenteus", "Urocyon_littoralis",
#' "Vulpes_corsac", "Vulpes_ferrilata", "Vulpes_lagopus", "Vulpes_macrotis",
#' "Vulpes_velox", "Vulpes_vulpes", "Vulpes_zerda", "Vulpes_pallida",
#' "Vulpes_bengalensis", "Vulpes_cana", "Vulpes_chama", "Vulpes_rueppellii"
#' ), class = "data.frame")
#' dtf = living_tipranges
#' max_range_size = 7
#' include_null_range=TRUE
#' #areas = getareas_from_tipranges_object(tipranges)
#' areas = names(dtf)
#' states_list_0based = rcpp_areas_list_to_states_list(areas=areas, maxareas=max_range_size, include_null_range=TRUE)
#' tmpres = get_distribution_of_range_sizes_in_states_list(states_list_0based)
#' tmpres
#' names(tmpres)
#'
get_distribution_of_range_sizes_in_states_list <- function(states_list_0based)
{
junk='
living_tipranges = structure(list(N = c("0", "0", "0", "0", "1", "1", "0", "1",
"0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0",
"0", "1", "1", "1", "0", "0", "1", "1", "1", "1", "0", "0", "0",
"0", "0", "0"), S = c("1", "0", "0", "0", "0", "0", "0", "0",
"0", "1", "1", "0", "1", "1", "0", "0", "0", "1", "1", "1", "1",
"1", "1", "1", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0",
"0", "0", "0"), R = c("0", "0", "0", "1", "0", "0", "0", "0",
"0", "0", "0", "1", "0", "0", "0", "1", "0", "0", "0", "0", "0",
"0", "0", "0", "0", "1", "1", "1", "0", "0", "1", "0", "0", "0",
"0", "0", "0"), E = c("0", "0", "0", "1", "0", "0", "0", "0",
"0", "0", "0", "1", "0", "0", "0", "0", "0", "0", "0", "0", "0",
"0", "0", "0", "0", "0", "0", "1", "0", "0", "1", "0", "0", "0",
"1", "0", "1"), M = c("0", "0", "0", "1", "0", "0", "1", "0",
"1", "0", "0", "0", "0", "0", "1", "0", "1", "0", "0", "0", "0",
"0", "0", "0", "0", "0", "0", "0", "0", "0", "1", "1", "1", "0",
"1", "0", "1"), F = c("0", "1", "1", "1", "0", "0", "1", "0",
"0", "0", "0", "0", "0", "0", "1", "0", "1", "0", "0", "0", "0",
"0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "1", "0",
"0", "1", "0"), I = c("0", "0", "0", "1", "0", "0", "0", "0",
"0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0",
"0", "0", "0", "0", "1", "1", "0", "0", "0", "1", "0", "0", "1",
"1", "0", "1")), row.names = c("Atelocynus_microtis", "Lupulella_adustus",
"Canis_anthus", "Canis_aureus", "Canis_latrans", "Canis_lupus",
"Lupulella_mesomelas", "Canis_rufus", "Canis_simensis", "Cerdocyon_thous",
"Chrysocyon_brachyurus", "Cuon_alpinus", "Dusicyon_australis",
"Lycalopex_fulvipes", "Lycaon_pictus", "Nyctereutes_procyonoides",
"Otocyon_megalotis", "Lycalopex_culpaeus", "Lycalopex_griseus",
"Lycalopex_gymnocercus", "Lycalopex_sechurae", "Pseudalopex_vetulus",
"Speothos_venaticus", "Urocyon_cinereoargenteus", "Urocyon_littoralis",
"Vulpes_corsac", "Vulpes_ferrilata", "Vulpes_lagopus", "Vulpes_macrotis",
"Vulpes_velox", "Vulpes_vulpes", "Vulpes_zerda", "Vulpes_pallida",
"Vulpes_bengalensis", "Vulpes_cana", "Vulpes_chama", "Vulpes_rueppellii"
), class = "data.frame")
dtf = living_tipranges
max_range_size = 7
include_null_range=TRUE
#areas = getareas_from_tipranges_object(tipranges)
areas = names(dtf)
states_list_0based = rcpp_areas_list_to_states_list(areas=areas, maxareas=max_range_size, include_null_range=TRUE)
tmpres = get_distribution_of_range_sizes_in_states_list(states_list_0based)
tmpres
names(tmpres)
'
numareas_per_state = sapply(X=states_list_0based, FUN=length)
numareas_per_state
# Correct the rangesize for null range (NA or "_") to 0
if (is.na(states_list_0based[[1]]) == TRUE)
{
numareas_per_state[1] = 0
} else {
if ((states_list_0based[[1]] == "_") || (states_list_0based[[1]] == ""))
{
numareas_per_state[1] = 0
}
} # END if (is.na(states_list_0based[[1]]) == TRUE)
numstates_per_numareas = table(numareas_per_state)
res = NULL
res$numareas_per_state = numareas_per_state
res$numstates_per_numareas = numstates_per_numareas
junk='
numareas_per_state = res$numareas_per_state
numstates_per_numareas = res$numstates_per_numareas
'
return(res)
} # END get_distribution_of_range_sizes_in_states_list <- function(states_list_0based)
#' Calculate a prior based on the distribution of range sizes
#'
#' This function sets up a rough prior on
#' range sizes, for use in cases where the frequency of different range
#' sizes in the state space is substantially different from the
#' observed distribution of range sizes. This is very frequently
#' the case, and can cause a major bias towards inferring
#' widespread tips and ancestors in imperfect detection models. This
#' prior should correct that bias.
#'
#' To demonstrate how this works:
#'
#' new_weighting_by_range_size: \cr
#'
#' range_size: 0 1 2 3 4 5 6 7 \cr
#' observed_range_size: 1 23 9 3 2 2 1 1 \cr
#' num_allowed_ranges_of_this_size: 1 7 21 ............ 1
#'
#' This suggests that the tip likelihoods, for tips with uncertain ranges, should be
#' multiplied by this prior:
#'
#' default: 1/numranges for all tips, total=1
#'
#' Based on the observed range size counts, the weight or prior on each range:
#'
#' 0: 1 range possible, so it gets prior prob. 1/43, divided among the 1 ranges \cr
#' 1: 7 ranges possible, so they get prior prob. 23/43, divided among the 7 ranges \cr
#' 2: 21 ranges possible, so they get prior prob. 9/43, divided among the 21 ranges \cr
#' ...etc... \cr
#'
#' @param observed_rangesizes_dtf A \code{data.frame} with counts
#' of observed range sizes, e.g. from res$new_weighting_by_range_size
#' returned by \code{\link{get_distribution_of_range_sizes_in_tipranges}}.
#' @param max_range_size The maximum allowed range size.
#' @param include_null_range Should the null range be included in the
#' state space? Default is \code{TRUE}.
#' @param states_list_0based A list of states, where each
#' list item is a vector of 0-based area numbers.
#' @return prior_by_range_size_of_each_state, a vector giving the prior
#' probability of each range size, assuming that the range sizes
#' are distributed roughly like \code{observed_rangesizes_dtf}. The
#' probability is divided up by the range size class (number of
#' ranges in each size class). For example:
#'
#' @export
#' @author Nicholas J. Matzke \email{matzke@@berkeley.edu}
#' @examples
#' test=1
#' living_tipranges = structure(list(N = c("0", "0", "0", "0", "1", "1", "0", "1",
#' "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0",
#' "0", "1", "1", "1", "0", "0", "1", "1", "1", "1", "0", "0", "0",
#' "0", "0", "0"), S = c("1", "0", "0", "0", "0", "0", "0", "0",
#' "0", "1", "1", "0", "1", "1", "0", "0", "0", "1", "1", "1", "1",
#' "1", "1", "1", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0",
#' "0", "0", "0"), R = c("0", "0", "0", "1", "0", "0", "0", "0",
#' "0", "0", "0", "1", "0", "0", "0", "1", "0", "0", "0", "0", "0",
#' "0", "0", "0", "0", "1", "1", "1", "0", "0", "1", "0", "0", "0",
#' "0", "0", "0"), E = c("0", "0", "0", "1", "0", "0", "0", "0",
#' "0", "0", "0", "1", "0", "0", "0", "0", "0", "0", "0", "0", "0",
#' "0", "0", "0", "0", "0", "0", "1", "0", "0", "1", "0", "0", "0",
#' "1", "0", "1"), M = c("0", "0", "0", "1", "0", "0", "1", "0",
#' "1", "0", "0", "0", "0", "0", "1", "0", "1", "0", "0", "0", "0",
#' "0", "0", "0", "0", "0", "0", "0", "0", "0", "1", "1", "1", "0",
#' "1", "0", "1"), F = c("0", "1", "1", "1", "0", "0", "1", "0",
#' "0", "0", "0", "0", "0", "0", "1", "0", "1", "0", "0", "0", "0",
#' "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "1", "0",
#' "0", "1", "0"), I = c("0", "0", "0", "1", "0", "0", "0", "0",
#' "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0",
#' "0", "0", "0", "0", "1", "1", "0", "0", "0", "1", "0", "0", "1",
#' "1", "0", "1")), row.names = c("Atelocynus_microtis", "Lupulella_adustus",
#' "Canis_anthus", "Canis_aureus", "Canis_latrans", "Canis_lupus",
#' "Lupulella_mesomelas", "Canis_rufus", "Canis_simensis", "Cerdocyon_thous",
#' "Chrysocyon_brachyurus", "Cuon_alpinus", "Dusicyon_australis",
#' "Lycalopex_fulvipes", "Lycaon_pictus", "Nyctereutes_procyonoides",
#' "Otocyon_megalotis", "Lycalopex_culpaeus", "Lycalopex_griseus",
#' "Lycalopex_gymnocercus", "Lycalopex_sechurae", "Pseudalopex_vetulus",
#' "Speothos_venaticus", "Urocyon_cinereoargenteus", "Urocyon_littoralis",
#' "Vulpes_corsac", "Vulpes_ferrilata", "Vulpes_lagopus", "Vulpes_macrotis",
#' "Vulpes_velox", "Vulpes_vulpes", "Vulpes_zerda", "Vulpes_pallida",
#' "Vulpes_bengalensis", "Vulpes_cana", "Vulpes_chama", "Vulpes_rueppellii"
#' ), class = "data.frame")
#' dtf = living_tipranges
#' observed_rangesizes_dtf = dtf
#' max_range_size = 7
#' include_null_range=TRUE
#' states_list_0based = rcpp_areas_list_to_states_list(areas=areas, maxareas=max_range_size, include_null_range=TRUE)
#' prior_by_range_size_of_each_state = get_rangesize_prior_by_observed_rangesize(observed_rangesizes_dtf=observed_rangesizes_dtf, max_range_size=max_range_size, #' include_null_range=include_null_range, states_list_0based=states_list_0based)
#' prior_by_range_size_of_each_state
#' sum(prior_by_range_size_of_each_state)
#'
get_rangesize_prior_by_observed_rangesize <- function(observed_rangesizes_dtf, max_range_size, include_null_range=TRUE, states_list_0based=NULL)
{
junk='
living_tipranges = structure(list(N = c("0", "0", "0", "0", "1", "1", "0", "1",
"0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0",
"0", "1", "1", "1", "0", "0", "1", "1", "1", "1", "0", "0", "0",
"0", "0", "0"), S = c("1", "0", "0", "0", "0", "0", "0", "0",
"0", "1", "1", "0", "1", "1", "0", "0", "0", "1", "1", "1", "1",
"1", "1", "1", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0",
"0", "0", "0"), R = c("0", "0", "0", "1", "0", "0", "0", "0",
"0", "0", "0", "1", "0", "0", "0", "1", "0", "0", "0", "0", "0",
"0", "0", "0", "0", "1", "1", "1", "0", "0", "1", "0", "0", "0",
"0", "0", "0"), E = c("0", "0", "0", "1", "0", "0", "0", "0",
"0", "0", "0", "1", "0", "0", "0", "0", "0", "0", "0", "0", "0",
"0", "0", "0", "0", "0", "0", "1", "0", "0", "1", "0", "0", "0",
"1", "0", "1"), M = c("0", "0", "0", "1", "0", "0", "1", "0",
"1", "0", "0", "0", "0", "0", "1", "0", "1", "0", "0", "0", "0",
"0", "0", "0", "0", "0", "0", "0", "0", "0", "1", "1", "1", "0",
"1", "0", "1"), F = c("0", "1", "1", "1", "0", "0", "1", "0",
"0", "0", "0", "0", "0", "0", "1", "0", "1", "0", "0", "0", "0",
"0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "1", "0",
"0", "1", "0"), I = c("0", "0", "0", "1", "0", "0", "0", "0",
"0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0",
"0", "0", "0", "0", "1", "1", "0", "0", "0", "1", "0", "0", "1",
"1", "0", "1")), row.names = c("Atelocynus_microtis", "Lupulella_adustus",
"Canis_anthus", "Canis_aureus", "Canis_latrans", "Canis_lupus",
"Lupulella_mesomelas", "Canis_rufus", "Canis_simensis", "Cerdocyon_thous",
"Chrysocyon_brachyurus", "Cuon_alpinus", "Dusicyon_australis",
"Lycalopex_fulvipes", "Lycaon_pictus", "Nyctereutes_procyonoides",
"Otocyon_megalotis", "Lycalopex_culpaeus", "Lycalopex_griseus",
"Lycalopex_gymnocercus", "Lycalopex_sechurae", "Pseudalopex_vetulus",
"Speothos_venaticus", "Urocyon_cinereoargenteus", "Urocyon_littoralis",
"Vulpes_corsac", "Vulpes_ferrilata", "Vulpes_lagopus", "Vulpes_macrotis",
"Vulpes_velox", "Vulpes_vulpes", "Vulpes_zerda", "Vulpes_pallida",
"Vulpes_bengalensis", "Vulpes_cana", "Vulpes_chama", "Vulpes_rueppellii"
), class = "data.frame")
dtf = living_tipranges
observed_rangesizes_dtf = dtf
max_range_size = 7
include_null_range=TRUE
states_list_0based = rcpp_areas_list_to_states_list(areas=areas, maxareas=max_range_size, include_null_range=TRUE)
prior_by_range_size_of_each_state = get_rangesize_prior_by_observed_rangesize(observed_rangesizes_dtf=observed_rangesizes_dtf, max_range_size=max_range_size, include_null_range=include_null_range, states_list_0based=states_list_0based)
prior_by_range_size_of_each_state
sum(prior_by_range_size_of_each_state)
'
# To take a tipranges object directly
if ("tipranges" %in% class(observed_rangesizes_dtf))
{
observed_rangesizes_dtf = observed_rangesizes_dtf@df
}
dtf = observed_rangesizes_dtf
areanames = names(dtf)
# Generate the states_list, if needed
if (is.null(states_list_0based))
{
states_list_0based = rcpp_areas_list_to_states_list(areas=areanames, maxareas=max_range_size, include_null_range=include_null_range)
}
numstates = length(states_list_0based)
# Get the observed range sizes
res = get_distribution_of_range_sizes_in_tipranges(dtf, max_range_size=max_range_size, include_null_range=include_null_range)
observed_range_sizes = res$observed_range_sizes
new_weighting_by_range_size = res$new_weighting_by_range_size
# Get the range sizes of the state space
res = get_distribution_of_range_sizes_in_states_list(states_list_0based)
numareas_per_state = res$numareas_per_state
numstates_per_numareas = res$numstates_per_numareas
# new_weighting_by_range_size:
# range_size: 0 1 2 3 4 5 6 7
# observed_range_size: 1 23 9 3 2 2 1 1
# num_allowed_ranges_of_this_size: 1 7 21 ............ 1
#
# This suggests that the tip likelihoods, for tips with uncertain ranges, should be
#
# multiplied by this prior:
# default: 1/numranges for all tips, total=1
# size_wt:
# 0: 1 range possible, so it gets prior prob. 1/43, divided among the 1 ranges
# 1: 7 ranges possible, so they get prior prob. (23/43), divided among the 7 ranges
# 2: 21 ranges possible, so they get prior prob. (9/43), divided among the 21 ranges
# ...etc...
# Character vector
list_of_numareas_in_state_space = names(numstates_per_numareas)
prior_by_range_size_of_each_state = rep(0, times=length(states_list_0based))
statenums_start = 0
statenums_end = 0
tmp_numstates = 0
for (i in 1:length(list_of_numareas_in_state_space))
{
# Character vector
tmp_numareas = list_of_numareas_in_state_space[i]
numstates_in_this_numareas = numstates_per_numareas[tmp_numareas]
statenums_start = statenums_end
statenums_start = statenums_start + 1
statenums_end = statenums_end + numstates_in_this_numareas
prior_by_range_size_of_each_state[statenums_start:statenums_end] = (new_weighting_by_range_size[tmp_numareas] / sum(new_weighting_by_range_size)) * (1/numstates_in_this_numareas)
}
prior_by_range_size_of_each_state
sum(prior_by_range_size_of_each_state)
# 1
return(prior_by_range_size_of_each_state)
} # END get_rangesize_prior_by_observed_rangesize(observed_rangesizes_dtf, max_range_size, include_null_range=TRUE, states_list_0based=NULL)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.