R/BioGeoBEARS_detection_v1.R

Defines functions read_detections read_controls calc_obs_like calc_post_prob_presence mapply_calc_post_prob_presence mapply_calc_obs_like Pdata_given_rangerow_dp Pdata_given_rangerow tiplikes_wDetectionModel prob_of_states_from_prior_prob_areas post_prob_states post_prob_states_matrix

Documented in calc_obs_like calc_post_prob_presence mapply_calc_obs_like mapply_calc_post_prob_presence Pdata_given_rangerow Pdata_given_rangerow_dp post_prob_states post_prob_states_matrix prob_of_states_from_prior_prob_areas read_controls read_detections tiplikes_wDetectionModel

#######################################################
# DETECTION PROBABILITIES
#######################################################

# source("/Dropbox/_njm/__packages/BioGeoBEARS_setup/BioGeoBEARS_detection_v1.R")

require("ape")
require("rexpokit")
require("cladoRcpp")


# Qmat contains the d & e probs
# COO_probs_columnar contains the simulation probs


#######################################################
# read_detections
#######################################################
#' Read a file with detection counts per area
#' 
#' This function reads in a tab-delimited text file containing counts of detections of 
#' each OTU in each region.  These could be from database records or some other source.
#' 
#' @param detects_fn The filename of the detections file.
#' @param OTUnames Default \code{NULL}, in which case the first column of the text file is used as row names/OTU names.
#' @param areanames Default \code{NULL}, in which case the text file column headings are used.
#' @param tmpskip How many lines should be skipped before reading the text file?  Default \code{0}.
#' @param phy An ape phylo object. If included, the rows will be sorted to match the order of tree tip labels.
#' @return \code{dtf} 
#' @export
#' @seealso \code{\link[cladoRcpp]{rcpp_calc_anclikes_sp_COOweights_faster}}
#' @note Go BEARS!
#' @author Nicholas J. Matzke \email{matzke@@berkeley.edu} 
#' @references
#' \url{http://phylo.wikidot.com/matzke-2013-international-biogeography-society-poster}
#' @bibliography /Dropbox/_njm/__packages/BioGeoBEARS_setup/BioGeoBEARS_refs.bib
#'   @cite Matzke_2012_IBS
#'	 @cite Bottjer_Jablonski_1988
#' @examples
#' testval=1
#' 
read_detections <- function(detects_fn, OTUnames=NULL, areanames=NULL, tmpskip=0, phy=NULL)
	{
	runjunk='
	detects_fn = "/Library/Frameworks/R.framework/Versions/2.15/Resources/library/BioGeoBEARS/extdata/Psychotria_detections_v1.txt"
	detects_fn = "/Dropbox/_njm/__packages/BioGeoBEARS_setup/inst/extdata/Psychotria_detections_v1.txt"
	OTUnames=NULL
	areanames=NULL
	tmpskip=0
	'
	if (is.null(OTUnames) == TRUE)
		{
		dtf = read.table(file=detects_fn, header=TRUE, skip=tmpskip, sep="	", quote="", stringsAsFactors = FALSE, strip.white=TRUE, fill=TRUE, row.names=1)
		dtf
		} else {
		# Assumes no OTU names
		dtf = read.table(file=detects_fn, header=TRUE, skip=tmpskip, sep="	", quote="", stringsAsFactors = FALSE, strip.white=TRUE, fill=TRUE, row.names=OTUnames)
		dtf
		}

	if (is.null(areanames) == TRUE)
		{
		pass=1
		} else {
		names(dtf) = areanames
		}

	# Order tip labels
	if (!is.null(phy))
		{
		tipranges_df_order = match(phy$tip.label, rownames(dtf))
		dtf = dtf[tipranges_df_order, ]
		}
		
	# Standardize and return
	dtf = adf2(dtf)
	dtf = dfnums_to_numeric(dtf)
	
	return(dtf)
	}


#######################################################
# read_controls
#######################################################
#' Read a file with the total number of detections in a taphonomic control
#' 
#' This function reads in a tab-delimited text file containing counts of detections of 
#' the taphonomic controls in each region.  These numbers should always be equal to or
#' larger than the counts in the detections file.
#' 
#' The idea of taphonomic controls dates back at least to work of Bottjer & Jablonski (1988).  The basic
#' idea is that if you have taxa of roughly similar detectability, then detections of other 
#' taxa give some idea of overall detection effort.  Obviously this is a very simple 
#' model that can be criticized in any number of ways (different alpha diversity in each 
#' region, different detectability of individual taxa, etc.), but it is a useful starting
#' point as there has been no implementation of any detection model in historical/phylogenetic 
#' biogeography to date.
#' 
#' One could imagine (a) every OTU and area has a different count of detections and taphonomic control 
#' detections, or (b) the taphonomic control detections are specified by area, and shared across all OTUs.
#' Situation (b) is likely more common, but this function implements (a).  Behavior (b) 
#' could be reproduced by summing each column, and/or copying this sum to all cells for a 
#' particular area.
#' 
#' @param controls_fn The filename of the file containing the counts of taphonomic control detections.
#' @param OTUnames Default \code{NULL}, in which case the first column of the text file is used as row names/OTU names.
#' @param areanames Default \code{NULL}, in which case the text file column headings are used.
#' @param tmpskip How many lines should be skipped before reading the text file?  Default \code{0}.
#' @param phy An ape phylo object. If included, the rows will be sorted to match the order of tree tip labels.
#' @return \code{dtf} 
#' @export
#' @seealso \code{\link[cladoRcpp]{rcpp_calc_anclikes_sp_COOweights_faster}}
#' @note Go BEARS!
#' @author Nicholas J. Matzke \email{matzke@@berkeley.edu} 
#' @references
#' \url{http://phylo.wikidot.com/matzke-2013-international-biogeography-society-poster}
#' @bibliography /Dropbox/_njm/__packages/BioGeoBEARS_setup/BioGeoBEARS_refs.bib
#'   @cite Matzke_2012_IBS
#'	 @cite Bottjer_Jablonski_1988
#' @examples
#' testval=1
#' 
read_controls <- function(controls_fn, OTUnames=NULL, areanames=NULL, tmpskip=0, phy=NULL)
	{
	runjunk='
	controls_fn = "/Library/Frameworks/R.framework/Versions/2.15/Resources/library/BioGeoBEARS/extdata/Psychotria_controls_v1.txt"
	controls_fn = "/Dropbox/_njm/__packages/BioGeoBEARS_setup/inst/extdata/Psychotria_controls_v1.txt"
	OTUnames=NULL
	areanames=NULL
	tmpskip=0
	'
	if (is.null(OTUnames) == TRUE)
		{
		dtf = read.table(file=controls_fn, header=TRUE, skip=tmpskip, sep="	", quote="", stringsAsFactors = FALSE, strip.white=TRUE, fill=TRUE, row.names=1)
		dtf
		} else {
		# Assumes no OTU names
		dtf = read.table(file=controls_fn, header=TRUE, skip=tmpskip, sep="	", quote="", stringsAsFactors = FALSE, strip.white=TRUE, fill=TRUE, row.names=OTUnames)
		dtf
		}

	if (is.null(areanames) == TRUE)
		{
		pass=1
		} else {
		names(dtf) = areanames
		}
	
	# Order tip labels
	if (!is.null(phy))
		{
		tipranges_df_order = match(phy$tip.label, rownames(dtf))
		dtf = dtf[tipranges_df_order, ]
		}
	
	# Standardize and return
	dtf = adf2(dtf)
	dtf = dfnums_to_numeric(dtf)
	
	return(dtf)
	}






# Likelihood model
# Calculate P(obs | presence)
# Source: 
# "/Users/nickm/Desktop/__projects/___phylokriging_example/__scripts/_ocean_data_predict_v6.R', chdir = TRUE)

#######################################################
# calc_obs_like
#######################################################
#' Calculate likelihood of count data given true presence/absence and parameters
#' 
#' This function calculates P(data|presence,parameters), i.e. the probability of some detection and 
#' taphonomic control counts, given the true geographic range/state, and parameters such as \code{dp}, a 
#' detection probability (and, optionally, a false detection probability, \code{fdp}).
#' 
#' The idea of taphonomic controls dates back at least to work of Bottjer & Jablonski (1988).  The basic
#' idea is that if you have taxa of roughly similar detectability, then detections of other 
#' taxa give some idea of overall detection effort.  Obviously this is a very simple 
#' model that can be criticized in any number of ways (different alpha diversity in each 
#' region, different detectability of individual taxa, etc.), but it is a useful starting
#' point as there has been no implementation of any detection model in historical/phylogenetic 
#' biogeography to date.
#' 
#' One could imagine (a) every OTU and area has a different count of detections and taphonomic control 
#' detections, or (b) the taphonomic control detections are specified by area, and shared across all OTUs.
#' Situation (b) is likely more common, but this function assumes (a) as this is the more thorough case.
#' Behavior (b) could be reproduced by summing each column, and/or copying this sum to all cells for a 
#' particular area.
#' 
#' @param truly_present Is the OTU of interest known/conditionally assumed to be truly present (\code{TRUE}) or truly absent (\code{FALSE})?
#' @param obs_target_species A count of detections of your OTU of interest, e.g. as produced from a cell of the matrix output from \code{\link{read_detections}}.
#' @param obs_all_species A count of detections of your taphonomic controls, e.g. as produced from a cell of the output from \code{\link{read_controls}}.
#' @param mean_frequency This is the proportion of samples from the taphonomic control group that will truly be from this OTU, GIVEN that the OTU is present.
#' This could be estimated, but a decent first guess is (total # samples of OTU of interest / total # of samples in the taphonomic control group where
#' the OTU is known to be present).  All that is really needed is some reasonable value, such that more sampling without detection lowers the 
#' likelihood of the data on the hypothesis of true presence, and vice versa.  This value can only be 1 when the number of detections = the number 
#' of taphonomic control detections, for every OTU and area.  This is the implicit assumption in e.g. standard historical biogeography analyses in 
#' LAGRANGE or BioGeoBEARS.
#' @param dp The detection probability.  This is the per-sample probability that you will correctly detect the OTU in question, 
#' when you are looking at it.  Default is 1, which is the implicit assumption in standard analyses.
#' @param fdp The false detection probability.  This is probability of falsely concluding a detection of the OTU of interest occurred, when in 
#' fact the specimen was of something else.  The default is 0, which assumes zero error rate, 
#' i.e. the assumption being made in all historical biogeography analyses that do not take into account detection 
#' probability.  These options are being included for completeness, but it may not be wise to try to infer \code{mean_frequency},  
#' \code{dp} and \code{fdp} all at once due to identifiability issues (and estimation of fdp may take a very large amount of data).  However, 
#' fixing some of these parameters to reasonable values can allow the user to effectively include beliefs about the uncertainty of the 
#' input data into the analysis, if desired.
#' @return \code{lnlike_allobs_given_absence} The natural log-likelihood of the data, given the model & assumption of true presence or absence.
#' @export
#' @seealso \code{\link{mapply_calc_post_prob_presence}}, \code{\link{calc_post_prob_presence}}, \code{\link{mapply_calc_obs_like}}
#' @note Go BEARS!
#' @author Nicholas J. Matzke \email{matzke@@berkeley.edu} 
#' @references
#' \url{http://phylo.wikidot.com/matzke-2013-international-biogeography-society-poster}
#' @bibliography /Dropbox/_njm/__packages/BioGeoBEARS_setup/BioGeoBEARS_refs.bib
#'   @cite Matzke_2012_IBS
#'	 @cite Bottjer_Jablonski_1988
#' @examples
#' 
#' # Example: 10 observations of the species mean dramatically higher likelihood of the
#' # data on the hypothesis that it is truly present.
#' 
#' # With zero error rate
#' obs_target_species = 10
#' obs_all_species = 100
#' mean_frequency=0.1
#' dp=1
#' fdp=0
#' LnL_under_presence = calc_obs_like(truly_present=TRUE, obs_target_species,
#' obs_all_species, mean_frequency, dp, fdp)
#' LnL_under_absence = calc_obs_like(truly_present=FALSE, obs_target_species,
#' obs_all_species, mean_frequency, dp, fdp)
#' LnL_under_presence
#' LnL_under_absence
#' # Note that the probability of getting detections, under the hypothesis of 
#' # true absence, is -Inf
#' 
#' 
#' # With a small error rate, there is some small but positive probability of
#' # falsely getting 10 detections
#' obs_target_species = 10
#' obs_all_species = 100
#' mean_frequency=0.1
#' dp=0.99
#' fdp=0.001
#' LnL_under_presence = calc_obs_like(truly_present=TRUE, obs_target_species,
#' obs_all_species, mean_frequency, dp, fdp)
#' LnL_under_absence = calc_obs_like(truly_present=FALSE, obs_target_species,
#' obs_all_species, mean_frequency, dp, fdp)
#' LnL_under_presence
#' LnL_under_absence
#' # i.e. the prob. of the data is 1 under the hypothesis of presence, and 0 
#' # under the hypothesis of absence (ln(prob) = 0 & -Inf, respectively)
#' 
#' 
#' # Note that with very high error rates, your conclusion could reverse
#' obs_target_species = 10
#' obs_all_species = 100
#' mean_frequency=0.1
#' dp=0.5
#' fdp=0.3
#' LnL_under_presence = calc_obs_like(truly_present=TRUE, obs_target_species,
#' obs_all_species, mean_frequency, dp, fdp)
#' LnL_under_absence = calc_obs_like(truly_present=FALSE, obs_target_species,
#' obs_all_species, mean_frequency, dp, fdp)
#' LnL_under_presence
#' LnL_under_absence
#' 
#' 
#' # Example #2 -- what if you have ZERO detections, but lots of detections 
#' # of your taphonomic control?
#' obs_target_species = 0
#' obs_all_species = 1
#' mean_frequency=0.1
#' dp=1
#' fdp=0
#' LnL_under_presence = calc_obs_like(truly_present=TRUE, obs_target_species,
#' obs_all_species, mean_frequency, dp, fdp)
#' LnL_under_absence = calc_obs_like(truly_present=FALSE, obs_target_species,
#' obs_all_species, mean_frequency, dp, fdp)
#' LnL_under_presence
#' LnL_under_absence
#' 
#' # With a slight error rate
#' obs_target_species = 0
#' obs_all_species = 1
#' mean_frequency=0.1
#' dp=0.99
#' fdp=0.001
#' LnL_under_presence = calc_obs_like(truly_present=TRUE, obs_target_species,
#' obs_all_species, mean_frequency, dp, fdp)
#' LnL_under_absence = calc_obs_like(truly_present=FALSE, obs_target_species,
#' obs_all_species, mean_frequency, dp, fdp)
#' LnL_under_presence
#' LnL_under_absence
#' 
#' 
#' obs_target_species = 0
#' obs_all_species = 2
#' mean_frequency=0.1
#' dp=1
#' fdp=0
#' LnL_under_presence = calc_obs_like(truly_present=TRUE, obs_target_species,
#' obs_all_species, mean_frequency, dp, fdp)
#' LnL_under_absence = calc_obs_like(truly_present=FALSE, obs_target_species,
#' obs_all_species, mean_frequency, dp, fdp)
#' LnL_under_presence
#' LnL_under_absence
#' 
#' # With a slight error rate
#' obs_target_species = 0
#' obs_all_species = 2
#' mean_frequency=0.1
#' dp=0.99
#' fdp=0.001
#' LnL_under_presence = calc_obs_like(truly_present=TRUE, obs_target_species,
#' obs_all_species, mean_frequency, dp, fdp)
#' LnL_under_absence = calc_obs_like(truly_present=FALSE, obs_target_species,
#' obs_all_species, mean_frequency, dp, fdp)
#' LnL_under_presence
#' LnL_under_absence
#' 
#' 
#' 
#' 
#' 
#' # Example #3 -- what if you have ZERO detections, but only a few 
#' # detections of your taphonomic control?
#' obs_target_species = 0
#' obs_all_species = 100
#' mean_frequency=0.1
#' dp=1
#' fdp=0
#' LnL_under_presence = calc_obs_like(truly_present=TRUE, obs_target_species,
#' obs_all_species, mean_frequency, dp, fdp)
#' LnL_under_absence = calc_obs_like(truly_present=FALSE, obs_target_species,
#' obs_all_species, mean_frequency, dp, fdp)
#' LnL_under_presence
#' LnL_under_absence
#' 
#' # With a slight error rate
#' obs_target_species = 0
#' obs_all_species = 100
#' mean_frequency=0.1
#' dp=0.99
#' fdp=0.001
#' LnL_under_presence = calc_obs_like(truly_present=TRUE, obs_target_species,
#' obs_all_species, mean_frequency, dp, fdp)
#' LnL_under_absence = calc_obs_like(truly_present=FALSE, obs_target_species,
#' obs_all_species, mean_frequency, dp, fdp)
#' LnL_under_presence
#' LnL_under_absence
#' 
#' 
#' 
#' # Special cases -- e.g., no data
#' # Prob(data)=1, ln(prob)=0
#' obs_target_species = 0
#' obs_all_species = 0
#' mean_frequency=0.1
#' dp=0.99
#' fdp=0.001
#' LnL_under_presence = calc_obs_like(truly_present=TRUE, obs_target_species,
#' obs_all_species, mean_frequency, dp, fdp)
#' LnL_under_absence = calc_obs_like(truly_present=FALSE, obs_target_species,
#' obs_all_species, mean_frequency, dp, fdp)
#' LnL_under_presence
#' LnL_under_absence
#' 
#' obs_target_species = 0
#' obs_all_species = 0
#' mean_frequency=0.1
#' dp=1
#' fdp=0
#' LnL_under_presence = calc_obs_like(truly_present=TRUE, obs_target_species,
#' obs_all_species, mean_frequency, dp, fdp)
#' LnL_under_absence = calc_obs_like(truly_present=FALSE, obs_target_species,
#' obs_all_species, mean_frequency, dp, fdp)
#' LnL_under_presence
#' LnL_under_absence
#' 
#' 
#' # What if, for some reason, you put in identical detections and taphonomic control
#' # counts? (e.g., you load in a standard tipranges file)
#' obs_target_species = 1
#' obs_all_species = 1
#' mean_frequency=1
#' dp=1
#' fdp=0
#' LnL_under_presence = calc_obs_like(truly_present=TRUE, obs_target_species,
#' obs_all_species, mean_frequency, dp, fdp)
#' LnL_under_absence = calc_obs_like(truly_present=FALSE, obs_target_species,
#' obs_all_species, mean_frequency, dp, fdp)
#' LnL_under_presence
#' LnL_under_absence
#' 
#' # What if, for some reason, you put in identical detections and taphonomic control
#' # counts? (e.g., you load in a standard tipranges file)
#' obs_target_species = 1
#' obs_all_species = 1
#' mean_frequency=1
#' dp=0.99
#' fdp=0.001
#' LnL_under_presence = calc_obs_like(truly_present=TRUE, obs_target_species,
#' obs_all_species, mean_frequency, dp, fdp)
#' LnL_under_absence = calc_obs_like(truly_present=FALSE, obs_target_species,
#' obs_all_species, mean_frequency, dp, fdp)
#' LnL_under_presence
#' LnL_under_absence
#' 
#' 
calc_obs_like <- function(truly_present=TRUE, obs_target_species, obs_all_species, mean_frequency=0.1, dp=1, fdp=0)
	{
	# Error check -- if you have NO taphonomic controls, you have no information to calculate the likelihood.
	# Therefore just return 1 for any hypothesis (implicit flat prior)
	if (obs_all_species == 0)
		{
		# Probability of these "data" is 1 under any hypothesis; ln(prob=1) is 0
		return(0)
		}
	
	# Hard-coded model parameters
	#########################################
	# Sampling probability
	# Given a single sample, what is the probability that it is the target species?
	Psample_is_the_species_given_presence = mean_frequency
	Psample_is_NOT_species_given_presence = 1-mean_frequency
	Psample_is_the_species_given_absence = 0
	Psample_is_NOT_species_given_absence = 1-Psample_is_the_species_given_absence

	# Identification probability
	# Given an ID, what is the probability that it is accurate?
	P_ID_is_accurate_given_species_is_target = dp
	P_ID_is_NOT_accurate_given_species_is_target = 1-P_ID_is_accurate_given_species_is_target

	P_ID_is_accurate_given_NONtarget_species = 1-fdp
	P_ID_is_NOT_accurate_given_NONtarget_species = 1-P_ID_is_accurate_given_NONtarget_species	# false detection prob
	#########################################
	
	
	# Get observations
	
	# count of the observations of the target species
	obs_target_species = obs_target_species
	
	# count of the observations of the other species (non-target)
	obs_NONtarget_species = obs_all_species-obs_target_species
	
	
	# Calculate P(obs | true_presence)
	if (truly_present == TRUE)
		{
		# prob of presence data (given presence in pixel)
		# prob of ways to observe species correctly (true positive)
		p1 = Psample_is_the_species_given_presence * P_ID_is_accurate_given_species_is_target
		# prob of ways to observe species incorrectly (false positive)
		p2 = Psample_is_NOT_species_given_presence * P_ID_is_NOT_accurate_given_NONtarget_species
		
		lnlike_presencedata_given_presence = log(p1 + p2)*obs_target_species
		exp(lnlike_presencedata_given_presence)
	
	
		# prob of absence data (given presence in pixel)
		# prob of ways to observe NOT species correctly (true negative)
		# still given presence
		p1 = Psample_is_NOT_species_given_presence * P_ID_is_accurate_given_NONtarget_species
		# prob of ways to observe NOT species incorrectly (false negative)
		p2 = Psample_is_the_species_given_presence * P_ID_is_NOT_accurate_given_species_is_target
		
		lnlike_absensedata_given_presence = log(p1 + p2)*obs_NONtarget_species
		# -Inf * 0 trap; prob is 1, log prob is 0
		if ( (log(p1 + p2) == -Inf) && obs_NONtarget_species == 0)
			{
			lnlike_absensedata_given_presence = 0
			}
		exp(lnlike_absensedata_given_presence)
		
		lnlike_allobs_given_presence = lnlike_presencedata_given_presence + lnlike_absensedata_given_presence
		exp(lnlike_allobs_given_presence)
		
		return(lnlike_allobs_given_presence)
		}
		
		
	# Calculate P(obs | true_absence)
	if (truly_present == FALSE)
		{
		# prob of ways to observe species correctly (true result, which can't be positive)
		p1 = Psample_is_the_species_given_absence * P_ID_is_accurate_given_species_is_target
		# prob of ways to observe species incorrectly (false result, which might positive)
		p2 = Psample_is_NOT_species_given_absence * P_ID_is_NOT_accurate_given_NONtarget_species
		
		lnlike_presencedata_given_absence = log(p1 + p2)*obs_target_species
		# -Inf * 0 trap; prob is 1, log prob is 0
		if ( (log(p1 + p2) == -Inf) && obs_target_species == 0)
			{
			lnlike_presencedata_given_absence = 0
			}
		exp(lnlike_presencedata_given_absence)
		
	
		# prob of ways to observe NOT species correctly (true negative)
		# still given absence
		p1 = Psample_is_NOT_species_given_absence * P_ID_is_accurate_given_NONtarget_species
		# prob of ways to observe NOT species incorrectly (false negative)
		p2 = Psample_is_the_species_given_absence * P_ID_is_NOT_accurate_given_species_is_target
		
		lnlike_absensedata_given_absence = log(p1 + p2)*obs_NONtarget_species
		exp(lnlike_absensedata_given_absence)

		
		lnlike_allobs_given_absence = lnlike_presencedata_given_absence + lnlike_absensedata_given_absence
		exp(lnlike_allobs_given_absence)
		
		return(lnlike_allobs_given_absence)
		}
	}




#######################################################
# calc_post_prob_presence
#######################################################
#' Calculate posterior probability of presence, given count data and parameters
#' 
#' This function calculates P(presence|count data,parameters), i.e. the posterior 
#' probability of presence in an area, given data on detection counts and 
#' taphonomic control counts, and a detection model with the parameters mean_frequency, \code{dp}, a 
#' detection probability (and, optionally, a false detection probability, \code{fdp}).
#' 
#' Essentially, this function combines a prior probability, with the likelihood function 
#' (coded in \code{\link{calc_obs_like}}) to produce a posterior probability of presence
#' given Bayes' Theorem (Bayes & Price, 1763).
#' 
#' The idea of taphonomic controls dates back at least to work of Bottjer & Jablonski (1988).  The basic
#' idea is that if you have taxa of roughly similar detectability, then detections of other 
#' taxa give some idea of overall detection effort.  Obviously this is a very simple 
#' model that can be criticized in any number of ways (different alpha diversity in each 
#' region, different detectability of individual taxa, etc.), but it is a useful starting
#' point as there has been no implementation of any detection model in historical/phylogenetic 
#' biogeography to date.
#' 
#' One could imagine (a) every OTU and area has a different count of detections and taphonomic control 
#' detections, or (b) the taphonomic control detections are specified by area, and shared across all OTUs.
#' Situation (b) is likely more common, but this function assumes (a) as this is the more thorough case.
#' Behavior (b) could be reproduced by summing each column, and/or copying this sum to all cells for a 
#' particular area.
#' 
#' @param prior_prob_presence The prior probability of presence, i.e. when no detection or taphonomic control data 
#' whatsoever is available.  Default is set to 0.01 which expresses my totally uninformed bias that 
#' in whatever your data is, your species of interest probably doesn't live in the typical area you are 
#' looking at.
#' @param obs_target_species A count of detections of your OTU of interest, e.g. as produced from a cell of the matrix output from \code{\link{read_detections}}.
#' @param obs_all_species A count of detections of your taphonomic controls, e.g. as produced from a cell of the output from \code{\link{read_controls}}.
#' @param mean_frequency This is the proportion of samples from the taphonomic control group that will truly be from this OTU, GIVEN that the OTU is present.
#' This could be estimated, but a decent first guess is (total # samples of OTU of interest / total # of samples in the taphonomic control group where
#' the OTU is known to be present).  All that is really needed is some reasonable value, such that more sampling without detection lowers the 
#' likelihood of the data on the hypothesis of true presence, and vice versa.  This value can only be 1 when the number of detections = the number 
#' of taphonomic control detections, for every OTU and area.  This is the implicit assumption in e.g. standard historical biogeography analyses in 
#' LAGRANGE or BioGeoBEARS.
#' @param dp The detection probability.  This is the per-sample probability that you will correctly detect the OTU in question, 
#' when you are looking at it.  Default is 1, which is the implicit assumption in standard analyses.
#' @param fdp The false detection probability.  This is probability of falsely concluding a detection of the OTU of interest occurred, when in 
#' fact the specimen was of something else.  The default is 0, which assumes zero error rate, 
#' i.e. the assumption being made in all historical biogeography analyses that do not take into account detection 
#' probability.  These options are being included for completeness, but it may not be wise to try to infer \code{mean_frequency},  
#' \code{dp} and \code{fdp} all at once due to identifiability issues (and estimation of fdp may take a very large amount of data).  However, 
#' fixing some of these parameters to reasonable values can allow the user to effectively include beliefs about the uncertainty of the 
#' input data into the analysis, if desired.
#' @param print_progress If not the default (\code{""}), print whatever is in print_progress, followed by a space (for error checking/surveying results).
#' @return \code{post_prob} The posterior probability of presence, given the prior probability, the model parameters, and the data.
#' @export
#' @seealso \code{\link{calc_obs_like}},  \code{\link{mapply_calc_post_prob_presence}}, \code{\link{mapply_calc_obs_like}}
#' @note Go BEARS!
#' @author Nicholas J. Matzke \email{matzke@@berkeley.edu} 
#' @references
#' \url{http://phylo.wikidot.com/matzke-2013-international-biogeography-society-poster}
#' \url{http://en.wikipedia.org/wiki/Bayes'_theorem}
#' @bibliography /Dropbox/_njm/__packages/BioGeoBEARS_setup/BioGeoBEARS_refs.bib
#'   @cite Matzke_2012_IBS
#'	 @cite Bottjer_Jablonski_1988
#' 	 @cite Bayes_1763
#' @examples
#'
#' # Calculate posterior probability of presence in an area, 
#' # given a dp (detection probability) and detection model. 
#' 
#' # With zero error rate
#' obs_target_species = 10
#' obs_all_species = 100
#' mean_frequency=0.1
#' dp=1
#' fdp=0
#' prior_prob_presence = 0.01
#' post_prob = calc_post_prob_presence(prior_prob_presence, obs_target_species,
#' obs_all_species, mean_frequency, dp, fdp)
#' post_prob
#' # i.e., with perfect detection, the prob. of presence is 1 under the 
#' # hypothesis of presence, and 0 under the hypothesis of 
#' # (This is because the likelihood of the data under 
#' # presence and absence are ln(prob) = 0 & -Inf, respectively.)
#' 
#' 
#' # Note that with very high error rates, your conclusion could reverse
#' obs_target_species = 10
#' obs_all_species = 100
#' mean_frequency=0.1
#' dp=0.5
#' fdp=0.3
#' prior_prob_presence = 0.01
#' post_prob = calc_post_prob_presence(prior_prob_presence, obs_target_species,
#' obs_all_species, mean_frequency, dp, fdp)
#' post_prob
#' 
#' 
#' # With 0 error rate, even 1 observation makes P(presence) = 1
#' obs_target_species = 1
#' obs_all_species = 100
#' mean_frequency=0.1
#' dp=1
#' fdp=0
#' prior_prob_presence = 0.01
#' post_prob = calc_post_prob_presence(prior_prob_presence, obs_target_species,
#' obs_all_species, mean_frequency, dp, fdp)
#' post_prob
#' 
#' 
#' # With a small error rate, there is some small but positive probability of
#' # falsely getting 10 detections; but it may be effectively 0
#' obs_target_species = 10
#' obs_all_species = 100
#' mean_frequency=0.1
#' dp=0.99
#' fdp=0.001
#' prior_prob_presence = 0.01
#' post_prob = calc_post_prob_presence(prior_prob_presence, obs_target_species,
#' obs_all_species, mean_frequency, dp, fdp)
#' post_prob
#' 
#' 
#' # If you have only 1 detection, and you have 100 taphonomic controls and
#' # a mean_frequency of sampling the OTU of interest of 0.1, then there is 
#' # still a very low probability of presence (since, under your model, 
#' # you should expect to see about 10 detections, not 1)
#' obs_all_species = 100
#' mean_frequency=0.1
#' dp=0.99
#' fdp=0.001
#' prior_prob_presence = 0.01
#' 
#' obs_target_species = 0
#' calc_post_prob_presence(prior_prob_presence, obs_target_species,
#' obs_all_species, mean_frequency, dp, fdp)
#' 
#' obs_target_species = 1
#' calc_post_prob_presence(prior_prob_presence, obs_target_species,
#' obs_all_species, mean_frequency, dp, fdp)
#' 
#' obs_target_species = 2
#' calc_post_prob_presence(prior_prob_presence, obs_target_species,
#' obs_all_species, mean_frequency, dp, fdp)
#' 
#' obs_target_species = 3
#' calc_post_prob_presence(prior_prob_presence, obs_target_species,
#' obs_all_species, mean_frequency, dp, fdp)
#' 
#' 
#' 
#' # Note how quickly this chances if you drop the mean_frequency from 0.1 
#' # to 0.01. This means that if you want single detections to count for 
#' # a lot, you need either a low mean_frequency which matches the observed 
#' # frequency, or an extremely high/perfect detection probability (dp).
#' obs_all_species = 100
#' mean_frequency=0.01
#' dp=0.99
#' fdp=0.001
#' prior_prob_presence = 0.01
#' 
#' obs_target_species = 0
#' calc_post_prob_presence(prior_prob_presence, obs_target_species,
#' obs_all_species, mean_frequency, dp, fdp)
#' 
#' obs_target_species = 1
#' calc_post_prob_presence(prior_prob_presence, obs_target_species,
#' obs_all_species, mean_frequency, dp, fdp)
#' 
#' obs_target_species = 2
#' calc_post_prob_presence(prior_prob_presence, obs_target_species,
#' obs_all_species, mean_frequency, dp, fdp)
#' 
#' obs_target_species = 3
#' calc_post_prob_presence(prior_prob_presence, obs_target_species,
#' obs_all_species, mean_frequency, dp, fdp)
#' 
#' 
#' # Changing mean_frequency from 0.01 to 0.001 actually LOWERS the posterior
#' # probability of presence based on 1 detection, as we have a somewhat 
#' # significant false detection rate:
#' obs_target_species = 1
#' obs_all_species = 100
#' mean_frequency=0.001
#' dp=0.99
#' fdp=0.001
#' prior_prob_presence = 0.01
#' 
#' obs_target_species = 0
#' calc_post_prob_presence(prior_prob_presence, obs_target_species,
#' obs_all_species, mean_frequency, dp, fdp)
#' 
#' obs_target_species = 1
#' calc_post_prob_presence(prior_prob_presence, obs_target_species,
#' obs_all_species, mean_frequency, dp, fdp)
#' 
#' obs_target_species = 2
#' calc_post_prob_presence(prior_prob_presence, obs_target_species,
#' obs_all_species, mean_frequency, dp, fdp)
#' 
#' obs_target_species = 3
#' calc_post_prob_presence(prior_prob_presence, obs_target_species,
#' obs_all_species, mean_frequency, dp, fdp)
#' 
#' 
#' # Change false detection probability to a much lower value
#' obs_all_species = 100
#' mean_frequency=0.001
#' dp=0.99
#' fdp=0.00001
#' prior_prob_presence = 0.01
#' 
#' obs_target_species = 0
#' calc_post_prob_presence(prior_prob_presence, obs_target_species,
#' obs_all_species, mean_frequency, dp, fdp)
#' 
#' obs_target_species = 1
#' calc_post_prob_presence(prior_prob_presence, obs_target_species,
#' obs_all_species, mean_frequency, dp, fdp)
#' 
#' obs_target_species = 2
#' calc_post_prob_presence(prior_prob_presence, obs_target_species,
#' obs_all_species, mean_frequency, dp, fdp)
#' 
#' obs_target_species = 3
#' calc_post_prob_presence(prior_prob_presence, obs_target_species,
#' obs_all_species, mean_frequency, dp, fdp)
#' 
#' 
#' 
#' 
#' # Change false detection probability to 0
#' obs_all_species = 100
#' mean_frequency=0.001
#' dp=0.99
#' fdp=0.0
#' prior_prob_presence = 0.01
#' 
#' 
#' obs_target_species = 0
#' calc_post_prob_presence(prior_prob_presence, obs_target_species,
#' obs_all_species, mean_frequency, dp, fdp)
#' 
#' obs_target_species = 1
#' calc_post_prob_presence(prior_prob_presence, obs_target_species,
#' obs_all_species, mean_frequency, dp, fdp)
#' 
#' obs_target_species = 2
#' calc_post_prob_presence(prior_prob_presence, obs_target_species,
#' obs_all_species, mean_frequency, dp, fdp)
#' 
#' obs_target_species = 3
#' calc_post_prob_presence(prior_prob_presence, obs_target_species,
#' obs_all_species, mean_frequency, dp, fdp)
#' 
#' 
#' 
#' 
#' 
#' 
#' 
#' 
#' 
#' # Change mean_frequency to 0.001
#' obs_all_species = 100
#' mean_frequency=0.001
#' dp=0.99
#' fdp=0.0
#' prior_prob_presence = 0.01
#' 
#' obs_target_species = 0
#' calc_post_prob_presence(prior_prob_presence, obs_target_species,
#' obs_all_species, mean_frequency, dp, fdp)
#' 
#' obs_target_species = 1
#' calc_post_prob_presence(prior_prob_presence, obs_target_species,
#' obs_all_species, mean_frequency, dp, fdp)
#' 
#' obs_target_species = 2
#' calc_post_prob_presence(prior_prob_presence, obs_target_species,
#' obs_all_species, mean_frequency, dp, fdp)
#' 
#' obs_target_species = 3
#' calc_post_prob_presence(prior_prob_presence, obs_target_species,
#' obs_all_species, mean_frequency, dp, fdp)
#' 
#' 
#' 
#' 
#' 
#' 
#' 
#' # Example #2 -- what if you have ZERO detections, but lots of detections 
#' # of your taphonomic control?
#' obs_target_species = 0
#' obs_all_species = 100
#' mean_frequency=0.1
#' dp=1
#' fdp=0
#' prior_prob_presence = 0.01
#' post_prob = calc_post_prob_presence(prior_prob_presence, obs_target_species,
#' obs_all_species, mean_frequency, dp, fdp)
#' post_prob
#' 
#' # With a slight error rate
#' obs_target_species = 0
#' obs_all_species = 100
#' mean_frequency=0.1
#' dp=0.99
#' fdp=0.001
#' prior_prob_presence = 0.01
#' post_prob = calc_post_prob_presence(prior_prob_presence, obs_target_species,
#' obs_all_species, mean_frequency, dp, fdp)
#' post_prob
#' 
#' 
#' obs_target_species = 0
#' obs_all_species = 2
#' mean_frequency=0.1
#' dp=1
#' fdp=0
#' prior_prob_presence = 0.01
#' post_prob = calc_post_prob_presence(prior_prob_presence, obs_target_species,
#' obs_all_species, mean_frequency, dp, fdp)
#' post_prob
#' 
#' # With a slight error rate
#' obs_target_species = 0
#' obs_all_species = 2
#' mean_frequency=0.1
#' dp=0.99
#' fdp=0.001
#' prior_prob_presence = 0.01
#' post_prob = calc_post_prob_presence(prior_prob_presence, obs_target_species,
#' obs_all_species, mean_frequency, dp, fdp)
#' post_prob
#' 
#' 
#' 
#' 
#' 
#' # Example #3 -- what if you have ZERO detections, but only a few 
#' # detections of your taphonomic control?
#' obs_target_species = 0
#' obs_all_species = 1
#' mean_frequency=0.1
#' dp=1
#' fdp=0
#' prior_prob_presence = 0.01
#' post_prob = calc_post_prob_presence(prior_prob_presence, obs_target_species,
#' obs_all_species, mean_frequency, dp, fdp)
#' post_prob
#' 
#' # With a slight error rate
#' obs_target_species = 0
#' obs_all_species = 1
#' mean_frequency=0.1
#' dp=0.99
#' fdp=0.001
#' prior_prob_presence = 0.01
#' post_prob = calc_post_prob_presence(prior_prob_presence, obs_target_species,
#' obs_all_species, mean_frequency, dp, fdp)
#' post_prob
#' 
#' 
#' 
#' # Special cases -- e.g., no data
#' # Prob(data)=1, ln(prob)=0
#' obs_target_species = 0
#' obs_all_species = 0
#' mean_frequency=0.1
#' dp=0.99
#' fdp=0.001
#' prior_prob_presence = 0.01
#' post_prob = calc_post_prob_presence(prior_prob_presence, obs_target_species,
#' obs_all_species, mean_frequency, dp, fdp)
#' post_prob
#' 
#' obs_target_species = 0
#' obs_all_species = 0
#' mean_frequency=0.1
#' dp=1
#' fdp=0
#' prior_prob_presence = 0.01
#' post_prob = calc_post_prob_presence(prior_prob_presence, obs_target_species,
#' obs_all_species, mean_frequency, dp, fdp)
#' post_prob
#' 
#' 
#' # What if, for some reason, you put in identical detections and taphonomic control
#' # counts? (e.g., you load in a standard tipranges file)
#' obs_target_species = 1
#' obs_all_species = 1
#' mean_frequency=1
#' dp=1
#' fdp=0
#' prior_prob_presence = 0.01
#' post_prob = calc_post_prob_presence(prior_prob_presence, obs_target_species,
#' obs_all_species, mean_frequency, dp, fdp)
#' post_prob
#' 
#' # What if, for some reason, you put in identical detections and taphonomic control
#' # counts? (e.g., you load in a standard tipranges file)
#' obs_target_species = 1
#' obs_all_species = 1
#' mean_frequency=1
#' dp=0.99
#' fdp=0.001
#' prior_prob_presence = 0.01
#' post_prob = calc_post_prob_presence(prior_prob_presence, obs_target_species,
#' obs_all_species, mean_frequency, dp, fdp)
#' post_prob
#' 
#' 
calc_post_prob_presence <- function(prior_prob_presence=0.01, obs_target_species, obs_all_species, mean_frequency=0.1, dp=1, fdp=0, print_progress="")
	{
	# Priors
	prior_prob_presence = prior_prob_presence
	prior_prob_absence = 1-prior_prob_presence

	# likelihood of data (assuming truly present)
	lnlike_allobs_given_presence = calc_obs_like(truly_present=TRUE, obs_target_species, obs_all_species, mean_frequency=mean_frequency, dp=dp, fdp=fdp)
	
	numerator = lnlike_allobs_given_presence + log(prior_prob_presence)
	
	
	# prob(obs)
	# denominator
	d1 = lnlike_allobs_given_presence + log(prior_prob_presence)
	
	lnlike_allobs_given_absence = calc_obs_like(truly_present=FALSE, obs_target_species, obs_all_species, mean_frequency=mean_frequency, dp=dp, fdp=fdp)
	
	d2 = lnlike_allobs_given_absence + log(prior_prob_absence)
	
	denominator = log(exp(d1) + exp(d2))
	
	ln_post_prob = numerator - denominator
	post_prob = exp(ln_post_prob)
	
	if (print_progress == "")
		{
		nothing_happens = 0
		} else {
		cat(print_progress, " ", sep="")
		}
	
	return(post_prob)
	}










#######################################################
# mapply_calc_post_prob_presence
#######################################################
#' Mapply version of calc_post_prob_presence()
#' 
#' This function applies \code{\link{calc_post_prob_presence}} to all cells of 
#' the input matrices \code{obs_target_species} and \code{obs_all_species}. These matrices
#' obviously must have the same dimensions.
#' 
#' The inputs are the same as for \code{\link{calc_post_prob_presence}}, except that
#' \code{obs_target_species} and \code{obs_all_species} can be matrices.
#' 
#' @param prior_prob_presence The prior probability of presence, i.e. when no detection or taphonomic control data 
#' whatsoever is available.  Default is set to 0.01 which expresses my totally uninformed bias that 
#' in whatever your data is, your species of interest probably doesn't live in the typical area you are 
#' looking at.
#' @param obs_target_species A scalar or column/vector/matrix of detection counts, e.g. as produced from the output from \code{\link{read_detections}}.
#' @param obs_all_species A scalar or column/vector/matrix of detection counts, e.g. as produced from the output from \code{\link{read_controls}}.
#' @param mean_frequency This is the proportion of samples from the taphonomic control group that will truly be from this OTU, GIVEN that the OTU is present.
#' This could be estimated, but a decent first guess is (total # samples of OTU of interest / total # of samples in the taphonomic control group where
#' the OTU is known to be present).  All that is really needed is some reasonable value, such that more sampling without detection lowers the 
#' likelihood of the data on the hypothesis of true presence, and vice versa.  This value can only be 1 when the number of detections = the number 
#' of taphonomic control detections, for every OTU and area.  This is the implicit assumption in e.g. standard historical biogeography analyses in 
#' LAGRANGE or BioGeoBEARS.
#' @param dp The detection probability.  This is the per-sample probability that you will correctly detect the OTU in question, 
#' when you are looking at it.  Default is 1, which is the implicit assumption in standard analyses.
#' @param fdp The false detection probability.  This is probability of falsely concluding a detection of the OTU of interest occurred, when in 
#' fact the specimen was of something else.  The default is 0, which assumes zero error rate, 
#' i.e. the assumption being made in all historical biogeography analyses that do not take into account detection 
#' probability.  These options are being included for completeness, but it may not be wise to try to infer \code{mean_frequency},  
#' \code{dp} and \code{fdp} all at once due to identifiability issues (and estimation of fdp may take a very large amount of data).  However, 
#' fixing some of these parameters to reasonable values can allow the user to effectively include beliefs about the uncertainty of the 
#' input data into the analysis, if desired.
#' @param print_progress If not the default (\code{""}), print whatever is in print_progress, followed by a space (for error checking/surveying results).
#' @return \code{pp_df} A matrix of the posterior probability of presence, given the prior probability, the model parameters, and the data.
#' @export
#' @seealso \code{\link{calc_obs_like}}, \code{\link{calc_post_prob_presence}}, \code{\link{mapply_calc_obs_like}}
#' \code{\link{Pdata_given_rangerow}}, \code{\link[base]{mapply}}, \code{\link{tiplikes_wDetectionModel}}
#' @note Go BEARS!
#' @author Nicholas J. Matzke \email{matzke@@berkeley.edu} 
#' @references
#' \url{http://phylo.wikidot.com/matzke-2013-international-biogeography-society-poster}
#' \url{http://en.wikipedia.org/wiki/Bayes'_theorem}
#' @bibliography /Dropbox/_njm/__packages/BioGeoBEARS_setup/BioGeoBEARS_refs.bib
#'   @cite Matzke_2012_IBS
#'	 @cite Bottjer_Jablonski_1988
#' 	 @cite Bayes_1763
#' @examples
#' 
#' # Calculate posterior probability of presence in an area, 
#' # given a dp (detection probability) and detection model. 
#' 
#' # soft-coded input files
#' extdata_dir = np(system.file("extdata", package="BioGeoBEARS"))
#' detects_fn = np(paste(extdata_dir, "/Psychotria_detections_v1.txt", sep=""))
#' controls_fn = np(paste(extdata_dir, "/Psychotria_controls_v1.txt", sep=""))
#' 
#' OTUnames=NULL
#' areanames=NULL
#' tmpskip=0
#' 
#' detects_df = read_detections(detects_fn, OTUnames=NULL, areanames=NULL, tmpskip=0)
#' controls_df = read_controls(controls_fn, OTUnames=NULL, areanames=NULL, tmpskip=0)
#' 
#' detects_df
#' controls_df
#' detects_df / controls_df
#' 
#' 
#' # Calculate data likelihoods, and posterior probability of presence=TRUE
#' mean_frequency=0.1
#' dp=1
#' fdp=0
#' 
#' mapply_calc_obs_like(truly_present=TRUE, obs_target_species=detects_df,
#' obs_all_species=controls_df, mean_frequency, dp, fdp)
#' 
#' mapply_calc_obs_like(truly_present=FALSE, obs_target_species=detects_df,
#' obs_all_species=controls_df, mean_frequency, dp, fdp)
#' 
#' mapply_calc_post_prob_presence(prior_prob_presence=0.01, 
#' obs_target_species=detects_df,
#' obs_all_species=controls_df, mean_frequency, dp, fdp)
#' 
mapply_calc_post_prob_presence <- function(prior_prob_presence=0.01, obs_target_species, obs_all_species, mean_frequency=0.1, dp=1, fdp=0, print_progress="")
	{
	# Calculate the posterior probability of presence in each area in a matrix / data.frame
	tmpdf = mapply(FUN=calc_post_prob_presence, obs_target_species=unlist(obs_target_species),
	obs_all_species=unlist(obs_all_species), MoreArgs=list(prior_prob_presence=prior_prob_presence, mean_frequency=mean_frequency, dp=dp, fdp=fdp), USE.NAMES=TRUE)

	pp_df = matrix(data=tmpdf, ncol=ncol(obs_target_species), nrow=nrow(obs_target_species), byrow=FALSE)
	pp_df
	
	pp_df = adf2(pp_df)
	names(pp_df) = names(obs_target_species)
	rownames(pp_df) = rownames(obs_target_species)
	
	return(pp_df)
	}


#######################################################
# mapply_calc_obs_like
#######################################################
#' Mapply version of calc_obs_like()
#' 
#' This function applies \code{\link{calc_obs_like}} to all cells of 
#' the input matrices \code{obs_target_species} and \code{obs_all_species}. These matrices
#' obviously must have the same dimensions.
#' 
#' The inputs are the same as for \code{\link{calc_obs_like}}, except that
#' \code{obs_target_species} and \code{obs_all_species} can be matrices.
#' 
#' @param truly_present Is the OTU of interest known/conditionally assumed to be truly present (\code{TRUE}) or truly absent (\code{FALSE})?
#' @param obs_target_species A scalar or column/vector/matrix of detection counts, e.g. as produced from the output from \code{\link{read_detections}}.
#' @param obs_all_species A scalar or column/vector/matrix of detection counts, e.g. as produced from the output from \code{\link{read_controls}}.
#' @param mean_frequency This is the proportion of samples from the taphonomic control group that will truly be from this OTU, GIVEN that the OTU is present.
#' This could be estimated, but a decent first guess is (total # samples of OTU of interest / total # of samples in the taphonomic control group where
#' the OTU is known to be present).  All that is really needed is some reasonable value, such that more sampling without detection lowers the 
#' likelihood of the data on the hypothesis of true presence, and vice versa.  This value can only be 1 when the number of detections = the number 
#' of taphonomic control detections, for every OTU and area.  This is the implicit assumption in e.g. standard historical biogeography analyses in 
#' LAGRANGE or BioGeoBEARS.
#' @param dp The detection probability.  This is the per-sample probability that you will correctly detect the OTU in question, 
#' when you are looking at it.  Default is 1, which is the implicit assumption in standard analyses.
#' @param fdp The false detection probability.  This is probability of falsely concluding a detection of the OTU of interest occurred, when in 
#' fact the specimen was of something else.  The default is 0, which assumes zero error rate, 
#' i.e. the assumption being made in all historical biogeography analyses that do not take into account detection 
#' probability.  These options are being included for completeness, but it may not be wise to try to infer \code{mean_frequency},  
#' \code{dp} and \code{fdp} all at once due to identifiability issues (and estimation of fdp may take a very large amount of data).  However, 
#' fixing some of these parameters to reasonable values can allow the user to effectively include beliefs about the uncertainty of the 
#' input data into the analysis, if desired.
#' @return \code{pp_df} A matrix of the natural log-likelihood of the data, given the model & assumption of true presence or absence.
#' @export
#' @seealso \code{\link{calc_obs_like}}, \code{\link{calc_post_prob_presence}}, \code{\link{mapply_calc_post_prob_presence}}, 
#' \code{\link{Pdata_given_rangerow}}, \code{\link[base]{mapply}}, \code{\link{tiplikes_wDetectionModel}}
#' @note Go BEARS!
#' @author Nicholas J. Matzke \email{matzke@@berkeley.edu} 
#' @references
#' \url{http://phylo.wikidot.com/matzke-2013-international-biogeography-society-poster}
#' \url{http://en.wikipedia.org/wiki/Bayes'_theorem}
#' @bibliography /Dropbox/_njm/__packages/BioGeoBEARS_setup/BioGeoBEARS_refs.bib
#'   @cite Matzke_2012_IBS
#'	 @cite Bottjer_Jablonski_1988
#' 	 @cite Bayes_1763
#' @examples
#' test=1
#' # Calculate likelihood of data, given presence in an area, 
#' # given a dp (detection probability) and detection model. 
#' 
#' # soft-coded input files
#' extdata_dir = np(system.file("extdata", package="BioGeoBEARS"))
#' detects_fn = np(paste(extdata_dir, "/Psychotria_detections_v1.txt", sep=""))
#' controls_fn = np(paste(extdata_dir, "/Psychotria_controls_v1.txt", sep=""))
#' 
#' OTUnames=NULL
#' areanames=NULL
#' tmpskip=0
#' 
#' detects_df = read_detections(detects_fn, OTUnames=NULL, areanames=NULL, tmpskip=0)
#' controls_df = read_controls(controls_fn, OTUnames=NULL, areanames=NULL, tmpskip=0)
#' 
#' detects_df
#' controls_df
#' detects_df / controls_df
#' 
#' 
#' # Calculate data likelihoods, and posterior probability of presence=TRUE
#' mean_frequency=0.1
#' dp=1
#' fdp=0
#' 
#' mapply_calc_obs_like(truly_present=TRUE, obs_target_species=detects_df,
#' obs_all_species=controls_df, mean_frequency, dp, fdp)
#' 
#' mapply_calc_obs_like(truly_present=FALSE, obs_target_species=detects_df,
#' obs_all_species=controls_df, mean_frequency, dp, fdp)
#' 
#' mapply_calc_post_prob_presence(prior_prob_presence=0.01, 
#' obs_target_species=detects_df,
#' obs_all_species=controls_df, mean_frequency, dp, fdp)
#' 
mapply_calc_obs_like <- function(truly_present=TRUE, obs_target_species, obs_all_species, mean_frequency=0.1, dp=1, fdp=0)
	{
	# Calculate the posterior probability of presence in each area in a matrix / data.frame
	tmpdf = mapply(FUN=calc_obs_like, obs_target_species=unlist(obs_target_species),
	obs_all_species=unlist(obs_all_species), MoreArgs=list(truly_present=truly_present, mean_frequency=mean_frequency, dp=dp, fdp=fdp), USE.NAMES=TRUE)

	like_df = matrix(data=tmpdf, ncol=ncol(obs_target_species), nrow=nrow(obs_target_species), byrow=FALSE)
	like_df
	
	like_df = adf2(like_df)
	names(like_df) = names(obs_target_species)
	rownames(like_df) = rownames(obs_target_species)
	
	return(like_df)
	}



#######################################################
# Pdata_given_range_dp
#######################################################
#' Calculate probability of detection data given a true geographic range and a detection probability
#' 
#' This function calculates P(data|range,dp), i.e. the probability of some detection and 
#' taphonomic control counts, given the true geographic range/state, and \code{dp}, a 
#' detection probability (and, optionally, a false detection probability, \code{fdp}).
#' 
#' The idea of taphonomic controls dates back at least to work of Bottjer & Jablonski (1988).  The basic
#' idea is that if you have taxa of roughly similar detectability, then detections of other 
#' taxa give some idea of overall detection effort.  Obviously this is a very simple 
#' model that can be criticized in any number of ways (different alpha diversity in each 
#' region, different detectability of individual taxa, etc.), but it is a useful starting
#' point as there has been no implementation of any detection model in historical/phylogenetic 
#' biogeography to date.
#' 
#' One could imagine (a) every OTU and area has a different count of detections and taphonomic control 
#' detections, or (b) the taphonomic control detections are specified by area, and shared across all OTUs.
#' Situation (b) is likely more common, but this function assumes (a) as this is the more thorough case.
#' Behavior (b) could be reproduced by summing each column, and/or copying this sum to all cells for a 
#' particular area.
#' 
#' @param truerange_areas The list of areas (as 0-based numbers) in this geographic range/state.
#' @param numareas The function needs to know the total number of areas in the analysis.
#' @param detects_df A column/vector of detection counts, as produced from a column of the output from \code{\link{read_detections}}.
#' @param controls_df A column/vector of detection counts, as produced from a column of the output from \code{\link{read_controls}}.
#' @param dp The detection probability.  This is the per-sample probability that you will detect the OTU in question.
#' In other words, the model assumes that each specimen from the taphonomic control group has a chance of being
#' a representative of the OTU you are looking for.  The default is 1, which assumes perfect detection, i.e. the assumption 
#' being made in all historical biogeography analyses that do not take into account detection probability.  A value of 1
#' will only work when the taphonomic control count equals the detection count; any other data would have likelihood=0.
#' @param fdp The false detection probability.  This is probability of falsely concluding a detection occurred, when in 
#' fact the specimen was of something else.  The default is 0, which assumes zero error rate, 
#' i.e. the assumption being made in all historical biogeography analyses that do not take into account detection 
#' probability.  This option is being included for completeness, but it may not be wise to try to infer both 
#' \code{dp} and \code{fdp} at once due to identifiability issues (and estimation of fdp may take a very large amount of data).
#' @param mean_frequency This is the proportion of samples from the taphonomic control group that will truly be from this OTU, GIVEN that the OTU is present.
#' This could be estimated, but a decent first guess is (total # samples of OTU of interest / total # of samples in the taphonomic control group where
#' the OTU is known to be present).  All that is really needed is some reasonable value, such that more sampling without detection lowers the 
#' likelihood of the data on the hypothesis of true presence, and vice versa.  This value can only be 1 when the number of detections = the number 
#' of taphonomic control detections, for every OTU and area.  This is the implicit assumption in e.g. standard historical biogeography analyses in 
#' LAGRANGE or BioGeoBEARS.
#' @return \code{dtf} 
#' @export
#' @seealso \code{\link[cladoRcpp]{rcpp_calc_anclikes_sp_COOweights_faster}}
#' @note Go BEARS!
#' @author Nicholas J. Matzke \email{matzke@@berkeley.edu} 
#' @references
#' \url{http://phylo.wikidot.com/matzke-2013-international-biogeography-society-poster}
#' @bibliography /Dropbox/_njm/__packages/BioGeoBEARS_setup/BioGeoBEARS_refs.bib
#'   @cite Matzke_2012_IBS
#'	 @cite Bottjer_Jablonski_1988
#' @examples
#' testval=1
#' 
Pdata_given_rangerow_dp <- function(truerange_areas, numareas, detects_df, controls_df, mean_frequency=0.1, dp=1, fdp=0)
	{
	runjunk='
	controls_fn = "/Library/Frameworks/R.framework/Versions/2.15/Resources/library/BioGeoBEARS/extdata/Psychotria_controls_v1.txt"
	controls_fn = "/Dropbox/_njm/__packages/BioGeoBEARS_setup/inst/extdata/Psychotria_controls_v1.txt"
	OTUnames=NULL
	areanames=NULL
	tmpskip=0
	' # end runjunk

	# Likelihood of a state = the probability of data at ALL areas (occupied, and not, in the state/geographic range) given the
	# conditionally assumed true presences/absences in the specified range
	
	# Build a TRUE/FALSE matrix specifying the ranges in this assumed true state/geographic range
	# (copy for every row, this does 1 state/geographic range across all OTUs)
	range_as_areas_TF = matrix(data=FALSE, nrow=nrow(detects_df), ncol=numareas)
	range_as_areas_TF[truerange_areas+1] = TRUE
	
	# OK, calculate likelihood OF THE DATA ON THAT RANGE
	LnLs_of_data_in_each_area = mapply(FUN=calc_obs_like, truly_present=range_as_areas_TF, obs_target_species=detects_df,
obs_all_species=controls_df, MoreArgs=list(mean_frequency=mean_frequency, dp=dp, fdp=fdp), USE.NAMES=TRUE)
	
	# LnL of the data at each tip, for this state
	LnLs_of_this_range_for_each_OTU = rowSums(LnLs_of_data_in_each_area)
	LnLs_of_this_range_for_each_OTU	
	
	return(LnLs_of_this_range_for_each_OTU)
	}





#######################################################
# Pdata_given_rangerow
#######################################################
#' Calculate probability of detection data given a true geographic range and a detection probability
#' 
#' This function calculates P(data|range,dp), i.e. the probability of some detection and 
#' taphonomic control counts, given the true geographic range/state, and \code{dp}, a 
#' detection probability (and, optionally, a false detection probability, \code{fdp}).
#' 
#' The idea of taphonomic controls dates back at least to work of Bottjer & Jablonski (1988).  The basic
#' idea is that if you have taxa of roughly similar detectability, then detections of other 
#' taxa give some idea of overall detection effort.  Obviously this is a very simple 
#' model that can be criticized in any number of ways (different alpha diversity in each 
#' region, different detectability of individual taxa, etc.), but it is a useful starting
#' point as there has been no implementation of any detection model in historical/phylogenetic 
#' biogeography to date.
#' 
#' One could imagine (a) every OTU and area has a different count of detections and taphonomic control 
#' detections, or (b) the taphonomic control detections are specified by area, and shared across all OTUs.
#' Situation (b) is likely more common, but this function assumes (a) as this is the more thorough case.
#' Behavior (b) could be reproduced by summing each column, and/or copying this sum to all cells for a 
#' particular area.
#' 
#' @param range_as_areas_TF The list of areas (as TRUE/FALSE) in this geographic range/state.
#' @param detects_df_row A column/vector of detection counts, as produced from a row of the output from \code{\link{read_detections}}.
#' @param controls_df_row A column/vector of detection counts, as produced from a row of the output from \code{\link{read_controls}}.
#' @param mean_frequency This is the proportion of samples from the taphonomic control group that will truly be from this OTU, GIVEN that the OTU is present.
#' This could be estimated, but a decent first guess is (total # samples of OTU of interest / total # of samples in the taphonomic control group where
#' the OTU is known to be present).  All that is really needed is some reasonable value, such that more sampling without detection lowers the 
#' likelihood of the data on the hypothesis of true presence, and vice versa.  This value can only be 1 when the number of detections = the number 
#' of taphonomic control detections, for every OTU and area.  This is the implicit assumption in e.g. standard historical biogeography analyses in 
#' LAGRANGE or BioGeoBEARS.
#' @param dp The detection probability.  This is the per-sample probability that you will correctly detect the OTU in question, 
#' when you are looking at it.  Default is 1, which is the implicit assumption in standard analyses.
#' @param fdp The false detection probability.  This is probability of falsely concluding a detection of the OTU of interest occurred, when in 
#' fact the specimen was of something else.  The default is 0, which assumes zero error rate, 
#' i.e. the assumption being made in all historical biogeography analyses that do not take into account detection 
#' probability.  These options are being included for completeness, but it may not be wise to try to infer \code{mean_frequency},  
#' \code{dp} and \code{fdp} all at once due to identifiability issues (and estimation of fdp may take a very large amount of data).  However, 
#' fixing some of these parameters to reasonable values can allow the user to effectively include beliefs about the uncertainty of the 
#' input data into the analysis, if desired.
#' @param return_LnLs If \code{FALSE} (default), return exp(sum(LnLs of data in each area)), i.e. the likelihood of the data, 
#' non-logged. If \code{TRUE}, return the LnLs of the data in each area.
#' @return \code{likelihood_of_data_given_range} The (non-logged!) likelihood of the data given the input range, and the 
#' detection model parameters. If \code{return_LnLs=TRUE}, returns \code{LnLs_of_data_in_each_area}, the LnLs of the data in each area.
#' @export
#' @seealso \code{\link{calc_obs_like}}, \code{\link[base]{mapply}}, \code{\link{tiplikes_wDetectionModel}}
#' @note Go BEARS!
#' @author Nicholas J. Matzke \email{matzke@@berkeley.edu} 
#' @references
#' \url{http://phylo.wikidot.com/matzke-2013-international-biogeography-society-poster}
#' @bibliography /Dropbox/_njm/__packages/BioGeoBEARS_setup/BioGeoBEARS_refs.bib
#'   @cite Matzke_2012_IBS
#'	 @cite Bottjer_Jablonski_1988
#' @examples
#' testval=1
#' 
#' # soft-coded input files
#' extdata_dir = np(system.file("extdata", package="BioGeoBEARS"))
#' detects_fn = np(paste(extdata_dir, "/Psychotria_detections_v1.txt", sep=""))
#' controls_fn = np(paste(extdata_dir, "/Psychotria_controls_v1.txt", sep=""))
#' 
#' OTUnames=NULL
#' areanames=NULL
#' tmpskip=0
#' 
#' detects_df = read_detections(detects_fn, OTUnames=NULL, areanames=NULL, tmpskip=0)
#' controls_df = read_controls(controls_fn, OTUnames=NULL, areanames=NULL, tmpskip=0)
#' 
#' detects_df
#' controls_df
#' detects_df / controls_df
#' 
#' mean_frequency=0.1
#' dp=1
#' fdp=0
#' 
#' mapply_calc_obs_like(truly_present=TRUE, obs_target_species=detects_df,
#' obs_all_species=controls_df, mean_frequency, dp, fdp)
#' 
#' mapply_calc_obs_like(truly_present=FALSE, obs_target_species=detects_df,
#' obs_all_species=controls_df, mean_frequency, dp, fdp)
#' 
#' mapply_calc_post_prob_presence(prior_prob_presence=0.01, 
#' obs_target_species=detects_df,
#' obs_all_species=controls_df, mean_frequency, dp, fdp)
#' 
#' 
#' 
#' # Now, calculate the likelihood of the data given a geographic range
#' numareas = 4
#' tmpranges = list(c(0), c(1), c(0,1))
#' truerange_areas = tmpranges[[3]]
#' truerange_areas
#' 
#' 	
#' # Build a TRUE/FALSE row specifying the ranges in this assumed true 
#' # state/geographic range
#' range_as_areas_TF = matrix(data=FALSE, nrow=1, ncol=numareas)
#' range_as_areas_TF[truerange_areas+1] = TRUE
#' range_as_areas_TF
#' 
#' detects_df_row = detects_df[1,]
#' controls_df_row = controls_df[1,]
#' 
#' # Manual method, superceded by Pdata_given_rangerow():
#' # LnLs_of_data_in_each_area = mapply(FUN=calc_obs_like, 
#' # obs_target_species=detects_df_row,
#' # obs_all_species=controls_df_row, truly_present=range_as_areas_TF, 
#' # MoreArgs=list(mean_frequency=mean_frequency, dp=dp, fdp=fdp), 
#' # USE.NAMES=TRUE)
#' 
#' 
#' # Calculate data likelihoods on for this geographic range
#' mean_frequency=0.1
#' dp=1
#' fdp=0
#' 

#' # Get the likelihood (the probability of the data, given this range)
#' likelihood_of_data_given_range = Pdata_given_rangerow(
#' range_as_areas_TF=range_as_areas_TF,
#' detects_df_row=detects_df_row, 
#' controls_df_row=controls_df_row, mean_frequency=mean_frequency, dp=dp, fdp=fdp)
#' likelihood_of_data_given_range
#' 
#' # Return the raw log-likelihoods:
#' LnLs_of_data_in_each_area = Pdata_given_rangerow(range_as_areas_TF=range_as_areas_TF,
#' detects_df_row=detects_df_row, 
#' controls_df_row=controls_df_row, mean_frequency=mean_frequency, dp=dp, fdp=fdp, 
#' return_LnLs=TRUE)
#' 
#' detects_df_row
#' controls_df_row
#' LnLs_of_data_in_each_area
#' 
#' # The likelihood: the probability of the data in each area:
#' exp(LnLs_of_data_in_each_area)
#' 
Pdata_given_rangerow <- function(range_as_areas_TF, detects_df_row, controls_df_row, mean_frequency=0.1, dp=1, fdp=0, return_LnLs=FALSE)
	{
	runjunk='
	controls_fn = "/Library/Frameworks/R.framework/Versions/2.15/Resources/library/BioGeoBEARS/extdata/Psychotria_controls_v1.txt"
	controls_fn = "/Dropbox/_njm/__packages/BioGeoBEARS_setup/inst/extdata/Psychotria_controls_v1.txt"
	OTUnames=NULL
	areanames=NULL
	tmpskip=0
	' # end runjunk

	# Likelihood of a state = the probability of data at ALL areas (occupied, and not, in the state/geographic range) given the
	# conditionally assumed true presences/absences in the specified range
	
	# Build a TRUE/FALSE matrix specifying the ranges in this assumed true state/geographic range
	# (copy for every row, this does 1 state/geographic range across all OTUs)
	#ranges_as_TF = matrix(data=FALSE, nrow=nrow(detects_df), ncol=numareas)
	#ranges_as_TF[truerange_areas+1] = TRUE

	# OK, calculate likelihood OF THE DATA ON THAT RANGE
	LnLs_of_data_in_each_area = mapply(FUN=calc_obs_like, truly_present=range_as_areas_TF, obs_target_species=detects_df_row,
obs_all_species=controls_df_row, MoreArgs=list(mean_frequency=mean_frequency, dp=0.99, fdp=0.001), USE.NAMES=TRUE)
	
	if (return_LnLs == FALSE)
		{
		LnL_range = sum(LnLs_of_data_in_each_area)
		likelihood_of_data_given_range = exp(LnL_range)
		return(likelihood_of_data_given_range)
		} else {
		return(LnLs_of_data_in_each_area)
		}
	}







#######################################################
# tiplikes_wDetectionModel
#######################################################
#' Calculate probability of detection data for each OTU at each range in a list of states/geographic ranges
#' 
#' This function calculates P(data|range,dp), i.e. the probability of some detection and 
#' taphonomic control counts, given the true geographic range/state, and \code{dp}, a 
#' detection probability (and, optionally, a false detection probability, \code{fdp}).
#' 
#' This function performs the operation for all states/ranges for all tips.
#' 
#' The idea of taphonomic controls dates back at least to work of Bottjer & Jablonski (1988).  The basic
#' idea is that if you have taxa of roughly similar detectability, then detections of other 
#' taxa give some idea of overall detection effort.  Obviously this is a very simple 
#' model that can be criticized in any number of ways (different alpha diversity in each 
#' region, different detectability of individual taxa, etc.), but it is a useful starting
#' point as there has been no implementation of any detection model in historical/phylogenetic 
#' biogeography to date.
#' 
#' One could imagine (a) every OTU and area has a different count of detections and taphonomic control 
#' detections, or (b) the taphonomic control detections are specified by area, and shared across all OTUs.
#' Situation (b) is likely more common, but this function assumes (a) as this is the more thorough case.
#' Behavior (b) could be reproduced by summing each column, and/or copying this sum to all cells for a 
#' particular area.
#' 
#' @param states_list_0based_index A states_list, 0-based, e.g. from \code{\link[cladoRcpp]{rcpp_areas_list_to_states_list}}.
#' @param numareas The number of areas being considered in the analysis. If \code{NULL} (default), this is calculated to be the maximum range length, or 
#' one plus the maximum 0-based index in any of the ranges.
#' @param detects_df A matrix/data.frame of detection counts, as produced from the output from \code{\link{read_detections}}.
#' @param controls_df A matrix/data.frame of detection counts, as produced from the output from \code{\link{read_controls}}.
#' @param mean_frequency This is the proportion of samples from the taphonomic control group that will truly be from this OTU, GIVEN that the OTU is present.
#' This could be estimated, but a decent first guess is (total # samples of OTU of interest / total # of samples in the taphonomic control group where
#' the OTU is known to be present).  All that is really needed is some reasonable value, such that more sampling without detection lowers the 
#' likelihood of the data on the hypothesis of true presence, and vice versa.  This value can only be 1 when the number of detections = the number 
#' of taphonomic control detections, for every OTU and area.  This is the implicit assumption in e.g. standard historical biogeography analyses in 
#' LAGRANGE or BioGeoBEARS.
#' @param dp The detection probability.  This is the per-sample probability that you will correctly detect the OTU in question, 
#' when you are looking at it.  Default is 1, which is the implicit assumption in standard analyses.
#' @param fdp The false detection probability.  This is probability of falsely concluding a detection of the OTU of interest occurred, when in 
#' fact the specimen was of something else.  The default is 0, which assumes zero error rate, 
#' i.e. the assumption being made in all historical biogeography analyses that do not take into account detection 
#' probability.  These options are being included for completeness, but it may not be wise to try to infer \code{mean_frequency},  
#' \code{dp} and \code{fdp} all at once due to identifiability issues (and estimation of fdp may take a very large amount of data).  However, 
#' fixing some of these parameters to reasonable values can allow the user to effectively include beliefs about the uncertainty of the 
#' input data into the analysis, if desired.
#' @param null_range_gets_0_like If \code{TRUE} (default), then the data is given zero probability on the hypothesis that the 
#' range is a null range (i.e., no areas occupied).  This is equivalent to saying that you are sure/are willing to assume that 
#' the OTU exists somewhere in your study area, at the timepoint being considered.  Null ranges are identified by length=1, 
#' containing NULL, NA, "", "_", etc.
#' @return \code{tip_condlikes_of_data_on_each_state} The (non-logged!) likelihood of the data for each tip, given each possible range, and the 
#' detection model parameters.
#' @export
#' @seealso \code{\link{Pdata_given_rangerow}}, \code{\link{calc_obs_like}}, \code{\link[base]{mapply}}, \code{\link{read_detections}}, \code{\link{read_controls}}
#' @note Go BEARS!
#' @author Nicholas J. Matzke \email{matzke@@berkeley.edu} 
#' @references
#' \url{http://phylo.wikidot.com/matzke-2013-international-biogeography-society-poster}
#' @bibliography /Dropbox/_njm/__packages/BioGeoBEARS_setup/BioGeoBEARS_refs.bib
#'   @cite Matzke_2012_IBS
#'	 @cite Bottjer_Jablonski_1988
#' @examples
#' testval=1
#' 
#' # soft-coded input files
#' extdata_dir = np(system.file("extdata", package="BioGeoBEARS"))
#' detects_fn = np(paste(extdata_dir, "/Psychotria_detections_v1.txt", sep=""))
#' controls_fn = np(paste(extdata_dir, "/Psychotria_controls_v1.txt", sep=""))
#' 
#' detects_df = read_detections(detects_fn, OTUnames=NULL, areanames=NULL, tmpskip=0)
#' controls_df = read_controls(controls_fn, OTUnames=NULL, areanames=NULL, tmpskip=0)
#' 
#' # Calculate the likelihood of the data at each tip, for each possible geographic range
#' numareas = 4
#' tmpranges = list(c(0), c(1), c(0,1))
#' 
#' mean_frequency=0.1
#' dp=1
#' fdp=0
#' 
#' tip_condlikes_of_data_on_each_state = 
#' tiplikes_wDetectionModel(states_list_0based_index=tmpranges, numareas=numareas, 
#' detects_df, controls_df, mean_frequency=mean_frequency, dp=dp, fdp=fdp, 
#' null_range_gets_0_like=TRUE)
#' 
#' tip_condlikes_of_data_on_each_state
#' 
tiplikes_wDetectionModel <- function(states_list_0based_index, numareas=NULL, detects_df, controls_df, mean_frequency=0.1, dp=1, fdp=0, null_range_gets_0_like=TRUE)
	{
	runjunk='
	controls_fn = "/Library/Frameworks/R.framework/Versions/2.15/Resources/library/BioGeoBEARS/extdata/Psychotria_controls_v1.txt"
	controls_fn = "/Dropbox/_njm/__packages/BioGeoBEARS_setup/inst/extdata/Psychotria_controls_v1.txt"
	OTUnames=NULL
	areanames=NULL
	tmpskip=0
	' # end runjunk
	
	
	# Get the number of areas, if needed
	if (is.null(numareas))
		{
		maxlength = max(unlist(lapply(X=states_list_0based_index,FUN=length)), na.rm=TRUE)
		maxnum = 1+max(unlist(lapply(X=states_list_0based_index,FUN=max, na.rm=TRUE)), na.rm=TRUE)
		numareas = max(c(maxnum, maxlength), na.rm=TRUE)
		}
	
	# Likelihood of a state = the probability of data at ALL areas (occupied, and not, in the state/geographic range) given the
	# conditionally assumed true presences/absences in the specified range
	
	# Go through the states
	#states_list_0based_index = tmpstates
	numstates = length(states_list_0based_index)
	numtips = nrow(detects_df)
	tip_condlikes_of_data_on_each_state = matrix(data=0, nrow=numtips, ncol=numstates)
	for (statenum in 1:numstates)
		{
		# Get the current state/geographic range
		truerange_areas = states_list_0based_index[[statenum]]
		
		# Check for null range, if you're supposed to, and the length is 1
		if ((null_range_gets_0_like==TRUE) && (length(truerange_areas)==1))
			{
			if ( (truerange_areas=="_") || (is.null(truerange_areas)==TRUE) || (is.na(truerange_areas)==TRUE) || (truerange_areas=="") )
				{
				# Then you have a null range, and you have decided to give the data 0 likelihood on all states
				likes_vector = rep(0, times=numtips)
				tip_condlikes_of_data_on_each_state[, statenum] = likes_vector
				
				# skip to next loop
				next()
				}
			}
		
		# Build a TRUE/FALSE row specifying the ranges in this assumed true 
		# state/geographic range
		range_as_areas_TF = matrix(data=FALSE, nrow=nrow(detects_df), ncol=numareas)
		range_as_areas_TF[, truerange_areas+1] = TRUE	# MUST BE OVER ALL ROWS
		range_as_areas_TF

		# iterate over the rows
		likes_vector = rep(0, times=numtips)
		for (i in 1:nrow(range_as_areas_TF))
			{
			likes_vector[i] = Pdata_given_rangerow(range_as_areas_TF=range_as_areas_TF[i,], detects_df_row=detects_df[i,], 
		controls_df_row=controls_df[i,], mean_frequency=mean_frequency, dp=dp, fdp=fdp, return_LnLs=FALSE)
			}
	
		tip_condlikes_of_data_on_each_state[, statenum] = likes_vector
		}

	tip_condlikes_of_data_on_each_state



	return(tip_condlikes_of_data_on_each_state)
	}







#######################################################
# prob_of_states_from_prior_prob_areas
#######################################################
#' Calculate probability of detection data for each OTU at each range in a list of states/geographic ranges
#' 
#' This function calculates P(data|range,dp), i.e. the probability of some detection and 
#' taphonomic control counts, given the true geographic range/state, and \code{dp}, a 
#' detection probability (and, optionally, a false detection probability, \code{fdp}).
#' 
#' This function performs the operation for all states/ranges for all tips.
#' 
#' The idea of taphonomic controls dates back at least to work of Bottjer & Jablonski (1988).  The basic
#' idea is that if you have taxa of roughly similar detectability, then detections of other 
#' taxa give some idea of overall detection effort.  Obviously this is a very simple 
#' model that can be criticized in any number of ways (different alpha diversity in each 
#' region, different detectability of individual taxa, etc.), but it is a useful starting
#' point as there has been no implementation of any detection model in historical/phylogenetic 
#' biogeography to date.
#' 
#' One could imagine (a) every OTU and area has a different count of detections and taphonomic control 
#' detections, or (b) the taphonomic control detections are specified by area, and shared across all OTUs.
#' Situation (b) is likely more common, but this function assumes (a) as this is the more thorough case.
#' Behavior (b) could be reproduced by summing each column, and/or copying this sum to all cells for a 
#' particular area.
#' 
#' @param states_list_0based_index A states_list, 0-based, e.g. from \code{\link[cladoRcpp]{rcpp_areas_list_to_states_list}}.
#' @param numareas The number of areas being considered in the analysis. If \code{NULL} (default), this is calculated to be the maximum range length, or 
#' one plus the maximum 0-based index in any of the ranges.
#' @param prior_prob_presence The prior probability of presence, i.e. when no detection or taphonomic control data 
#' whatsoever is available.  Default is set to 0.01 which expresses my totally uninformed bias that 
#' in whatever your data is, your species of interest probably doesn't live in the typical area you are 
#' looking at.
#' @param null_range_gets_0_prob If \code{TRUE} (default), then the null range is given zero probability. 
#' A null range has no areas occupied.  This is equivalent to saying that you are sure/are willing to assume that 
#' the OTU exists somewhere in your study area, at the timepoint being considered.  Null ranges are identified by length=1, 
#' containing NULL, NA, "", "_", etc.
#' @param normalize_probs If \code{TRUE}, the probabilities of each range will be normalized so that they sum to 1. Otherwise, they 
#' won't.
#' @return \code{prob_of_each_range} The probability of each range, given the prior probability of presence in each area.
#' @export
#' @seealso \code{\link[cladoRcpp]{rcpp_areas_list_to_states_list}}, \code{\link{Pdata_given_rangerow}}, 
#' \code{\link{calc_obs_like}}, \code{\link[base]{mapply}}, \code{\link{read_detections}}, \code{\link{read_controls}}
#' @note Go BEARS!
#' @author Nicholas J. Matzke \email{matzke@@berkeley.edu} 
#' @references
#' \url{http://phylo.wikidot.com/matzke-2013-international-biogeography-society-poster}
#' @bibliography /Dropbox/_njm/__packages/BioGeoBEARS_setup/BioGeoBEARS_refs.bib
#'   @cite Matzke_2012_IBS
#'	 @cite Bottjer_Jablonski_1988
#' @examples
#' testval=1
#' 
#' prior_prob_presence = 0.01
#' 
#' areas = c("K", "O", "M", "H")
#' numareas = length(areas)
#' states_list_0based_index = 
#' rcpp_areas_list_to_states_list(areas=areas, maxareas=4, include_null_range=TRUE)
#' states_list_0based_index
#' numareas = 4
#' 
#' mean_frequency=0.1
#' dp=1
#' fdp=0
#' 
#' prob_of_states_from_prior_prob_areas(states_list_0based_index, numareas=numareas,
#' prior_prob_presence=prior_prob_presence, null_range_gets_0_prob=TRUE, 
#' normalize_probs=TRUE)
#' 
#' prob_of_states_from_prior_prob_areas(states_list_0based_index, numareas=numareas,
#' prior_prob_presence=prior_prob_presence, null_range_gets_0_prob=TRUE, 
#' normalize_probs=FALSE)
#' 
#' prob_of_states_from_prior_prob_areas(states_list_0based_index, numareas=numareas,
#' prior_prob_presence=prior_prob_presence, null_range_gets_0_prob=FALSE, 
#' normalize_probs=TRUE)
#' 
#' prob_of_states_from_prior_prob_areas(states_list_0based_index, numareas=numareas,
#' prior_prob_presence=prior_prob_presence, null_range_gets_0_prob=FALSE, 
#' normalize_probs=FALSE)
#' 
prob_of_states_from_prior_prob_areas <- function(states_list_0based_index, numareas=NULL, prior_prob_presence=0.01, null_range_gets_0_prob=TRUE, normalize_probs=TRUE)
	{
	runjunk='
	' # end runjunk
	
	
	# Get the number of areas, if needed
	if (is.null(numareas))
		{
		maxlength = max(unlist(lapply(X=states_list_0based_index,FUN=length)), na.rm=TRUE)
		maxnum = 1+max(unlist(lapply(X=states_list_0based_index,FUN=max, na.rm=TRUE)), na.rm=TRUE)
		numareas = max(c(maxnum, maxlength), na.rm=TRUE)
		}
	
	numstates = length(states_list_0based_index)

	range_as_areas_TF = matrix(data=FALSE, nrow=numstates, ncol=numareas)
	prob_of_TF = matrix(data=0, nrow=numstates, ncol=numareas)
	for (statenum in 1:numstates)
		{
		# Get the current state/geographic range
		truerange_areas = states_list_0based_index[[statenum]]

		# Check for null range, if you're supposed to, and the length is 1
		if ((null_range_gets_0_prob==TRUE) && (length(truerange_areas)==1))
			{
			if ( (truerange_areas=="_") || (is.null(truerange_areas)==TRUE) || (is.na(truerange_areas)==TRUE) || (truerange_areas=="") )
				{
				# Then you have a null range, and you have decided to give the data 0 likelihood on all states
				range_as_areas_TF[statenum, ] = NA
			
				# skip to next loop
				next()
				}
			}

	
		# Build a TRUE/FALSE row specifying the ranges in this assumed true 
		# state/geographic range
		range_as_areas_TF[statenum, truerange_areas+1] = TRUE	# MUST BE OVER ALL ROWS
		range_as_areas_TF
		}
	range_as_areas_TF

	# Prob of each range
	prob_of_TF[range_as_areas_TF == TRUE] = prior_prob_presence
	prob_of_TF[range_as_areas_TF == FALSE] = 1-prior_prob_presence

	# Set anything in the NULL range to 0
	range_as_areas_TF[is.na(range_as_areas_TF)] = 0

	prob_of_TF
	prob_of_each_range = exp(rowSums(log(prob_of_TF)))
	prob_of_each_range

	if (normalize_probs == TRUE)
		{
		prob_of_each_range = prob_of_each_range / sum(prob_of_each_range)
		prob_of_each_range
		}



	return(prob_of_each_range)
	}




#######################################################
# post_prob_states
#######################################################
#' Calculate posterior probability of each states/geographic ranges, given prior probabilities and data likelihoods
#' 
#' This function calculates P(range|data,detection model), i.e. the probability of each possible 
#' range, given a prior probability of each range, and the likelihood of each range.
#' 
#' The prior probability of each range should be considered by the user.  Note that putting the same prior on 
#' the probability of occurrence in each individual range does NOT mean a flat prior on each 
#' state/geographic range.  This fact is demonstrated in the function \code{\link{prob_of_states_from_prior_prob_areas}}.
#' 
#' @param prob_of_each_range The probability of each range, given the prior probability of presence in each area.
#' @param condlikes_of_data_on_each_range The probability of the data, conditional on each range (i.e., the likelihood), as 
#' found in e.g. a row of the output from \code{\link{tiplikes_wDetectionModel}}.
#' 
#' @return \code{posterior_probs} The posterior probability of each range.
#' @export
#' @seealso \code{\link{prob_of_states_from_prior_prob_areas}}, \code{\link{tiplikes_wDetectionModel}}, 
#' \code{\link[cladoRcpp]{rcpp_areas_list_to_states_list}}, \code{\link{Pdata_given_rangerow}}, 
#' \code{\link{calc_obs_like}}, \code{\link[base]{mapply}}, \code{\link{read_detections}}, \code{\link{read_controls}}
#' @note Go BEARS!
#' @author Nicholas J. Matzke \email{matzke@@berkeley.edu} 
#' @references
#' \url{http://phylo.wikidot.com/matzke-2013-international-biogeography-society-poster}
#' \url{http://en.wikipedia.org/wiki/Log_probability}
#' @bibliography /Dropbox/_njm/__packages/BioGeoBEARS_setup/BioGeoBEARS_refs.bib
#'   @cite Matzke_2012_IBS
#'	 @cite Bottjer_Jablonski_1988
#' @examples
#' testval=1
#' 
#' # soft-coded input files
#' extdata_dir = np(system.file("extdata", package="BioGeoBEARS"))
#' detects_fn = np(paste(extdata_dir, "/Psychotria_detections_v1.txt", sep=""))
#' controls_fn = np(paste(extdata_dir, "/Psychotria_controls_v1.txt", sep=""))
#' 
#' detects_df = read_detections(detects_fn, OTUnames=NULL, areanames=NULL, tmpskip=0)
#' controls_df = read_controls(controls_fn, OTUnames=NULL, areanames=NULL, tmpskip=0)
#' 
#' 
#' # Setup
#' prior_prob_presence = 0.01
#' areas = c("K", "O", "M", "H")
#' numareas = length(areas)
#' maxareas = length(areas)
#' states_list_0based_index = 
#' rcpp_areas_list_to_states_list(areas=areas, maxareas=maxareas, 
#'                                include_null_range=TRUE)
#' states_list_0based_index
#' 
#' mean_frequency=0.1
#' dp=1
#' fdp=0
#' 
#' tip_condlikes_of_data_on_each_state = 
#' tiplikes_wDetectionModel(states_list_0based_index, numareas=numareas, 
#' detects_df, controls_df, mean_frequency=mean_frequency, dp=dp, fdp=fdp, 
#' null_range_gets_0_like=TRUE)
#' 
#' tip_condlikes_of_data_on_each_state
#' 
#' 
#' 
#' # To get denominator, just iterate over all the states
#' # Prior probability
#' prob_of_each_range = prob_of_states_from_prior_prob_areas(states_list_0based_index, 
#' numareas=numareas,
#' prior_prob_presence=prior_prob_presence, null_range_gets_0_prob=TRUE, 
#' normalize_probs=TRUE)
#' 
#' # Likelihoods of the data on each range
#' condlikes_of_data_on_each_range = tip_condlikes_of_data_on_each_state[1,]
#' 
#' posterior_probs = post_prob_states(prob_of_each_range, 
#'                   condlikes_of_data_on_each_range)
#' posterior_probs
#' 
#' # Should sum to 1
#' sum(posterior_probs)
#' 
post_prob_states <- function(prob_of_each_range, condlikes_of_data_on_each_range)
	{
	# The denominator in Bayes's Theorem is the P(data), i.e. the probability of the 
	# data conditional on all possible hypotheses, i.e. the sum of the data likelihood times the 
	# prior for all states.\

	# Sum probabilities in log space
	# http://en.wikipedia.org/wiki/Log_probability
	log_probs_in_denominator = log(prob_of_each_range) + log(condlikes_of_data_on_each_range)
	log_probs_in_denominator

	# Simplest way to add probabilities in log formet
	denominator = log(sum(exp(log_probs_in_denominator)))
	denominator


	numstates = length(prob_of_each_range)
	numerators = rep(NA, times=numstates)
	for (i in 1:numstates)
		{
		prior_prob_of_this_range = prob_of_each_range[i]
		like_of_data_on_this_range = condlikes_of_data_on_each_range[i]
	
		# Sum probabilities in log space
		# http://en.wikipedia.org/wiki/Log_probability
		numerators[i] = log(prior_prob_of_this_range * like_of_data_on_this_range)
		}

	# Calculate the posterior probabilities
	posterior_probs = exp(numerators - denominator)
	posterior_probs
	
	# They should sum to 1
	sum(posterior_probs)
	
	return(posterior_probs)
	}








#######################################################
# post_prob_states_matrix
#######################################################
#' Calculate posterior probability of each states/geographic ranges, given prior probabilities and data likelihoods
#' 
#' This function calculates P(range|data,detection model), i.e. the probability of each possible 
#' range, given a prior probability of each range, and the likelihood of each range.
#' 
#' The prior probability of each range should be considered by the user.  Note that putting the same prior on 
#' the probability of occurrence in each individual range does NOT mean a flat prior on each 
#' state/geographic range.  This fact is demonstrated in the function \code{\link{prob_of_states_from_prior_prob_areas}}.
#' 
#' @param prob_of_each_range The probability of each range, given the prior probability of presence in each area.
#' @param tip_condlikes_of_data_on_each_state The probability of the data, conditional on each range (i.e., the likelihood), as 
#' found in e.g. a row of the output from \code{\link{tiplikes_wDetectionModel}}.
#' @return \code{posterior_probs} The posterior probability of each range.
#' @export
#' @seealso \code{\link{prob_of_states_from_prior_prob_areas}}, \code{\link{tiplikes_wDetectionModel}}, 
#' \code{\link[cladoRcpp]{rcpp_areas_list_to_states_list}}, \code{\link{Pdata_given_rangerow}}, 
#' \code{\link{calc_obs_like}}, \code{\link[base]{mapply}}, \code{\link{read_detections}}, \code{\link{read_controls}}
#' @note Go BEARS!
#' @author Nicholas J. Matzke \email{matzke@@berkeley.edu} 
#' @references
#' \url{http://phylo.wikidot.com/matzke-2013-international-biogeography-society-poster}
#' \url{http://en.wikipedia.org/wiki/Log_probability}
#' @bibliography /Dropbox/_njm/__packages/BioGeoBEARS_setup/BioGeoBEARS_refs.bib
#'   @cite Matzke_2012_IBS
#'	 @cite Bottjer_Jablonski_1988
#' @examples
#' testval=1
#' 
#' # soft-coded input files
#' extdata_dir = np(system.file("extdata", package="BioGeoBEARS"))
#' #extdata_dir = "/Dropbox/_njm/__packages/BioGeoBEARS_setup/inst/extdata/"
#' detects_fn = np(paste(extdata_dir, "/Psychotria_detections_v1.txt", sep=""))
#' controls_fn = np(paste(extdata_dir, "/Psychotria_controls_v1.txt", sep=""))
#' 
#' detects_df = read_detections(detects_fn, OTUnames=NULL, areanames=NULL, tmpskip=0)
#' controls_df = read_controls(controls_fn, OTUnames=NULL, areanames=NULL, tmpskip=0)
#' 
#' 
#' # Setup
#' prior_prob_presence = 0.01
#' areas = c("K", "O", "M", "H")
#' numareas = length(areas)
#' maxareas = length(areas)
#' states_list_0based_index = 
#' rcpp_areas_list_to_states_list(areas=areas, maxareas=maxareas, 
#'                                include_null_range=TRUE)
#' states_list_0based_index
#' 
#' mean_frequency=0.1
#' dp=1
#' fdp=0
#' 
#' tip_condlikes_of_data_on_each_state = 
#' tiplikes_wDetectionModel(states_list_0based_index, numareas=numareas, 
#' detects_df, controls_df, mean_frequency=mean_frequency, dp=dp, fdp=fdp, 
#' null_range_gets_0_like=TRUE)
#' 
#' tip_condlikes_of_data_on_each_state
#' 
#' 
#' 
#' # To get denominator, just iterate over all the states
#' # Prior probability
#' prob_of_each_range = prob_of_states_from_prior_prob_areas(states_list_0based_index, 
#' numareas=numareas,
#' prior_prob_presence=prior_prob_presence, null_range_gets_0_prob=TRUE, 
#' normalize_probs=TRUE)
#' 
#' posterior_probs_matrix = post_prob_states_matrix(prob_of_each_range, 
#'                   tip_condlikes_of_data_on_each_state)
#' posterior_probs_matrix
#' 
#' # Should sum to 1
#' rowSums(posterior_probs_matrix)
#' 
#' # How does posterior probability correlate with likelihood and prior probability?
#' par(mfrow=c(1,2))
#' plot(x=jitter(log(tip_condlikes_of_data_on_each_state)), 
#' y=jitter(log(posterior_probs_matrix)))
#' title("Correlation of data likelihoods\nand posterior probabilities")
#' 
#' prob_of_each_range_matrix = matrix(data=prob_of_each_range, 
#' nrow=nrow(posterior_probs_matrix), ncol=length(prob_of_each_range))
#' plot(x=jitter(log(prob_of_each_range_matrix)), 
#' y=jitter(log(posterior_probs_matrix)))
#' title("Correlation of prior probability\nand posterior probabilities")
#' 
post_prob_states_matrix <- function(prob_of_each_range, tip_condlikes_of_data_on_each_state)
	{
	# The denominator in Bayes's Theorem is the P(data), i.e. the probability of the 
	# data conditional on all possible hypotheses, i.e. the sum of the data likelihood times the 
	# prior for all states.\
	
	numtips = nrow(tip_condlikes_of_data_on_each_state)
	posterior_probs_matrix = matrix(NA, nrow=numtips, ncol=ncol(tip_condlikes_of_data_on_each_state))
	
	# Iterate through each tip
	for (tipnum in 1:numtips)
		{
		condlikes_of_data_on_each_range = tip_condlikes_of_data_on_each_state[tipnum, ]
	
		# Sum probabilities in log space
		# http://en.wikipedia.org/wiki/Log_probability
		log_probs_in_denominator = log(prob_of_each_range) + log(condlikes_of_data_on_each_range)
		log_probs_in_denominator

		# Simplest way to add probabilities in log formet
		denominator = log(sum(exp(log_probs_in_denominator)))
		denominator


		numstates = length(prob_of_each_range)
		numerators = rep(NA, times=numstates)
		for (i in 1:numstates)
			{
			prior_prob_of_this_range = prob_of_each_range[i]
			like_of_data_on_this_range = condlikes_of_data_on_each_range[i]
	
			# Sum probabilities in log space
			# http://en.wikipedia.org/wiki/Log_probability
			numerators[i] = log(prior_prob_of_this_range * like_of_data_on_this_range)
			}

		# Calculate the posterior probabilities
		posterior_probs = exp(numerators - denominator)
		posterior_probs
		
		posterior_probs_matrix[tipnum, ] = posterior_probs
		}
	
	# They should sum to 1
	rowSums(posterior_probs_matrix)
	
	return(posterior_probs_matrix)
	}





# add_probs_as_logs <- function(lnP_list)
# 	{
# 	
# 	# Complex:
# #	num_lnPs_to_add = length(lnP_list)
# # 	sum_lnPs = lnP_list[1]
# # 	for (i in 2:num_lnPs_to_add)
# # 		{
# # 		lnP = lnP_list[i]
# # 		
# # 		# Sum probabilities in log space
# # 		# http://en.wikipedia.org/wiki/Log_probability
# # 		#sum_lnPs = -1 * log( exp(-sum_lnPs) + exp(-lnP) )
# # 		
# # 		# Simpler:
# # 		sum_lnPs = 1 * log( exp(sum_lnPs) + exp(lnP) )
# # 		}
# 
# 	# Simple:
# 	sum_lnPs = log(sum(exp(lnP_list)))
# 
# 	return(sum_lnPs)
# 	}

Try the BioGeoBEARS package in your browser

Any scripts or data that you put into this service are public.

BioGeoBEARS documentation built on May 29, 2017, 8:36 p.m.