### This is the code containing all the functions required to run GenePlot (apart
### from the saddlepoint functions, which are lengthy and are stored in a separate
### file).
#' Run GenePlot calculations without plotting.
#'
#' Run GenePlot calculations to obtain Log-Genotype-Probabilities for all
#' individuals from populations in \code{refpopnames} or \code{includepopnames},
#' with respect to each of the populations in \code{refpopnames}.
#'
#' All individuals in \code{dat} whose population label in the 'pop' column of
#' \code{dat} matches one of the populations in \code{refpopnames} or
#' \code{includepopnames} will be included.
#'
#' NOTE that if a population is not in \code{includepopnames}/\code{refpopnames},
#' then any alleles private to that population will NOT be included in the
#' prior / posterior. Thus the posterior for a given refpop will change slightly
#' depending on which populations are in \code{includepopnames}/\code{refpopnames}.
#'
#' Leave-one-out will be used when calculating the log-genotype probability for
#' an individual with respect to their own reference population, if specified in
#' the inputs (default is NON leave-one-out).
#' Default is NON leave-one-out but WE STRONGLY RECOMMEND USING LEAVE-ONE-OUT,
#' ESPECIALLY FOR SMALL SAMPLES (<30).
#'
#' @param dat The data, in a data frame, with two columns labelled as 'id' and
#' 'pop', and with two additional columns per locus. Missing data at any
#' locus should be marked as '0' for each allele.
#' The locus columns must be labelled in the format Loc1.a1, Loc1.a2,
#' Loc2.a1, Loc2.a2, etc.
#' Missing data must be for BOTH alleles at any locus. Missing data for
#' ONE allele at any locus will produce an error.
#' See \code{\link{read_genepop_format}} for details of how to import Genepop
#' format data files into the appropriate format.
#'
#' @param refpopnames Character vector of reference population names, that must
#' match the values in the 'pop' column of \code{dat}.
#'
#' @param locnames Character vector, names of the loci, which must match the
#' column names in the data so e.g. if dat has columns
#' id, pop, EV1.a1, EV1.a2, EV14.a1, EV14.a2, etc.
#' then you could use 'locnames = c("EV1","EV14") etc.
#' The locnames do not need to be in any particular order but all of them
#' must be in \code{dat}.
#'
#' @param includepopnames Character vector (default NULL) of population names to
#' be included in the calculations. All individuals with 'pop' value in
#' \code{includepopnames} will have their Log-Genotype-Probabilities
#' calculated with respect to all populations in \code{refpopnames}. This
#' input parameter should be used to specify the population labels of any
#' new/additional individuals that you want to compare to the reference pops.
#' For example, if the reference pops are Pop1 and Pop2, and you have some
#' new individuals which you have labelled as PopNew, then use
#' \code{includepopnames=c("PopNew")} to compare those individuals to Pop1
#' and Pop2.
#' You can specify the populations in any order, provided that they are all
#' in \code{dat}.
#' If NULL (default) then only the individuals from \code{refpopnames} will
#' have their Log-Genotype-Probabilities calculated.
#'
#' @param prior (default="Rannala") String, either "Rannala" or "Baudouin",
#' giving the choice of prior parameter for the Dirichlet priors for the
#' allele frequency estimates. Both options define parameter values that
#' depend on the number of alleles at each locus, k.
#' "Baudouin" gives slightly more weight to
#' rare alleles than "Rannala" does, or less weight to the data, so
#' Baudouin may be more suitable for small reference samples, but there is
#' no major difference between them. For more details, see McMillan and
#' Fewster (2017), Biometrics.
#' Additional options are "Half" or "Quarter" which specify parameters 1/2
#' or 1/4, respectively. These options have priors whose parameters do not
#' depend on the number of alleles at each locus, and so may be more suitable
#' for microsatellite data with varying numbers of alleles at each locus.
#'
#' @param saddlepoint Boolean (default TRUE), indicates whether or not to use the
#' saddlepoint method for imputing missing data/leave-one-out results.
#' For more details, see McMillan and Fewster (2017), Biometrics.
#'
#' @param leave_one_out Boolean (default TRUE), indicates whether or not to
#' calculate leave-one-out results for any individual from the reference
#' pops. If TRUE, any individual from a reference population will have
#' their Log-Genotype-Probability with respect to their own reference
#' population after temporarily removing the individual's genotype from the
#' sample data for that reference population. The individual's
#' Log-Genotype-Probabilities with respect to all populations they are not
#' a member of will be calculated as normal.
#' We STRONGLY RECOMMEND using leave-one-out=TRUE for any small reference
#' samples (<30).
#'
#' @param logten (default TRUE) Boolean, indicates whether to use base 10 for the
#' logarithms, or base e (i.e. natural logarithms). logten=TRUE is default
#' because it's easier to recalculate the original non-log numbers in your
#' head when looking at the plots. Use FALSE for natural logarithms.
#'
#' @param min_loci (default 6) is the minimum number of loci that an individual
#' must have (within the set of loci defined in \code{locnames}) for the
#' individual to have its Log-Genotype-Probabilities calculated. Any
#' individuals with fewer loci than this (i.e. with too many missing loci)
#' will not have its Log-Genotype-Probabilities plotted.
#' Thus if the calculations are based on 9 loci i.e. locnames is length 9,
#' and min_loci=6, then every individual included in the results has data
#' for at least 6 of these 9 loci.
#'
#' @param quantiles (default c(0.01,1.00)) Vector of probabilities, specifying the
#' quantiles of the posterior distribution to be calculated.
#' Default plots the 1\% and 100\% quantiles of the Log-Genotype-Probability
#' distributions for each of the reference populations.
#' For example, only 1\% of all possible genotypes that could arise from the
#' given population will have Log-Genotype-Probabilities below the 1\% quantile,
#' and 99\% of all possible genotypes arising from that population will have
#' Log-Genotype-Probabilities above the 1\% quantile.
#' The 100\% quantile is the maximum possible Log-Genotype-Probability that
#' any genotype can have with respect to this population.
#' Quantile values will be provided as attributes to the output object of
#' calc_logprob (see the Value section.)
#' If no quantiles are wanted, supply quantiles=NULL.
#'
#' @param Ndraw (default 100000) is only used if saddlepoint=FALSE. Defines the
#' number of draws that will be taken from the distribution of
#' log-posterior genotype probabilities for each reference population.
#' These draws, i.e. simulated genotypes from the posterior distributions
#' of the reference populations, are used when imputing the
#' log-genotype-probabilities for individuals with
#' missing data, or when calculating quantiles of the distribution. For
#' more details, see McMillan and Fewster (2017), Biometrics.
#'
#' @returns The structure of the output from \code{calc_logprob} and/or \code{geneplot}
#' is a data frame, with one row per individual.
#'
#' The first two columns are "id" and "pop", as in the input data.
#'
#' The next column (col3) is "status" which is "complete" or "impute" depending on
#' whether the individual had data for all loci, or had some loci missing.
#'
#' The next column (col4) is "nloci" which is how many loci the individual has
#' data for
#'
#' The next columns are the final/imputed log-genotype probabilities for the
#' individual with respect to each of the reference populations.
#' They are named in the form "Pop1", "Pop2" etc. corresponding to the names in
#' the refpopnames input.
#'
#' Then the final columns are the "raw" log-genotype probabilities for the same pops.
#' These are named in the form "Pop1.raw", Pop2.raw", etc. again corresponding
#' to the names in refpopnames.
#'
#' For individuals with full data at all loci, i.e. no missing data, these two
#' sets of columns will be the same, and give the individual's log-genotype
#' probabilities with respect to each of the reference populations.
#'
#' For individuals with missing data at some loci then the raw values are the
#' log-genotype probabilities calculated based on the loci that *are* present in
#' the data, and the final/imputed columns, at the start of the results data
#' frame, are the imputed log-genotype probabilities for the full set of loci
#' i.e. the final LGPs for the missing-data individuals are comparable to the
#' final LGPs for the complete-data individuals.
#'
#' ---- Additional attributes of the results object ----------------------------
#'
#' At the end of \code{calc_logprob} the details of the algorithm used to calculate
#' the results are attached as attributes to the results object.
#' If your call to \code{calc_logprob} or \code{geneplot} is e.g.
#'
#' \code{Pop1_vs_Pop2_results <- calc_logprob(dat, c("Pop1","Pop2"), locnames=whaleLocnames)}
#'
#' then you would find out the attributes using \code{attributes(Pop1_vs_Pop2_results)$saddlepoint} etc.
#'
#' Other attributes attached to the results object are:
#' \code{attributes(results)$min_loci} -- the minimum number of loci to require
#' for any individual to be assigned, so any individual with fewer loci
#' will be excluded from analysis
#'
#' \code{attributes(results)$n_too_few} -- the number of individuals that have
#' been excluded from the analysis because they had too few loci
#'
#' \code{attributes(results)$percent_missing} -- the percentage of individuals
#' that have been excluded, out of all those in the samples listed in
#' allpopnames
#'
#' \code{attributes(results)$qmat} -- the values of the plotted quantiles
#' for the populations, with the \% labels of the quantiles as the column names
#' e.g. if quantiles=c(0.05,0.99) was the input to chart.func then
#' qmat will be of the form
#'
#' \tabular{lrr}{
#' \tab 5\% \tab 99\% \cr
#' Pop1 \tab xx \tab xx\cr
#' Pop2 \tab xx \tab xx}
#'
#' \code{attributes(results)$allele_freqs} -- the posterior estimates of the
#' allele frequencies for the populations, as a list, where each element
#' of the list corresponds to one locus (and the list elements are named
#' with the loci names), and at a single locus the allele frequencies are
#' given as a matrix with the allele type names as the columns and the
#' reference populations as the rows e.g. one locus example:
#'
#' \code{$TR3G2}
#' \tabular{lrrrrrr}{
#' \tab 150 \tab 158 \tab 168 \tab 172 \tab 176 \tab 180 \cr
#' Pop1 \tab 0.125 \tab 0.125 \tab 12.125 \tab 26.125 \tab 22.125 \tab 12.125 \cr
#' Pop2 \tab 1.125 \tab 2.125 \tab 13.125 \tab 29.125 \tab 21.125 \tab 10.125}
#'
#' These are allele COUNT estimates, NOT PROPORTION estimates, so
#' they do not need to add up to 1.
#'
#' \code{attributes(results)$allpopnames} -- a vector of refpopnames, followed by includepopnames
#' i.e. allpopnames <- c(refpopnames, includepopnames)
#'
#' \code{attributes(results)$refpopnames} -- vector of reference population names
#'
#' \code{attributes(results)$includepopnames} -- vector of included pop names for assignment
#'
#' \code{attributes(results)$saddlepoint} -- TRUE/FALSE for whether saddlepoint was used
#'
#' \code{attributes(results)$leave_one_out} -- TRUE/FALSE for whether leave_one_out was used
#'
#' \code{attributes(results)$logten} -- TRUE/FALSE for whether log_10 was used (TRUE) or
#' log_e was used (FALSE)
#'
#' \code{attributes(results)$prior} -- "Rannala"/"Baudouin", for whether Rannala
#' and Mountain or Baudouin and Lebrun prior was used (see McMillan & Fewster, 2017 Biometrics)
#'
#' @references McMillan, L. and Fewster, R. "Visualizations for genetic assignment
#' analyses using the saddlepoint approximation method" (2017) \emph{Biometrics}.
#'
#' @references Rannala, B., and Mountain, J. L. (1997). Detecting immigration by using
#' multilocus genotypes. \emph{Proceedings of the National Academy of Sciences}
#' \strong{94}, 9197--9201.
#'
#' @references Piry, S., Alapetite, A., Cornuet, J.-M., Paetkau, D., Baudouin, L., and
#' Estoup, A. (2004). GENECLASS2: A software for genetic assignment and
#' first-generation migrant detection. \emph{Journal of Heredity} \strong{95},
#' 536--539.
#'
#' @author Log-Genotype-Probability calculations based on the method of Rannala
#' and Mountain (1997) as implemented in GeneClass2, updated to allow for
#' individuals with missing data and to enable accurate calculations of
#' quantiles of the Log-Genotype-Probability distributions of the reference
#' populations.
#' See McMillan and Fewster (2017) for details.
#' Coded by Rachel Fewster and Louise McMillan.
#'
#' @importFrom stats dbeta dnorm pnorm rgamma rbeta rbinom rmultinom ecdf prcomp quantile uniroot
#'
#' @export
calc_logprob <- function(dat, refpopnames, locnames,
includepopnames=NULL,
prior="Rannala",
saddlepoint=T,
leave_one_out=T,
logten=T,
min_loci=6,
quantiles=c(0.01, 1.00),
Ndraw=100000)
{
## Validate inputs ---------------------------------------------------------
validate_geneplot_inputs(dat,refpopnames,includepopnames,locnames,
prior,saddlepoint,leave_one_out,logten,min_loci,quantiles)
## Check quantiles are valid
if (!is.null(quantiles) & (!is.vector(quantiles) || !is.numeric(quantiles) || any(quantiles > 1) || any(quantiles < 0))) stop("quantiles must be a vector of values between 0 and 1.")
## Make sure that dat$pop is stored as a character vector ------------------
dat$pop <- as.character(dat$pop)
allpopnames <- unique(c(refpopnames, includepopnames))
## Reduce the data frame "dat" to include only the populations in includepopnames:
dat <- dat[!is.na(match(dat$pop, allpopnames)),]
## If you want to reorder refpopnames to the same order as includepopnames, use the line below; by default,
## keep the order specified in refpopnames because this sets the axes:
## refpopnames <- includepopnames[sort(match(refpopnames, includepopnames))]
## Make popx2 --------------------------------------------------------------
## popx2 is useful so we can concatenate the vector for the first allele followed by
## the vector for the second allele, keeping the population information intact:
popx2 <- rep(dat$pop, 2)
## Calculate the posterior allele frequencies for all populations at each locus
## This function returns an array with rows matching the refpopnames, and with entries being the
## vector (nu_1, ..., nu_k) for this row. For example the output could be (for a locus with 8 alleles):
## 150 154 156 164 172 178 182 184
## Pop1 1 1 19 6 12 1 18 10
## Pop2 1 1 16 1 42 1 1 1
## meaning that the posterior for allele frequencies for population Pop2 is (p1, ..., p8)~Dirichlet(1, 1, 16, ..., 1).
posterior_nu_list <- lapply(locnames, calc_posterior_locus, dat, popx2, prior, refpopnames)
names(posterior_nu_list) <- locnames
## Create for each locus a vector of nusums, such that the first element is the value nusum for the
## first refpop, the second for the second refpop, etc:
posterior_nusumvec_list <- lapply(posterior_nu_list, rowSums)
## Split into complete and missing data individuals -------------------------------
## Now find the assignment probabilities, i.e. the log posterior genotype
## probabilities for each individual in dat.
## Split the data into complete-data individuals, dat_complete, and incomplete-data individuals, dat_missing.
## Just check all the locname.a1 columns for 0's, because the locname.a2 column will be the same:
any_missing <- apply(dat[paste(locnames, ".a1", sep="")], 1, function(x) any(x==0))
dat_complete <- dat[!any_missing,]
dat_missing <- dat[any_missing,]
n_complete <- nrow(dat_complete)
n_missing <- nrow(dat_missing)
## Generate LGP distribution -----------------------------------------------
## In three situations, we need to simulate from the distribution of
## log-posterior genotype probabilities for each reference population.
## These situations are (a) when there are any missing data, in which case
## the simulations enable us to place the missing-data individuals on the
## plot so they are represented with the same p-values as they would be if
## only their complete-data loci were shown; and (b) if the vector
## "quantiles" is supplied so that crosslines representing the corresponding
## quantiles are wanted on the twopop or 3spin plot; and (c) if doing
## leave-one-out, where we need to impute the log-posterior genotype
## probabilities calculated after leaving the individual out on the same
## scale as the full-population LGPs.
## Initially set up both the all_loci_SCDF_qsearch_params and
## all_loci_sim_logprob objects so that they can be passed in to functions
## (otherwise we end up with multiple versions of function calls, one for
## each option)
all_loci_SCDF_qsearch_params = NULL
all_loci_sim_logprob = NULL
if((n_missing > 0 | !is.null(quantiles) | leave_one_out) & saddlepoint){
# Calculate the SCDF for all loci
all_loci_SCDF_qsearch_params <- calc_qsearch_params_usingRcpp(posterior_nu_list, refpopnames,
logten=logten, leave_one_out=leave_one_out)
} else if (!saddlepoint) {
## If saddlepoint has not been used, will need to have access to
## all_loci_sim_logprob as an attribute of logprob_results, in order to
## plot the bars in the multi-pop bar plot.
## So even if is.null(quantiles) and n_missing==0 and leave_one_out=FALSE
## we still need to calculate all_loci_sim_logprob when saddlepoint=FALSE.
## The logs will be base 10 if logten=T, and natural logs otherwise.
sim_logprob_bylocus <- lapply(posterior_nu_list, sim_logprob_locus, Nsim=Ndraw, logten=logten)
## For both situations (n_missing>0 or !is.null(quantiles))), we need the all-loci aggregation of these
## probabilities: i.e. the distribution of all-locus genotype probabilities under the posterior distributions
## for each reference population.
all_loci_sim_logprob <- aggregate_logprob(sim_logprob_bylocus, which_loci_names=locnames)
}
## Quantiles ---------------------------------------------------------------
## Create the matrix of quantile information if quantiles is non-NULL, for
## plotting via plot_logprob_twopop or threespin.plot:
## quantile_mat below has the following format:
## 1\% 99\%
## Pop1 -37.3 -25.7
## Pop2 -29.8 -11.9
## meaning that an individual with a log-genotype probability of less than
## -37.3 is below the 1% percentile for individuals genuinely simulated from
## the posterior genotype distribution for population Pop1.
if(!is.null(quantiles)){
## If only one quantile is required, the quantile matrix loses
## dimensionality and loses the nice names.
## Cheap hack to stop this happening is to ask for the same quantile twice:
if(length(quantiles)==1) quantiles_vec <- rep(quantiles, 2)
else quantiles_vec <- quantiles
## Make sure quantiles are in order:
quantiles_vec <- sort(quantiles_vec)
###### Here, change to using quantiles from all-loci SCDF
if (saddlepoint)
{
if (leave_one_out) qsearch_params <- calc_qsearch_params_usingRcpp(
posterior_nu_list, refpopnames,
logten=logten, leave_one_out=leave_one_out)
else qsearch_params <- all_loci_SCDF_qsearch_params
## Make sure to use leave-one-out for calc_quantiles_usingRcpp as
## well as for calc_qsearch_params, otherwise the K functions will
## not work correctly -- need to be using the twoPops/leave-one-out
## mode for all K functions when calculating leave-one-out quantiles
quantile_mat <- calc_quantiles_usingRcpp(qsearch_params,
posterior_nu_list, refpopnames,
quantiles_vec, logten=logten,
leave_one_out=leave_one_out)
## Add column headings to the quantile table
colnames(quantile_mat) <- paste0(round(100*quantiles_vec,0),"%")
}
else
{
quantile_mat <- t(apply(all_loci_sim_logprob, 1, quantile, probs=quantiles_vec))
}
if(length(quantiles)==1) quantile_mat <- quantile_mat[,1, drop=F]
}
else quantile_mat <- NULL
## Deal with complete-data individuals --------------------------------------------
if(n_complete > 0){
dat_complete$status <- rep("complete", n_complete)
dat_complete$nloc <- rep(length(locnames), n_complete)
logprob_complete <- t(sapply(1:n_complete, calc_logprob_complete, dat_complete, refpopnames, locnames,
posterior_nu_list, posterior_nusumvec_list,
leave_one_out, saddlepoint, logten, Ndraw,
all_loci_sim_logprob, all_loci_SCDF_qsearch_params))
## Put logprob together with whatever columns out of id, pop, subpop, and nloc are present,
## into logprob_results, a data frame:
logprob_results_complete <- data.frame(dat_complete[,!is.na(match(names(dat_complete),
c("id", "pop", "subpop", "status", "nloc")))],
logprob_complete)
names_complete <- names(logprob_results_complete)
}
else{
logprob_results_complete <- NULL
if (!is.na(match("subpop",names(dat)))) basic_cols <- c("id", "pop", "subpop", "status", "nloc")
else basic_cols <- c("id", "pop", "status", "nloc")
names_complete <- c(basic_cols, refpopnames, paste(refpopnames, "raw", sep="."))
}
## Deal with missing-data individuals ---------------------------------------------
if(n_missing > 0){
dat_missing$status <- rep("impute", n_missing)
logprob_missing <- t(sapply(1:n_missing, calc_logprob_missing, dat_missing, refpopnames, locnames,
posterior_nu_list, posterior_nusumvec_list,
leave_one_out, saddlepoint, logten, Ndraw, sim_logprob_bylocus,
all_loci_sim_logprob, all_loci_SCDF_qsearch_params))
logprob_results_missing <- data.frame(dat_missing[,!is.na(match(names(dat_missing),
c("id", "pop", "subpop", "status")))], logprob_missing)
## Reorder the columns of logprob_results_missing if necessary, in order to match those of logprob_results_complete:
logprob_results_missing <- logprob_results_missing[ , match(names_complete, names(logprob_results_missing))]
## If they still don't match, I don't know why: it would need investigating.
if(any(names(logprob_results_missing)!=names(logprob_results_complete))) stop("Naming of the complete and missing arrays is different. They shouldn't be.")
## Individuals with too few loci ----------------------------------------------
## Remove any individuals from logprob_results_missing that don't meet min_loci
n_too_few <- length((1:n_missing)[logprob_results_missing$nloc < min_loci])
if(n_too_few > 0){
cat("Removing ", n_too_few, "individuals with number of loci < ", min_loci, "\n\n")
logprob_results_missing <- logprob_results_missing[logprob_results_missing$nloc >= min_loci,]
}
}
else logprob_results_missing <- NULL
## Combine complete-data results and missing-data results ------------------
## The two arrays should have been set up so that they have the same names and the function should
## already have dumped if they don't.
logprob_results <- rbind(logprob_results_complete, logprob_results_missing)
## Add attributes to the results object ------------------------------------
attributes(logprob_results)$min_loci <- min_loci
if (n_missing > 0) attributes(logprob_results)$n_too_few <- n_too_few
else attributes(logprob_results)$n_too_few <- 0
attributes(logprob_results)$percent_missing <- count_missing_loci(logprob_results, dat)
attributes(logprob_results)$qmat <- quantile_mat
attributes(logprob_results)$allele_freqs <- posterior_nu_list
attributes(logprob_results)$refpopnames <- refpopnames
attributes(logprob_results)$includepopnames <- includepopnames
attributes(logprob_results)$saddlepoint <- saddlepoint
attributes(logprob_results)$leave_one_out <- leave_one_out
attributes(logprob_results)$logten <- logten
attributes(logprob_results)$prior <- prior
attributes(logprob_results)$allpopnames <- allpopnames
## Attach the full-loci simulated distribution, which will be NULL except when
## saddlepoint=FALSE and quantiles are not NULL
attributes(logprob_results)$all_loci_sim_logprob <- all_loci_sim_logprob
class(logprob_results) <- c(class(logprob_results), "geneplot")
logprob_results
}
validate_geneplot_inputs <- function(dat,refpopnames,includepopnames,locnames,
prior,saddlepoint,leave_one_out,logten,min_loci,quantiles)
{
if (is.null(dat)) stop("dat cannot be null.")
if (is.null(refpopnames)) stop("refpopnames cannot be null.")
if (is.null(locnames)) stop("locnames cannot be null.")
if (!is.data.frame(dat)) stop("dat must be a data frame.")
if (!("pop" %in% names(dat))) stop("dat does not have a column named 'pop' containing the population/sample labels.")
if (is.factor(dat$pop)) stop("The 'pop' column in dat is a factor. Please convert to character and rerun.")
# if (!is.character(dat$pop)) stop("The 'pop' column in dat must be a character vector of the population/sample labels.")
## Check refpopnames and includepopnames are valid
if (!is.vector(refpopnames) || !is.character(refpopnames)) stop("refpopnames must be a character vector of population/sample names.")
if (length(refpopnames) == 0) stop("refpopnames is an empty vector.")
if (length(refpopnames) < 2) stop("You must specify at least two populations in refpopnames.")
if (!all(refpopnames %in% unique(dat$pop))) stop("At least one refpopname is not found in the dataset. Please check the inputs.")
## Check if refpopnames have spaces, because spaces get filled in as "." in
## calc_logprob but then that doesn't match with plot_logprob still using
## the original refpopnames with spaces
if (any(grepl(" ",refpopnames))) stop("refpopnames cannot have spaces in them. Please edit refpopnames and the pop field in the dataset before rerunning.")
## Check that all the locnames valid and are in the data columns
if (!is.vector(locnames) || !is.character(locnames)) stop("locnames must be a character vector of locus names.")
if (length(locnames) == 0) stop("locnames is an empty vector.")
datColNames <- as.vector(outer(c(".a1",".a2"),locnames, function(allele,loc) paste0(loc,allele)))
if (!all(datColNames %in% colnames(dat))) stop("Some of the locnames are not present in the dataset. Please check that dat has two columns for each locus, named <<locusname>>.a1 and <<locusname>>.a2.")
if (!is.null(includepopnames))
{
if (!is.vector(includepopnames) || !is.character(includepopnames)) stop("includepopnames must be a character vector of sample names to be compared to the reference populations.")
if (length(includepopnames) == 0) stop("includepopnames is an empty vector.")
if (!all(includepopnames %in% unique(dat$pop))) stop("At least one of includepopnames is not found in the dataset. Please check the inputs.")
}
## Check if any individuals are missing all their data
allMissing <- sapply(dat$id,function(id){
all(dat[dat$id == id,datColNames] == 0)
})
if (any(allMissing)) stop("At least one individual is missing data at all loci. Please remove these individuals before rerunning analysis.")
## Check optional inputs
if (is.null(prior)) stop("Invalid prior. Options are 'Rannala', 'Baudouin', 'Half', or 'Quarter'.")
if (!is.character(prior) || !is.vector(prior) || length(prior) != 1 || !(prior %in% c("Rannala","Baudouin","Half","Quarter"))) stop("Invalid prior. Options are 'Rannala', 'Baudouin', 'Half', or 'Quarter'.")
if (is.null(saddlepoint)) stop("'saddlepoint' cannot be null. Please set to TRUE or FALSE.")
if (!is.logical(saddlepoint) || !is.vector(saddlepoint) || length(saddlepoint)!=1) stop("'saddlepoint' must be TRUE or FALSE.")
if (is.null(leave_one_out)) stop("'leave_one_out' cannot be null. Please set to TRUE or FALSE.")
if (!is.logical(leave_one_out) || !is.vector(leave_one_out) || length(leave_one_out)!=1) stop("'leave_one_out' must be TRUE or FALSE.")
if (is.null(logten)) stop("'logten' cannot be null. Please set to TRUE or FALSE.")
if (!is.logical(logten) || !is.vector(logten) || length(logten)!=1) stop("'logten' must be TRUE or FALSE.")
if (is.null(min_loci)) stop("'min_loci' cannot be null. Please specify a positive integer, smaller than the length of locnames.")
if (!is.numeric(min_loci) || !is.vector(min_loci) ||
length(min_loci)!=1 || min_loci%%1 != 0 ||
min_loci < 1 || min_loci >= length(locnames)) stop("'min_loci' must be a positive integer, smaller than the length of locnames.")
if (!is.null(quantiles) & (!is.vector(quantiles) || !is.numeric(quantiles) ||
!all(quantiles >= 0) || !all(quantiles <= 1))) stop("'quantiles' must be null, or a vector of values between 0 and 1.")
}
## calc_posterior_locus takes a single locus, and returns the parameters of the
## Dirichlet posterior for allele frequencies for each refpop at this locus.
## Let (p1, ..., pk) be the allele frequencies for a population at one locus with
## k possible alleles.
## The prior is (p1, ..., pk) ~ Dirichlet(alpha/k, ..., alpha/k) where
## alpha = 1 if prior="Rannala" or
## alpha = k if prior = "Baudouin" or
## alpha = k/2 if prior="Half" or
## alpha = k/4 if prior="Quarter".
## The posterior is (p1, ..., pk) ~ Dirichlet(nu_1, ..., nu_k) where nu_1, ..., nu_k
## are determined by the data.
## This function returns an array with rows matching the refpopnames, and with
## entries being the vector (nu_1, ..., nu_k) for this row. For example the
## output could be (for a locus with 8 alleles):
## 150 154 156 164 172 178 182 184
## Pop1 1 1 19 6 12 1 18 10
## Pop2 1 1 16 1 42 1 1 1
## meaning that the posterior for allele frequencies for population Pop2 is
## (p1, ..., p8)~Dirichlet(1, 1, 16, ..., 1).
calc_posterior_locus <- function(loc, dat, popx2, prior, refpopnames){
## First find all alleles present in dat (where dat includes only the
## reference pops specified in the call to calc_logprob) at this locus:
a1_name <- paste0(loc, ".a1")
a2_name <- paste0(loc, ".a2")
dat_alleles <- c(dat[, a1_name], dat[, a2_name])
## allele_tab is a table of all possible alleles in the specified reference
## populations, and their frequency in those populations, e.g.:
## 150 154 156 164 172 178 182 184
## Pop1 0 0 18 5 11 0 17 9
## Pop2 0 0 15 0 41 0 0 0
## Using dnn=NULL below prevents extra names "popx2" and "dat_alleles" being
## inherited by other objects later in the coding.
allele_tab <- table(popx2, dat_alleles, dnn=NULL)
## Remove missing data (code = 0):
allele_tab <- allele_tab[ , colnames(allele_tab)!="0", drop=F]
## Determine choice for prior: the prior distribution of allele frequencies
## at each locus will be
## prior(p1, ..., pk) ~ Dirichlet(alpha/k, ..., alpha/k)
## for a locus with k possible alleles, and user chooses the value of alpha.
k <- ncol(allele_tab)
if(prior=="Rannala") alpha <- 1
else if(prior=="Baudouin") alpha <- k
else if(prior=="Half") alpha <- k/2
else if(prior=="Quarter") alpha <- k/4
## Find the posterior nuvec for each population in refpopnames.
## It's as simple as adding alpha/k to the existing allele_tab, because the
## posterior is
## (p1, ..., pk) ~ Dirichlet ( alpha/k + n_1, ...., alpha/k + n_k ),
## and the values n_1, ..., n_k are exactly what are found in allele_tab.
posterior_nu_tab <- alpha/k + allele_tab[refpopnames, , drop=F]
## Using the example above, if prior="Baudouin"
## (so prior ~ Dirichlet(1, 1, ..., 1))
## then posterior_nu_tab would just have 1 added to every table entry:
## posterior ~ Dirichlet(nuvec)
## 150 154 156 164 172 178 182 184
## Pop1 1 1 19 6 12 1 18 10
## Pop2 1 1 16 1 42 1 1 1
## To extract the posterior nuvec for a particular population, e.g. such
## that the posterior for Pop2 is Dirichlet(nuvec), just use posterior_nu_tab["Pop2",].
posterior_nu_tab
}
## Functions to calculate the log genotype probability for each an individual
## with respect to each population.
## The posterior genotype probability is
## prod_loci { int_pvec P(locus genotype | pvec) f_posterior(pvec) d pvec }
## and the locus-wise integral is given by the Dirichlet compound multinomial
## distribution:
## suppose the posterior ~ Dirichlet(nuvec), and nusum = sum(nuvec).
## Then:
## (a) if the individual has two distinct alleles s and t, we get:
## posterior genotype prob for this locus
## = int_pvec P( genotype
## = (s,t) | pvec) dDirichlet(pvec | nuvec) dpvec
## = 2 * nu[s] * nu[t] / ( nusum * (nusum+1) )
## (b) if the individual has one repeated allele, t, we get:
## posterior genotype prob for this locus
## = int_pvec P( genotype
## = (t,t) | pvec) dDirichlet(pvec | nuvec) dpvec
## = nu[t] * (nu[t] + 1) / ( nusum * (nusum+1) )
calc_logprob_complete <- function(row, dat_complete, refpopnames, locnames,
posterior_nu_list, posterior_nusumvec_list,
leave_one_out, saddlepoint, logten, Ndraw,
all_loci_sim_logprob = NULL,
all_loci_SCDF_qsearch_params = NULL){
## calc_logprob_complete takes a row of the data frame "dat_complete",
## and finds the posterior genotype probability for that
## individual at every locus, then combines by summing the logs.
## log_indiv_probvec is the output log-posterior genotype probability for this individual.
indiv_dat <- dat_complete[row,]
## For leave-one-out, create a separate copy of posterior_nu_list to store the left-one-out
## allele frequencies for this individual
if (leave_one_out && indiv_dat$pop %in% refpopnames) LOO_posterior_nu_list <- posterior_nu_list
indiv_allele_freqs <- vector()
indiv_allele_freqs_LOO <- vector()
log_indiv_probvec <- 0
for(loc in locnames){
loc_table <- posterior_nu_list[[loc]]
nusumvec <- posterior_nusumvec_list[[loc]]
a1_name <- paste0(loc, ".a1")
a2_name <- paste0(loc, ".a2")
indiv_a1 <- as.character(indiv_dat[a1_name])
indiv_a2 <- as.character(indiv_dat[a2_name])
col_a1 <- which(colnames(loc_table)==indiv_a1)
col_a2 <- which(colnames(loc_table)==indiv_a2)
if (length(col_a1) == 0 || length(col_a2) == 0) browser()
## For leave-one-out, correct the posterior allele frequencies by
## subtracting 1 for each of the individual's alleles -- but only for
## the individual's own labelled population
## Do not do this for individuals in includepopnames
if (leave_one_out && indiv_dat$pop %in% refpopnames)
{
loc_table[indiv_dat$pop, indiv_a1] = loc_table[indiv_dat$pop, indiv_a1] - 1
loc_table[indiv_dat$pop, indiv_a2] = loc_table[indiv_dat$pop, indiv_a2] - 1
nusumvec[indiv_dat$pop] = nusumvec[indiv_dat$pop] - 2
## Also change the estimated allele frequencies at this locus by
## removing the individual's own alleles
## Must subtract 1 from LOO_posterior_nu_list so that for homozygotes,
## take same allele away twice (if subtracting each time directly from
## from posterior_nu_list, end up only taking allele away once)
LOO_posterior_nu_list[[loc]][indiv_dat$pop, indiv_a1] = LOO_posterior_nu_list[[loc]][indiv_dat$pop, indiv_a1] - 1
LOO_posterior_nu_list[[loc]][indiv_dat$pop, indiv_a2] = LOO_posterior_nu_list[[loc]][indiv_dat$pop, indiv_a2] - 1
indiv_allele_freqs_LOO <- c(indiv_allele_freqs_LOO,loc_table[,col_a1],loc_table[,col_a2])
}
indiv_allele_freqs <- c(indiv_allele_freqs,loc_table[,col_a1]+1,loc_table[,col_a2]+1)
if(col_a1==col_a2)
prob_loc <- loc_table[,col_a1] * (loc_table[,col_a1] + 1) / ( nusumvec * (nusumvec+1) ) # Homozygote
else
prob_loc <- 2 * loc_table[,col_a1] * loc_table[,col_a2] / ( nusumvec * (nusumvec+1) ) # Heterozygote
if(logten) log_indiv_probvec <- log_indiv_probvec + log10(prob_loc)
else log_indiv_probvec <- log_indiv_probvec + log(prob_loc)
} # end for-locus loop
## The log_indiv_probvec returned is of the following form:
## Pop1 Pop2 DBM
## -13.9 -22.3 -17.4
## NOTE: since 2018-09, no longer converting LOO logprobs back into non-LOO LGP space
if (leave_one_out && indiv_dat$pop %in% refpopnames) ## If doing leave-one-out, impute individual LGP for its own population
{
if (!saddlepoint)
{
if (is.null(all_loci_sim_logprob)) stop("all_loci_sim_logprob cannot be null when using non-saddlepoint Ndraw method.")
## First need to know the probability in indiv_agg of getting an answer <= the one in
## log_indiv_present,
## e.g. length((1:Ndraw)[indiv_agg["Pop1",] <= log_indiv_present["Pop1"]])/Ndraw
indiv_p <- length((1:Ndraw)[all_loci_sim_logprob[indiv_dat$pop,] <= log_indiv_probvec[indiv_dat$pop]])/Ndraw
## The result is the p-value for each reference pop:
## e.g. indiv_p = 0.501 for rpop = Pop1 means that 50.1% of individuals from the genuine Pop1 posterior
## have a log-genotype probability less than or equal to the one that this individual has.
## Now find the corresponding quantile in all_loci_sim_logprob: this will be the imputed value for
## this individual.
q <- quantile(all_loci_sim_logprob[indiv_dat$pop,], probs=indiv_p)
}
leave_one_out_probvec <- log_indiv_probvec
# leave_one_out_probvec[indiv_dat$pop] <- q
if (!saddlepoint) leave_one_out_probvec[indiv_dat$pop] <- q
## Add the raw (non-imputed) columns to the imputed columns to return both:
names(log_indiv_probvec) <- paste(names(log_indiv_probvec), "raw", sep=".")
log_indiv_complete <- c(leave_one_out_probvec, log_indiv_probvec)
}
else
{
## For complete-data individuals when not using leave-one-out, the "raw" results are the same as the logprob_complete results:
logprob_raw <- log_indiv_probvec
names(logprob_raw) <- paste(names(log_indiv_probvec), "raw", sep=".")
log_indiv_complete <- c(log_indiv_probvec, logprob_raw)
}
log_indiv_complete
} # end calc_logprob_complete
calc_logprob_missing <- function(row, dat_missing, refpopnames, locnames,
posterior_nu_list, posterior_nusumvec_list,
leave_one_out, saddlepoint, logten, Ndraw,
sim_logprob_bylocus=NULL,
all_loci_sim_logprob = NULL,
all_loci_SCDF_qsearch_params = NULL){
## calc_logprob_missing takes a row of the data frame "dat_missing".
## It finds the posterior genotype probability for that individual at the
## loci that are present, and combines them by summing the logs.
## It then finds the p-value of that individual's genotype probability among
## those loci that are present, such that if the p-value is indiv_p for
## population A, then indiv_p is the probability that an individual generated
## from the known posterior genotype distribution for A has genotype
## probability less than or equal to that of this target individual.
## Once we know indiv_p, we find the quantile in the all-locus genotype
## probability distribution corresponding to indiv_p. This quantile is the
## logprob output for this individual for refpop A.
## A similar process is applied to all the other reference populations.
##
## log_indiv_probvec is the output "imputed" log-posterior genotype probability
## for this individual.
indiv_dat <- dat_missing[row,]
## For leave-one-out, create a separate copy of posterior_nu_list to store
## the left-one-out allele frequencies for this individual
if (leave_one_out && indiv_dat$pop %in% refpopnames) LOO_posterior_nu_list <- posterior_nu_list
## Find which loci are present for this individual:
indiv_loc_a1 <- indiv_dat[paste(locnames, ".a1", sep="")]
locnames_missing <- locnames [which( indiv_loc_a1 ==0)]
locnames_present <- locnames [which( indiv_loc_a1 !=0)]
if (length(locnames_present) == 0) browser()
## Find the aggregate log genotype probabilities for the loci present, so we
## can look up this individual's values:
if (!saddlepoint)
{
if (is.null(sim_logprob_bylocus)) stop("sim_logprob_bylocus cannot be null when using non-saddlepoint Ndraw method.")
indiv_agg <- aggregate_logprob(sim_logprob_bylocus, which_loci_names=locnames_present)
}
log_indiv_present <- 0
for(loc in locnames_present){
loc_table <- posterior_nu_list[[loc]]
nusumvec <- posterior_nusumvec_list[[loc]]
a1_name <- paste0(loc, ".a1")
a2_name <- paste0(loc, ".a2")
indiv_a1 <- as.character(indiv_dat[a1_name])
indiv_a2 <- as.character(indiv_dat[a2_name])
col_a1 <- which(colnames(loc_table)==indiv_a1)
col_a2 <- which(colnames(loc_table)==indiv_a2)
## For leave-one-out, correct the posterior allele frequencies by
## subtracting 1 for each of the individual's alleles -- but only for
## the individual's own labelled population
if (leave_one_out && indiv_dat$pop %in% refpopnames)
{
loc_table[indiv_dat$pop, indiv_a1] = loc_table[indiv_dat$pop, indiv_a1] - 1
loc_table[indiv_dat$pop, indiv_a2] = loc_table[indiv_dat$pop, indiv_a2] - 1
nusumvec[indiv_dat$pop] = nusumvec[indiv_dat$pop] - 2
## Also change the estimated allele frequencies at this locus by
## removing the individual's own alleles
## Must subtract 1 from LOO_posterior_nu_list so that for homozygotes,
## take same allele away twice (if subtracting each time directly from
## from posterior_nu_list, end up only taking allele away once)
LOO_posterior_nu_list[[loc]][indiv_dat$pop, indiv_a1] = LOO_posterior_nu_list[[loc]][indiv_dat$pop, indiv_a1] - 1
LOO_posterior_nu_list[[loc]][indiv_dat$pop, indiv_a2] = LOO_posterior_nu_list[[loc]][indiv_dat$pop, indiv_a2] - 1
}
if(col_a1==col_a2)
prob_loc <- loc_table[,col_a1] * (loc_table[,col_a1] + 1) / ( nusumvec * (nusumvec+1) ) # Homozygote
else
prob_loc <- 2 * loc_table[,col_a1] * loc_table[,col_a2] / ( nusumvec * (nusumvec+1) ) # Heterozygote
if(logten) log_indiv_present <- log_indiv_present + log10(prob_loc)
else log_indiv_present <- log_indiv_present + log(prob_loc)
} # end for-locus loop for loci with data intact
## We now have something like this for log_indiv_present (in the case of three
## refpops):
## Pop1 Pop2 Pop3
## -8.099003 -10.186118 -8.159208
##
## We also have something like this for indiv_agg if using saddlepoint=FALSE:
##
## Pop1 -7.36 -9.00 -8.38 -8.89 .... and so on for Ndraw=10000 columns.
## Pop2 -4.97 -3.71 -4.81 -7.47
## Pop3 -6.36 -9.25 -8.45 -7.48
##
## Or for saddlepoint=TRUE we have post_info, which defines the distributions
## of log-genotype-probabilities for the complete set of all loci, and for
## the reduced set of loci present in this individual.
impute_logprob_refpop <- function(rpop){
## For each refpop, find the "imputed" log-genotype probability for the individual in this refpop.
if (saddlepoint)
{
###### Here, calculate the SCDF for this set of loci
###### For leave-one-out, if this individual is from this refpop,
###### use the Leave-One-Out allele frequencies for the
###### reduced set of loci
## Note for next two lines: must use "lapply" to preserve correct
## format and ,drop=FALSE to preserve correct matrix format
if (leave_one_out & indiv_dat$pop==rpop) post_nu_pop <- lapply(LOO_posterior_nu_list[locnames_present], function(x) x[rpop,,drop=FALSE])
else post_nu_pop <- lapply(posterior_nu_list[locnames_present], function(x) x[rpop,,drop=FALSE])
if (length(post_nu_pop) == 0) browser()
post_info <- rcpp_calc_multi_locus_dist(post_nu_pop, leave_one_out=leave_one_out)
mean_pop <- rcpp_calc_mu(post_info$dist)
indiv_p <- rcpp_calc_Fhat(log_indiv_present[rpop], post_info$dist, post_info$min, post_info$max, mean_pop, logten=logten)
if (is.na(indiv_p))
{
print("indiv_p is NA")
test <- rcpp_calc_Fhat(log_indiv_present[rpop], post_info$dist, post_info$min, post_info$max, mean_pop, logten=logten)
}
if (is.infinite(indiv_p))
{
print("indiv_p is infinite")
test <- rcpp_calc_Fhat(log_indiv_present[rpop], post_info$dist, post_info$min, post_info$max, mean_pop, logten=logten)
}
}
else
{
## First need to know the probability in indiv_agg of getting an answer <= the one in
## log_indiv_present,
## e.g. length((1:Ndraw)[indiv_agg["Pop1",] <= log_indiv_present["Pop1"]])/Ndraw
indiv_p <- length((1:Ndraw)[indiv_agg[rpop,] <= log_indiv_present[rpop]])/Ndraw
}
###### Here, use Qhat for the full SCDF
if (saddlepoint)
{
if (is.null(all_loci_SCDF_qsearch_params)) stop("all_loci_SCDF_qsearch_params cannot be null when using saddlepoint.")
#print("Starting to calculate the quantiles for a single population")
q <- calc_quantiles_usingRcpp(all_loci_SCDF_qsearch_params, posterior_nu_list,
rpop, indiv_p, logten=logten, leave_one_out=leave_one_out)
#print("Calculated the quantiles for a single population")
}
else
{
if (is.null(all_loci_sim_logprob)) stop("all_loci_sim_logprob cannot be null when using non-saddlepoint Ndraw method.")
## The result is the p-value for each reference pop:
## e.g. indiv_p = 0.501 for rpop = Pop1 means that 50.1% of individuals from the genuine Pop1 posterior
## have a log-genotype probability less than or equal to the one that this individual has.
## Now find the corresponding quantile in all_loci_sim_logprob: this will be the imputed value for
## this individual.
q <- quantile(all_loci_sim_logprob[rpop,], probs=indiv_p)
}
q
}
log_indiv_probvec <- sapply(refpopnames, impute_logprob_refpop)
## For missing-data individuals, tack the number of complete loci on to the beginning so it
## gels with the equivalent for the complete-data individuals data frame:
log_indiv_probvec <- c(length(locnames_present), log_indiv_probvec)
names(log_indiv_probvec) <- c("nloc", refpopnames)
## The log_indiv_probvec returned is of the following form:
## nloc Pop1 Pop2 DBM
## 8 -13.9 -22.3 -17.4
## Add the raw (non-imputed) columns to the imputed columns to return both:
names(log_indiv_present) <- paste(names(log_indiv_present), "raw", sep=".")
c(log_indiv_probvec, log_indiv_present)
} # end calc_logprob_missing
calc_qsearch_params_usingRcpp <- function(posterior_nu_list, refpopnames, logten=FALSE, leave_one_out=FALSE)
{
scdf_qsearch_params <- list()
for (pop in refpopnames)
{
## Note: must use "lapply" specifically, in order to preserve the right format!!
## Also need to use ,drop=FALSE to preserve the right format (matrix)
post_nu_pop <- lapply(posterior_nu_list, function(x) x[pop,,drop=FALSE])
post_info <- rcpp_calc_multi_locus_dist(post_nu_pop, leave_one_out=leave_one_out)
scdf_qsearch_params[[pop]] <- rcpp_calc_qsearch_params(post_info$dist, post_info$min,
post_info$max, logten=logten)
if (is.na(scdf_qsearch_params[[pop]]$Fh_max)) browser()
}
scdf_qsearch_params
}
calc_quantiles_usingRcpp <- function(scdf_qsearch_params, posterior_nu_list, refpopnames, quantiles_vec, logten=FALSE, leave_one_out=FALSE)
{
quantile_mat <- matrix(nrow = length(refpopnames), ncol = length(quantiles_vec))
rownames(quantile_mat) <- refpopnames
for (pop in refpopnames)
{
scdf_qsearch_params_pop <- scdf_qsearch_params[[pop]]
post_nu_pop <- sapply(posterior_nu_list, function(x) x[pop,,drop=FALSE])
for (i in 1:length(quantiles_vec))
{
quantile_mat[pop,i] <- rcpp_calc_Qhat(quantiles_vec[i], scdf_qsearch_params_pop,
post_nu_pop, logten=logten)
}
}
rownames(quantile_mat) <- refpopnames
quantile_mat
}
find_nloc <- function(dat, locnames){
## find_nloc 10/6/10
## Takes input data frame and adds a column nloc specifying how many loci
## are successful for each individual.
## It also ensures that dat$pop is a character vector.
## EXAMPLE:
## gbi.dat <- find_nloc(gbi.dat)
## saveres("dat")
##
## OR:
## find_nloc(gbi.dat, locnames=loc.names.fixed[-c(1, 3, 8)])$nloc
dat$pop <- as.character(dat$pop)
ninds <- nrow(dat)
## nloc is a record for each individual giving the number of successful loci:
nloc <- rep(0, ninds)
loc_col_names <- paste(rep(locnames, each=2), c("a1", "a2"), sep=".")
for(loc_col in loc_col_names){
dat[,loc_col][is.na(dat[,loc_col])] <- 0
## Add 0.5 to the number of present loci for each individual which DOESN'T have missing
## record at this loc.allele. The 0.5 should be replicated by the other allele for the same locus.
nloc[dat[,loc_col]!=0] <- nloc[dat[,loc_col]!=0] + 0.5
}
dat$nloc <- nloc
dat
}
count_missing_loci <- function(logprob_results, dat){
nloci <- (ncol(dat) - 2)/2
percent_missing <- 100*(1-sum(logprob_results$nloc)/(nloci*nrow(logprob_results)))
}
sim_logprob_locus <- function(nutab, Nsim, logten){
## sim_logprob_locus 19/5/11
## Returns the log-probabilities of animals genuinely simulated from the
## posterior distribution of allele frequencies specified in nutab.
## Operates in log_10 if logten=T, otherwise in log_e.
## nutab is a table of nu-vectors which are the posterior parameters for
## the different populations, one row per population, e.g.
## 150 154 156 164 172 178 182 184
## Pop1 1 1 19 6 12 1 18 10
## Pop2 1 1 16 1 42 1 1 1
## meaning that the posterior distribution for allele frequencies is:
## (p1, ..., p8) ~ Dirichlet(1, 1, 19, 6, 12, 1, 18, 10) in population Pop1;
## (p1, ..., p8) ~ Dirichlet(1, 1, 16, 1, 42, 1, 1, 1) in population Pop2.
##
## A single individual generated from one of these populations has the Dirichlet
## compound multinomial distribution for its genotype probability at this
## locus (assuming Hardy-Weinberg equilibrium): so if nu is the vector of
## Dirichlet parameters for a single population, i.e. the posterior
## distribution parameters, and nusum=sum(nu), then the genotype probability is:
## 2 * nu[s] * nu[t] / (nusum * (nusum + 1)) with probability 2 * nu[s] * nu[t] / (nusum * (nusum + 1)) ;
## nu[t] * (nu[t]+1) / (nusum * (nusum + 1)) with probability nu[t] * (nu[t]+1) / (nusum * (nusum + 1)).
##
## (This is equivalent to saying, let X be a discrete random variable, and
## let f(X) be the probability function of X: it's a function of X and so it
## is itself a random variable. Then (X=x) => (f(X) = f(x)) and this
## happens with probability f(x). So we add f(x) to the probability of
## getting (f(X) = f(x)). Here, X is the genotype, e.g. X = (s, t) or X = (t, t);
## and f(X) is the genotype probability which is what we're most interested in.)
##
## This function generates Nsim realisations of the genotype probabilities:
## i.e. it generates Nsim probabilities between 0 and 1 that would correspond
## to the posterior genotype probability of an individual *genuinely* drawn from
## this population (as estimated by the posterior distribution) at this locus.
## For example, if the results are:
## Pop1 : (0.2, 0.1, 0.05, 0.05, 0.3, 0.2, 0.05, ...) then we learn that
## individuals genuinely drawn from the posterior Pop1 population would have
## genotype probabilities 0.05 (commonly), 0.1, 0.2, 0.3; but maybe that an
## individual with genotype probability 0.002 would be vanishingly unlikely
## from this population. On the other hand, if there are many alleles then
## we might get Pop1 : (0.002, 0.001, 0.03, 0.001, 0.002, 0.007, ...) making
## 0.002 perfectly respectable for an individual drawn from this population.
##
## The output from this function is a matrix with rows corresponding the the
## populations in nutab, and Nsim columns corresponding to the Nsim draws
## from each population. The results are presented as LOG genotype probabilities:
## e.g. Pop1:(log(0.02), log(0.03), log(0.05), log(0.05), log(0.3), log(0.2), log(0.05), ....)
## e.g. Pop2:(log(0.30), log(0.13), log(0.24), log(0.15), log(0.1), log(0.13), log(0.1), ....)
sim_logprob_onepop <- function(nupop){
## Do the simulations for a single population.
## To avoid problems with sample() in the case that there is only one
## allele, return rep(0, Nsim) if this happens, because all animals
## sampled from this population must have genotype probability 1 if
## there is only one allele, so the log genotype probability is 0.
if(length(nupop)==1) return(rep(0, Nsim))
## If there is more than one allele, continue.
## First create the vector of all possible genotype probabilities.
## Because all probabilities have the same denominator (nusum * (nusum+1)),
## calculate the common denominator to divide by at the end:
nusum <- sum(nupop)
common_denom <- nusum* (nusum + 1)
## To find all homozygotes, the probabilities are
## nupop * (nupop+1) / (nusum * (nusum+1)).
## Create the vector of numerators of these probabilities:
homozyg_top <- nupop * (nupop + 1)
## For heterozygotes, we need all pairs of alleles (call them s and t),
## then the probabilities are 2 nu[s] nu[t] / common_denom.
## Using outer() gives the values nu[s] * nu[t] for all possible pairings
## of s and t. Use the lower triangle of these only, i.e. the pairs where
## s is not equal to t, and multiply by 2. Dscard the diagonal homozygotes.
## The result in heterozyg_top is a vector.
outer_nu <- outer(nupop, nupop)
heterozyg_top <- 2 * outer_nu[lower.tri(outer_nu, diag=F)]
## The vector of all genotype probabilities is geno_probvec:
geno_probvec <- c(homozyg_top, heterozyg_top) / common_denom
## Now simulate Nsim log-genotype probabilities with the probabilities
## given in geno_probvec:
if(logten)
log_geno_prob_sim <- sample(log10(geno_probvec), size=Nsim, replace=T, prob=geno_probvec)
else
log_geno_prob_sim <- sample(log(geno_probvec), size=Nsim, replace=T, prob=geno_probvec)
return(log_geno_prob_sim)
}
## Run the simulation for each row i.e. each population in nutab
t(apply(nutab, 1, sim_logprob_onepop))
}
aggregate_logprob <- function(sim_logprob, which_loci_names){
## aggregate_logprob
## Given a sim_logprob object corresponding to the output from
## lapply(posterior_nu_list, sim_logprob_locus, Nsim),
## this function aggregates across the loci specified by which_loci_names,
## and returns a matrix with rows corresponding to populations, and Nsim
## columns such that the Nsim columns are the SUMMED results across the
## specified loci of the log-genotype probabilities.
## It doesn't matter whether the sim_logprob is in log_10 or log_e, because
## it will all be in the same base.
##
## EXAMPLE: suppose which_loci_names=c("D10Rat20", "D11Mgh5"),
## and the input sim_logprob has Nsim=5 and looks like this:
## $D10Rat20
## Pop1 -0.62 -0.62 -2.39 -0.62 -0.62
## Pop2 -3.49 -2.95 -3.55 -2.90 -4.80
##
## $D11Mgh5
## Pop1 -1.06 -0.823 -1.06 -4.52 -0.823
## Pop2 -1.84 -3.999 -2.08 -1.84 -2.076
##
## $D15Rat77
## Pop1 -2.54 -1.49 -1.84 -6.73 -1.84
## Pop2 -1.51 -6.01 -1.90 -1.51 -1.90
##
## Then the output from this function is:
##
## Pop1 -1.680 -1.443 -3.450 -5.140 -1.443
## Pop2 -5.330 -6.949 -5.630 -4.740 -6.876
##
## where the results for Pop1 are added across the two specified loci only,
## and likewise for Pop2:
## e.g. -0.62 - 1.06 = -1.680 for the first element; etc.
subset_list <- sim_logprob[which_loci_names]
subset_sum <- subset_list[[1]]
n_in_list <- length(which_loci_names)
## If there is more than one locus, add the others to the first one:
if(n_in_list > 1)
for(locn in 2:n_in_list) subset_sum <- subset_sum + subset_list[[locn]]
subset_sum
}
rDirichlet <- function(ndraw, alphvec){
## See Wikipedia on the Dirichlet distribution for confirmation:
gamdraw <- rgamma(ndraw, shape=alphvec, rate=1)
gamdraw / sum(gamdraw)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.