R/RcppExports.R

Defines functions dirch_from_counts dirch_from_allocations gsi_mcmc_1 gsi_em_1 gprob_sim_gc_missing gprob_sim_ind gprob_sim_gc samp_from_mat rcpp_per_locus_logls rcpp_indiv_specific_logl_means_and_vars geno_logL_ssq geno_logL gsi_mcmc_fb rcpp_close_matchers

Documented in dirch_from_allocations dirch_from_counts geno_logL geno_logL_ssq gprob_sim_gc gprob_sim_gc_missing gprob_sim_ind gsi_em_1 gsi_mcmc_1 gsi_mcmc_fb rcpp_close_matchers rcpp_indiv_specific_logl_means_and_vars rcpp_per_locus_logls samp_from_mat

# Generated by using Rcpp::compileAttributes() -> do not edit by hand
# Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393

#' Find all pairs that have close matches
#'
#'
#' @keywords internal
#'
#' @param par_list genetic data converted to the param_list format by \code{tcf2param_list}
#'
#' @return Gotta say more
#' @examples
#' # gotta do something here too
#' @export
rcpp_close_matchers <- function(par_list, non_miss_fract, match_fract) {
    .Call('_rubias_rcpp_close_matchers', PACKAGE = 'rubias', par_list, non_miss_fract, match_fract)
}

#' MCMC from the fully Bayesian GSI model for pi and the individual posterior probabilities
#'
#' Given a list of key parameters from a genetic dataset, this function samples values of pi
#' and the posteriors for all the individuals. Each MCMC iteration includes a recalculation
#' of the scaled genotype likelihood matrix, with baseline allele frequencies updated
#' based on the previous iteration's allocations. It returns the output in a list.
#' @keywords internal
#'
#' @param par_list genetic data converted to the param_list format by \code{tcf2param_list}
#' @param Pi_init  Starting value for the pi (collection mixture proportion) vector.
#' @param lambda the prior to be added to the collection allocations, in order to
#' generate pseudo-count Dirichlet parameters for the simulation of a new pi vector
#' @param reps total number of reps (sweeps) to do.
#' @param burn_in how many reps to discard in the beginning when doing the mean calculation. They will still be
#' returned in the traces if desired
#' @param sample_int_Pi the number of reps between samples being taken for Pi traces.  If 0 no trace samples are taken
#' @param sample_int_PofZ the number of reps between samples being taken for the traces of posterior of each individual's origin. If 0
#' no trace samples are taken.
#'
#' @return \code{gsi_mcmc_fb} returns a list of three. \code{$mean} lists the posterior
#' means for collection proportions \code{pi}, for the individual posterior
#' probabilities of assignment \code{PofZ}, and for the allele frequencies \code{theta}.
#' \code{$sd} returns the posterior standard deviations for the same values.
#'
#' If the corresponding \code{sample_int} variables are not 0, \code{$trace} contains
#' samples taken from the Markov chain at intervals of \code{sample_int_}(variable) steps.
#'
#' @examples
#' # this demonstrates it with scaled likelihoods computed from
#' # assignment of the reference samples
#'
#' # we have to get the ploidies to pass to tcf2param_list
#' locnames <- names(alewife)[-(1:16)][c(TRUE, FALSE)]
#' ploidies <- rep(2, length(locnames))
#' names(ploidies) <- locnames
#'
#' params <- tcf2param_list(alewife, 17, ploidies = ploidies)
#' lambda <- rep(1/params$C, params$C)
#' # use very short run and burn in so it doesn't take too long
#' # when checking on CRAN
#' mcmc <- gsi_mcmc_fb(params, lambda, lambda, 20, 5, 4, 4)
#' @export
gsi_mcmc_fb <- function(par_list, Pi_init, lambda, reps, burn_in, sample_int_Pi, sample_int_PofZ) {
    .Call('_rubias_gsi_mcmc_fb', PACKAGE = 'rubias', par_list, Pi_init, lambda, reps, burn_in, sample_int_Pi, sample_int_PofZ)
}

#' Calculate a matrix of genotype log-likelihoods for a genetic dataset
#'
#' Takes a list of key parameters from a genetic dataset, and calculates
#' the log-likelihood of each individual's genotype, given the allele counts
#' in each collection
#'
#' Leave-One-Out cross-validation is used to avoid bias in log-likelihood for an
#' individual's known collection of origin
#'
#' @keywords internal
#'
#' @param par_list genetic data converted to the param_list format by \code{tcf2param_list}
#'
#' @return \code{geno_logL} returns a matrix with C rows and I columns. Each column
#' represents an individual, and each row the log-likelihood of that individual's
#' genotype, given the allele counts in that collection
#'
#' @examples
#' example(tcf2param_list)
#' ale_glL <- geno_logL(ale_par_list)
#' @export
geno_logL <- function(par_list) {
    .Call('_rubias_geno_logL', PACKAGE = 'rubias', par_list)
}

#' Calculate a matrix of sum-of-squares of genotype log-likelihoods for a genetic dataset
#'
#' Takes a list of key parameters from a genetic dataset, and calculates
#' the sum of squared log-likelihood of each individual's genotype, given the allele counts
#' in each collection. This is used for the quick-and-dirty Z-score calculations.
#'
#' Leave-One-Out cross-validation is used to avoid bias in log-likelihood for an
#' individual's known collection of origin
#'
#' @keywords internal
#'
#' @param par_list genetic data converted to the param_list format by \code{tcf2param_list}
#'
#' @return \code{geno_logL} returns a matrix with C rows and I columns. Each column
#' represents an individual, and each row the log-likelihood of that individual's
#' genotype, given the allele counts in that collection
#'
#' @examples
#' example(tcf2param_list)
#' ale_glL <- geno_logL(ale_par_list)
#' @export
geno_logL_ssq <- function(par_list) {
    .Call('_rubias_geno_logL_ssq', PACKAGE = 'rubias', par_list)
}

#' From the pattern of missing data at each individual, compute the expected mean and variance of the logl
#'
#' This takes a param_list so that it has access to individual genotypes (and hence can cycle through them
#' and know which are missing and which are not.)  It also takes a matrix of per-locus logl means
#' and variances like what is computed by \code{\link{per_locus_means_and_vars}}.
#'
#' This function doesn't do any checking to assure that the par_list and the per-locus logl means
#' matrix are made for one another.  (i.e. use the same collections in the same order.)
#'
#' @keywords internal
#'
#' @param par_list genetic data converted to the param_list format by \code{tcf2param_list}
#' @param MV a list of two matrices, one of means and the other of variances, which are C x L
#' matrices.  This is basically the list that is returned by \code{\link{per_locus_means_and_vars}}.
#'
#' @return a matrix with C rows and I columns. Each row
#' represents a collection, and each column an individual.
#'
#' @export
rcpp_indiv_specific_logl_means_and_vars <- function(par_list, MV) {
    .Call('_rubias_rcpp_indiv_specific_logl_means_and_vars', PACKAGE = 'rubias', par_list, MV)
}

#' Return a matrix of locus-specific self-assignment logls
#'
#' Takes a list of key parameters from a genetic dataset, and calculates
#' the log-likelihood of each individual's single-locus genotype, given the allele counts
#' in the individual's collection.
#'
#' This uses Leave-One-Out cross-validation is used to avoid bias in log-likelihood for an
#' individual's known collection of origin
#'
#' @keywords internal
#'
#' @param par_list genetic data converted to the param_list format by \code{tcf2param_list}
#'
#' @return \code{rcpp_per_locus_logls} returns a matrix with I rows and L columns. Each row
#' represents an individual, and each column a locus. Note that missing data at a locus
#' returns a zero.  That should be changed to NA later.
#'
#' @export
rcpp_per_locus_logls <- function(par_list) {
    .Call('_rubias_rcpp_per_locus_logls', PACKAGE = 'rubias', par_list)
}

#' Sample 1 observation from cell probabilities that are columns of a matrix
#'
#' Takes a matrix in which columns sum to one. For each column, performs a
#' single multinomial draw from the rows, weighted by their values in that column
#'
#' @keywords internal
#'
#' @param M a matrix whose columns are reals summing to one
#'
#' @return a vector length = \code{ncol(M)} of indices, with each element being
#' the row that was chosen in that column's sampling
#' @export
samp_from_mat <- function(M) {
    .Call('_rubias_samp_from_mat', PACKAGE = 'rubias', M)
}

#' Simulate genotype log-likelihoods from a population by gene copy
#'
#' Takes a list of parameters from a genetic dataset, and returns a genotype log-likelihood
#' matrix for individuals simulated by gene copy from the specified collections
#'
#' In simulation by gene copy, the genotype at a locus for any individual is the result
#' of two random draws from the allele count matrix of that locus. Draws within an individual
#' are performed without replacement, but allele counts are replaced between individuals.
#'
#' @keywords internal
#'
#' @param par_list genetic data converted to the param_list format by \code{tcf2param_list}
#' @param sim_colls a vector of indices for the collections desired for simulation;
#' each element of the list corresponds to an individual
#'
#'
#' @return \code{gprob_sim} returns a matrix of the summed log-likelihoods
#' for all loci of a simulated population mixture; columns represent individuals,
#' with each row containing their log-likelihood of belonging to the collection
#' of the same index, given the selection of two independent gene copies from the
#' desired collection of origin's reference allele frequencies
#'
#' @examples
#' example(tcf2param_list)
#' sim_colls <- sample(ale_par_list$C, 1070, replace = TRUE)
#' ale_sim_gprobs_gc <- gprob_sim_gc(ale_par_list, sim_colls)
#' @export
gprob_sim_gc <- function(par_list, sim_colls) {
    .Call('_rubias_gprob_sim_gc', PACKAGE = 'rubias', par_list, sim_colls)
}

#' Simulate genotype log-likelihoods from a population by individual
#'
#' Takes a list of parameters from a genetic dataset, and returns a genotype log-likelihood
#' matrix for individuals simulated by individual from the specified collections
#'
#' In simulation by individual, the genotype for any simulated individual is the
#' result of a single random draw from the genotypes of all individuals in the collection.
#' Gene copies and loci are therefore not independent.
#'
#' @param par_list genetic data converted to the param_list format by \code{tcf2param_list}
#' @param sim_colls a vector of indices for the collections desired for simulation;
#' each element of the list corresponds to an individual
#'
#'
#' @return \code{gprob_sim} returns a matrix of the summed log-likelihoods
#' for all loci of a simulated population mixture; columns represent individuals,
#' with each row containing their log-likelihood of belonging to the collection
#' of the same index, given the selection of an individual's genotype from the
#' reference collection of interest. Selection at the locus and gene copy level
#' are not independent, and missing data is included in selection.
#'
#' @keywords internal
#'
#' @examples
#' example(tcf2param_list)
#' sim_colls <- sample(ale_par_list$C, 1070, replace = TRUE)
#' ale_sim_gprobs_ind <- gprob_sim_ind(ale_par_list, sim_colls)
#' @export
gprob_sim_ind <- function(par_list, sim_colls) {
    .Call('_rubias_gprob_sim_ind', PACKAGE = 'rubias', par_list, sim_colls)
}

#' Simulate genotypes by gene copy, with missing data from chosen individuals
#'
#' Takes a list of parameters from a genetic dataset, and returns a genotype log-likelihood
#' matrix for individuals simulated by gene copy from the specified collections, with
#' genotypes masked by missing data patterns from reference individuals
#'
#' In simulation by gene copy, the genotype at a locus for any individual is the result
#' of two random draws from the allele count matrix of that locus. Draws within an individual
#' are performed without replacement, but allele counts are replaced between individuals.
#' If the data at a particular locus is missing for individual i in \code{sim_missing},
#' this data will also be missing in simulated individual i for the
#' log-likelihood calculation.
#'
#' @param par_list genetic data converted to the param_list format by \code{tcf2param_list}
#' @param sim_colls a vector; element i specifies the collection from which to sample
#' the genotypes for individual i
#' @param sim_missing a vector; element i specifies the index for the individual in
#' params$I whose missing data should be copied for individual i
#'
#' @keywords internal
#' @examples
#'
#' # If one wanted to simulate the missing data patterns
#' # of a troublesome mixture dataset, one would run tcf2param_list,
#' # selecting samp_type = "mixture", then draw sim_miss from
#' # the mixture individual genotype list
#'
#' # make a fake mixture data set to demonstrate...
#' drawn <- mixture_draw(alewife, rhos = c(1/3, 1/3, 1/3),N = 100)
#' ref <- drawn$reference
#' mix <- drawn$mix
#'
#' # then run it...
#' # we have to get the ploidies to pass to tcf2param_list
#' locnames <- names(alewife)[-(1:16)][c(TRUE, FALSE)]
#' ploidies <- rep(2, length(locnames))
#' names(ploidies) <- locnames
#' params <- tcf2param_list(rbind(ref,mix), 17, samp_type = "mixture", ploidies = ploidies)
#' sim_colls <- sample(params$C, 1070, replace = TRUE)
#' sim_miss <- sample(length(params$coll), 1070, replace = TRUE)
#' ale_sim_gprobs_miss <- gprob_sim_gc_missing(params, sim_colls, sim_miss)
#' @export
gprob_sim_gc_missing <- function(par_list, sim_colls, sim_missing) {
    .Call('_rubias_gprob_sim_gc_missing', PACKAGE = 'rubias', par_list, sim_colls, sim_missing)
}

#' EM algorithm from the simplest GSI model for pi and the individual posterior probabilities
#'
#' Using a matrix of scaled likelihoods, this function does an EM algorithm to climb the
#' likelihood surface for pi, and computes the plug-in estimate of the posteriors
#' for all the individuals.  It returns the output in a list.
#' @keywords internal
#' @param SL  a matrix of the scaled likelihoods.  This is should have values for each individual in a column
#' (going down in the rows are values for different collections).
#' @param Pi_init  Starting value for the pi (collection mixture proportion) vector.
#' @param max_iterations the maximum total number of reps iterations to do.
#' @param tolerance the EM-algorithm will be considered converged when the sum over the elements of pi of the absolute value
#' of the difference between the previous and the current estimate is less than tolerance.
#' @param return_progression  If true, then the pi_trace component of the output shows the value of pi visited en route to the end.
#'
#' @return \code{gsi_em_1} returns a final Maximum-Likelihood estimate for pi and PofZ,
#' as well as the number of iterations needed to reach convergence ("iterations_performed"),
#' and traces of the pi values and change in pi in each iteration
#'
#' @examples
#' # this is shown with a scaled likelihood matrix from self-assignment
#' # of the reference individuals
#'
#' # we have to get the ploidies to pass to tcf2param_list
#' locnames <- names(alewife)[-(1:16)][c(TRUE, FALSE)]
#' ploidies <- rep(2, length(locnames))
#' names(ploidies) <- locnames
#' params <- tcf2param_list(alewife, 17, ploidies = ploidies)
#' logl <- geno_logL(params)
#' SL <- apply(exp(logl), 2, function(x) x/sum(x))
#' test_em <- gsi_em_1(SL,
#'                     rep(1/params$C, params$C),
#'                     max_iterations = 10^6,
#'                     tolerance = 10^-7,
#'                     return_progression = TRUE)
#' @export
gsi_em_1 <- function(SL, Pi_init, max_iterations, tolerance, return_progression) {
    .Call('_rubias_gsi_em_1', PACKAGE = 'rubias', SL, Pi_init, max_iterations, tolerance, return_progression)
}

#' MCMC from the simplest GSI model for pi and the individual posterior probabilities
#'
#' Using a matrix of scaled likelihoods, this function samples values of pi and the posteriors
#' for all the individuals.  It returns the output in a list.
#' @keywords internal
#' @param SL  matrix of the scaled likelihoods.  This is should have values for each individual in a column
#' (going down in the rows are values for different populations).
#' @param Pi_init  Starting value for the pi (collection mixture proportion) vector.
#' @param lambda the prior to be added to the collection allocations, in order to
#' generate pseudo-count Dirichlet parameters for the simulation of a new pi vector
#' @param reps total number of reps (sweeps) to do.
#' @param burn_in how many reps to discard in the beginning when doing the mean calculation. They will still be
#' returned in the traces if desired
#' @param sample_int_Pi the number of reps between samples being taken for Pi traces.  If 0 no trace samples are taken
#' @param sample_int_PofZ the number of reps between samples being taken for the traces of posterior of each individual's origin. If 0
#' no trace samples are taken.
#'
#' @return \code{gsi_mcmc_1} returns a list of three. \code{$mean} lists the posterior
#' means for collection proportions \code{pi} and for the individual posterior
#' probabilities of assignment \code{PofZ}. \code{$sd} returns the posterior standard
#' deviations for the same values.
#'
#' If the corresponding \code{sample_int} variables are not 0, \code{$trace} contains
#' samples taken from the Markov chain at intervals of \code{sample_int_}(variable) steps.
#'
#' @examples
#' # this demonstrates it with scaled likelihoods computed from
#' # assignment of the reference samples
#'
#' # we have to get the ploidies to pass to tcf2param_list
#' locnames <- names(alewife)[-(1:16)][c(TRUE, FALSE)]
#' ploidies <- rep(2, length(locnames))
#' names(ploidies) <- locnames
#'
#' params <- tcf2param_list(alewife, 17, ploidies = ploidies)
#' logl <- geno_logL(params)
#' SL <- apply(exp(logl), 2, function(x) x/sum(x))
#' lambda <- rep(1/params$C, params$C)
#' mcmc <- gsi_mcmc_1(SL, lambda, lambda, 200, 50, 5, 5)
#' @export
gsi_mcmc_1 <- function(SL, Pi_init, lambda, reps, burn_in, sample_int_Pi, sample_int_PofZ) {
    .Call('_rubias_gsi_mcmc_1', PACKAGE = 'rubias', SL, Pi_init, lambda, reps, burn_in, sample_int_Pi, sample_int_PofZ)
}

#' Given a vector of different categories in 1...n and a prior,
#' simulate a Dirichlet random vector
#'
#' Takes a vector of collection indices to which individuals (vector elements) were assigned,
#' and returns a Dirichlet random variable generated by adding the prior to the sum
#' of each collection's occurrences, and simulating an alpha from a gamma distribution
#' with this shape parameter.
#'
#' The categories are labeled in C from 1 up to n.  n is the length of \code{lambda},
#' which is a vector of priors. Note that all elements of \code{lambda}
#' must be strictly greater than 0.
#'
#' @keywords internal
#'
#' @param C  a vector giving different categories of individual
#' (not counts of categories - untabulated)
#' @param lambda priors for the categories
#' @export
dirch_from_allocations <- function(C, lambda) {
    .Call('_rubias_dirch_from_allocations', PACKAGE = 'rubias', C, lambda)
}

#' Given a vector of counts for different categories in 1...n and a prior,
#' simulate a Dirichlet random vector
#'
#' Takes a vector of counts for 1:n collections,
#' and returns a Dirichlet random variable generated by adding the prior to each
#' collection value, and simulating an alpha from a gamma distribution
#' with this shape parameter.
#'
#' The categories are labeled in C from 1 up to n.  n is the length of \code{lambda},
#' which is a vector of priors. Note that all elements of \code{lambda}
#' must be strictly greater than 0.
#' @keywords internal
#' @param C  a vector giving counts of categories
#' @param lambda priors for the categories
#' @export
dirch_from_counts <- function(C, lambda) {
    .Call('_rubias_dirch_from_counts', PACKAGE = 'rubias', C, lambda)
}
benmoran11/rubias documentation built on Feb. 1, 2024, 10:52 p.m.