R/hint_single_urn.R
In hint: Tools for Hypothesis Testing Based on Hypergeometric Intersection Distributions

Documented in dhydist phydist qhydist rhydist

###########################################################################
#
# Functions for calculating distributions associated with drawing distinct
#  categories from a single urn containing duplicates in q<=n of its
#  categories.
#
# Author: Alex T. Kalinka (alex.t.kalinka@gmail.com)
#
###########################################################################


#### Main user-callable functions that do the dispatching to user-invisible worker functions below. ####
#
# Density, distribution (probability mass function), quantile, and random generation functions:


.hydist.check.params <- function(n, a, q)
	{
	if(!a>=0 || !n>=0 || !q>=0 || !a<=n || !q<=n || !q>=0){
		stop("the following constraints must be met:\nn > 0\n0 <= a <= n\n0 <= q <= n\n", call. = FALSE)
		}
	cmin <- a-min(floor(a/2),q)
	cmax <- a
	crange <- cmin:cmax
	return(crange)
	}


#' Drawing Distinct Categories from a Single Urn
#' 
#' Density, distribution function, quantile function and random generation for the distribution of distinct categories drawn from a single urn in which there are duplicates in q of the categories.
#' 
#' @param n An integer specifying the number of categories in the urn.
#' @param a An integer specifying the number of balls drawn from the urn.
#' @param q An integer specifying the number of categories in the urn which have duplicate members.
#' @name Hyperdistinct
NULL
#> NULL

#' @rdname Hyperdistinct
#' @param range A vector of integers specifying the intersection sizes for which probabilities (dhydist) or cumulative probabilites (phydist) should be computed (can be a single number). If range is NULL (default) then probabilities will be returned over the entire range of possible values.
#' @param log Logical. If TRUE, probabilities p are given as log(p). Defaults to FALSE.
#' @export
#' @examples
#' ## Generate the distribution of distinct categories drawn from a single urn.
#' dd <- dhydist(20, 10, 12)
#' ## Restrict the range of intersections.
#' dd <- dhydist(20, 10, 12, range = 5:10)
dhydist <- function(n, a, q, range = NULL, log = FALSE)
	{
	# range is a vector giving intersection sizes for which the user wishes to retrieve probabilities.
	crange <- .hydist.check.params(n, a, q)
	if(is.null(range)){
		range <- crange
		}
	rn <- intersect(range, crange)
	if(length(rn)==0){
		dist <- data.frame(c=range, p=rep(0,length(range)))
		}
	dist <- .hydist.dup(n, a, q, range = rn)
	if(log){
		dist[,2] <- log(dist[,2])
		}
	return(dist)
	}


#' @rdname Hyperdistinct
#' @param vals A vector of integers specifying the intersection sizes for which probabilities (dhydist) or cumulative probabilites (phydist) should be computed (can be a single number). If range is NULL (default) then probabilities will be returned over the entire range of possible values.
#' @param log.p Logical. If TRUE, probabilities p are given as log(p). Defaults to FALSE.
#' @param upper.tail Logical. If TRUE, probabilities are P(X >= c), else P(X <= c). Defaults to TRUE.
#' @export
#' @examples
#' ## Generate cumulative probabilities.
#' pp <- phydist(29, 15, 8, vals = 5)
#' pp <- phydist(29, 15, 8, vals = 2, upper.tail = FALSE)
phydist <- function(n, a, q, vals, upper.tail = TRUE, log.p = FALSE)
	{
	# vals are the values of v for which we want cumulative probabilities.
	dist <- dhydist(n, a, q)
	if(log.p){
		dist[,2] <- log(dist[,2])
		}
	crange <- .hydist.check.params(n, a, q)
	rn <- intersect(vals, crange)
	if(length(rn)==0){
		pp <- NULL
		for(i in 1:length(vals)){
			if(vals[i] < min(crange)){
				if(upper.tail){
					pp[i] <- 1
				}else{
					pp[i] <- 0
					}
			}else{
				if(upper.tail){
					pp[i] <- 0
				}else{
					pp[i] <- 1
					}
				}
			}
		pval <- data.frame(c=vals, cum.p=rep(pp, length(vals)))
	}else{
		pv <- NULL
		for(i in 1:length(rn)){
			if(upper.tail){
				inds <- which(dist[,1]==rn[i]):nrow(dist)
			}else{
				inds <- 1:which(dist[,1]==rn[i])
				}
			pv[i] <- sum(dist[inds, 2])
			}
		pval <- data.frame(c=rn, cum.p=pv)
		}
	if(log.p){
		pval[,2] <- log(pval[,2])
		}
	return(pval)
	}


#' @rdname Hyperdistinct
#' @param p A probability between 0 and 1.
#' @export
#' @examples 
#' ## Extract quantiles:
#' qq <- qhydist(0.15, 23, 12, 10)
qhydist <- function(p, n, a, q, upper.tail = TRUE, log.p = FALSE)
	{
	# p is a probability.
	if(!p>=0 || !p<=1){
		stop("p must be between 0 and 1\n", call. = FALSE)
		}
	crange <- .hydist.check.params(n, a, q)
	vals <- crange
	dist <- dhydist(n, a, q, range=vals)
	pvals <- phydist(n, a, q, upper.tail=upper.tail, vals=vals)
	inds <- which(pvals[,2]<=p)
	if(length(inds) == 0)
	  inds <- nrow(pvals)
	pv <- sum(dist[inds, 2])
	qq <- pvals[max(inds),1]
	if(log.p){
		pv <- log(pv)
		}
	ret <- data.frame(c=qq, cum.p=pv)
	return(ret)
	}


#' @rdname Hyperdistinct
#' @param num An integer specifying the number of random numbers to generate. Defaults to 5.
#' @export
#' @examples 
#' ## Generate random samples based on this distribution.
#' rr <- rhydist(num = 10, 18, 9, 12)
rhydist <- function(num = 5, n, a, q)
	{
	crange <- .hydist.check.params(n, a, q)
	probs <- dhydist(n, a, q, range=crange)
	samp <- sample(crange, num, prob = probs[,2], replace = TRUE)
	return(samp)
	}


#### Worker function(s): ####


# Single urn with q duplicates.
.hydist.dup <- function(n, a, q, range)
	{
	dist <- NULL
	for(c in range){
		qs <- 0
		jmax <- q-a+c
		for(j in 0:jmax){
			if((q-a+c-j+1)<=0 || (2*c-a-j+1)<=0){
				qs <- qs
			}else{
				qs <- sum(qs, exp(lgamma(q+1) - lgamma(a-c+1) - lgamma(q-a+c-j+1) - lgamma(j+1) + lgamma(n-a+c-j+1) - lgamma(n-c+1) - lgamma(2*c-a-j+1) - lgamma(n+q+1) + lgamma(n+q-a+1) + lgamma(a+1)) )
				}
			}
		dist <- append(dist, qs)
		}
	ret <- data.frame(c=range,p=dist)
	return(ret)
	}


### Simulation ###

.sim.hydist <- function(n, a, sims = 10000, Na = rep(2,n))
	{
	# Na is a vector with the numbers in each category.
	sdist <- NULL; na <- NULL
	for(i in 1:n){
		na <- append(na, rep(i,Na[i]))
		}

	for(i in 1:sims){
		cat("\r   Completed...",round((i/sims)*100,3),"%",collapse="")
		s1 <- sample(na, a, replace = FALSE)
		sdist[i] <- length(unique(s1))
		}
	cat("\n")
	return(sdist)
	}

Any scripts or data that you put into this service are public.

hint documentation built on Feb. 2, 2022, 5:10 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

hint
Tools for Hypothesis Testing Based on Hypergeometric Intersection Distributions

R/hint_single_urn.R
In hint: Tools for Hypothesis Testing Based on Hypergeometric Intersection Distributions

Defines functions .sim.hydist .hydist.dup rhydist qhydist phydist dhydist .hydist.check.params

Documented in dhydist phydist qhydist rhydist

Try the hint package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

hint Tools for Hypothesis Testing Based on Hypergeometric Intersection Distributions

R/hint_single_urn.R In hint: Tools for Hypothesis Testing Based on Hypergeometric Intersection Distributions

Defines functions .sim.hydist .hydist.dup rhydist qhydist phydist dhydist .hydist.check.params

Documented in dhydist phydist qhydist rhydist

Try the hint package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

hint
Tools for Hypothesis Testing Based on Hypergeometric Intersection Distributions

R/hint_single_urn.R
In hint: Tools for Hypothesis Testing Based on Hypergeometric Intersection Distributions