R/composition_estimation_utils.R

Defines functions subGroup_breakdown PBT_breakdown PBT_expand_calc nonPBT_breakdown PBT_var3_grad PBT_var3_log_likelihood PBT_var3_optimllh PBT_var3_calc_MLE PBT_var2_grad PBT_var2_log_likelihood PBT_var2_optimllh PBT_var2_calc_MLE PBT_grad PBT_optimllh PBT_log_likelihood PBT_expand_calc_MLE softMax

# functions used in estimating composition with the trap biological data


####
## Maximum likelihood utility functions
####


#' softmax function with numerical stability
#' Values will sum to 1
#' may return values of 0, but not Inf
#' @param x real numbers to transform into simplex via softmax
#' @keywords internal
#' @noRd
softMax <- function(x){
	pr <- exp(x - max(x))
	return(pr / sum(pr))
}


#' MLE estimates composition for PBT groups
#' @param values values of the category for all the samples, a vector
#' @param tagRates a tibble with release group as "group" and tag rate as "tagRate"
#' @return returns tibble with group (release Group or Unassigned) and
#'   prop (proportion of the stratum) columns
#' @keywords internal
#' @noRd
PBT_expand_calc_MLE <- function(values, tagRates){
	values <- values[!is.na(values)]
	numUnassign <- sum(values == "Unassigned")
	numAssign <- table(values[values != "Unassigned"])
	# no PBT assigned fish
	if(length(numAssign) < 1) return(tibble(group = "Unassigned", prop = 1))

	pr <- tibble(group = names(numAssign), count = as.numeric(numAssign)) %>% left_join(tagRates, by = "group")

	# now MLE estimation of proportions\
	propsMLE <- optim(par = rep(1, nrow(pr) + 1), fn = PBT_optimllh, gr = PBT_grad,
								nGroups = pr$count, nUntag = numUnassign, tagRates = pr$tagRate,
							  control = list(fnscale = -1, maxit = 1000), method = "BFGS"
							)
	if(propsMLE$convergence != 0) warning("Convergence error in PBT_expand_calc_MLE")
	# apply softmax to translate into probabilities
	par_prob <- softMax(propsMLE$par)
	# fixing computational limit on inferring parameters to be zero
	if(numUnassign == 0){
		par_prob[length(par_prob)] <- 0
		par_prob <- par_prob / sum(par_prob)
	}
	return(tibble(group = c(pr$group, "Unassigned"), prop = par_prob))
}

#' log-likelihood of PBT groups
#' @param pGroups proportion belonging to each PBT group
#' @param pW proportion that is wild
#' @param nGroups counts for each PBT group
#' @param nUntag counts of untagged
#' @param tagRates tagRates in order of pGroups
#' @keywords internal
#' @noRd
#' @return log likelihood value
PBT_log_likelihood <- function(pGroups, pW, nGroups, nUntag, tagRates){
	y <- pW + sum((1 - tagRates) * pGroups)
	pGroups_tag <- pGroups * tagRates
	return(dmultinom(x = c(nGroups, nUntag), prob = c(pGroups_tag, y), log = TRUE))
}

#' wrapper for PBT log-likelihood that keeps all variables in par
#' This requires at least one PBT tagged group is present
#' @param par real numbers corresponding to softmax probabilities of c(PBT groups, wild)
#' @param nGroups counts for each PBT group
#' @param nUntag counts of untagged
#' @param tagRates tagRates in order of pGroups
#' @keywords internal
#' @noRd
#' @return log likelihood value
PBT_optimllh <- function(par, nGroups, nUntag, tagRates){
	# apply softmax to translate into probabilities
	par_prob <- softMax(par)
	pGroups <- par_prob[1:(length(par_prob) - 1)]
	pW <- par_prob[length(par_prob)]
	return(PBT_log_likelihood(pGroups = pGroups, pW = pW,
									  nGroups = nGroups, nUntag = nUntag, tagRates = tagRates))
}

PBT_grad <- function(par, nGroups, nUntag, tagRates){
	# apply softmax to translate into probabilities
	par_prob <- softMax(par)
	nPBT <- length(par_prob) - 1
	pGroups <- par_prob[1:nPBT]
	pW <- par_prob[nPBT + 1]
	# calculate probs for multinomial
	y <- pW + sum((1 - tagRates) * pGroups)

	gr <- rep(0, length(par))

	for(i in 1:(nPBT + 1)){
		smDeriv <- -par_prob * par_prob[i]
		smDeriv[i] <- par_prob[i] * (1 - par_prob[i])
		gr[i] <- sum((nGroups / pGroups) * smDeriv[1:nPBT]) +
			((nUntag / y) * (sum((1 - tagRates[1:nPBT]) * smDeriv[1:nPBT]) + smDeriv[nPBT + 1]))
	}

	# print(data.frame(an = gr,
	# 					  num = numDeriv::grad(PBT_optimllh, par, nGroups = nGroups, nUntag = nUntag, tagRates = tagRates)) %>%
	# 					  	mutate(diff = an - num)
	# 					  )

	return(gr)
}



#' MLE estimates composition for var2 within PBT groups
#' @param tagRates a tibble with release group as "group" and tag rate as "tagRate"
#' @return returns a matrix with PBT group/Unassigned as rows and var2 as cols, and props within each group as values
#' @keywords internal
#' @noRd
PBT_var2_calc_MLE <- function(v2Data, piGroup, tagRates){
	# now MLE estimation of proportions\
	propsMLE <- optim(par = rep(1, nrow(v2Data) * ncol(v2Data)), fn = PBT_var2_optimllh, gr = PBT_var2_grad,
							piGroup = piGroup, v2Data = v2Data, tagRates = tagRates,
							control = list(fnscale = -1, maxit = 1000), method = "BFGS"
					)
	if(propsMLE$convergence != 0) warning("Convergence error in PBT_var2_calc_MLE")
	# apply softmax to translate into probabilities
	varProbs <- matrix(propsMLE$par, nrow = nrow(v2Data), ncol = ncol(v2Data), byrow = TRUE)
	for(i in 1:nrow(varProbs)){
		varProbs[i,] <- softMax(varProbs[i,])
	}
	rownames(varProbs) <- rownames(v2Data)
	colnames(varProbs) <- colnames(v2Data)
	return(varProbs)
}

#' wrapper for estimating var 2 with PBT as var1
#' @param par real numbers corresponding to softmax probabilities of
#' @param piGroup "known" proportions of each group in order of rows of v2Data
#' @param v2Data counts of each combination
#' @param tagRates tagRates in order of piGroup
#' @keywords internal
#' @noRd
#' @return log likelihood value
PBT_var2_optimllh <- function(par, piGroup, v2Data, tagRates){
	# apply softmax to translate into probabilities
	varProbs <- matrix(par, nrow = nrow(v2Data), ncol = ncol(v2Data), byrow = TRUE)
	for(i in 1:nrow(varProbs)){
		varProbs[i,] <- softMax(varProbs[i,])
	}
	return(PBT_var2_log_likelihood(varProbs = varProbs, piGroup = piGroup,
											 v2Data = v2Data, tagRates = tagRates))
}

#' log_likelihood for estimating var 2 with PBT as var1
#' @param varProbs rows are groups, cols are var2 cats, values are probs within each group
#' @param piGroup "known" proportions of each group in order of rows of v2Data
#' @param v2Data counts of each combination
#' @param tagRates tagRates in order of piGroup
#' @keywords internal
#' @noRd
#' @return log likelihood value
PBT_var2_log_likelihood <- function(varProbs, piGroup,
												v2Data, tagRates){
	# product of multinomial likelihoods (sum of log-likelihoods)
	llh <- 0
	nPBT <- nrow(v2Data) - 1
	# first the tagged fish
	for(i in 1:nPBT){
		llh <- llh + dmultinom(x = v2Data[i,], prob = varProbs[i,], log = TRUE)
	}
	# now the untagged fish
	untagProb <- piGroup
	untagProb[1:nPBT] <- untagProb[1:nPBT] * (1 - tagRates[1:nPBT])
	untagProb <- untagProb / sum(untagProb)

	untagVarProbs <- drop(matrix(untagProb, nrow = 1) %*% varProbs)
	llh <- llh + dmultinom(x = v2Data[nrow(v2Data),], prob = untagVarProbs, log = TRUE)

	return(llh)
}

PBT_var2_grad <- function(par, piGroup, v2Data, tagRates){
	gr <- rep(0, length(par))
	nPBT <- nrow(v2Data) - 1
	# apply softmax to translate into probabilities
	varProbs <- matrix(par, nrow = nrow(v2Data), ncol = ncol(v2Data), byrow = TRUE)
	for(i in 1:nrow(varProbs)){
		varProbs[i,] <- softMax(varProbs[i,])
	}

	# first the tagged fish
	pos <- 1
	for(i in 1:nPBT){
		for(j in 1:ncol(v2Data)){
			gr[pos] <- sum(v2Data[i,] * (((1:ncol(v2Data)) == j) - varProbs[i,j]))
			pos <- pos + 1
		}
	}

	# now the untagged fish
	untagProb <- piGroup
	untagProb[1:nPBT] <- untagProb[1:nPBT] * (1 - tagRates[1:nPBT])
	untagProb <- untagProb / sum(untagProb)

	untagVarProbs <- drop(matrix(untagProb, nrow = 1) %*% varProbs)

	unTagData <- v2Data[nPBT + 1,]
	pos <- 1
	for(p in 1:(nPBT+1)){
		for(j in 1:ncol(v2Data)){
			smDeriv <- varProbs[p,] * (((1:ncol(v2Data)) == j) - varProbs[p,j])
			gr[pos] <- gr[pos] + sum((unTagData / untagVarProbs) * untagProb[p] * smDeriv)
			pos <- pos + 1
		}
	}

	# print(data.frame(an = gr,
	# 					  num = numDeriv::grad(PBT_var2_optimllh, par, piGroup = piGroup, v2Data = v2Data, tagRates = tagRates)) %>%
	# 					  	mutate(diff = an - num) %>% pull(diff) %>% mean
	# 					  )

	return(gr)
}


#' MLE estimates composition for var3 within W
#' @param piGroup "known" proportions of each group in order of rows of
#' @param tagRates tagRates in order of piGroup
#' @param var2Probs known composition of var2(of 3). rows are PBT groups with Unassigned (wild) at the end
#' @param hnc_var3 counts of PBT assigned in cats of var3
#' @param un_counts counts of unassigned in cats of var2 (rows) and var3 (cols)
#' @return returns a matrix with var2 as rows and var3 as cols, and props within each group as values
#' @keywords internal
#' @noRd
PBT_var3_calc_MLE <- function(piGroup, tagRates, var2Probs, hnc_var3, un_counts){
	# now MLE estimation of proportions\
	propsMLE <- optim(par = rep(1, (nrow(hnc_var3) * ncol(hnc_var3)) + (nrow(un_counts) * ncol(un_counts))),
							fn = PBT_var3_optimllh, gr = PBT_var3_grad,
							piGroup = piGroup, var2Probs = var2Probs, tagRates = tagRates,
							hnc_var3 = hnc_var3, un_counts = un_counts,
							control = list(fnscale = -1, maxit = 1000), method = "BFGS"
	)
	if(propsMLE$convergence != 0) warning("Convergence error in PBT_var3_calc_MLE")
	# apply softmax to translate into probabilities
	var3Probs <- matrix(propsMLE$par, nrow = nrow(hnc_var3) + nrow(un_counts), ncol = ncol(hnc_var3), byrow = TRUE)
	# don't need HNC values
	var3Probs <- var3Probs[(nrow(hnc_var3) + 1):nrow(var3Probs),,drop = FALSE]
	for(i in 1:nrow(var3Probs)){
		var3Probs[i,] <- softMax(var3Probs[i,])
	}
	rownames(var3Probs) <- rownames(un_counts)
	colnames(var3Probs) <- colnames(un_counts)
	return(var3Probs)
}


#' wrapper for estimating var 3 in W
#' @param par real numbers corresponding to softmax probabilities of
#' @param piGroup "known" proportions of each group in order of rows of
#' @param tagRates tagRates in order of piGroup
#' @param var2Probs known composition of var2(of 3). rows are PBT groups with Unassigned (wild) at the end
#' @param hnc_var3 counts of PBT assigned in cats of var3
#' @param un_counts counts of unassigned in cats of var2 (rows) and var3 (cols)
#' @keywords internal
#' @noRd
#' @return log likelihood value
PBT_var3_optimllh <- function(par, piGroup, tagRates, var2Probs, hnc_var3, un_counts){
	# apply softmax to translate into probabilities
	# these are the probabilities of var3 within the pbt groups and then
	# within the var2 groups of the unassigned fish
	# so, need one row for each PBT group and one for each var2 group of the Unassigned
	# and one column for each category of var3
	# PBT groups are first, and then cats of var2 in Unassigned
	var3Probs <- matrix(par, nrow = nrow(hnc_var3) + nrow(un_counts), ncol = ncol(hnc_var3), byrow = TRUE)
	for(i in 1:nrow(var3Probs)){
		var3Probs[i,] <- softMax(var3Probs[i,])
	}
	return(PBT_var3_log_likelihood(var3Probs = var3Probs, piGroup = piGroup,
											 tagRates = tagRates, var2Probs = var2Probs,
											 hnc_var3 = hnc_var3, un_counts = un_counts))
}


#' log_likelihood for estimating var 3 in W
#' @param var3Probs Probabilities of var3 cats (cols) in PBT groups and in var2 cats of W (rows)
#' @param piGroup "known" proportions of each group in order of rows of
#' @param tagRates tagRates in order of piGroup
#' @param var2Probs known composition of var2(of 3). rows are PBT groups with Unassigned (wild) at the end
#' @param hnc_var3 counts of PBT assigned in cats of var3
#' @param un_counts counts of unassigned in cats of var2 (rows) and var3 (cols)
#' @keywords internal
#' @noRd
#' @return log likelihood value
PBT_var3_log_likelihood <- function(var3Probs, piGroup, tagRates, var2Probs,
												hnc_var3, un_counts){
	# product of multinomial likelihoods (sum of log-likelihoods)
	llh <- 0
	nPBT <- nrow(hnc_var3)
	# first the tagged fish
	for(i in 1:nPBT){
		llh <- llh + dmultinom(x = hnc_var3[i,], prob = var3Probs[i,], log = TRUE)
	}
	# now the untagged fish

	untagProb <- piGroup
	untagProb[1:nPBT] <- untagProb[1:nPBT] * (1 - tagRates[1:nPBT])
	untagProb <- untagProb / sum(untagProb) # this is compostion of the Unassigned fish

	# this is probabilities an unassigned fish is in cat of var2 x var3
	untagVar3Probs <- matrix(0, nrow = nrow(un_counts), ncol = ncol(un_counts))
	for(i in 1:nrow(un_counts)){
		for(j in 1:nPBT){
			untagVar3Probs[i,] <- untagVar3Probs[i,] + (untagProb[j] * var2Probs[j,i] * var3Probs[j,])
		}
		untagVar3Probs[i,] <- untagVar3Probs[i,] + (untagProb[nPBT + 1] * var2Probs[nPBT + 1,i] * var3Probs[nPBT + i,])
	}

	llh <- llh + dmultinom(x = as.vector(un_counts), prob = as.vector(untagVar3Probs), log = TRUE)

	return(llh)
}


PBT_var3_grad <- function(par, piGroup, tagRates, var2Probs, hnc_var3, un_counts){
	# apply softmax to translate into probabilities
	# these are the probabilities of var3 within the pbt groups and then
	# within the var2 groups of the unassigned fish
	# so, need one row for each PBT group and one for each var2 group of the Unassigned
	# and one column for each category of var3
	# PBT groups are first, and then cats of var2 in Unassigned

	var3Probs <- matrix(par, nrow = nrow(hnc_var3) + nrow(un_counts), ncol = ncol(hnc_var3), byrow = TRUE)
	for(i in 1:nrow(var3Probs)){
		var3Probs[i,] <- softMax(var3Probs[i,])
	}

	gr <- rep(0, length(par))
	nPBT <- nrow(hnc_var3)

	# first the tagged fish
	pos <- 1
	for(i in 1:nPBT){
		for(j in 1:ncol(var3Probs)){
			gr[pos] <- sum(hnc_var3[i,] * (((1:ncol(var3Probs)) == j) - var3Probs[i,j]))
			pos <- pos + 1
		}
	}

	# then the untagged fish
	untagProb <- piGroup
	untagProb[1:nPBT] <- untagProb[1:nPBT] * (1 - tagRates[1:nPBT])
	untagProb <- untagProb / sum(untagProb) # this is compostion of the Unassigned fish

	# this is probabilities an unassigned fish is in cat of var2 x var3
	untagVar3Probs <- matrix(0, nrow = nrow(un_counts), ncol = ncol(un_counts))
	for(i in 1:nrow(un_counts)){
		for(j in 1:nPBT){
			untagVar3Probs[i,] <- untagVar3Probs[i,] + (untagProb[j] * var2Probs[j,i] * var3Probs[j,])
		}
		untagVar3Probs[i,] <- untagVar3Probs[i,] + (untagProb[nPBT + 1] * var2Probs[nPBT + 1,i] * var3Probs[nPBT + i,])
	}

	# PBT group influence
	for(r in 1:nPBT){
		for(c in 1:ncol(var3Probs)){
			pos <- ((r - 1) * ncol(var3Probs)) + c
			# for a given parameter
			for(i1 in 1:nrow(un_counts)){
				for(i2 in 1:ncol(un_counts)){
					# for a given category (var2 x var3)
					gr[pos] <- gr[pos] + (
						(un_counts[i1,i2] / untagVar3Probs[i1,i2]) *
							untagProb[r] * var2Probs[r,i1] *
							var3Probs[r,i2] * ((i2 == c) - var3Probs[r,c]) # softmax derivative
					)

				}
			}
		}
	}

	# wild group influence
	for(r in (nPBT+1):nrow(var3Probs)){
		for(c in 1:ncol(var3Probs)){
			pos <- ((r - 1) * ncol(var3Probs)) + c
			# for a given parameter
			for(i2 in 1:ncol(un_counts)){
				# for a given category (var2 x var3)
				gr[pos] <- gr[pos] + (
					(un_counts[r - nPBT,i2] / untagVar3Probs[r - nPBT,i2]) *
						untagProb[nPBT + 1] * var2Probs[nPBT + 1, r - nPBT] *
						var3Probs[r,i2] * ((i2 == c) - var3Probs[r,c]) # softmax derivative
				)
			}
		}
	}

	# print(data.frame(an = gr,
	# 					  num = numDeriv::grad(PBT_var3_optimllh, par, piGroup = piGroup,
	# 					  							tagRates = tagRates, var2Probs = var2Probs,
	# 					  							hnc_var3 = hnc_var3, un_counts = un_counts)) %>%
	# 					  	mutate(diff = abs(an - num)) %>% pull(diff) %>% summary
	# 					  )
	return(gr)
}


#####
## Accounting method utility functions
#####



#' estimates composition for a non-PBT categorical variable
#' @param values values of the category for all the samples, a vector
#' @param boots number of bootstrap iterations
#' @return a list with two components: first is tibble of point estimates,
#'   second is matrix with bootstraps. Columns are categories are in order following the
#'   point estimates and are named accordingly. Rows are bootstrap iterations.
#' @keywords internal
#' @noRd
nonPBT_breakdown <- function(values, boots){
	props <- table(values) # removes NAs automatically
	nam <- names(props)
	sampleSize <- sum(props)
	props <- as.numeric(props/sampleSize)
	output <- list(tibble(group = nam, prop = props))
	output[[2]] <- t(rmultinom(n = boots, size = sampleSize, prob = props)) / sampleSize
	colnames(output[[2]]) <- nam
	return(output)
}

#' estimates composition for PBT groups
#' @param values values of the category for all the samples, a vector
#' @param tagRates a tibble with release group in col1 and tag rate in col2
#' @return returns tibble with group (release Group or Unassigned) and
#'   prop (proportion of the stratum) columns
#' @keywords internal
#' @noRd
PBT_expand_calc <- function(values, tagRates){
	props <- table(values) # removes NAs automatically
	pr <- tibble(group = names(props), count = as.numeric(props)) %>% left_join(tagRates, by = "group") %>%
		mutate(expand = count / tagRate, diff = expand - count)
	if(!"Unassigned" %in% pr$group) pr <- pr %>% bind_rows(tibble(group = "Unassigned",
																					  count = 0, tagRate = 1, expand = 0, diff = 0))
	# truncate expansions when not enough Unassigned
	if((pr %>% filter(group == "Unassigned") %>% pull(count)) < sum(pr$diff)){
		pr$diff <- (pr %>% filter(group == "Unassigned") %>% pull(count)) * (pr$diff / sum(pr$diff))
		pr$expand <- pr$count + pr$diff
		pr$expand[pr$group == "Unassigned"] <- 0
	} else {
		pr$expand[pr$group == "Unassigned"] <- pr$expand[pr$group == "Unassigned"] - sum(pr$diff)
	}
	return(pr %>% mutate(prop = expand / sum(expand)) %>% select(group, prop))
}


#' @param values values of the category for all the samples, a vector
#' @param tagRates a tibble with release group in col1 and tag rate in col2
#' @param boots number of bootstrap iterations
#' @return a list with two components: first is tibble of point estimates,
#'   second is matrix with bootstraps. Columns are categories are in order following the
#'   point estimates and are named accordingly. Rows are bootstrap iterations.
#' @keywords internal
#' @noRd
PBT_breakdown <- function(values, tagRates, boots){
	colnames(tagRates) <- c("group", "tagRate")
	if("Unassigned" %in% tagRates[[1]]){
		if(tagRates[[2]][tagRates[[1]] == "Unassigned"] != 1){
			stop("Unassigned must not be included in the tag rate file or have a tag rate of 1")
		}
	} else {
		# add Unassigned with tag rate of 1
		tagRates <- bind_rows(tagRates, tibble(group = "Unassigned", tagRate = 1))
	}
	output <- list(PBT_expand_calc(values, tagRates))
	# non-parametric bootstrapping
	output[[2]] <- matrix(nrow = boots, ncol = nrow(output[[1]]))
	for(i in 1:boots){
		temp <- PBT_expand_calc(values = sample(values, size = length(values), replace = TRUE), tagRates)
		output[[2]][i,] <- output[[1]] %>% select(group) %>% left_join(temp, by = "group") %>%
			mutate(prop = replace_na(prop, 0)) %>% pull(prop)
	}
	colnames(output[[2]]) <- output[[1]]$group
	return(output)
}

#' estimates composition of one stratum for one or two variables
#' @param trapStratumData values of the category for all the samples, a vector
#' @param vars a tibble with release group in col1 and tag rate in col2
#' @param pbt_var name of pbt variable (character)
#' @return a list with one component for each variable.They contain point
#'   estimates and bootstrap estimates
#' @keywords internal
#' @noRd
subGroup_breakdown <- function(trapStratumData, vars, pbt_var, tagRates, boots){
	list_break <- list()
	v1 <- vars[1]
	v2 <- if(length(vars) == 2) vars[2] else NULL
	trapStratumData <- trapStratumData[!is.na(trapStratumData[[v1]]),]
	if(v1 == pbt_var){
		list_break[[1]] <- PBT_breakdown(values = trapStratumData[[v1]], tagRates = tagRates, boots = boots)
	} else {
		list_break[[1]] <- nonPBT_breakdown(values = trapStratumData[[v1]], boots = boots)
	}
	names(list_break)[1] <- v1
	if(!is.null(v2)){
		trapStratumData <- trapStratumData[!is.na(trapStratumData[[v2]]),] # remove observations missing data for v2
		# storing all the values for the subvariables
		# as a massive list and named/indexed according to order
		list_break[[2]] <- list() # index here is category of the first variable
		for(j in 1:nrow(list_break[[1]][[1]])){ # run breakdown for all categories
			v1Cat <- list_break[[1]][[1]]$group[j] # current value of v1
			tempData <- trapStratumData[[v2]][trapStratumData[[v1]] == v1Cat]
			if(list_break[[1]][[1]]$prop[j] == 0 && length(tempData) == 0){
				# makign one entry for when there are no Unassigned fish
				list_break[[2]][[j]] <- list(tibble(group = sort(trapStratumData[[v2]])[1], prop = 0),
													  matrix(0, nrow = boots, ncol = 1))
				colnames(list_break[[2]][[j]][[2]]) <- list_break[[2]][[j]][[1]]$group[1]
				next
			}
			if(v2 == pbt_var){
				list_break[[2]][[j]] <- PBT_breakdown(values = tempData, tagRates = tagRates, boots = boots)
			} else {
				list_break[[2]][[j]] <- nonPBT_breakdown(values = tempData, boots = boots)
			}
		}
		names(list_break[[2]]) <- list_break[[1]][[1]]$group
		names(list_break)[2] <- v2
	}
	return(list_break)
}
delomast/escapeLGD documentation built on June 9, 2025, 10:52 p.m.