# R/qhat_estimation_functions.R In drpop: Efficient and Doubly Robust Population Size Estimation

#### Documented in qhat_gamqhat_logitqhat_mlogitqhat_rangerqhat_rangerlogit

```#' Estimate marginal and joint distribution of lists j and k using logistic regression.
#'
#' @param List.train The training data matrix used to estimate the distibution functions.
#' @param List.test The data matrix on which the estimator function is applied.
#' @param K The number of lists in the data.
#' @param j The first list that is conditionally independent.
#' @param k The second list that is conditionally independent.
#' @param margin The minimum value the estimates can attain to bound them away from zero.
#' @param ... Any extra arguments passed into the function.
#' @return A list of the marginal and joint distribution probabilities \code{q1}, \code{q2} and \code{q12}.
#' @examples
#' \dontrun{
#' qhat = qhat_logit(List.train = List.train, List.test = List.test, margin = 0.005)
#' q1 = qhat\$q1
#' q2 = qhat\$q2
#' q12 = qhat\$q12
#' }
#' @importFrom stats binomial formula glm model.matrix na.omit predict rnorm sigma var
#' @export
qhat_logit <- function(List.train, List.test, K = 2, j = 1, k = 2, margin = 0.005, ...){

stopifnot(ncol(List.train) > K)
if(missing(List.test)){
List.test = List.train
}
colnames(List.train) = c(paste("L", 1:K, sep = ''), paste("x", 1:(ncol(List.train) - K), sep = ''))
colnames(List.test) = c(paste("L", 1:K, sep = ''), paste("x", 1:(ncol(List.train) - K), sep = ''))

fitj0 = try(glm(formula(paste("L", j, "*(1 - L", k, ") ~.", sep = '')), family = binomial(link = "logit"), data = List.train[,c(j, k, (K + 1):ncol(List.train))]))
fit0k = try(glm(formula(paste("L", k, "*(1 - L", j, ") ~.", sep = '')), family = binomial(link = "logit"), data = List.train[,c(j, k, (K + 1):ncol(List.train))]))
fitjk = try(glm(formula(paste("L", j, "*L", k, " ~.", sep = '')), family = binomial(link = "logit"), data = List.train[,c(j, k, (K + 1):ncol(List.train))]))

if("try-error" %in% c(class(fitj0), class(fit0k), class(fitjk))){
warning("One or more fits with logistic regression failed.")
return(NULL)
}else{
q12 = pmax(predict(fitjk, newdata = List.test, type = "response"), margin)
q1 = pmax(q12 + predict(fitj0, newdata = List.test, type = "response"), margin)
q2 = pmax(q12 + predict(fit0k, newdata = List.test, type = "response"), margin)

return(list(q1 = q1, q2 = q2, q12 = q12))
}
}
#' Estimate marginal and joint distribution of lists j and k using generalized additive models.
#'
#' @param List.train The training data matrix used to estimate the distibution functions.
#' @param List.test The data matrix on which the estimator function is applied.
#' @param K The number of lists in the data.
#' @param j The first list that is conditionally independent.
#' @param k The second list that is conditionally independent.
#' @param margin The minimum value the estimates can attain to bound them away from zero.
#' @param ... Any extra arguments passed into the function.
#' @return A list of the marginal and joint distribution probabilities \code{q1}, \code{q2} and \code{q12}.
#' @examples
#' \dontrun{
#' qhat = qhat_gam(List.train = List.train, List.test = List.test, margin = 0.005)
#' q1 = qhat\$q1
#' q2 = qhat\$q2
#' q12 = qhat\$q12
#' }
#' @import gam
#' @references Trevor Hastie (2020). gam: Generalized Additive Models. _R package version 1.20_. https://CRAN.R-project.org/package=gam
#' @export
qhat_gam <- function(List.train, List.test, K = 2, j = 1, k = 2, margin = 0.005, ...){

stopifnot(ncol(List.train) > K)
if(missing(List.test)){
List.test = List.train
}
colnames(List.train) = c(paste("L", 1:K, sep = ''), paste("x", 1:(ncol(List.train) - K), sep = ''))
colnames(List.test) = c(paste("L", 1:K, sep = ''), paste("x", 1:(ncol(List.train) - K), sep = ''))

requireNamespace("gam", quietly = TRUE)
l = ncol(List.train) - K
nnum = intersect(names(which(lapply(subset(List.train, select = -c(1:K)), "class") != "factor")),
names(which(lapply(subset(List.train, select =  -c(1:K)), function(icol){length(unique(icol))}) > 3)))
nfac = setdiff(names(List.train[,-c(1:K)]), nnum)
if(!length(nnum)){
covformula = paste0(nfac, collapse = ' + ')
}else if(!length(nfac)){
covformula = paste0('s(', nnum, ')', collapse = ' + ')
}else{
covformula = paste0(paste0('s(', nnum, ')', collapse = ' + '),
'+', paste0('s(', nnum, ')', collapse = ' + '))
}

fitj = try(gam::gam(formula(paste0("L", j, " ~", covformula)), data = List.train, family = binomial("logit")))
fitk = try(gam::gam(formula(paste0("L", k, " ~", covformula)), data = List.train, family = binomial("logit")))
fitjk = try(gam::gam(formula(paste0("L", j, "*L", k, " ~", covformula)), data = List.train, family = binomial("logit")))

if("try-error" %in% c(class(fitj), class(fitk), class(fitjk))){
warning("One or more fits with GAM regression failed.")
return(NULL)
}else{
q12 = pmax(predict(fitjk, newdata = List.test, type = "response"), margin)
q1 = pmax(predict(fitj, newdata = List.test, type = "response"), margin)
q2 = pmax(predict(fitk, newdata = List.test, type = "response"), margin)

return(list(q1 = q1, q2 = q2, q12 = q12))
}
}
#' Estimate marginal and joint distribution of lists j and k using ranger.
#'
#' @param List.train The training data matrix used to estimate the distibution functions.
#' @param List.test The data matrix on which the estimator function is applied.
#' @param K The number of lists in the data.
#' @param j The first list that is conditionally independent.
#' @param k The second list that is conditionally independent.
#' @param margin The minimum value the estimates can attain to bound them away from zero.
#' @param ... Any extra arguments passed into the function.
#' @return A list of the marginal and joint distribution probabilities \code{q1}, \code{q2} and \code{q12}.
#' @examples
#' \dontrun{
#' qhat = qhat_ranger(List.train = List.train, List.test = List.test, margin = 0.005)
#' q1 = qhat\$q1
#' q2 = qhat\$q2
#' q12 = qhat\$q12
#' }
#' @import ranger
#' @references Marvin N. Wright, Andreas Ziegler (2017). ranger: A Fast Implementation of Random Forests for High Dimensional Data in C++ and R. _Journal of Statistical Software_, *77*(1), 1-17. doi:10.18637/jss.v077.i01
#' @export
qhat_ranger <- function(List.train, List.test, K = 2, j = 1, k = 2, margin = 0.005, ...){
requireNamespace("ranger", quietly = TRUE)
l = ncol(List.train) - K

stopifnot(l>0)
if(missing(List.test)){
List.test = List.train
}

colnames(List.train) = c(paste("L", 1:K, sep = ''), paste("x", 1:(ncol(List.train) - K), sep = ''))
colnames(List.test) = c(paste("L", 1:K, sep = ''), paste("x", 1:(ncol(List.train) - K), sep = ''))

fitj = ranger(formula(paste("factor(L", j, ") ~.", sep = '')), data = List.train[,-c(1:K)[-j]], probability = TRUE, classification = TRUE)
fitk = ranger(formula(paste("factor(L", k, ") ~.", sep = '')), data = List.train[,-c(1:K)[-k]], probability = TRUE, classification = TRUE)
fitjk = ranger(formula(paste("factor(L", j, "*L", k, ") ~.", sep = '')), data = List.train[,c(j, k, K + 1:l)], probability = TRUE, classification = TRUE)

if("try-error" %in% c(class(fitj), class(fitk), class(fitjk))){
warning("One or more fits with GAM regression failed.")
return(NULL)
}else{
q12 = pmax(predict(fitjk, data = List.test, type = "response")\$predictions[,'1'], margin)
q1 = pmax(predict(fitj, data = List.test, type = "response")\$predictions[,'1'], margin)
q2 = pmax(predict(fitk, data = List.test, type = "response")\$predictions[,'1'], margin)

return(list(q1 = q1, q2 = q2, q12 = q12))
}
}
#' Estimate marginal and joint distribution of lists j and k using super learner.
#'
#' @param List.train The training data matrix used to estimate the distibution functions.
#' @param List.test The data matrix on which the estimator function is applied.
#' @param K The number of lists in the data.
#' @param j The first list that is conditionally independent.
#' @param k The second list that is conditionally independent.
#' @param sl.lib The functions from the SuperLearner library to be used for model fitting. See [SuperLearner::listWrappers()].
#' @param margin The minimum value the estimates can attain to bound them away from zero.
#' @param num_cores The number of cores to be used for paralellization in Super Learner.
#' @param ... Any extra arguments passed into the function.
#' @return A list of the marginal and joint distribution probabilities \code{q1}, \code{q2} and \code{q12}.
#' @examples
#' \dontrun{
#' qhat = qhat_sl(List.train = List.train, List.test = List.test, margin = 0.005, num_cores = 1)
#' q1 = qhat\$q1
#' q2 = qhat\$q2
#' q12 = qhat\$q12
#' }
#' @import SuperLearner
#' @references Eric Polley, Erin LeDell, Chris Kennedy and Mark van der Laan (2021). SuperLearner: Super Learner Prediction. _R package version 2.0-28_. https://CRAN.R-project.org/package=SuperLearner
#' @references van der Laan, M. J., Polley, E. C. and Hubbard, A. E. (2008) Super Learner, _Statistical Applications of Genetics and Molecular Biology_, 6, article 25.
#' @export
qhat_sl <- function (List.train, List.test, K = 2, j = 1, k = 2, margin = 0.005,
sl.lib = c("SL.glm", "SL.gam", "SL.glm.interaction", "SL.ranger", "SL.glmnet"), num_cores = NA,...)
{

stopifnot(ncol(List.train) > K)
if(missing(List.test)){
List.test = List.train
}

requireNamespace("SuperLearner", quietly = TRUE, warn.conflicts = FALSE)
requireNamespace("parallel", quietly = TRUE, warn.conflicts = FALSE)
requireNamespace("dplyr")
requireNamespace("janitor", quietly = TRUE, warn.conflicts = FALSE)
requireNamespace("tidyr", quietly = TRUE, warn.conflicts = FALSE)
slib = intersect(sl.lib, c("SL.glm", "SL.gam",
"SL.glm.interaction"))
slib1 = setdiff(sl.lib, slib)
if(length(slib) == 0){
slib2 = slib1
}else{
slib2 <- c(slib1, slib, split(rbind(slib, "screen.corP"),
rep(1:length(slib), each = 2)), split(rbind(slib, "screen.glmnet"),
rep(1:length(slib), each = 2)))
}
num_cores = min(num_cores, parallel::detectCores() - 1, na.rm = TRUE)

factor_cols <- subset(List.train, select = -c(1:K)) %>% select_if(Negate(is.numeric)) %>% names()

if(length(factor_cols)) {
List.train = data.frame(List.train[,!(names(List.train) %in% factor_cols)],
model.matrix(formula(paste('~', paste(factor_cols, collapse = '+'))), List.train))
List.test = data.frame(List.test[,!(names(List.test) %in% factor_cols)],
model.matrix(formula(paste('~', paste(factor_cols, collapse = '+'))), List.test))
}

if (ncol(List.train) == K + 1) {
xtrain = data.frame(x1 = List.train[, K + 1])
xtest = data.frame(x1 = List.test[, K + 1])
} else {
xtrain = List.train[, -c(1:K)]
xtest = List.test[, -c(1:K)]
}

xtrain = janitor::clean_names(xtrain)
xtest = janitor::clean_names(xtest)

if(num_cores == 1){
fitj = tryCatch(SuperLearner(Y = as.numeric(List.train[,j]), X = xtrain, family = binomial(),
SL.library = slib2, method = "method.AUC",
verbose = FALSE), silent = TRUE)
fitk = tryCatch(SuperLearner(Y = as.numeric(List.train[,k]), X = xtrain, family = binomial(),
SL.library = slib2, method = "method.AUC",
verbose = FALSE), silent = TRUE)
fitjk = tryCatch(SuperLearner(Y = as.numeric(pmin(List.train[, j], List.train[, k])),
X = xtrain, family = binomial(), method = "method.AUC",
SL.library = slib2, verbose = FALSE), silent = TRUE)

}else{
options(mc.cores = num_cores)
getOption("mc.cores")
cl <- parallel::makeCluster(num_cores, type = "PSOCK")
foo <-parallel::clusterEvalQ(cl, library(SuperLearner))
parallel::clusterSetRNGStream(cl, iseed = 1)
#
parallel::clusterExport(cl, foo)

fitj = tryCatch(snowSuperLearner(cluster = cl, Y = as.numeric(List.train[,j]), X = xtrain, family = binomial(),
SL.library = slib2, method = "method.AUC",
verbose = FALSE), silent = TRUE)
fitk = tryCatch(snowSuperLearner(cluster = cl, Y = as.numeric(List.train[,k]), X = xtrain, family = binomial(),
SL.library = slib2, method = "method.AUC",
verbose = FALSE), silent = TRUE)
fitjk = tryCatch(snowSuperLearner(cluster = cl, Y = as.numeric(pmin(List.train[, j], List.train[, k])),
X = xtrain, family = binomial(), method = "method.AUC",
SL.library = slib2, verbose = FALSE), silent = TRUE)
parallel::stopCluster(cl)
}
if ("try-error" %in% c(class(fitj), class(fitk), class(fitjk))) {
warning("One or more fits with SuperLearner regression failed.")
return(NULL)
}
else {
q12 = pmax(pmin(predict(fitjk, newdata = xtest, onlySL = TRUE)\$pred,
1), margin)
q1 = pmin(pmax(predict(fitj, newdata = xtest, onlySL = TRUE)\$pred,
q12), 1)
q2 = pmin(pmax(predict(fitk, newdata = xtest, onlySL = TRUE)\$pred,
q12), 1)
return(list(q1 = q1, q2 = q2, q12 = q12))
}
}
#' Estimate marginal and joint distribution of lists j and k using multinomial logistic model.
#'
#' @param List.train The training data matrix used to estimate the distibution functions.
#' @param List.test The data matrix on which the estimator function is applied.
#' @param K The number of lists in the data.
#' @param j The first list that is conditionally independent.
#' @param k The second list that is conditionally independent.
#' @param margin The minimum value the estimates can attain to bound them away from zero.
#' @param ... Any extra arguments passed into the function.
#' @return A list of the marginal and joint distribution probabilities \code{q1}, \code{q2} and \code{q12}.
#' @examples
#' \dontrun{
#' qhat = qhat_mlogit(List.train = List.train, List.test = List.test, margin = 0.005)
#' q1 = qhat\$q1
#' q2 = qhat\$q2
#' q12 = qhat\$q12
#' }
#' @importFrom nnet "multinom"
#' @references Croissant Y (2020). Estimation of Random Utility Models in R: The mlogit Package. _Journal of Statistical Software_, *95*(11), 1-41. doi: 10.18637/jss.v095.i11 (URL: https://doi.org/10.18637/jss.v095.i11).
#' @references Venables, W. N. & Ripley, B. D. (2002) Modern Applied Statistics with S. Fourth Edition. _Springer_, New York. ISBN 0-387-95457-0
#' @export
qhat_mlogit <- function(List.train, List.test, K = 2, j = 1, k = 2, margin = 0.005, ...){

stopifnot(ncol(List.train) > K)
if(missing(List.test)){
List.test = List.train
}

requireNamespace("nnet", quietly = TRUE)

l = ncol(List.train) - K

stopifnot(l > 0)

colnames(List.train) = c(paste("L", 1:K, sep = ''), paste("x", 1:(ncol(List.train) - K), sep = ''))
colnames(List.test) = c(paste("L", 1:K, sep = ''), paste("x", 1:(ncol(List.train) - K), sep = ''))

eval(parse(text = paste0("List.train\$caphist = paste0(List.train\$L", j, ", List.train\$L", k, ")")))
eval(parse(text = paste0("List.test\$caphist = paste0(List.test\$L", j, ", List.test\$L", k, ")")))

mfit = try(multinom(formula = formula(paste("caphist ~ ", paste("x", 1:l, sep = '', collapse = ' + '))), data = List.train), silent = TRUE)
if(!("try-error" %in% class(mfit))){

pred = predict(mfit, newdata = List.test, "probs")
if(is.null(setdiff(c("10", "11", "01"), colnames(pred)))){
stop("Training data is missing one or more of the capture history combinations.")
}
q12 = pmax(pred[,"11"], margin)
q1 = pmin(pred[,"10"] + q12, 1)
q2 = pmin(pred[,"01"] + q12, 1)
}else{
warning("One or more fits with SuperLearner regression failed.")
}
return(list(q1 = q1, q2 = q2, q12 = q12))
}
#' Estimate marginal and joint distribution of lists j and k using ensemble of ranger and logit.
#'
#' @param List.train The training data matrix used to estimate the distibution functions.
#' @param List.test The data matrix on which the estimator function is applied.
#' @param K The number of lists in the data.
#' @param j The first list that is conditionally independent.
#' @param k The second list that is conditionally independent.
#' @param margin The minimum value the estimates can attain to bound them away from zero.
#' @param ... Any extra arguments passed into the function.
#' @return A list of the marginal and joint distribution probabilities \code{q1}, \code{q2} and \code{q12}.
#' @examples
#' \dontrun{
#' qhat = qhat_ranger(List.train = List.train, List.test = List.test, margin = 0.005)
#' q1 = qhat\$q1
#' q2 = qhat\$q2
#' q12 = qhat\$q12
#' }
#' @import ranger nnls
#' @references Marvin N. Wright, Andreas Ziegler (2017). ranger: A Fast Implementation of Random Forests for High Dimensional Data in C++ and R. _Journal of Statistical Software_, *77*(1), 1-17. doi:10.18637/jss.v077.i01
#' @references Polley, Eric C. and van der Laan, Mark J., (May 2010) Super Learner In Prediction. _U.C. Berkeley Division of Biostatistics Working Paper Series_. Working Paper 266. https://biostats.bepress.com/ucbbiostat/paper266
#' @export
qhat_rangerlogit <- function(List.train, List.test, K = 2, j = 1, k = 2, margin = 0.005, ...){
requireNamespace("ranger", quietly = TRUE)
requireNamespace("nnls")
l = ncol(List.train) - K
if(missing(List.test)){
List.test = List.train
}

stopifnot(l>0)

colnames(List.train) = c(paste("L", 1:K, sep = ''), paste("x", 1:(ncol(List.train) - K), sep = ''))
colnames(List.test) = c(paste("L", 1:K, sep = ''), paste("x", 1:(ncol(List.train) - K), sep = ''))

fitj = ranger(formula(paste("factor(L", j, ") ~.", sep = '')), data = List.train[,-c(1:K)[-j]], probability = TRUE, classification = TRUE)
fitk = ranger(formula(paste("factor(L", k, ") ~.", sep = '')), data = List.train[,-c(1:K)[-k]], probability = TRUE, classification = TRUE)
fitjk = ranger(formula(paste("factor(L", j, "*L", k, ") ~.", sep = '')), data = List.train[,c(j, k, K + 1:l)], probability = TRUE, classification = TRUE)

if("try-error" %in% c(class(fitj), class(fitk), class(fitjk))){
warning("One or more fits with GAM regression failed.")
return(NULL)
}else{
q12.r = pmax(predict(fitjk, data = List.test, type = "response")\$predictions[,'1'], margin)
q1.r = pmax(predict(fitj, data = List.test, type = "response")\$predictions[,'1'], margin)
q2.r = pmax(predict(fitk, data = List.test, type = "response")\$predictions[,'1'], margin)

ql = qhat_logit(List.train = List.train, List.test = List.test, K = K, j = j, k = k, margin = margin)
q12.l = ql\$q12
q1.l = ql\$q1
q2.l = ql\$q2

requireNamespace("nnls")
A = cbind(1, q12.r, q12.l)
coef = coef(nnls(b = List.test[,paste0('L',j)]*List.test[,paste0('L',k)], A = A))
coef = coef/sum(coef)
q12 = A %*% coef
A = cbind(1, q1.r, q1.l)
coef = coef(nnls(b = List.test[,paste0('L',j)], A = A))
coef = coef/sum(coef)
q1 = pmax(A %*% coef, q12)
A = cbind(1, q2.r, q2.l)
coef = coef(nnls(b = List.test[,paste0('L',k)], A = A))
coef = coef/sum(coef)
q2 = pmin(pmax(A %*% coef, q12), 1 - q1 + q12)

return(list(q1 = q1, q2 = q2, q12 = q12))
}
}
```

## Try the drpop package in your browser

Any scripts or data that you put into this service are public.

drpop documentation built on Nov. 6, 2021, 1:06 a.m.