#' Build model for multiple systems estimation
#'
#' For multiple systems estimation model corresponding to a specified set of two-list effects,
#' set up the GLM model formula and data matrix.
#'
#'
#'@param zdat Data matrix with \eqn{t+1} columns. The first \eqn{t} columns, each corresponding to a particular list,
#' are 0s and 1s defining the capture histories
#' observed. The last column is the count of cases with that particular capture history.
#' List names A, B, ... are constructed if not supplied. Where a capture history is not explicitly listed,
#' it is assumed that it has observed count zero.
#'
#'@param mX A \eqn{2 \times k} matrix giving the \eqn{k} two-list interactions to be included in the model.
#' Each column of \code{mX} contains the numbers of the corresponding pair of lists.
#' If \code{mX = 0}, then all two-list interactions are included. If \code{mX = NULL}, no interactions are included and
#' the main effects model is fitted.
#'
#' @return A list with components as below.
#'
#' \code{datamatrix} A matrix with all possible capture histories, other than those corresponding to empty overlaps
#' within the model. An empty overlap is a pair of lists \eqn{(i,j)} such that no case is observed in both lists,
#' regardless of whether it is present on any other lists. If \eqn{(i,j)} is within the model specified by \code{mX},
#' all capture histories containing both \eqn{i} and \eqn{j} are
#' then excluded.
#'
#' \code{modelform} The model formula suitable to be called by the Generalized Linear Model function \code{glm}
#'
#' \code{emptyoverlaps} A matrix with two rows, whose columns give the indices of pairs of lists for which there are empty overlaps and where
#' the pair is within the specified model. The column names give the names of the lists
#' corresponding to each pair.
#'
#' @examples
#' data(NewOrl)
#' buildmodel(NewOrl, mX=NULL)
#'@export
buildmodel <- function(zdat, mX) {
m = dim(zdat)[2] - 1
#Recover the data matrix with combination of lists which have count zero
zdfull = tidylists(zdat, includezerocounts = T)
vn = dimnames(zdfull)[[2]]
#count
cn = vn[m+1]
#names of list excluding count
vn = vn[ -(m+1)]
modf = paste(cn, "~ .")
if (is.null(mX)) return(list(datamatrix = zdfull, modelform = modf, emptyoverlaps = matrix(nrow=2,ncol=0)))
if (sum(mX) == 0) {
mX = t(expand.grid(1:m, 1:m))
mX = mX[ , mX[1,]<mX[2,]]
}
mX=as.matrix(mX)
# --- find if any of the overlaps specified within the model are empty
neff = dim(mX)[2]
effc = rep(T,neff)
for (j in (1:neff)) {
i1 = mX[1,j]
i2 = mX[2,j]
joverlap = sum(zdfull[,i1]*zdfull[,i2]*zdfull[,m+1])
if (joverlap == 0) {
effc[j] = F
zdfull = zdfull[zdfull[,i1]*zdfull[,i2]==0, ]
}
}
#----construct matrix of interaction parameters where overlap is empty
mempty = as.matrix(mX[, !effc])
dimnames(mempty)[[1]] = c("","")
menames = apply(matrix(vn[mempty],nrow=2),2, paste, collapse=":")
dimnames(mempty)[[2]] = menames
#----construct matrix of interaction parameters where overlap is not empty
mX = as.matrix(mX[,effc])
mXc = vn[mX]
dim(mXc) = dim(mX)
#----construct the model formula to include
# the interactions in the model for which the overlap is not empty
mXmod = paste(apply(mXc,2, paste, collapse=":"), collapse=" + ")
if (nchar(mXmod) > 0) modf = paste(modf, mXmod, sep= " + ")
modf = as.formula(modf)
return(list(datamatrix = zdfull, modelform = modf, emptyoverlaps = mempty))
}
#' Check all possible models
#'
#'This routine checks every possible model for existence and identifiability of the maximum likelihood estimates.
#'
#' The routine calls the routine \code{\link{checkident}} for every model.
#' If there are \eqn{t} lists then there are \eqn{t(t-1)/2} pairs of lists, and hence \eqn{2^{t(t-1)/2}} possible models, because
#' the models correspond to subsets of the set of all pairs of lists. If \eqn{t = 7} there are 2,097,152 models to check, which would take
#' several hours. If \eqn{t} is equal to 8 or more, the routine terminates with a statement of the number of models and an explanation that
#' checking all of these is not possible in a reasonable time.
#'
#'@param zdata Data matrix with \eqn{t+1} columns. The first \eqn{t} columns, each corresponding to a particular list,
#' are 0s and 1s defining the capture histories
#' observed. The last column is the count of cases with that particular capture history.
#' List names A, B, ... are constructed if not supplied. Where a capture history is not explicitly listed,
#' it is assumed that it has observed count zero.
#'
#'@param verbose Specifies whether all possible models are listed in the output, or only those which generate a non-zero error code.
#'
#'@return
#' If \code{verbose=F}, it gives a matrix each of whose rows specifies a model, with last entry equal to the error code. Only those models yielding a non-zero error code are included, so if no models lead to an error the matrix is empty. Each of the first \eqn{t(t-1)/2} columns corresponds to a pair of lists, and for each row, presence in or absence from the corresponding model is indicated by the value 1 or 0 respectively.
#'
#' If \code{verbose=T}, it gives the full matrix of models together with the error codes they generate.
#'
#'
#' @examples
#'data(Artificial_3)
#'data(Western)
#'checkallmodels(Artificial_3, verbose=T)
#'checkallmodels(Western)
#'@export
checkallmodels <- function (zdata, verbose=F){
m = dim(zdata)[2] - 1
if (m == 6)
print("The program is checking 32728 models.")
if (m == 7)
print("There are 2,097,152 models to check. This may take several hours.")
if (m >= 8) {
cat("The number of models is", 2^{
m*(m - 1)/2
}, ". It is impossible to check them in a reasonable time.")
return(invisible())
}
mX0 = rbind(rep(1:m, each = m), rep(1:m, times = m))
mX0 = mX0[, (mX0[1, ] < mX0[2, ])]
varnames = dimnames(zdata)[[2]][1:m]
mvnames = rbind(varnames[mX0[1,]], varnames[mX0[2,]])
pairnames = apply(mvnames, 2, paste, collapse=":")
npairs = m * (m - 1)/2
nmodels = 2^npairs
modmat = matrix(0, nrow=nmodels, ncol=npairs)
dimnames(modmat)[[2]] = pairnames
ierr = rep(NA, nmodels)
for (j in (1:nmodels)) {
jkeep = (intToBits(j)[1:m] == 1)
mX = mX0[, jkeep]
modmat[j,jkeep] = 1
ierr[j] = checkident(zdata, mX)
}
resmat = cbind(modmat, ierr)
if (!verbose) resmat= resmat[ierr>0,]
return(resmat)
}
#' Produce a data matrix with a unique row for each capture history
#'
#' This routine finds rows with the same capture history and consolidates them into a single row whose count is the sum of counts of
#' the relevant rows. If \code{includezerocounts = T} then it also includes rows for all the capture histories with zero counts; otherwise
#' these are all removed.
#'
#' @param zdat Data matrix with \eqn{t+1} columns. The first \eqn{t} columns, each corresponding to a particular list,
#' are 0s and 1s defining the capture histories
#' observed. The last column is the count of cases with that particular capture history.
#' List names A, B, ... are constructed if not supplied. Where a capture history is not explicitly listed,
#' it is assumed that it has observed count zero.
#'
#' @param includezerocounts If \code{F} then remove rows corresponding to capture histories with zero count.
#' If \code{T} then include all possible capture histories including those with zero count,
#' excluding the all-zero row corresponding to the dark figure.
#'
#' @return A data matrix in the form specified above, including all capture histories with zero counts if \code{includezerocounts=T}.
#'
#' @examples
#' data(NewOrl)
#' zdat<-tidylists(NewOrl,includezerocounts=T)
#'@export
tidylists <- function(zdat, includezerocounts = F) {
zdat = as.matrix(zdat)
m = dim(zdat)[2] - 1
# ----construct full capture history matrix
zm = NULL
#-----produce an unordered matrix of all possible capture history including the one corresponds to dark figure
for (j in (1:m)) zm = rbind(cbind(1, zm), cbind(0, zm))
# ----calculate the number of 1s in each row of the unordered matrix
ztot = apply(zm, 1, sum)
#----order the rows of the matrix according to the number of 1s appearing on each row in ascending order
zm = zm[order(ztot), ]
#---remove the row that has 0s for all its entries and add a column of 0s
zm = cbind(zm[-1, ], 0)
#---supply column names if necessary
vn = dimnames(zdat)[[2]]
if(is.null(vn)) {
vn = c(LETTERS[1:m], "count")
}
dimnames(zm)[[2]] = vn
# find row of zm corresponding to each row of zmse and update count
bcode = zdat[, -(m + 1)] %*% (2^(1:m))
bc = zm[, -(m + 1)] %*% (2^(1:m))
for (j in (1:dim(zdat)[1])) {
ij = (1:dim(zm)[1])[bc == bcode[j]]
zm[ij, m + 1] = zm[ij, m + 1] + zdat[j, m + 1]
}
# remove rows with zero counts if includezerocounts is F
if (!includezerocounts)
zm = zm[zm[, m + 1] > 0, ]
zdatr = as.data.frame(zm)
return(zdatr)
}
#' Build the model matrix based on particular data, as required to check for identifiability and existence of the maximum likelihood estimate
#'
#' This routine builds a model matrix as required by the linear program check \code{\link{checkident}} and checks if the matrix is of full rank.
#' In addition, for each individual list, and for each pair of lists included in the model,
#' it returns the total count of individuals appearing on the specific list or lists whether or not in combination with other lists.
#'
#' @param zdat Data matrix with \eqn{t+1} columns. The first \eqn{t} columns, each corresponding to a particular list,
#' are 0s and 1s defining the capture histories
#' observed. The last column is the count of cases with that particular capture history.
#' List names A, B, ... are constructed if not supplied. Where a capture history is not explicitly listed,
#' it is assumed that it has observed count zero.
#'
#' @param mX A \eqn{2 \times k} matrix giving the \eqn{k} two-list interactions to be included in the model.
#' Each column of \code{mX} contains the numbers of the corresponding pair of lists.
#' If \code{mX = 0}, then all two-list interactions are included. If \code{mX = NULL}, no interactions are included and
#' the main effects model is fitted.
#'
#' @return A list with components as below
#'
#' \code{modmat} The matrix that maps the parameters in the model (excluding any corresponding to non-overlapping lists) to the log expected value
#' of the counts of capture histories that do not contain non-overlapping pairs in the data.
#'
#' \code{tvec} A vector indexed by the parameters in the model, excluding those corresponding to non-overlapping pairs of lists. For each parameter
#' the vector contains the total count of individuals in all the capture histories that dominate that parameter.
#'
#' \code{rankdef} The column rank deficiency of the matrix \code{modmat}. If \code{rankdef = 0}, the matrix has full column rank.
#'
#'
#'@examples
#' data(NewOrl)
#' buildmodelmatrix(NewOrl, mX=NULL)
#'
#' @export
#'
buildmodelmatrix = function(zdat, mX=NULL) {
m = dim(zdat)[2] - 1
#Recover the data matrix
zdfull = tidylists(zdat, T)
if (length(mX)==0) { mX= matrix(nrow=2,ncol=0)
}
else {
if (sum(mX) == 0) {
mX = t(expand.grid(1:m, 1:m))
mX = mX[ , mX[1,]<mX[2,]]
}
mX=as.matrix(mX)
# --- find if any of the overlaps specified within the model are empty
neff = dim(mX)[2]
effc = rep(T,neff)
for (j in (1:neff)) {
i1 = mX[1,j]
i2 = mX[2,j]
joverlap = sum(zdfull[,i1]*zdfull[,i2]*zdfull[,m+1])
if (joverlap == 0) {
effc[j] = F
zdfull = zdfull[zdfull[,i1]*zdfull[,i2]==0, ]
}
}
#----construct matrix of interaction parameters where overlap is not empty
mX = as.matrix(mX[,effc])
}
counts = zdfull[,m+1]
nobs = dim(zdfull)[1]
vn = c(0,dimnames(zdfull)[[2]][-(m+1)])
modmat = cbind(rep(1,nobs), zdfull[, -(m+1)])
#----construct the model matrix to include
# the interactions in the model for which the overlap is not empty
neffs = dim(mX)[2]
if (neffs > 0) {for ( j in (1:dim(mX)[2])) {
i1 = mX[1,j]
i2 = mX[2,j]
modmat = cbind(modmat, zdfull[,i1]*zdfull[,i2])
vn = c(vn, paste(vn[i1+1],vn[i2+1],sep=":"))
}}
dimnames(modmat)[[2]] =vn
tvec = t(modmat)%*%counts
rankdef = dim(modmat)[2] - qr(modmat)$rank
return(list(modmat=modmat, tvec=tvec, rankdef=rankdef))
}
#' Check a model for the existence and identifiability of the maximum likelihood estimate
#'
#' Apply the linear programming test as derived by Fienberg and Rinaldo (2012), and a calculation of the rank of the design
#' matrix, to check whether a particular model yields an identifiable maximum likelihood estimate
#' based on the given data. The particular algorithm applied is described on page 3 of the supplementary material, with a typographical error corrected.
#'
#' @param zdat Data matrix with \eqn{t+1} columns. The first \eqn{t} columns, each corresponding to a particular list,
#' are 0s and 1s defining the capture histories
#' observed. The last column is the count of cases with that particular capture history.
#' List names A, B, ... are constructed if not supplied. Where a capture history is not explicitly listed,
#' it is assumed that it has observed count zero.
#'
#' @param mX A \eqn{2 \times k} matrix giving the \eqn{k} two-list interactions to be included in the model.
#' Each column of \code{mX} contains the numbers of the corresponding pair of lists.
#' If \code{mX = 0}, then all two-list interactions are included. If \code{mX = NULL}, no interactions are included and
#' the main effects model is fitted.
#'
#' @param verbose Specifies the output. If \code{F} then the error code is returned. If \code{T} then
#' in addition the routine prints an error message if the model/data fail either of the two tests, and also
#' returns both the error code and the \code{lp} object.
#'
#' @return
#'
#'If \code{verbose=F}, then return the error code \code{ierr} which is 1 if the linear program test shows that the maximum likelihood
#' estimate does not exist, 2 if it is not identifiable, and 3 if both tests are failed.
#'
#'If \code{verbose=T}, then return a list with components as below
#'
#'\code{ierr} As described above
#'
#'\code{zlp} Linear programming object, in particular giving the value of the objective function at optimum.
#'
#' @references Fienberg, S. E. and Rinaldo, A. (2012). Maximum likelihood estimation in log-linear
#' models. Ann. Statist. 40, 996-1023. Supplementary material: Technical report, Carnegie Mellon University. Available from \url{http://www.stat.cmu.edu/~arinaldo/Fienberg_Rinaldo_Supplementary_Material.pdf}.
#' @export
#'
checkident <- function(zdat, mX=0, verbose=F)
{
#---use LP to check if ML estimate exists
#--- construct model matrix
zb = buildmodelmatrix(zdat, mX)
amat = t(zb$modmat)
tt = zb$tvec
npar = dim(amat)[1]
nobs = dim(amat)[2]
#----construct objective and constraint matrix
f.obj = c(rep(0,nobs),1)
const.rhs = c( tt, rep(0,nobs))
amat1 = cbind(amat, rep(0,npar))
amat2 = cbind(diag(nobs), rep(-1, nobs))
#amat3 = c(rep(0,nobs), 1)
const.mat = rbind(amat1, amat2)
const.dir = c( rep("=",npar), rep(">=",nobs))
#--------------carry out the LP step
zlp = lpSolve::lp("max",f.obj,const.mat,const.dir,const.rhs)
ierr = (zlp$objval == 0) + 2*(zb$rankdef > 0 )
if (!verbose) return(ierr)
if (zlp$objval == 0) cat ("The estimate is not finite \n")
if (zb$rankdef > 0 ) cat ("The estimate is not identifiable \n")
return(list(ierr=ierr, lp= zlp))
}
#' Estimate the total population including the dark figure
#'
#' This routine calculates the estimate of the total population, including the dark figure, together with confidence intervals as specified. It also returns the details of the fitted model.
#'
#' @param zdat Data matrix with \eqn{t+1} columns. The first \eqn{t} columns, each corresponding to a particular list,
#' are 0s and 1s defining the capture histories
#' observed. The last column is the count of cases with that particular capture history.
#' List names A, B, ... are constructed if not supplied. Where a capture history is not explicitly listed,
#' it is assumed that it has observed count zero.
#'
#' @param method If \code{method = "stepwise"} the stepwise method implemented in \code{\link{stepwisefit}} is used. If \code{method = "fixed"} then a specified fixed model is used; the model is then given by \code[mX}. If \code{method = "main"} then main effects only are fitted.
#'
#' @param quantiles Quantiles of interest for confidence intervals.
#' @param mX A \eqn{2 \times k} matrix giving the \eqn{k} two-list interactions to be included in the model if \code{method = "fixed"}.
#' Each column of \code{mX} contains the numbers of the corresponding pair of lists.
#' If \code{mX = 0}, then all two-list interactions are included. If \code{mX = NULL}, no interactions are included and
#' the main effects model is fitted.
#' If only one interaction is to be fitted, it is ok to specify it as a vector of length 2, e.g \code{mX=c(1,3)}
#' for interactions of list 1 and 3. If \code{method} is equal to \code{"stepwise"} or \code{"fixed"} then \code{mX} is ignored.
#'
#' @param pthresh Threshold p-value used if \code{method = "stepwise"}.
#'
#' @return A list of components as below
#'
#' \code{estimate} Point estimate and confidence interval estimates corresponding to specified quantiles
#'
#' \code{MSEfit} The model fitted to the data in the format described in \code{\link{modelfit}}.
#'
#' @examples
#' data(NewOrl)
#' data(NewOrl_5)
#' estimatepopulation(NewOrl, method="stepwise", quantiles=c(0.025,0.975))
#' estimatepopulation(NewOrl_5, method="main", quantiles=c(0.01, 0.05,0.95, 0.99))
#'
#'@export
estimatepopulation <-function(zdat,method="stepwise", quantiles=c(0.025,0.975),mX=NULL, pthresh=0.001){
if (method=="stepwise"){
zMSE=stepwisefit(zdat, pthresh)
}
else if (method=="fixed"){
zMSE=modelfit(zdat,mX)
}
else if (method=="main") {
zMSE=modelfit(zdat, mX=NULL)
}
else {stop ("Unknown method")}
zfit=zMSE$fit
zzdat=zfit$data
totobs = sum(zzdat[, dim(zzdat)[2]])#summing all observed history
estdarklogmean = zfit$coefficients[1]#intercept of the fit
estdarklogsd = sqrt(summary(zfit)$cov.unscaled[1,1])
quantiles_with_pt.estimate<-sort(unique(c(quantiles,0.5)))
estdarkci = exp(qnorm(quantiles_with_pt.estimate, estdarklogmean, estdarklogsd))#estimated dark figure confidence interval
totci = totobs + estdarkci
names(totci)<-paste( quantiles_with_pt.estimate*100,"%",sep="")
for (i in 1:length(totci)){
if (names(totci)[i]=="50%"){
names(totci)[i]<-"point est."}
}
return(list(estimate=totci, MSEfit = zMSE))
}
#' Fit a specified model to Multiple Systems Estimation data
#'
#' This routine fits a specified model to multiple systems estimation data, taking account of the possibility of empty overlaps between observed lists.
#'
#' @param zdat Data matrix with \eqn{t+1} columns. The first \eqn{t} columns, each corresponding to a particular list,
#' are 0s and 1s defining the capture histories
#' observed. The last column is the count of cases with that particular capture history.
#' List names A, B, ... are constructed if not supplied. Where a capture history is not explicitly listed,
#' it is assumed that it has observed count zero.
#' @param mX A \eqn{2 \times k} matrix giving the \eqn{k} two-list interactions to be included in the model.
#' Each column of \code{mX} contains the numbers of the corresponding pair of lists.
#' If \code{mX = 0}, then all two-list interactions are included. If \code{mX = NULL}, no interactions are included and
#' the main effects model is fitted.
#' If only one interaction is to be fitted, it may be specified as a vector of length 2, e.g \code{mX=c(1,3)}
#' for interactions of list 1 and 3.
#'
#' @param check If \code{check = T} check first of all if the maximum likelihood
#' estimate exists and is identifiable, using the routine \code{\link{checkident}}. If either condition fails, print an appropriate error message
#' and return the error code.
#'
#'
#' @return A list of components as below
#'
#' \code{fit} Details of the fit of the specified model as output by \code{glm}. The Akaike information criterion is adjusted to take account
#' of the number of parameters corresponding to empty overlaps.
#'
#' \code{emptyoverlaps} Matrix with two rows, giving the list pairs within the model for which no cases are observed in common.
#' Each column gives the indices of a pair of lists, with the names of the lists in the column name.
#'
#' \code{poisspempty} the Poisson p-values of the empty overlaps.
#'
#' @examples
#' data(NewOrl)
#' modelfit(NewOrl,c(1,3))
#' @export
modelfit <- function(zdat, mX=NULL, check = T) {
#--- perform check if asked for
if (check) {
zcheck = checkident(zdat, mX, verbose = T)
if (zcheck$ierr > 0) return(zcheck$ierr)
}
zz = buildmodel(zdat, mX)
eo = zz$emptyoverlaps
nover= dim(eo)[2]
fit = glm(zz$modelform, family=poisson, data=zz$datamatrix, x=TRUE)
fit$aic = fit$aic + 2*nover
#----------deal with empty overlaps if necessary
probzero = vector("numeric",nover)
if (nover > 0) {
names(probzero) = dimnames(eo)[[2]]
for (iover in (1:nover)) {
lambda = sum(fit$coefficients[c(1,1 + eo[,iover])])
probzero[iover] = exp( -exp(lambda))
}
}
return( list (fit=fit, emptyoverlaps = eo, poisspempty = probzero))
}
#'Stepwise fit using Poisson pvalues.
#'
#'Starting with a model with main effects only, pairwise interactions are added one by one.
#'At each stage the interaction with the lowest p-value is added, provided that p-value is lower than \code{pthresh}, and provided that the resulting
#' model does not fail either of the tests in \code{\link{checkident}}.
#'
#' For each candidate interaction for possible addition to the model, the p-value is calculated as follows.
#' The total of cases occurring on both lists indexed by the interaction (regardless of
#' whether or not they are on any other lists) is calculated.
#' On the null hypothesis that the effect is not included in the model, this statistic has a Poisson
#' distribution whose mean depends on the parameters within the model. The one-sided Poisson p-value of the observed statistic is calculated.
#'
#'@param zdat Data matrix with \eqn{t+1} columns. The first \eqn{t} columns, each corresponding to a particular list,
#' are 0s and 1s defining the capture histories
#' observed. The last column is the count of cases with that particular capture history.
#' List names A, B, ... are constructed if not supplied. Where a capture history is not explicitly listed,
#' it is assumed that it has observed count zero.
#'
#'@param pthresh this is the threshold below which the p-value of the newly added parameter needs to be in order to be included in the model.
#' If \code{pthresh = 0} then the model with main effects only is returned.
#'
#'@return A list with components as follow
#'
#'\code{fit} Details of the fit of the specified model as output by \code{glm}. The Akaike information criterion is adjusted to take account
#' of the parameters corresponding to empty overlaps.
#'
#'\code{emptyoverlaps} Matrix with two rows, each column of which gives the list pairs within the model for which empty overlaps are observed
#'
#' \code{poisspempty} the Poisson p-value of the empty overlaps.
#'
#'@examples
#'data(NewOrl)
#'stepwisefit(NewOrl, pthresh=0.001)
#'
#'@export
stepwisefit<- function(zdat, pthresh=0.001) {
m = dim(zdat)[2] - 1
mX = NULL
zfit = modelfit(zdat, mX=NULL, check=F)
if (pthresh == 0) return(zfit)
# ------ Set up a list of interactions to be included in the model with interactions
ints = NULL
for (i in (1:(m - 1))) for (j in ((i + 1):m)) {
ints = cbind(ints,c(i, j))
}
nints = dim(ints)[2]
nstar = rep(0,nints)
for (j in (1:nints)) {
nstar[j] = sum( zdat[,ints[1,j]]*zdat[,ints[2,j]]*zdat[,m+1])
}
#--------cycle through and find best interaction to add.
intsincluded = rep(F, nints)
for (jcycle in (1:nints)) {
pred0 = predict(zfit$fit)
dat0 = zfit$fit$data
pvalnew = rep(1, nints)
for (j in (1:nints)) {
if (!intsincluded[j]) {
#----- find the predicted Poisson intensity of this overlap
pstar = sum(exp(pred0) * dat0[,ints[1,j]]*dat0[,ints[2,j]] )
pvalnew[j] = min( ppois(nstar[j],pstar), ppois(nstar[j]-1, pstar, F) )
#----- test the potential model and set the p-value to 1 if the model
#--- would yield an infinite result or be non-identifiable
ierr = checkident(zdat, cbind(mX, ints[,j]))
if (ierr > 0) pvalnew[j] = 1
}
}
#---------if none of them have p value below threshold, finish, otherwise incorporate and proceed
pvmin = min(pvalnew)
if (pvmin >= pthresh) return(zfit)
jintmax = min((1:nints)[pvalnew == pvmin])
mX1 = cbind(mX, ints[, jintmax])
zf1 = modelfit(zdat, mX = mX1, check=F)
zfit = zf1
mX = mX1
intsincluded[jintmax] = T
}
return(zfit)
}
#' Plot of simulation study
#'
#' This routine produces Figure 1 of Chan, Silverman and Vincent (2019).
#'
#' Simulations are carried out for two different three-list models.
#' In one model, the probabilities of capture are 0.01, 0.04 and 0.2 for the three lists respectively, while in the other the probability
#' is 0.3 on all three lists. In both cases, there are no interaction
#' effects, so that captures on the lists occur independently of each other.
#' The first model is chosen to be somewhat more typical of the sparse capture
#' case, of the kind which often occurs in the human trafficking context,
#' while the second is a more classical multiple systems estimate.
#'
#' The probability of an individual having each possible capture history is first evaluated.
#' Then these probabilities are multiplied by \code{Nsamp = 1000} and, for each simulation
#' replicate, Poisson random values with expectations equal to these values are generated
#' to give a full set of observed capture histories;
#' together with the null capture history the expected number of counts
#' (population size) is equal to \code{Nsamp}.
#' Inference was carried out both for the model with main effects only, and for the model with the addition of an interaction effect between the first two lists.
#' The reduction in deviance between the two models was determined.
#'
#' Checking for compliance with the conditions for existence and identifiability of the
#' estimates shows that a very small number of the simulations for the sparse model (two out of
#' ten thousand) fail the checks for existence even within the extended maximum likelihood context.
#' Detailed investigation shows that in neither of these cases is the dark figure itself not estimable;
#' although the parameters themselves cannot all be estimated, there is a maximum likelihood estimate
#' of the expected capture frequencies, and hence the deviance can still be calculated.
#'
#' The routine produces QQ-plots
#' of the resulting deviance reductions against quantiles of the \eqn{\chi^2_1} distribution,
#' for \code{nsim} simulation replications.
#'
#' @param nsim=10000 The number of simulation replications
#' @param Nsamp=1000 The expected value of the total population within each simulation
#' @param seed=1001 The random number seed
#'
#' @return
#'
#' An \code{nsim} \eqn{\times} 2 matrix giving the changes in deviance for each replication for each of the two models
#'
#'
#'
#' @references
#'Chan, L., Silverman, B. W., and Vincent, K. (2019).
#' Multiple systems estimation for Sparse Capture Data: Inferential Challenges when there are Non-Overlapping Lists. Available from \url{https://arxiv.org/abs/1902.05156}.
#'
#'@export
investigateAIC <- function(nsim=10000, Nsamp= 1000, seed = 1001) {
#
# ---- define functions to be called
#
simulate_deviances <- function(N, plist=0.1, nsim=100, seed= NULL ) {
if (length(plist)==1) plist= rep(plist, 3)
set.seed(seed)
devdiffs = rep(NA, nsim)
for (j in (1:nsim)) {
zdat1 = simulate_capture(N, plist)
devdiffs[j] = deviance_compare(zdat1)
}
return(devdiffs)
}
#
#
simulate_capture <- function(N=1000, plist=rep(0.1,3)) {
zdata = rbind(diag(3), 1-diag(3), rep(1,3))
meancounts = N* exp( zdata %*% log(plist) + (1-zdata)%*% log(1-plist))
simcounts = rpois(7, meancounts)
zdata = cbind(zdata,simcounts)
dimnames(zdata)[[2]] = c("A", "B", "C", "count")
return(zdata)
}
#
#
deviance_compare <- function(zdat) {
if(sum(zdat[4:7,4])==0) return(0)
devnull = modelfit(zdat)$fit$deviance
devalt = modelfit(zdat, mX=c(1,2),check=F)$fit$deviance
devdiff = devnull - devalt
return(devdiff)
}
#
# ----carry out the simulations and plot the results
#
zdev0 = simulate_deviances(Nsamp, plist=c(0.01,0.04,0.2),nsim, seed=seed )
zdev1 = simulate_deviances(Nsamp, plist=0.3,nsim, seed=seed )
zdev= cbind(sort(zdev0), sort(zdev1))
zchisq = qchisq(ppoints(nsim), df=1)
matplot(zchisq, zdev, type="l", xlim=c(0,8), ylim=c(0,8), xaxs="i", yaxs="i",
xlab="Chi-Squared distribution", ylab="Simulated deviances", main="QQ plot")
abline( 0,1, lty=3)
return(zdev)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.