`e.calibrate` <-
function (design, df.population,
calmodel = if (inherits(df.population, "pop.totals"))
attr(df.population, "calmodel"),
partition = if (inherits(df.population, "pop.totals"))
attr(df.population, "partition") else FALSE,
calfun = c("linear", "raking", "logit"), bounds = c(-Inf, Inf),
aggregate.stage = NULL, sigma2 = NULL,
maxit = 50, epsilon = 1e-07, force = TRUE)
#######################################################################
# Definisce un oggetto di classe 'cal.analytic' #
# NOTA: L'algoritmo e' iterativo (calibrazioni indipendenti sui #
# singoli domini) o non iterativo (calibrazione globale) a #
# seconda della struttura del dataframe dei totali noti #
# 'df.population'. #
# NOTA: Il programma verifica se 'df.population' sia conforme allo #
# standard richiesto da e.calibrate (la struttura standard #
# e' quella generata dalla funzione pop.template); se cosi' #
# e', allora la struttura di 'df.population' identifica in #
# modo univoco gli eventuali domini di calibrazione e la #
# formula che definisce il modello di calibrazione. #
# Il programma e', quindi, in grado di generare #
# automaticamente, a partire da 'df.population', i parametri #
# 'formula e 'population' della ez.calib. #
# NOTA: L'argomento 'force' consente di richiedere di accettare #
# l'approssimazione finale dei pesi "calibrati" in caso di #
# mancata convergenza dell'algoritmo di calibrazione. #
# NOTA: L'argomento 'sigma2' modella un eventuale effetto #
# eteroschedastico nel modello di calibrazione (vedi 1/q_k di #
# Deville e Sarndal). #
#######################################################################
{
# First verify if the function has been called inside another function:
# this is needed to correctly manage metadata when e.g. the caller is a
# GUI stratum. (affects ONLY ecal.status)
directly <- !( length(sys.calls()) > 1 )
if (!inherits(design, "analytic"))
stop("Object 'design' must inherit from class analytic")
if (!inherits(df.population, "data.frame"))
stop("Object 'df.population' must be of class data frame")
# Prevent havoc caused by tibbles:
if (inherits(df.population, c("tbl_df", "tbl")))
df.population <- as.data.frame(df.population)
if (!inherits(calmodel, "formula"))
stop("Parameter 'calmodel' must be supplied as a formula")
if (!identical(partition, FALSE)) {
if (!inherits(partition, "formula"))
stop("Parameter 'partition' must be supplied as a formula")
}
if (inherits(df.population, "pop.totals")) {
# DEBUG 15/02/2019: switched from !identical to !all.equal to avoid false error maessages
# generated by different environments in model.formulae
# DEBUG 12/06/2020: BUG: if clauses have to use !isTRUE(all.equal()). Fixed
if (!isTRUE(all.equal(calmodel, attr(df.population, "calmodel"))))
warning("'calmodel' formula (1) does not agree with the 'calmodel' attribute of df.population (2).\nValue (2) will be used", immediate. = TRUE)
if (!isTRUE(all.equal(partition, attr(df.population, "partition"))))
warning("'partition' formula (1) does not agree with the 'partition' attribute of df.population (2).\nValue (2) will be used", immediate. = TRUE)
if ( !identical(attr(design, "data"), attr(df.population, "data")) &&
!identical(design, eval(attr(df.population, "data"))) &&
!is.null(attr(df.population, "data")) &&
!is.null(attr(design, "data")) )
warning("Data frames used to build objects design and df.population have different names",
immediate. = TRUE)
}
else {
df.population <- population.check(df.population, design,
calmodel, partition)
}
if ( !is.numeric(bounds) || (bounds[1] > 1) || (1 > bounds[2]) )
stop("Bounds must be numeric and must satisfy bounds[1] <= 1 <= bounds[2]")
if (!is.logical(force))
stop("Parameter 'force' must be logical")
if (is.character(calfun))
calfun <- match.arg(calfun)
if (calfun=="logit") {
if (any(is.infinite(bounds)))
stop("Bounds must be finite for logit calibration")
if (any(zapsmall(bounds)==1))
stop("Logit distance is not defined for bounds[1]=1 or bounds[2]=1")
}
e.df <- design$variables
calmodel <- attr(df.population, "calmodel")
calmodel.vars <- all.vars(calmodel)
na.Fail(e.df, calmodel.vars)
partition <- attr(df.population, "partition")
ids <- attr(design, "ids")
ids.char <- all.vars(ids)
stages <- length(ids.char)
weights <- attr(design, "weights")
weights.char <- all.vars(weights)
# If design is already calibrated, check that it has no negative "initial
# weights" (i.e. calibration weights generated at the previous calibration step
# NOTE: This cannot happen for uncalibrated designs, owing to e.svydesign checks
if (inherits(design, "cal.analytic")) {
if (any(e.df[, weights.char] < 0)) {
stop("Negative weights in *calibrated* input object 'design': cannot calibrate it further!")
}
}
cal.weights.char <- paste(weights.char, ".cal",sep = "")
if (!is.null(sigma2)){
if (!inherits(sigma2,"formula"))
stop("Heteroskedasticity variable must be passed as a formula")
sigma2.char <- all.vars(sigma2)
if (length(sigma2.char)>1)
stop("Heteroskedasticity formula must reference only one variable")
na.Fail(e.df, sigma2.char)
if (!is.numeric(variance <- e.df[, sigma2.char]))
stop("Heteroskedasticity variable must be numeric")
### BLOCK BELOW IS COMMENTED BECAUSE IT DOES NOT WORK YET ###
# Prevent negative sigma2, with the possible EXCEPTION of those
# coming from an invokation of ext.calibrated(), as they are actually
# convenience values derived from CALIBRATED external weights (see
# ext.calibrated)
# if ( any(is.infinite(variance)) || any(variance <= 0) && !isTRUE(attr(e.df, "negw.pass")))
### BLOCK ABOVE IS COMMENTED BECAUSE IT DOES NOT WORK YET ###
if ( any(is.infinite(variance)) || any(variance <= 0) )
stop("Heteroskedasticity variable must have finite and strictly positive values")
} else sigma2.char <- NULL
mk.ecal.status <- function(df.population,partition,partition.names=NULL){
############################################################
# Diagnostica sul processo di calibrazione. #
# Crea nel .GlobalEnv la lista a 2/3 componenti #
# 'ecal.status': #
# - 'call': #
# identifica la chiamata di e.calibrate che ha #
# generato la lista 'ecal.status'; #
# - 'return.code': #
# matrice che contiene i codici di ritorno dei singoli #
# sub-task del processo di calibrazione complessivo: #
# -1 -> task non ancora affrontato; #
# 0 -> convergenza ottenuta; #
# 1 -> convergenza NON ottenuta ma force=TRUE. #
# - 'fail.diagnostics': #
# presente solo se almeno un return code e' 1, fornisce #
# utili informazioni di diagnostica per ogni partizione #
# in cui la convergenza sia stata forzata. #
############################################################
tasks <- 1
parts <- nrow(df.population)
rownam <- "code"
colnam <- if (identical(partition,FALSE)) "global" else partition.names
ret.cod <- matrix(-1, nrow = tasks, ncol = parts, dimnames = list(rownam, colnam))
if (directly) {
# assign("ecal.status", list(call = sys.call(-1), return.code = ret.cod), envir = .GlobalEnv)
assign2GE("ecal.status", list(call = sys.call(-1), return.code = ret.cod))
}
else {
# assign("ecal.status", list(return.code = ret.cod), envir = .GlobalEnv)
assign2GE("ecal.status", list(return.code = ret.cod))
}
}
upd.ecal.status <- function(n.sub.task,code){
###############################################
# Aggiorna la lista 'ecal.status' nel #
# .GlobalEnv con il codice 'code' ritornato #
# dal sub-task di ordine 'n.subtask' e, se #
# code=1, con i dati diagnostici. #
###############################################
last.task <- col(t(ecal.status[["return.code"]]))[n.sub.task]
last.part <- row(t(ecal.status[["return.code"]]))[n.sub.task]
ecal.status[["return.code"]][last.task,last.part] <<- code
# Add fail diagnostics data:
if (code==1) {
fd <- list(attr(code, "fail.diagnostics"))
names(fd) <- colnames(ecal.status[["return.code"]])[last.part]
ecal.status[["fail.diagnostics"]] <<- c(ecal.status[["fail.diagnostics"]], fd)
}
}
#################################################################
# Il parametro need.gc determina se la garbage collection #
# debba, o non debba, essere gestita dal programma. #
# Il valore di soglia per la dimensione della model-matrix #
# completa del modello di calibrazione e' fissato ad 1/10 #
# della memoria massima allocabile. #
# NOTA: Non dividere per il numero di domini di calibrazione #
# nella stima della dimensione massima della #
# model-matrix serve a contrastare potenziali casi #
# "sfortunati" in cui le dimensioni dei domini di #
# calibrazione siano molto squilibrate (cioe' quando #
# esistano domini con un numero di unita' molto maggiore #
# che nell'equidistribuzione). #
#################################################################
need.gc <- FALSE
if (Sys.info()["sysname"] == "Windows"){
naux <- if (identical(partition, FALSE)) {
ncol(df.population)
}
else {
(ncol(df.population) - length(all.vars(partition)))
}
nrec <- nrow(e.df)
# MEM.mega <- memory.limit() # NOTE: memory.limit() was intended for
# 32-bit versions of Windows, which
# are no longer supported in R
# versions >= 4.2.0 (21/03/2022)
MEM.mega <- 4096
mem.frac <- 10 #Default value
need.gc <- ( ((8 * nrec * naux )/(1024^2)) > (MEM.mega / mem.frac) )
}
if (need.gc)
warning("Complete calibration model-matrix ", if (identical(partition, FALSE)) "takes" else "would take",
" up more than 0.4 GB of allocable memory", immediate. = TRUE)
gc.here <- function(doit) {
#################################################
# Se doit=TRUE effettua la garbage collection #
# quando viene invocata. #
#################################################
if (doit)
gc()
}
# Require MASS package to get ginv at hand ## No longer needed: ReGenesees now IMPORTS MASS
# require(MASS)
cal.sub.task <- 0
if (!is.null(aggregate.stage)) {
if (aggregate.stage < 1 || aggregate.stage > stages)
stop("aggregate.stage must be between 1 and the number of sampling stages (",
stages, ")")
aggindex <- design$cluster[[aggregate.stage]]
# Check that weights are constant at stage aggregate.stage
if (any(err <- tapply(e.df[, weights.char], aggindex,
function(x) length(unique(x)) > 1))) {
first.err <- names(err[err])[1]
stop("Weights are not constant inside stage-", aggregate.stage,
" clusters! (e.g. cluster ", first.err,")")
}
# Check that sigma2 is constant at stage aggregate.stage
if (!is.null(sigma2)){
if (any(err <- tapply(variance, aggindex,
function(x) length(unique(x)) > 1))) {
first.err <- names(err[err])[1]
stop("Heteroskedasticity variable isn't constant inside stage-", aggregate.stage,
" clusters! (e.g. cluster ", first.err,")")
}
}
}
check.totals <- function(tot)
#######################################
# "Type" check for population totals. #
#######################################
{
num.col <- function(df) sapply(names(df), function(v) is.numeric(df[, v]))
if ( any(is.na(as.matrix(tot))) || any(is.infinite(as.matrix(tot))) || !(all(num.col(tot))) )
stop("Population totals must be numeric and finite")
}
if (identical(partition,FALSE)) {
#####################################################
# Calibrazione globale (senza domini) #
#####################################################
check.totals(df.population)
N.cal.constr <- ncol(df.population)
mk.ecal.status(df.population,partition)
qr.list <- vector("list", length = 1)
names(qr.list) <- "population"
gc.here(need.gc)
des <- ez.design(data = e.df, weights = weights, ids = ids,
variables = c(ids.char, weights.char, calmodel.vars, sigma2.char))
gc.here(need.gc)
w.cal <- ez.calib(design = des, population = as.numeric(df.population),
formula = calmodel, calfun = calfun, bounds = bounds,
aggregate.stage = aggregate.stage, sigma2 = sigma2,
maxit = maxit, epsilon = epsilon, force = force)
gc.here(need.gc)
cal.sub.task <- (cal.sub.task+1)
upd.ecal.status(cal.sub.task,attr(w.cal,"ret.code"))
qr.list[[1]] <- list(group = 1:nrow(e.df), qr = attr(w.cal,"qr"), gwhalf = attr(w.cal,"gwhalf"))
gc.here(need.gc)
w.cal <- as.numeric(w.cal)
gc.here(need.gc)
}
else {
#####################################################
# Calibrazione sui singoli domini di calibrazione #
#####################################################
partition.vars <- all.vars(partition)
check.totals(df.population[, -(1:length(partition.vars)), drop = FALSE])
na.Fail(e.df, partition.vars)
N.cal.constr <- prod(nrow(df.population), ncol(df.population)- length(partition.vars))
partition.names <- apply(df.population[,partition.vars,drop=FALSE],1,paste,collapse=".")
mk.ecal.status(df.population,partition,partition.names)
# 'interact': factor i cui livelli identificano le partizioni
interact <- interaction(e.df[, rev(partition.vars), drop = FALSE], drop=TRUE)
# 'groups': lista che contiene gli indici di riga delle osservazioni nelle diverse partizioni
# groups <- .Internal(split(1:nrow(e.df), interact))
groups <- split(1:nrow(e.df), interact)
qr.list <- vector("list", length = length(groups))
names(qr.list) <- partition.names
# 'w.cal' conterra' i pesi calibrati calcolati nel loop partizione per partizione
w.cal <- rep(NA,nrow(e.df))
i.g <- 0
for (g in groups) {
i.g <- i.g+1
des.g <- ez.design(data = e.df[g,], weights = weights, ids = ids,
variables = c(ids.char, weights.char, calmodel.vars, partition.vars, sigma2.char))
pop.g <- df.population[i.g, which(!(names(df.population) %in%
partition.vars))]
w.cal.g <- ez.calib(design = des.g, population = as.numeric(pop.g),
formula = calmodel, calfun = calfun, bounds = bounds,
aggregate.stage = aggregate.stage, sigma2 = sigma2,
maxit = maxit, epsilon = epsilon, force = force)
gc.here(need.gc)
cal.sub.task <- (cal.sub.task+1)
upd.ecal.status(cal.sub.task,attr(w.cal.g,"ret.code"))
qr.list[[i.g]] <- list(group = g, qr = attr(w.cal.g,"qr"), gwhalf = attr(w.cal.g,"gwhalf"))
w.cal[g] <- as.numeric(w.cal.g)
}
gc.here(need.gc)
}
# Aggiunge al dataframe interno a design i pesi calibrati calcolati w.cal
design$variables[[cal.weights.char]] <- w.cal
design$prob <- 1/w.cal
# NOTE: All ReGenesees function that *change* the weights of design objects (e.g.
# e.calibrate, trimcal, ...) do *not* update the *design$allprob* slot.
# Therefore, *design$allprob* acts as a persistent memory of the *initial*
# weights in arbitrary multi-step weights adjustment procedures!
attr(design, "weights") <- as.formula(paste("~", cal.weights.char, sep = ""), env = .GlobalEnv)
if (!is.null(sigma2)){
attr(design, "sigma2") <- sigma2
}
# Add some component to qr.list (so it becomes conform to caldata in calibrate.survey.design2)
psvar <- list(qr.list = qr.list, stage = 0, index = NULL)
class(psvar) <- "analytic_calibration"
design$postStrata <- list(psvar)
# ADD calibration model metadata (2016/07/21 ADDED TO BENEFIT trimcal)
attr(design, "calmodel") <- calmodel
attr(design, "partition") <- partition
if (!is.null(aggregate.stage)){
attr(design, "aggregate.stage") <- aggregate.stage
}
#### REMOVE TOKENS TESTIFYING PREVIOUS WEIGHTS ADJUSTMENTS (if any) -- START
# If design was tokenized:
# 1) remove the token (i.e. calibrating a tokenized object generates an
# 'ordinary' calibrated output)
# 2) store a 'in.change' flag to recall the object is living within a weights
# changing pipeline
in.change <- FALSE
if (is.trimmed(design)) {
attr(design, "trimmed") <- NULL
in.change <- TRUE
}
if (is.smoothed(design)) {
attr(design, "smoothed") <- NULL
in.change <- TRUE
}
if (is.ext.cal(design)) {
attr(design, "ext.cal") <- NULL
in.change <- TRUE
}
#### REMOVE TOKENS TESTIFYING PREVIOUS WEIGHTS ADJUSTMENTS (if any) -- STOP
# Add calibration diagnostics
attr(design, "ecal.status") <- s <- get("ecal.status", envir = .GlobalEnv)
attr(design, "epsilon") <- epsilon
attr(design, "N.cal.constr") <- N.cal.constr
failed <- any(s$return.code != 0)
## SPC ZONE - START
# If design has just been calibrated under a *special purpose calibration* task
# but is *not* currently being calibrated under another spc:
# (0) remove the "expired" 'spc.justdone' token
if (isTRUE(attr(design, "spc.justdone"))) {
attr(design, "spc.justdone") <- NULL
}
# If design was prepared for a *special purpose calibration* task that is *now
# being completed*:
# (1) attach to design an 'spc.justdone' token for usage by other functions
# (e.g. check.cal)
# (2) If required, drop the synthetic linearized z variables just used for
# calibration from design (to save space)
# (3) remove the original "spec.purp.calib" attribute (to save space and for
# information hiding)
# (4) check for possible *false convergence* (i.e. all w.cal ~ 0), if the task
# involves *simple* (i.e. *non-fused*) control totals.
# NOTE: For *fused* control totals, all w.cal ~ 0 would likely cause some
# internal function to raise a message signaling lack of convergence.
if ( is.spc(design) && inherits(df.population, "spc.pop") ) {
# (1)
attr(design, "spc.justdone") <- TRUE
if (isTRUE(attr(design, "spec.purp.calib")$drop.z)) {
# (2)
design$variables[, all.vars(attr(df.population, "spec.purp.calib")$calmodel)] <- NULL
gc.here(need.gc)
}
# (3)
attr(design, "spec.purp.calib") <- NULL
if (!is.fused.spc(df.population)) {
# (4)
# Here, the threshold for condition (all w.cal ~ 0) is set in such a way
# that the calibrated estimate of the population count would drop below
# one: N.cal < 1
if ( sum(abs(w.cal)) < 1 ) {
if (!failed) {
warning("All calibration weights collapsed to (nearly) zero! This indicates a FALSE CONVERGENCE of the *special purpose calibration* task.")
} else {
warning("All calibration weights collapsed to (nearly) zero!")
}
}
}
}
## SPC ZONE - END
# Define cal.analytic class
if (inherits(design, "cal.analytic")) {
design$call <- c(sys.call(), design$call)
gc.here(need.gc)
}
else {
# $call slot
# NOTE: Here accumulate the original design object $call ALSO whenever it
# contains weights that ALREADY underwent a previous modification
# as recalled by the 'in.change' flag
if (in.change) {
# NOTE: IF CLAUSE ABOVE MUST INCLUDE ANY WEIGHTS MODIFICATOR FUNCTION
# AVAILABLE IN ReGenesees
design$call <- c(sys.call(), design$call)
} else {
design$call <- sys.call()
}
class(design) <- c("cal.analytic", class(design))
gc.here(need.gc)
}
design
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.