#' Density functions using Anastasiade-Tillé Decomposition
#'
#' Plots the original and the counterfactual density functions
#'
#' @param formula a formula specifying the income variable and the variables to calibrate upon.
#' @param design a design object of class \code{survey.design} or class \code{svyrep.design} from the \code{survey} library.
#' @param sex a formula indicating the sex variable, a factor of two levels (male and female).
#' @param grake.args arguments to be passed to the \code{grake} function of the \code{survey} library.
#' @param ... future expansion
#'
#' @details This function
#'
#' @return Object of class "\code{svystat}", which are vectors with a "\code{var}" attribute giving the variance-covariance matrix and a "\code{statistic}" attribute giving the name of the decomposed variable.
#'
#' @author Guilherme Jacob
#'
#' @seealso \code{\link{svyatdens}}
#'
#' @references Anastasiade, M.-C., and Tillé, Y. (2017). Decomposition of gender wage inequalities through calibration: Application to the Swiss structure of earnings survey.
#' \emph{Survey Methodology}, Statistics Canada, Catalogue No. 12-001-X, Vol. 43, No. 2. URL \url{http://www.statcan.gc.ca/pub/12-001-x/2017002/article/54887-eng.htm}.
#'
#' DiNardo, J., Fortin, N., and Lemieux, T. (1996). Labor Market Institutions and the Distribution of Wages, 1973-1992: A Semiparametric Approach. \emph{Econometrica},
#' vol. 64, no. 5, pp. 1001–1044, URL \url{http://www.jstor.org/stable/2171954}.
#'
#' Oaxaca, R. (1973). Male-Female Wage Differentials in Urban Labor Markets. \emph{International Economic Review}, vol. 14, no. 3, pp. 693–709.
#' URL \url{www.jstor.org/stable/2525981}.
#'
#' Blinder, A. (1973). Wage Discrimination: Reduced Form and Structural Estimates. \emph{The Journal of Human Resources}, vol. 8, no. 4, 1973, pp. 436–455. JSTOR, www.jstor.org/stable/144855
#'
#' @keywords survey
#'
#' @examples
#' library(survey)
#' library(vardpoor)
#' data(eusilc) ; names( eusilc ) <- tolower( names( eusilc ) )
#'
#' # linearized design
#' des_eusilc <- svydesign( ids = ~rb030 , strata = ~db040 , weights = ~rb050 , data = eusilc )
#' des_eusilc <- convey_prep(des_eusilc)
#'
#' svyatd( ~eqincome , design = des_eusilc )
#'
#' # replicate-weighted design
#' des_eusilc_rep <- as.svrepdesign( des_eusilc , type = "bootstrap" )
#' des_eusilc_rep <- convey_prep(des_eusilc_rep)
#'
#' svyatd( ~eqincome , design = des_eusilc_rep )
#'
#' \dontrun{
#'
#' # linearized design using a variable with missings
#' svyatd( ~ py010n , design = des_eusilc )
#' svyatd( ~ py010n , design = des_eusilc , na.rm = TRUE )
#' # replicate-weighted design using a variable with missings
#' svyatd( ~ py010n , design = des_eusilc_rep )
#' svyatd( ~ py010n , design = des_eusilc_rep , na.rm = TRUE )
#'
#' # database-backed design
#' library(MonetDBLite)
#' library(DBI)
#' dbfolder <- tempdir()
#' conn <- dbConnect( MonetDBLite::MonetDBLite() , dbfolder )
#' dbWriteTable( conn , 'eusilc' , eusilc )
#'
#' dbd_eusilc <-
#' svydesign(
#' ids = ~rb030 ,
#' strata = ~db040 ,
#' weights = ~rb050 ,
#' data="eusilc",
#' dbname=dbfolder,
#' dbtype="MonetDBLite"
#' )
#'
#' dbd_eusilc <- convey_prep( dbd_eusilc )
#'
#' svyatd( ~ eqincome , design = dbd_eusilc )
#'
#' dbRemoveTable( conn , 'eusilc' )
#'
#' dbDisconnect( conn , shutdown = TRUE )
#'
#' }
#'
#' @export
svyatdens <-
function(formula, design, ...) {
# if( length( attr( terms.formula( formula ) , "term.labels" ) ) != 2 ) stop( "convey package functions currently only support one variable in the `formula=` argument" )
UseMethod("svyatdens", design )
}
#' @rdname svyatdens
#' @export
svyatdens.survey.design <-
function( formula , design , sex , ngrid = 401 , bandwidth = NULL , grake.args = list( calfun = survey::cal.raking , bounds = list( lower = -Inf , upper = Inf ) , epsilon = 1e-7 , verbose = FALSE , maxit = 1000 , variance=NULL) ) {
# collect weights
ww <- weights( design )
# collect data
sexvar <- model.frame( formula = sex , data = design$variables[ ww > 0 , , drop = FALSE ] , na.action = na.pass )
xx <- model.frame( formula = formula , data = design$variables[ ww > 0 , , drop = FALSE ] , na.action = na.pass , drop.unused.levels = TRUE )
# test consistency
if ( length( levels( sexvar[,1] ) ) != 2 ) stop( "sex variable must be a factor with two levels only.")
# replace infinite for missing
numcols <- as.logical( sapply( xx, is.numeric , USE.NAMES = FALSE , simplify = TRUE ) )
xx[ apply( xx[ , numcols ] , 2 , is.infinite ) ] <- NA
# test values
nas <- rowSums( is.na( cbind( sexvar , xx ) ) )
# filter out observations
design <- design[ nas == 0 , ]
# collect new weights
ww <- weights( design )
# collect data
sexvar <- model.frame( formula = sex , data = design$variables[ ww > 0 , , drop = FALSE ] , na.action = na.pass )
xx <- model.frame( formula = formula , data = design$variables[ ww > 0 , , drop = FALSE ] , na.action = na.pass , drop.unused.levels = TRUE )
# filter weights
ww <- ww[ ww > 0 ]
# add contrasts
sexvar <- model.matrix( update( sex , ~. + 0 ) , sexvar )[,]
yy <- as.matrix( xx[,1,drop=FALSE] )
xx <- model.matrix( ~. , xx[,-1,drop=FALSE] )[ , ]
# calculate adjusted totals for calibration
population <- sum( ww * sexvar[ , 2 ] ) * colSums( ww * sexvar[ , 1 ] * xx ) / sum( ww * sexvar[ , 1 ] )
# calibrate group 2 weights on group 1 totals
g <- tryCatch( do.call( grake , c( list( mm = xx[ as.logical(sexvar[,2]) , ] , ww = ww[ as.logical(sexvar[,2]) ] , eta = rep( 0, NCOL(xx) ) ) , population = list(population) , grake.args ) ) )
g <- (function(a,b) { a[names(a) %in% names(b)] <- b ; a[!(names(a) %in% names(b))] <- 1 ; a })(ww,g)
# calculate densities
if ( length(ngrid != 3) ) ngrid <- rep( ngrid[1] , 3 )
if ( is.null(bandwidth) ) bandwidth <- NULL else if ( length(bandwidth != 3) ) bandwidth <- rep( bandwidth[1] , 3 )
densities <- list( custom.locpoly( yy , ww * sexvar[ , 1] , ngrid = ngrid[1] , bandwidth = bandwidth[1] ) ,
custom.locpoly( yy , ww * sexvar[ , 2] , ngrid = ngrid[2] , bandwidth = bandwidth[2] ) ,
custom.locpoly( yy , ww * g * sexvar[ , 2] , ngrid = ngrid[3] , bandwidth = bandwidth[3] ) )
names(densities) <- c( colnames( sexvar ) , paste0( "counterfactual(" , colnames( sexvar )[2] , ")" ) )
# result object
densities
}
#' @rdname svyatdens
#' @export
svyatdens.svyrep.design <-
function( formula , design , sex , grake.args = list( calfun = survey::cal.raking , bounds = list( lower = .5 , upper = 2 ) , epsilon = 1e-7 , eta = NULL , maxit = 1000 , verbose = FALSE , variance=NULL) , ... ) {
# collect data
sexvar <- model.frame( formula = sex , data = design$variables , na.action = na.pass )
xx <- model.frame( formula = formula , design$variables , na.action = na.pass , drop.unused.levels = TRUE )
# test consistency
if ( length( levels( sexvar[,1] ) ) != 2 ) stop( "sex variable must be a factor with two levels only.")
# replace infinite for missing
numcols <- as.logical( sapply( xx, is.numeric , USE.NAMES = FALSE , simplify = TRUE ) )
xx[ apply( xx[ , numcols ] , 2 , is.infinite ) ] <- NA
# test values
nas <- 1*( is.na( cbind( sexvar , xx ) ) )
nas <- rowSums( nas ) > 0
# filter out observations
design <- design[ !nas , ]
# collect data
sexvar <- model.frame( formula = sex , data = design$variables , na.action = na.pass )
xx <- model.frame( formula = formula , design$variables , na.action = na.pass , drop.unused.levels = TRUE )
# add contrasts
sexvar <- model.matrix( update( sex , ~. + 0 ) , sexvar )[,]
yy <- as.matrix( xx[ , 1, drop = FALSE ] )
xx <- model.matrix( ~. , xx[ , -1 , drop = FALSE ] )[ , ]
# collect sampling weights
ww <- weights( design , "sampling" )
# calculate adjusted totals for calibration
population <- sum( ww * sexvar[ , 2 ] ) * colSums( ww * sexvar[ , 1 ] * xx ) / sum( ww * sexvar[ , 1 ] )
# calibrate group 2 weights on group 1 totals
g <- do.call( grake , c( list( mm = xx[ as.logical(sexvar[,2]) , ] , ww = ww[ as.logical(sexvar[,2]) ] , eta = rep( 0, NCOL(xx) ) ) , population = list(population) , grake.args ) )
g <- (function(a,b) { a[names(a) %in% names(b)] <- b ; a[!(names(a) %in% names(b))] <- 1 ; a })(ww,g)
# calculate densities
if ( length(ngrid != 3) ) ngrid <- rep( ngrid[1] , 3 )
if ( is.null(bandwidth) ) bandwidth <- NULL else if ( length(bandwidth != 3) ) bandwidth <- rep( bandwidth[1] , 3 )
densities <- list( custom.locpoly( yy , ww * sexvar[ , 1] , ngrid = ngrid[1] , bandwidth = bandwidth[1] ) ,
custom.locpoly( yy , ww * sexvar[ , 2] , ngrid = ngrid[2] , bandwidth = bandwidth[2] ) ,
custom.locpoly( yy , ww * g * sexvar[ , 2] , ngrid = ngrid[3] , bandwidth = bandwidth[3] ) )
names(densities) <- c( colnames( sexvar ) , paste0( "counterfactual(" , colnames( sexvar )[2] , ")" ) )
# result object
densities
}
#' @rdname svyatdens
#' @export
svyatdens.DBIsvydesign <-
function (formula, design, sex , ...){
design$variables <-
survey:::getvars(
unique( c( all.vars(formula) , all.vars(sex) ) ),
design$db$connection,
design$db$tablename,
updates = design$updates,
subset = design$subset ,
)
NextMethod("svyatdens", design)
}
custom.locpoly <- function ( x, w, ngrid = 401, xlim = NULL, ylim = NULL,
bandwidth = NULL, ...)
{
if (is.null(xlim)) { xlim <- apply(x, 2, range) }
if (!is.matrix(xlim)) xlim <- matrix(xlim, nrow = 2)
if (is.null(bandwidth)) { bandwidth <- KernSmooth::dpik( x , gridsize = ngrid ) } else { bandwidth <- rep(bandwidth, length = ncol(x)) }
ll <- vector("list", 1)
gx <- seq(min(xlim[, 1]), max(xlim[, 1]), length = ngrid)
nx <- rowsum(c(rep(0, ngrid), w), c(1:ngrid, findInterval(x[,1], gx)))
ll[[1]] <- KernSmooth::locpoly(rep(1, ngrid), nx *
ngrid/(diff(xlim[, 1]) * sum(w)), binned = TRUE,
bandwidth = bandwidth, range.x = xlim[, 1])
names(ll) <- colnames(x)
attr(ll, "call") <- sys.call(-1)
attr(ll, "density") <- TRUE
attr(ll, "ylab") <- "Density"
class(ll) <- "svysmooth"
ll
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.