#' Anastasiade-Tillé Decomposition by Calibration
#'
#' Decomposes the difference of average wages between men and women in composition effects and wage structure effects.
#'
#' @param formula a formula specifying the income variable and the variables to calibrate upon.
#' @param design a design object of class \code{survey.design} or class \code{svyrep.design} from the \code{survey} library.
#' @param sex a formula indicating the sex variable, a factor of two levels (male and female).
#' @param na.rm Should cases with missing values be dropped?
#' @param grake.args arguments to be passed to the \code{grake} function of the \code{survey} library.
#' @param ... future expansion
#'
#' @details This function runs on a survey design object produced by the \code{survey} package.
#'
#' @return Object of class "\code{svystat}", which are vectors with a "\code{var}" attribute giving the variance-covariance matrix and a "\code{statistic}" attribute giving the name of the decomposed variable.
#'
#' @author Guilherme Jacob
#'
#' @seealso \code{\link{svyatddens}}
#'
#' @references Anastasiade, M.-C., and Tillé, Y. (2017). Decomposition of gender wage inequalities through calibration: Application to the Swiss structure of earnings survey.
#' \emph{Survey Methodology}, Statistics Canada, Catalogue No. 12-001-X, Vol. 43, No. 2. URL \url{http://www.statcan.gc.ca/pub/12-001-x/2017002/article/54887-eng.htm}.
#'
#' DiNardo, J., Fortin, N., and Lemieux, T. (1996). Labor Market Institutions and the Distribution of Wages, 1973-1992: A Semiparametric Approach. \emph{Econometrica},
#' vol. 64, no. 5, pp. 1001–1044, URL \url{http://www.jstor.org/stable/2171954}.
#'
#' Oaxaca, R. (1973). Male-Female Wage Differentials in Urban Labor Markets. \emph{International Economic Review}, vol. 14, no. 3, pp. 693–709.
#' URL \url{www.jstor.org/stable/2525981}.
#'
#' Blinder, A. (1973). Wage Discrimination: Reduced Form and Structural Estimates. \emph{The Journal of Human Resources}, vol. 8, no. 4, 1973, pp. 436–455. JSTOR, www.jstor.org/stable/144855
#'
#' @keywords survey
#'
#' @examples
#' library(vardpoor)
#' library(survey)
#'
#' data(ses)
#' names( ses ) <- gsub( "size" , "size_" , tolower( names( ses ) ) )
#' ses$sex <- relevel(ses$sex, "male")
#'
#' # define arguments for raking
#' grake.args <- list( calfun = survey::cal.raking,
#' bounds = list(lower = 0 , upper = 3 ),
#' epsilon = 1e-7,
#' eta = NULL,
#' maxit = 100 ,
#' verbose = FALSE,
#' variance = NULL )
#'
#' # define starting values
#' grake.args$eta <- c( -4.05134, -0.60522, -0.50329, -0.57568, -0.25707, 2.66533, 0.02911 )
#'
#' # lrun decomposition
#' des_ses <- svydesign(id=~1, weights=~weights, data=ses)
#' svyatd( formula = log(earningsmonth) ~ factor(age) + hourspaid , design = des_ses , sex = ~sex , na.rm = TRUE , grake.args = grake.args )
#'
#' @export
svyatd <-
function(formula, design, ...) {
design$subset <- weights( design ) > 0
# if( length( attr( terms.formula( formula ) , "term.labels" ) ) != 2 ) stop( "convey package functions currently only support one variable in the `formula=` argument" )
UseMethod("svyatd", design )
}
#' @rdname svyatd
#' @export
svyatd.survey.design <-
function( formula , design , sex , na.rm = TRUE , grake.args = list( calfun = survey::cal.raking , bounds = list( lower = .5 , upper = 2 ) , epsilon = 1e-7 , eta = NULL , maxit = 1000 , verbose = FALSE , variance=NULL) , ... ) {
# collect weights
ww <- weights( design )
# collect data
sexvar <- model.frame( formula = sex , data = design$variables[ ww > 0 , , drop = FALSE ] , na.action = na.pass )
xx <- model.frame( formula = formula , data = design$variables[ ww > 0 , , drop = FALSE ] , na.action = na.pass , drop.unused.levels = TRUE )
# test consistency
if ( length( levels( sexvar[,1] ) ) != 2 ) stop( "sex variable must be a factor with two levels only.")
# replace infinite for missing
numcols <- as.logical( sapply( xx, is.numeric , USE.NAMES = FALSE , simplify = TRUE ) )
if ( sum( numcols ) >= 2 ) {
xx[ which( apply( xx[ , numcols ] , 2 , is.infinite ) , arr.ind = TRUE ) ] <- NA
}else if ( sum( numcols ) ==1 ) {
xx[ is.infinite( xx[ , numcols ] ) , numcols ] <- NA
}
# remove missing values
if ( na.rm ) {
# test values
nas <- rowSums( is.na( cbind( sexvar , xx ) ) )
# filter out observations
design <- design[ nas == 0 , ]
# collect new weights
ww <- weights( design )
# collect data
sexvar <- model.frame( formula = sex , data = design$variables[ ww > 0 , , drop = FALSE ] , na.action = na.pass )
xx <- model.frame( formula = formula , data = design$variables[ ww > 0 , , drop = FALSE ] , na.action = na.pass , drop.unused.levels = TRUE )
}
# filter weights
ww <- ww[ ww > 0 ]
# test for NA
if ( any( is.na( cbind( sexvar , xx ) ) ) ) {
# result object
rval <- rep(NA,3)
names(rval) <- c( "total effect" , "composition effect" , "structural effect" )
vmat <- matrix( NA , ncol = 3 , nrow = 3 )
colnames( vmat ) <- rownames( vmat ) <- c( "total effect" , "composition effect" , "structural effect" )
class(rval) <- c( "svystat" )
attr(rval, "var") <- vmat
attr(rval, "statistic") <- strsplit( as.character( formula )[[2]] , ' \\+ ' )[[1]]
return(rval)
}
# add contrasts
sexvar <- model.matrix( update( sex , ~. + 0 ) , sexvar )[,]
yy <- as.matrix( xx[,1,drop=FALSE] )
xx <- model.matrix( ~. , xx[,-1,drop=FALSE] )[ , ]
# calculate adjusted totals for calibration
population <- sum( ww * sexvar[ , 2 ] ) * colSums( ww * sexvar[ , 1 ] * xx ) / sum( ww * sexvar[ , 1 ] )
# if eta is NULL, set base values
if ( is.null( grake.args$eta ) ) grake.args$eta <- rep(0 , ncol( xx ) )
# calibrate group 2 weights on group 1 totals
g <- tryCatch( do.call( grake , c( list( mm = xx[ as.logical(sexvar[,2]) , ] , ww = ww[ as.logical(sexvar[,2]) ] ) , population = list(population) , grake.args ) ) )
g_indices <- names( g )
eta <- attr( g , "eta" )
g <- (function(a,b) { a[names(a) %in% names(b)] <- b ; a[!(names(a) %in% names(b))] <- 1 ; a })(ww,g)
attr( g , "eta" ) <- eta
attr( g , "indices" ) <- g_indices
# group 1 average
ybar_m <- sum( ww * sexvar[,1] * yy ) / sum( ww * sexvar[,1] )
# group 2 average
ybar_f <- sum( ww * sexvar[,2] * yy ) / sum( ww * sexvar[,2] )
# group 2|1 average
ybar_fm <- sum( ww * sexvar[,2] * g * yy ) / sum( ww * g * sexvar[,2] )
# linearizations
zm <- ww * sexvar[,1] * (yy - ybar_m) * 1/sum(ww*sexvar[,1])
zf <- ww * sexvar[,2] * (yy - ybar_f) * 1/sum(ww*sexvar[,2])
x_bar0 <- colSums( ww * sexvar[ , 1 ] * xx ) / sum( ww * sexvar[ , 1 ] )
bigT <- crossprod( ww * g * sexvar[ , 2] * xx , xx )
bigB <- crossprod( MASS::ginv( bigT ) , t( crossprod( ww * sexvar[ , 2] * g * yy , xx ) ) )
zfm <- ifelse( as.logical( sexvar[ , 2 ] ) ,
ww * sexvar[ , 2 ] * ( g * yy - ybar_fm - crossprod( t( sexvar[ , 2 ] * sweep( g * xx , 2 , x_bar0 , "-" ) ) , bigB ) ) / sum( ww * sexvar[ , 2 ] ) ,
ww * sexvar[ , 1 ] * crossprod( t( sexvar[ , 1 ] * sweep( xx , 2 , x_bar0 , "-" ) ) , bigB ) / sum( ww * sexvar[ , 1 ] ) )
zfm <- matrix( zfm , ncol = 1 )
zmat <- cbind( zm , zf , zfm )
rownames(zmat) <- rownames(sexvar)
colnames(zmat) <- NULL
rm( zm , zf , zfm )
zmat[ , 1:3 ] <- cbind( zmat[,1] - zmat[,2] , zmat[,3] - zmat[,2] , zmat[,1] - zmat[,3] )
# vcov matrix
res <- matrix( rep( as.numeric( weights( design ) > 0 , 3 ) ) , ncol = 3 , nrow = length( weights( design ) ) , byrow = FALSE )
res[ is.element( names( weights( design ) ) , rownames( zmat ) ) , 1:3 ] <- zmat
vmat <- survey::svyrecvar( res , clusters = design$cluster , stratas = design$strata , fpcs = design$fpc , postStrata = design$postStrata )
# result object
rval <- c(ybar_m - ybar_f , ybar_fm - ybar_f , ybar_m - ybar_fm )
names(rval) <- c( "total effect" , "composition effect" , "structural effect" )
colnames( vmat ) <- rownames( vmat ) <- c( "total effect" , "composition effect" , "structural effect" )
class(rval) <- c( "svystat" )
attr(rval, "var") <- vmat
attr(rval, "statistic") <- as.character( formula )[[2]]
attr(rval, "eta") <- eta
# attr(rval, "formula") <- list( model = formula , sex = sex )
rval
}
#' @rdname svyatd
#' @export
svyatd.svyrep.design <-
function( formula , design , sex , na.rm = TRUE , grake.args = list( calfun = survey::cal.raking , bounds = list( lower = .5 , upper = 2 ) , epsilon = 1e-7 , eta = NULL , maxit = 1000 , verbose = FALSE , variance=NULL) , ... ) {
# collect data
sexvar <- model.frame( formula = sex , data = design$variables , na.action = na.pass )
xx <- model.frame( formula = formula , design$variables , na.action = na.pass , drop.unused.levels = TRUE )
# test consistency
if ( length( levels( sexvar[,1] ) ) != 2 ) stop( "sex variable must be a factor with two levels only.")
# replace infinite for missing
numcols <- as.logical( sapply( xx, is.numeric , USE.NAMES = FALSE , simplify = TRUE ) )
if ( sum( numcols ) >= 2 ) {
xx[ which( apply( xx[ , numcols ] , 2 , is.infinite ) , arr.ind = TRUE ) ] <- NA
}else if ( sum( numcols ) ==1 ) {
xx[ is.infinite( xx[ , numcols ] ) , numcols ] <- NA
}
# remove missing values
if ( na.rm ) {
# test values
nas <- 1*( is.na( cbind( sexvar , xx ) ) )
nas <- rowSums( nas ) > 0
# filter out observations
design <- design[ !nas , ]
# collect data
sexvar <- model.frame( formula = sex , data = design$variables , na.action = na.pass )
xx <- model.frame( formula = formula , design$variables , na.action = na.pass , drop.unused.levels = TRUE )
}
# add contrasts
sexvar <- model.matrix( update( sex , ~. + 0 ) , sexvar )[,]
yy <- as.matrix( xx[ , 1, drop = FALSE ] )
xx <- model.matrix( ~. , xx[ , -1 , drop = FALSE ] )[ , ]
# test for NA
if ( any( is.na( cbind( sexvar , xx ) ) ) ) {
# result object
rval <- rep(NA,3)
names(rval) <- c( "total effect" , "composition effect" , "structural effect" )
vmat <- matrix( NA , ncol = 3 , nrow = 3 )
colnames( vmat ) <- rownames( vmat ) <- c( "total effect" , "composition effect" , "structural effect" )
class(rval) <- c( "svrepstat" )
attr(rval, "var") <- vmat
attr(rval, "statistic") <- strsplit( as.character( wage )[[2]] , ' \\+ ' )[[1]]
return(rval)
}
# collect sampling weights
ww <- weights( design , "sampling" )
# calculate adjusted totals for calibration
population <- sum( ww * sexvar[ , 2 ] ) * colSums( ww * sexvar[ , 1 ] * xx ) / sum( ww * sexvar[ , 1 ] )
# if eta is NULL, set base values
if ( is.null( grake.args$eta ) ) grake.args$eta <- rep(0 , ncol( xx ) )
# calibrate group 2 weights on group 1 totals
g <- tryCatch( do.call( grake , c( list( mm = xx[ as.logical(sexvar[,2]) , ] , ww = ww[ as.logical(sexvar[,2]) ] ) , population = list(population) , grake.args ) ) )
g_indices <- names( g )
eta <- attr( g , "eta" )
g <- (function(a,b) { a[names(a) %in% names(b)] <- b ; a[!(names(a) %in% names(b))] <- 1 ; a })(ww,g)
attr( g , "eta" ) <- eta
attr( g , "indices" ) <- g_indices
# group 1 average
ybar_m <- sum( ww * sexvar[,1] * yy ) / sum( ww * sexvar[,1] )
# group 2 average
ybar_f <- sum( ww * sexvar[,2] * yy ) / sum( ww * sexvar[,2] )
# group 2|1 average
ybar_fm <- sum( ww * sexvar[,2] * g * yy ) / sum( ww * g * sexvar[,2] )
# collect analysis weights
wr <- weights( design , "analysis" )
# calibrate group 2 weights on group 1 totals: replicates
grake.args$eta <- eta
gq <- apply( wr , 2 , function(wi) {
these_indices <- which( as.logical(sexvar[,2]) & (wi > 0))
# z <- do.call( grake , c( list( mm = xx[ these_indices , ] , ww = wi[ these_indices ] ) , population = list(population) , grake.args ) )
z <- tryCatch( do.call( grake , c( list( mm = xx[ these_indices , ] , ww = wi[ these_indices ] ) , population = list(population) , grake.args ) ) , warning = function(e) { NULL } )
if (is.null(z)) return( rep(NA,nrow(wr)) )
res <- rep( 1 , nrow( sexvar ) )
res[ as.logical(sexvar[,2]) & (wi > 0) ] <- z
res
} )
# filter out replicates that did not converge
ncr <- apply(gq , 2 , function(z) any( is.na(z) ) )
if ( sum( ncr ) > 0 ) warning( sum(ncr) , " replicates did not converge. Variance calculated using " , sum( !ncr ) , " replicates left." )
gq <- gq[ , !ncr ]
wr <- wr[ , !ncr ]
# calculate on replicate values
qq1 <- apply( wr, 2, function(wi) sum( wi * sexvar[,1] * yy ) / sum( wi * sexvar[,1] ) )
qq2 <- apply( wr, 2, function(wi) sum( wi * sexvar[,2] * yy ) / sum( wi * sexvar[,2] ) )
qq3 <- apply( wr * gq , 2, function(wi) sum( wi * sexvar[,2] * yy ) / sum( wi * sexvar[,2] ) )
thetas <- matrix( c(qq1 - qq2 , qq3 - qq2, qq1 - qq3 ) , ncol = 3 ) ; rm( qq1,qq2,qq3)
vmat <- survey::svrVar( thetas = thetas , design$scale, design$rscales[ !ncr ], mse = design$mse, coef = c(ybar_m - ybar_f , ybar_fm - ybar_f , ybar_m - ybar_fm ) )
vmat <- as.matrix( vmat )
# result object
rval <- c(ybar_m - ybar_f , ybar_fm - ybar_f , ybar_m - ybar_fm )
names(rval) <- c( "total effect" , "composition effect" , "structural effect" )
colnames( vmat ) <- rownames( vmat ) <- names( rval )
class(rval) <- c( "svrepstat" )
attr(rval, "var") <- vmat
attr(rval, "statistic") <- as.character( formula )[[1]]
attr(rval, "eta") <- eta
# attr(rval, "formula") <- list( model = formula , sex = sex )
rval
}
#' @rdname svyatd
#' @export
svyatd.DBIsvydesign <-
function (formula, design, sex , ...){
design$variables <-
survey:::getvars(
unique( c( all.vars(formula) , all.vars(sex) ) ),
design$db$connection,
design$db$tablename,
updates = design$updates,
subset = design$subset ,
)
NextMethod("svyatd", design)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.