R/svyatd.R

Defines functions svyatd svyatd.survey.design svyatd.svyrep.design

Documented in svyatd svyatd.survey.design svyatd.svyrep.design

#' Anastasiade-Tillé Decomposition by Calibration
#'
#' Decomposes the difference of average wages between men and women in composition effects and wage structure effects.
#'
#' @param formula a formula specifying the income variable and the variables to calibrate upon.
#' @param design a design object of class \code{survey.design} or class \code{svyrep.design} from the \code{survey} library.
#' @param sex a formula indicating the sex variable, a factor of two levels (male and female).
#' @param na.rm Should cases with missing values be dropped?
#' @param grake.args arguments to be passed to the \code{grake} function of the \code{survey} library.
#' @param ... future expansion
#'
#' @details This function runs on a survey design object produced by the \code{survey} package.
#'
#' @return Object of class "\code{svystat}", which are vectors with a "\code{var}" attribute giving the variance-covariance matrix and a "\code{statistic}" attribute giving the name of the decomposed variable.
#'
#' @author Guilherme Jacob
#'
#' @seealso \code{\link{svyatddens}}
#'
#' @references Anastasiade, M.-C., and Tillé, Y. (2017). Decomposition of gender wage inequalities through calibration: Application to the Swiss structure of earnings survey.
#' \emph{Survey Methodology}, Statistics Canada, Catalogue No. 12-001-X, Vol. 43, No. 2. URL \url{http://www.statcan.gc.ca/pub/12-001-x/2017002/article/54887-eng.htm}.
#'
#' DiNardo, J., Fortin, N., and Lemieux, T. (1996). Labor Market Institutions and the Distribution of Wages, 1973-1992: A Semiparametric Approach. \emph{Econometrica},
#' vol. 64, no. 5, pp. 1001–1044, URL \url{http://www.jstor.org/stable/2171954}.
#'
#' Oaxaca, R. (1973). Male-Female Wage Differentials in Urban Labor Markets. \emph{International Economic Review}, vol. 14, no. 3, pp. 693–709.
#' URL \url{www.jstor.org/stable/2525981}.
#'
#' Blinder, A. (1973). Wage Discrimination: Reduced Form and Structural Estimates. \emph{The Journal of Human Resources}, vol. 8, no. 4, 1973, pp. 436–455. JSTOR, www.jstor.org/stable/144855
#'
#' @keywords survey
#'
#' @examples
#' library(vardpoor)
#' library(survey)
#'
#' data(ses)
#' names( ses ) <- gsub( "size" , "size_" , tolower( names( ses ) ) )
#' ses$sex <- relevel(ses$sex, "male")
#'
#' # define arguments for raking
#' grake.args <- list( calfun = survey::cal.raking,
#'                  bounds = list(lower = 0 , upper = 3 ),
#'                  epsilon = 1e-7,
#'                  eta = NULL,
#'                  maxit = 100 ,
#'                  verbose = FALSE,
#'                  variance = NULL )
#'
#' # define starting values
#' grake.args$eta <- c( -4.05134, -0.60522, -0.50329, -0.57568, -0.25707, 2.66533, 0.02911 )
#'
#' # lrun decomposition
#' des_ses <- svydesign(id=~1, weights=~weights, data=ses)
#' svyatd( formula = log(earningsmonth) ~ factor(age) + hourspaid , design = des_ses , sex = ~sex , na.rm = TRUE , grake.args = grake.args )
#'
#' @export
svyatd <-
  function(formula, design, ...) {

    design$subset <- weights( design ) > 0

    # if( length( attr( terms.formula( formula ) , "term.labels" ) ) != 2 ) stop( "convey package functions currently only support one variable in the `formula=` argument" )

    UseMethod("svyatd", design )

  }


#' @rdname svyatd
#' @export
svyatd.survey.design <-
  function( formula , design , sex , na.rm = TRUE , grake.args = list( calfun = survey::cal.raking , bounds = list( lower = .5 , upper = 2 ) , epsilon = 1e-7 , eta = NULL , maxit = 1000 , verbose = FALSE , variance=NULL) , ... ) {

    # collect weights
    ww <- weights( design )

    # collect data
    sexvar <- model.frame( formula = sex , data = design$variables[ ww > 0 , , drop = FALSE ] , na.action = na.pass )
    xx <- model.frame( formula = formula , data = design$variables[ ww > 0 , , drop = FALSE ] , na.action = na.pass , drop.unused.levels = TRUE )

    # test consistency
    if ( length( levels( sexvar[,1] ) ) != 2 ) stop( "sex variable must be a factor with two levels only.")

    # replace infinite for missing
    numcols <- as.logical( sapply( xx, is.numeric , USE.NAMES = FALSE , simplify = TRUE ) )
    if ( sum( numcols ) >= 2 ) {
      xx[ which( apply( xx[ , numcols ] , 2 , is.infinite ) , arr.ind = TRUE ) ] <- NA
    }else if ( sum( numcols ) ==1 ) {
      xx[ is.infinite( xx[ , numcols ] ) , numcols ] <- NA
    }

    # remove missing values
    if ( na.rm ) {

      # test values
      nas <- rowSums( is.na( cbind( sexvar , xx ) ) )

      # filter out observations
      design <- design[ nas == 0 , ]

      # collect new weights
      ww <- weights( design )

      # collect data
      sexvar <- model.frame( formula = sex , data = design$variables[ ww > 0 , , drop = FALSE ] , na.action = na.pass )
      xx <- model.frame( formula = formula , data = design$variables[ ww > 0 , , drop = FALSE ] , na.action = na.pass , drop.unused.levels = TRUE )

    }

    # filter weights
    ww <- ww[ ww > 0 ]

    # test for NA
    if ( any( is.na( cbind( sexvar , xx ) ) ) ) {

      # result object
      rval <- rep(NA,3)
      names(rval) <- c( "total effect" , "composition effect" , "structural effect" )
      vmat <- matrix( NA , ncol = 3 , nrow = 3 )
      colnames( vmat ) <- rownames( vmat ) <-  c( "total effect" , "composition effect" , "structural effect" )
      class(rval) <- c( "svystat" )
      attr(rval, "var") <- vmat
      attr(rval, "statistic") <- strsplit( as.character( formula )[[2]] , ' \\+ ' )[[1]]
      return(rval)

    }


    # add contrasts
    sexvar <- model.matrix( update( sex , ~. + 0 ) , sexvar )[,]
    yy <- as.matrix( xx[,1,drop=FALSE] )
    xx <- model.matrix( ~. , xx[,-1,drop=FALSE] )[ , ]

    # calculate adjusted totals for calibration
    population <- sum( ww * sexvar[ , 2 ] ) * colSums( ww * sexvar[ , 1 ] * xx ) / sum( ww * sexvar[ , 1 ] )

    # if eta is NULL, set base values
    if ( is.null( grake.args$eta ) ) grake.args$eta <- rep(0 , ncol( xx ) )

    # calibrate group 2 weights on group 1 totals
    g <- tryCatch( do.call( grake , c( list( mm = xx[ as.logical(sexvar[,2]) , ] , ww = ww[ as.logical(sexvar[,2]) ] ) , population = list(population) , grake.args ) ) )
    g_indices <- names( g )
    eta <- attr( g , "eta" )
    g <- (function(a,b) { a[names(a) %in% names(b)] <- b ; a[!(names(a) %in% names(b))] <- 1 ; a })(ww,g)
    attr( g , "eta" ) <- eta
    attr( g , "indices" ) <- g_indices

    # group 1 average
    ybar_m <- sum( ww * sexvar[,1] * yy ) / sum( ww * sexvar[,1] )

    # group 2 average
    ybar_f <- sum( ww * sexvar[,2] * yy ) / sum( ww * sexvar[,2] )

    # group 2|1 average
    ybar_fm <- sum( ww * sexvar[,2] * g * yy ) / sum( ww * g * sexvar[,2] )

    # linearizations
    zm <- ww * sexvar[,1] * (yy - ybar_m) * 1/sum(ww*sexvar[,1])
    zf <- ww * sexvar[,2] * (yy - ybar_f) * 1/sum(ww*sexvar[,2])
    x_bar0 <- colSums( ww * sexvar[ , 1 ] * xx ) / sum( ww * sexvar[ , 1 ] )
    bigT <- crossprod( ww * g * sexvar[ , 2] * xx , xx )
    bigB <- crossprod( MASS::ginv( bigT ) , t( crossprod(  ww * sexvar[ , 2] * g * yy , xx ) ) )
    zfm <- ifelse( as.logical( sexvar[ , 2 ] ) ,
                   ww * sexvar[ , 2 ] * ( g * yy - ybar_fm - crossprod( t( sexvar[ , 2 ] * sweep( g * xx , 2 , x_bar0 , "-" ) ) , bigB ) ) / sum( ww * sexvar[ , 2 ] ) ,
                   ww * sexvar[ , 1 ] * crossprod( t( sexvar[ , 1 ] * sweep( xx , 2 , x_bar0 , "-" ) ) , bigB ) / sum( ww * sexvar[ , 1 ] ) )
    zfm <- matrix( zfm , ncol = 1 )
    zmat <- cbind( zm , zf , zfm )
    rownames(zmat) <- rownames(sexvar)
    colnames(zmat) <- NULL
    rm( zm , zf , zfm )
    zmat[ , 1:3 ] <- cbind( zmat[,1] - zmat[,2] , zmat[,3] - zmat[,2] , zmat[,1] - zmat[,3] )

    # vcov matrix
    res <- matrix( rep( as.numeric( weights( design ) > 0 , 3 ) ) , ncol = 3 , nrow = length( weights( design ) ) , byrow = FALSE )
    res[ is.element( names( weights( design ) ) , rownames( zmat ) ) , 1:3 ] <- zmat
    vmat <- survey::svyrecvar( res , clusters = design$cluster , stratas = design$strata , fpcs = design$fpc , postStrata = design$postStrata )

    # result object
    rval <- c(ybar_m - ybar_f , ybar_fm - ybar_f , ybar_m - ybar_fm )
    names(rval) <- c( "total effect" , "composition effect" , "structural effect" )
    colnames( vmat ) <- rownames( vmat ) <-  c( "total effect" , "composition effect" , "structural effect" )
    class(rval) <- c( "svystat" )
    attr(rval, "var") <- vmat
    attr(rval, "statistic") <- as.character( formula )[[2]]
    attr(rval, "eta") <- eta
    # attr(rval, "formula") <- list( model = formula , sex = sex )
    rval
  }

#' @rdname svyatd
#' @export
svyatd.svyrep.design <-
  function( formula , design , sex , na.rm = TRUE , grake.args = list( calfun = survey::cal.raking , bounds = list( lower = .5 , upper = 2 ) , epsilon = 1e-7 , eta = NULL , maxit = 1000 , verbose = FALSE , variance=NULL) , ... ) {

    # collect data
    sexvar <- model.frame( formula = sex , data = design$variables , na.action = na.pass )
    xx <- model.frame( formula = formula , design$variables , na.action = na.pass , drop.unused.levels = TRUE )

    # test consistency
    if ( length( levels( sexvar[,1] ) ) != 2 ) stop( "sex variable must be a factor with two levels only.")

    # replace infinite for missing
    numcols <- as.logical( sapply( xx, is.numeric , USE.NAMES = FALSE , simplify = TRUE ) )
    if ( sum( numcols ) >= 2 ) {
      xx[ which( apply( xx[ , numcols ] , 2 , is.infinite ) , arr.ind = TRUE ) ] <- NA
    }else if ( sum( numcols ) ==1 ) {
      xx[ is.infinite( xx[ , numcols ] ) , numcols ] <- NA
    }

    # remove missing values
    if ( na.rm ) {

      # test values
      nas <- 1*( is.na( cbind( sexvar , xx ) ) )
      nas <- rowSums( nas ) > 0

      # filter out observations
      design <- design[ !nas , ]

      # collect data
      sexvar <- model.frame( formula = sex , data = design$variables , na.action = na.pass )
      xx <- model.frame( formula = formula , design$variables , na.action = na.pass , drop.unused.levels = TRUE )

    }

    # add contrasts
    sexvar <- model.matrix( update( sex , ~. + 0 ) , sexvar )[,]
    yy <- as.matrix( xx[ , 1, drop = FALSE ] )
    xx <- model.matrix( ~. , xx[ , -1 , drop = FALSE ] )[ , ]

    # test for NA
    if ( any( is.na( cbind( sexvar , xx ) ) ) ) {

      # result object
      rval <- rep(NA,3)
      names(rval) <- c( "total effect" , "composition effect" , "structural effect" )
      vmat <- matrix( NA , ncol = 3 , nrow = 3 )
      colnames( vmat ) <- rownames( vmat ) <-  c( "total effect" , "composition effect" , "structural effect" )
      class(rval) <- c( "svrepstat" )
      attr(rval, "var") <- vmat
      attr(rval, "statistic") <- strsplit( as.character( wage )[[2]] , ' \\+ ' )[[1]]
      return(rval)

    }

    # collect sampling weights
    ww <- weights( design , "sampling" )

    # calculate adjusted totals for calibration
    population <- sum( ww * sexvar[ , 2 ] ) * colSums( ww * sexvar[ , 1 ] * xx ) / sum( ww * sexvar[ , 1 ] )

    # if eta is NULL, set base values
    if ( is.null( grake.args$eta ) ) grake.args$eta <- rep(0 , ncol( xx ) )

    # calibrate group 2 weights on group 1 totals
    g <- tryCatch( do.call( grake , c( list( mm = xx[ as.logical(sexvar[,2]) , ] , ww = ww[ as.logical(sexvar[,2]) ] ) , population = list(population) , grake.args ) ) )
    g_indices <- names( g )
    eta <- attr( g , "eta" )
    g <- (function(a,b) { a[names(a) %in% names(b)] <- b ; a[!(names(a) %in% names(b))] <- 1 ; a })(ww,g)
    attr( g , "eta" ) <- eta
    attr( g , "indices" ) <- g_indices

    # group 1 average
    ybar_m <- sum( ww * sexvar[,1] * yy ) / sum( ww * sexvar[,1] )

    # group 2 average
    ybar_f <- sum( ww * sexvar[,2] * yy ) / sum( ww * sexvar[,2] )

    # group 2|1 average
    ybar_fm <- sum( ww * sexvar[,2] * g * yy ) / sum( ww * g * sexvar[,2] )

    # collect analysis weights
    wr <- weights( design , "analysis" )

    # calibrate group 2 weights on group 1 totals: replicates
    grake.args$eta <- eta
    gq <- apply( wr , 2 , function(wi) {
      these_indices <- which( as.logical(sexvar[,2]) & (wi > 0))
      # z <- do.call( grake , c( list( mm = xx[ these_indices , ] , ww = wi[ these_indices ] ) , population = list(population) , grake.args ) )
      z <- tryCatch( do.call( grake , c( list( mm = xx[ these_indices , ] , ww = wi[ these_indices ] ) , population = list(population) , grake.args ) ) , warning = function(e) { NULL } )
      if (is.null(z)) return( rep(NA,nrow(wr)) )
      res <- rep( 1 , nrow( sexvar ) )
      res[ as.logical(sexvar[,2]) & (wi > 0) ] <- z
      res
    } )

    # filter out replicates that did not converge
    ncr <- apply(gq , 2 , function(z) any( is.na(z) ) )
    if ( sum( ncr ) > 0 ) warning( sum(ncr) , " replicates did not converge. Variance calculated using " , sum( !ncr ) , " replicates left." )
    gq <- gq[ , !ncr ]
    wr <- wr[ , !ncr ]

    # calculate on replicate values
    qq1 <- apply( wr, 2, function(wi) sum( wi * sexvar[,1] * yy ) / sum( wi * sexvar[,1] ) )
    qq2 <- apply( wr, 2, function(wi) sum( wi * sexvar[,2] * yy ) / sum( wi * sexvar[,2] ) )
    qq3 <- apply( wr * gq , 2, function(wi) sum( wi * sexvar[,2] * yy ) / sum( wi * sexvar[,2] ) )
    thetas <- matrix( c(qq1 - qq2 , qq3 - qq2, qq1 - qq3 ) , ncol = 3 ) ; rm( qq1,qq2,qq3)
    vmat <- survey::svrVar( thetas = thetas , design$scale, design$rscales[ !ncr ], mse = design$mse, coef = c(ybar_m - ybar_f , ybar_fm - ybar_f , ybar_m - ybar_fm ) )
    vmat <- as.matrix( vmat )

    # result object
    rval <- c(ybar_m - ybar_f , ybar_fm - ybar_f , ybar_m - ybar_fm )
    names(rval) <- c( "total effect" , "composition effect" , "structural effect" )
    colnames( vmat ) <- rownames( vmat ) <-  names( rval )
    class(rval) <- c( "svrepstat" )
    attr(rval, "var") <- vmat
    attr(rval, "statistic") <- as.character( formula )[[1]]
    attr(rval, "eta") <- eta
    # attr(rval, "formula") <- list( model = formula , sex = sex )
    rval
  }


#' @rdname svyatd
#' @export
svyatd.DBIsvydesign <-
  function (formula, design, sex , ...){

    design$variables <-
      survey:::getvars(
        unique( c( all.vars(formula) , all.vars(sex) ) ),
        design$db$connection,
        design$db$tablename,
        updates = design$updates,
        subset = design$subset ,
      )

    NextMethod("svyatd", design)
  }
guilhermejacob/svycdec documentation built on Nov. 4, 2019, 1:24 p.m.