R/GREGWT.R

# Created by Esteban Munoz (emunozh@gmail.com).
#
# 13.10.2014
# last edit:
# Mi  26 Aug 2015 11:46:05    AEST
# Sat 02 Apr 2016 11:56:02 PM CEST
#


#TODO: document GREGWT
#TODO: implement weighted benchmarks

#' @title GREGWT
#'
#' @description
#' Implementation of the GREGWT algorithm in the R language
#'
#' @param data_in (default=FALSE) input data
#' @param area_code (default = 1) area code to use for the reweighting, defines
#' the area code name. If area_code is a list, the GREGWT function will loop
#' through that list and reweight every area on the list. If area_code is of
#' class logical the GREGWT function will loop through all simulation area on
#' the data set. If area_code is set to `internal` the GREGWT function will
#' loop through all areas on the data set but will used the area codes define
#' by function `prepareData`.
#' @param use_ginv (default = FALSE) use the ginv function of the MASS library
#' to compute the inverse matrix.
#' @param Tx (default = FALSE) manually specify benchmark vector
#' @param X_input (default = FALSE) manually specify survey matrix as dummy
#' variables without reference categories.
#' @param X_complete_input (default = FALSE) manually specify complete survey
#' matrix as dummy variables with all categories.
#' @param dx_input (default = FALSE) manually specify initial weights.
#' @param area_pop (default = FALSE) manually specify the total population. The
#' area_pop is a vector of size == dim(Tx)[2]. If Tx is a vector area_pop needs
#' to be define as a single value representing the total population of a single
#' area.
#' @param group (default = FALSE) position of the grouping variable, implies an
#' integrated reweighting.
#' @param bounds (default = c(0,Inf)) defines the bounds of the estimated
#' new weights.
#' @param epsilon (default = 0.001) defines the desire precision of the model
#' to achieve convergence.
#' @param max_iter (default = 10) maximum number of iterations to run
#' @param cat_weights (default = FALSE, under development) weights for
#' individual benchmarks. If not FALSE the function will compute individual
#' weights for each benchmark (or benchmark group) and compute a weighted mean
#' with the weights define in variable 'cat_weights'.
#' @param align_pop (default = TRUE) align weights to total population.
#' @param error (default = TRUE) compute the error in the simulation.
#' @param output_log (default = FALSE) create en output log of the simulation.
#' This will create a file called GREGWT.out on the current working directory.
#' This option will suppress the output to the command line. You can process
#' the output by calling the function logtocsv() provided by this package.
#' @param verbose (optional, default = FALSE) be verbose
#' @return GREGWT
#' @examples
#'
#' library('GREGWT')
#' data("GREGWT.census")
#' data("GREGWT.survey")
#'
#' simulation_data <- prepareData(
#'   GREGWT.census, GREGWT.survey,
#'   survey_id=FALSE,
#'   pop_benchmark=c(2,12),
#'   census_categories=seq(2,24),
#'   survey_categories=seq(1,3)
#' )
#'
#' # reweight for 4 areas
#' areas <- c("02", "11", "04011", "04012")
#' weights_GREGWT = GREGWT(data_in=simulation_data, area_code=areas)
#' plot(weights_GREGWT)
#'
#' # reweight and plot the results for a single area
#' acode = "02"
#' weights_GREGWT_02 <- GREGWT(data_in= simulation_data, area_code=acode)
#'
#' plot(weights_GREGWT_02)
#' print(weights_GREGWT_02)
#' summary(weights_GREGWT_02)
#'
#' @author M. Esteban Munoz H.
GREGWT <- function(x, ...) UseMethod("GREGWT")

GREGWT.default <- function(data_in           = FALSE,
                           X_input           = FALSE,
                           X_complete_input  = FALSE,
                           Tx                = FALSE,
                           dx_input          = FALSE,
                           area_pop          = FALSE,     # area population
                           area_code         = 1,
                           use_ginv          = FALSE,
                           group             = FALSE,
                           bounds            = c(0,Inf),
                           epsilon           = 0.001,
                           max_iter          = 10,
                           benchmark_weights = FALSE,
                           align_pop         = TRUE,
                           error             = TRUE,
                           output_log        = FALSE,
                           verbose           = FALSE){

    if (output_log){
        log_con <- file("GREGWT.log", open="a")
        sink(log_con, append=TRUE)
        sink(log_con, append=TRUE, type="message")
    }

    if (!(is.logical(data_in))){
        if (class(data_in) != "prepareData"){
            stop("data_in has to be of class 'prepareData'")
        }
    }

    if (verbose) cat("\nstart GREGWT, read variables")
    if (is.logical(data_in) &
       !(is.logical(X_input)) & !(is.logical(dx_input))){
        if (verbose) cat("\nusing X_complete, Tx and dx")
        X_complete <- X_complete_input
        X_temp <- X_input
        dx <- dx_input
        if (!(is.null(dim(Tx)))){
            if (is.numeric(area_code)){
                total_pop <- area_pop[area_code]
            } else if (is.character(area_code)){
                total_pop <- area_pop[rownames(Tx) %in% area_code, ]
            }
        } else {
            total_pop <- area_pop
        }
        survey <- X_complete_input
    } else if (!(is.logical(data_in))){
        if (verbose) cat("\nusing data_in")
        X_complete  <- data_in$X_complete
        X_temp      <- data_in$X
        dx          <- data_in$dx
        total_pop   <- data_in$total_pop
        survey      <- data_in$survey
    } else {
        stop("Either data_in or X_input and dx_input have to be define")
    }

    survey_id <- X_temp[,colnames(X_temp) %in% "survey_id"]
    X <- X_temp[,!(colnames(X_temp) %in% "survey_id")]
    X <- as.matrix(X)
    if (verbose) cat(" ... ok")

    # Get area benchmarks
    if (verbose) cat("\nget area benchmarks")
    if (is.logical(Tx) & !(is.logical(data_in))){
        if(verbose) cat("\n\tget Tx ")
        if (is.logical(area_code)){
            if(verbose) cat("(for", dim(data_in$Tx)[1], "areas):")
            area_index = seq(dim(data_in$Tx)[1])
        } else if (area_code == 'internal'){
            area_index = seq(dim(data_in$Tx)[1])
            area_code = data_in$area_id[,1]
            if(verbose) cat("(for", length(area_index), "areas):")
        } else if (length(area_code) == 1){
            if(verbose) cat("(for single area): ")
            if(is.numeric(area_code)){
                area_index = area_code
            } else {
                area_index = which(data_in$area_id==area_code)
            }
            if(verbose) cat('index ', area_index)
        } else {
            if(verbose) cat("(for", length(area_code), "areas):")
            area_index = which(data_in$area_id[,1] %in% area_code)
        }
        Tx <- data_in$Tx[area_index, ]
        Tx_complete <- data_in$Tx_complete[area_index,]
    } else if (!(is.logical(Tx))){
        if(verbose) cat("\n\tgiven Tx")
        if (is.numeric(area_code) & !(is.null(dim(Tx)))){
            Tx <- Tx[area_code, ]
        } else if (is.character(area_code) & !(is.null(dim(Tx)))){
            Tx <- Tx[rownames(Tx) %in% area_code, ]
        }
        Tx_complete <- Tx
    } else {
        stop("Either data_in or Tx have to be define as input")
    }
    if (verbose){
        cat("\n\tX_complete: ", dim(X_complete))
        if (!(is.null(dim(Tx)))){
            cat("\n\tTx: ", dim(Tx))
        } else {
            cat("\n\tTx: ", length(Tx))
        }
        cat("\n\ttotal_pop: ", dim(total_pop)[1], dim(total_pop)[2])
        cat("\n\tdx: ", length(dx))
    }
    if(verbose) cat(" ... ok")

    ## Define breaks and weighted benchmarks
    #if (!(is.logical(data_in)) & is.logical(breaks)){
    #    breaks <- data_in$breaks
    #} else if (is.logical(data_in) & is.logical(breaks)){
    #    stop("Either data_in or breaks have to be defined as input")
    #}
    #breaks <- c(breaks, dim(Tx)[2])
    #if((length(breaks) != length(benchmark_weights)) & !(is.logical(benchmark_weights))){
    #    #TODO: implement weighted benchmarks
    #    stop("Expected ", length(breaks), "weights, ",
    #         length(benchmark_weights), "given\n")
    #}

    # Get total area population
    if (verbose) cat("\nget area population")
    if (is.logical(area_pop)){
        if (is.logical(total_pop)){
            pop       <- FALSE
            align_pop <- FALSE
            if(verbose) cat("\nunknown total population: ", pop)
        } else {
            if (is.logical(area_code)){
                pop <- total_pop$pop
            } else if (length(area_code) == 1){
                if(is.numeric(area_code)){
                    pop <- total_pop[area_code, ]
                } else {
                    pop <- total_pop[which(total_pop$area_id==area_code), ]
                }
                pop <- pop$pop
            } else {
                pop <- total_pop[total_pop$area_id %in% area_code, ]
                pop <- pop$pop
            }
            if (verbose) cat("\nusing total population by area_id: ",
                            area_code)
        }
    } else {
        pop <- area_pop
        if (verbose) cat("\nusing variable <area_pop>: ", pop)
    }
    if (verbose) cat("\npopulation: ")
    if (verbose) {
        if(length(pop) == 1){
            cat(pop)
        } else {
            cat('vector')
        }
    }
    if (verbose) cat(" ... ok")

    if (!(is.logical(pop))){
        # Add population vector
        X <- cbind(X, vector(length=dim(X)[1]) + 1)
        colnames(X)[length(colnames(X))] <- "pop"
        X_complete <- cbind(X_complete, vector(length=dim(X)[1]) + 1)
        colnames(X_complete)[length(colnames(X_complete))] <- "pop"

        # And the corresponding population benchmark to Tx
        if (is.null(dim(Tx))){
            Tx[length(Tx)+1] <- pop
            Tx_complete[length(Tx_complete)+1] <- pop
        } else {
            Tx <- cbind(Tx, pop)
            Tx_complete <- cbind(Tx_complete, pop)
        }
        names(Tx)[length(names(Tx))] <- "pop"
        names(Tx_complete)[length(names(Tx_complete))] <- "pop"
    } else {
        if(verbose) cat("\nno population...")
    }
    if(verbose) cat(" ... ok")

    if(verbose) cat("\narrange data and data formats")
    constrains_complete <- names(Tx_complete)
    constrains_names    <- names(Tx)
    Tx                  <- as.matrix(Tx)
    Tx_complete         <- as.matrix(Tx_complete)
    bounds              <- as.numeric(bounds)
    epsilon             <- as.numeric(epsilon)
    max_iter            <- as.numeric(max_iter)
    if(verbose) cat(" ... ok")

    if(verbose) cat("\nconvert NA to 0")
    Tx[is.na(Tx)]                   <- 0
    Tx_complete[is.na(Tx_complete)] <- 0

    # get number of simulation areas
    if (verbose) cat("\ndim(Tx) --> ", dim(Tx), "\n")
    if (is.null(dim(Tx))){
        area_numbers <- 1
    } else {
        if (dim(Tx)[2] == 1){
            if (verbose) cat("\nndim 2 TX", "\n")
            area_numbers <- dim(Tx)[2]
        } else {
            if (verbose) cat("\nndim 1 TX", "\n")
            area_numbers <- dim(Tx)[1]
        }
    }

    # loop throgh all simulation areas
    final_weights = as.numeric()
    TAE    <- as.numeric()
    SAE    <- as.numeric()
    PSAE   <- as.numeric()
    TAD    <- as.numeric()
    DW     <- as.numeric()
    DChi2  <- as.numeric()
    TDChi2 <- as.numeric()
    Z      <- as.numeric()
    EM     <- as.numeric()
    ED     <- as.numeric()
    for (i in seq(area_numbers)){
        if (verbose) cat("\n", "area_numbers -->", area_numbers, "\n")
        if (verbose) cat("\n", "loop --> ", i, "\n")
        if (area_numbers == 1){
            area_code_i   <- area_code
            Tx_i          <- Tx
            Tx_complete_i <- Tx_complete
            pop_i         <- pop
        } else {
            area_code_i   <- area_code[i]
            Tx_i          <- as.matrix(Tx[i, ])
            Tx_complete_i <- as.matrix(Tx_complete[i, ])
            pop_i         <- pop[i]
        }

    # Divide initial weights by area population
    if(!(is.logical(pop_i))){
        if(verbose) cat("\ndivide initial weights by area population")
        if(verbose) cat("\nlength(dx)", length(dx))
        if(verbose) cat("\npop: ",      pop_i, "\n")
        dx <- as.numeric(dx)
        dx <- alignPop(dx, pop_i)
    } else {
        if (!(is.logical(align_pop))){
            dx <- alignPop(dx, sum(Tx)*align_pop)
        }
    }
    if(verbose) cat(" length(dx)", length(dx))
    if(verbose) cat(" ... ok")

    # Group initial weights for integrated reweight
    dx_oinput <- dx
    X_oinput  <- X
    if(is.character(group)){
        if(verbose) cat("\ngroup weights for integrated reweighting\n")
        dx <- groupDx(X, dx, group)
        if(verbose) cat(" length(dx-group): ", length(dx), "\n")
        X  <- groupX(X, group)
        if(verbose) cat(" dim(X-group): ",     dim(X),     "\n")
    }else{
        dx_oinput=FALSE
    }
    if(verbose) cat(" ... ok")

    if(verbose) cat("\nGREGWT...")
    model_iter <- GREGWTest(X, dx, Tx_i, X_complete, Tx_complete, pop_i,
                       survey_id = survey_id,
                       use_ginv  = use_ginv,
                       group     = group,
                       bounds    = bounds,
                       epsilon   = epsilon,
                       max_iter  = max_iter,
                       X_input   = X_input,
                       dx_oinput = dx_oinput,
                       area_code = area_code_i,
                       align_pop = align_pop,
                       verbose   = verbose)
    if (verbose) cat(" ... ok")

    model_iter$X                   <- X_oinput
    model_iter$X_complete          <- X_complete
    model_iter$constrains          <- constrains_names
    model_iter$constrains_complete <- constrains_complete
    model_iter$survey              <- survey
    model_iter$Tx                  <- Tx_i
    model_iter$Tx_complete         <- Tx_complete_i
    model_iter$pop                 <- pop_i

    if (error){
        model_iter <- computeError(model_iter, group, verbose=verbose)
        TAE        <- cbind(TAE,    model_iter$TAE)
        SAE        <- cbind(SAE,    model_iter$SAE)
        PSAE       <- cbind(PSAE,   model_iter$PSAE)
        TAD        <- cbind(TAD,    model_iter$TAD)
        DW         <- cbind(DW,     model_iter$DW)
        DChi2      <- cbind(DChi2,  model_iter$DChi2)
        TDChi2     <- cbind(TDChi2, model_iter$TDChi2)
        Z          <- cbind(Z,      model_iter$Z)
        EM         <- cbind(EM,     model_iter$EM)
        ED         <- cbind(ED,     model_iter$ED)
    } else {
        cat("\t|\n")
    }

    if (is.null(dim(final_weights))) {
        final_weights <- model_iter$final_weights
    } else {
        old_colname <- colnames(final_weights)
        idx <- which(colnames(model_iter$final_weights) == 'id')
        cidx <- which(colnames(model_iter$final_weights) != 'id')
        extra_colname <- colnames(model_iter$final_weights)[cidx]
        new_colnames <- c(old_colname, extra_colname)
        fw <- model_iter$final_weights[,-idx]
        final_weights <- cbind(final_weights, fw)
        colnames(final_weights) <- new_colnames
    }
    } # end loop areas

    model                     <- model_iter #TODO: is error implemented
    model$TAE                 <- rowMeans(TAE)
    model$SAE                 <- rowMeans(SAE)
    model$PSAE                <- rowMeans(PSAE)
    model$TAD                 <- mean(TAD)
    model$DW                  <- mean(DW)
    model$DChi2               <- mean(DChi2)
    model$TDChi2              <- mean(TDChi2)
    model$Z                   <- rowMeans(Z)
    model$final_weights       <- final_weights
    model$input_weights       <- model_iter$input_weights
    model$Tx                  <- Tx
    model$Tx_complete         <- Tx_complete
    model$pop                 <- pop
    model$X                   <- X_oinput
    model$X_complete          <- X_complete
    model$constrains          <- constrains_names
    model$constrains_complete <- constrains_complete
    model$survey              <- survey

    if (verbose){
        cat("\nprepare output")
        cat("\n\t|--> dim(X)               = ", dim(X))
        cat("\n\t|--> dim(X_complete)      = ", dim(X_complete))
        if (is.null(dim(Tx))){
            cat("\n\t|--> length(Tx)           = ", length(Tx))
            cat("\n\t|--> length(Tx_complete)  = ", length(Tx_complete))
        } else {
            cat("\n\t|--> dim(Tx)              = ", dim(Tx))
            cat("\n\t|--> dim(Tx_complete)     = ", dim(Tx_complete))
        }
        cat("\n\t|--> length(constrains)   = ", length(constrains_names))
        cat("\n\t|--> length(constrains_c) = ", length(constrains_complete))
        if (is.null(dim(Tx))){
            cat("\n\t|--> pop                  = ", pop)
        } else {
            cat("\n\t|--> dim(pop)             = ", dim(pop))
        }
        cat("\n\t|--> dim(survey)          = ", dim(survey))
    }
    class(model) <- "GREGWT"
    if (output_log){
        sink()
        sink(type="message")
    }
    return(model)
}


GREGWTest <- function(X, dx, Tx, X_complete, Tx_complete, pop,
                      # Optional variables
                      survey_id = "survey_id",
                      use_ginv  = FALSE,
                      group     = FALSE,
                      bounds    = c(-Inf,Inf),
                      epsilon   = 0.001,
                      max_iter  = 10,
                      X_input   = FALSE,
                      dx_oinput = FALSE,
                      align_pop = TRUE,
                      area_code = "/",
                      verbose   = FALSE){

    # Save the aggregation ids
    X_g <- X

    if(verbose) cat("\n\tgroup...")
    if(is.character(group)){
        X <- X[,colnames(X)[colnames(X)!="Group.1"]]
        X <- as.matrix(X)
    }
    if(verbose) cat("ok")

    # Number of attributes in the input sample
    att_num <- dim(X)[2]

    if(verbose){
        cat("\n\tget first lambda")
        cat("\n\tdim(X)",     dim(X))
        cat("\n\tlength(dx)", length(dx))
    }
    # Lambda
    hTx    <- colSums(X * dx, na.rm=T)  # Sample totals
    lambda <- getLambda(X, dx, Tx, hTx, att_num, use_ginv, verbose=verbose)

    if(is.na(sum(lambda))){
        cat("Warning, nans in lambda vector")
        return(list(input_weights=F, final_weights=F))
    }

    if(verbose) cat("ok")

    if(verbose) cat("\n\tmain loop")
    # Truncate weights
    convergence = FALSE
    number_iter = 0
    while(!convergence){
        number_iter = number_iter + 1
        # Get new weights
        wx = dx * (1 + X %*% lambda)  # New weights
        # Truncate weights
        wx[wx<bounds[1]] <- bounds[1]
        wx[wx>bounds[2]] <- bounds[2]
        # Truncate initial wights
        dx[wx<bounds[1]] <- 0
        dx[wx>bounds[2]] <- 0
        # Recompute lambda
        if(verbose) cat("\n\t\tNew Lambda")
        hTx <- colSums(X * as.numeric(wx), na.rm=T)  # Sample totals
        lambdaS <- getLambda(X, dx, Tx, hTx, att_num, use_ginv, verbose=verbose)
        # Save lambda m-1
        lambdaO <- lambda
        # Compute new lambda
        lambda = lambda + lambdaS
        # Compute values for convergence
        if(all(Tx > epsilon)){
            #dlta_tx <- abs(Tx_complete - hTx_complete)
            dlta_tx <- abs(Tx - hTx)
            dlta_l  <- abs(lambdaS-lambdaO)
        }else{
            dlta_tx <- 0
            dlta_l  <- 0
        }
        # compute convergence
        convergence <- (
            all(dlta_tx < epsilon) |
            all(dlta_l <= epsilon) |
            number_iter >= max_iter)
        if(convergence){
            cat("| dTx | ",
                format(max(dlta_tx), digits=2, scientific=T, width=7), "< ",
                format(epsilon, digits=2, scientific=T) ,"-> ")
            cat(format(all(dlta_tx < epsilon), width=5), " ")
            cat("| dAm | ",
                format(max(dlta_l), digits=2, scientific=T, width=7), "<=",
                format(epsilon, digits=2, scientific=T) ,"-> ")
            cat(format(all(dlta_l <= epsilon), width=5), " ")
            cat("| itr: ", format(number_iter, digits=0, width=4), "")}
        }
    if(verbose) cat("ok")
    if(number_iter >= max_iter){
        cat("| NC | ")
    }else{
        cat("| OK | ")}
    cat("AC: ", format(area_code, scientific=FALSE))
    # expand grouped data
    if(is.character(group)){
        if (verbose) cat("Expanding group ")
        weights_g <- expandGroup(X_g, wx, X_complete, group)
        dx_output <- dx_oinput
        wx_output <- weights_g
    }else{
        dx_output <- dx
        wx_output <- wx
    }
    # align population total
    if(align_pop){
        if (verbose) cat("\npopulation alignment")
        if (is.logical(pop)){
            if (!(is.logical(align_pop))){
                wx_output <- alignPop(wx_output, sum(Tx)*align_pop)
                }
        }else{
            wx_output <- alignPop(wx_output, pop)
        }
    }
    if (verbose) cat("\nbind index")
    wx_output <- cbind(survey_id, wx_output) #TODO: why do I need the durvey id?
    colnames(wx_output) <- c("id", format(area_code, scientific=FALSE))
    return(list(input_weights=dx_output, final_weights=wx_output))
}

#TODO: document alignPop

#' @title alignPop
#'
#' @description #' Aligns the survey weights with the total population
#'
#' @param w new computed weights
#' @param p population total
#' @return wo align weights
#' @author M. Esteban Munoz H.
#TODO: make example alignPop
alignPop <- function(w, p){
    wo <- p * w / sum(w)
    return(wo)
}


getLambda <- function(X, dx, Tx, hTx, att_num, use_ginv, verbose=FALSE){
    if(verbose) cat("\n\t\tget A")
    A <- crossprod(as.matrix(dx * X), as.matrix(X))
    if(verbose){
        cat("... ok")
        if(sum(is.na(A)))   cat("!!A contains   NaN's!!")
        if(sum(is.na(Tx)))  cat("!!Tx contains  NaN's!!")
        if(sum(is.na(hTx))) cat("!!hTx contains NaN's!!")
    }
    # A solution for 'A %*% lambda = (Tx - hTx)'
    if(verbose) cat("\n\t\tlength(Tx): ", length(Tx))
    if(verbose) cat("\t\tlength(hTx): ",  length(hTx))
    if(use_ginv){
        if(verbose) cat("\n\t\t\tusing ginv ")
        require('MASS')
        At <- ginv(A)
        lambda = At %*% (Tx - hTx)
        if(verbose) cat("ok")
    }else{
        if(verbose) cat("\n\t\t\tfind a solution for A ")
        lambda <- solve(A, (Tx - hTx))   # Lagrange multipliers
        if(verbose) cat("ok")
    }
    if(verbose) cat("ok")
    if(verbose){
        cat("\n\t\tLambda:")
        cat(lambda)
    }
    return(lambda)}


groupX <- function(X, group){
    X_g <- aggregate(X, by=list(X[,group]), FUN=sum)
    X_g <- X_g[names(X_g)!=group]
    return(X_g)
}


#TODO: divide initial weights by area population
groupDx <- function(X_input, dx, group){
    dx_g <- aggregate(dx, by=list(X_input[,group]), FUN=sum)
    dx_g <- as.matrix(dx_g)
    dx_g <- as.numeric(dx_g[,"x"])
    return(dx_g)
}


#TODO: make example expandGroup
expandGroup <- function(X_g, wx, X_input, group){
    Origin    <- X_g[,"Group.1"]
    Grouped   <- X_input[,group]
    index     <- match(Grouped, Origin, nomatch=0)
    weights_g <- wx[index]
    return(weights_g)
}


getTAE <- function(observed, simulated){
    obs <- as.numeric(observed)
    sim <- as.numeric(simulated)
    TAE <- sum(abs(obs-sim))
    return(TAE)
}


#TODO: delete comments
computeError <- function(model, group, verbose=FALSE){
    if(verbose) cat("\n compute error")
    model$call <- match.call()
    # Weights
    d <- model$input_weights
    w <- model$final_weights[,2]
    # Marginal sums
    Tx <- model$Tx_complete
    if (dim(Tx)[1] > dim(Tx)[2]){
        Tx <- Tx[sort(rownames(Tx)), ]
    } else {
        Tx <- Tx[, sort(colnames(Tx))]
    }
    hTx <- t(colSums(model$X_complete * w)[model$constrains_complete])
    hTx <- hTx[,sort(colnames(hTx))]

    if (verbose){
        #hTx <- colSums(X * as.numeric(wx_output), na.rm=T)  # Sample totals
        cat("\n###############################################\n")
        Tx_diff <- cbind(Tx, hTx)
        colnames(Tx_diff) <- c("Tx", "hTx")
        print(Tx_diff)
        cat("\n###############################################\n")
    }
    # Compute the internal error of the computation
    # Total absolute error (TAE)
    model$TAE <- getTAE(Tx, hTx)
    cat(" | TAE:", format(sum(model$TAE),digits=2,scientific=T,width=7))
    # Standardized absolute error (SAE)
    model$SAE <- abs(Tx-hTx) / model$pop
    # Percentage error (PSAE)
    if(verbose) cat("\n using total population: ", model$pop)
    model$PSAE <- model$SAE * 100
    cat(" | PSAE:", format(sum(model$PSAE),digits=2,scientific=T,width=7))

    # Total absolute distance (TAD)
    model$TAD <- sum(abs(w-d))
    # Weights Distance
    model$DW <- abs(w - d)
    # Chi-squared distance
    model$DChi2 <- 1/2 * (d * w)^2 / d
    # Total Chi-squared distance
    model$TDChi2 <- sum(1/2 * (d * w)^2 / d)

    # Z-statistic
    r = hTx/sum(Tx, na.rm=T)
    p = Tx/sum(Tx, na.rm=T)
    model$Z <- (r-p)/sqrt(p*(1-p)/sum(Tx, na.rm=T))
    # Error in Margin (EM)
    model$EM <- (sum(d) - sum(w)) / sum(d)
    # Error in Distribution (ED)
    model$ED <- abs(sum(d) - sum(w)) / sum(d)

    if (sum(is.na(hTx)) >= length(hTx)-3) {
        return(model)
    }
    ## Correlation Coefficient (Pearson Correlation)
    model$pearson <- cor(cbind(Tx, hTx), use="complete.obs", method="pearson")
    ## Independent samples t-Test
    model$ttest <- t.test(Tx, hTx)
    ## Coefficient of determination
    lm.X <- lm(Tx~hTx)
    model$r2 <- summary(lm.X)$r.squared
    model$r2.adj <- summary(lm.X)$adj.r.squared

    cat(" |\n")
    return(model)
}


print.GREGWT <- function(x, ...){
    require('stargazer')
    cat("\n===========================================================================================")
    cat("\nError measure\t\t\t\t\t\t\t| Value")
    cat("\n-------------------------------------------------------------------------------------------")
    cat("\nMean Weight Distance [mean(abs(w - d))]:\t\t\t| ")
    cat(mean(x$DW))
    cat("\nMean Chi-squared Distance [mean(1/2 * (w * d)^2 / d)]:\t\t| ")
    cat(mean(x$DChi2))
    cat("\nTotal Chi-squared Distance [sum(1/2 * (w * d)^2 / d)]:\t\t| ")
    cat(x$TDChi2)
    cat("\nTotal absolute distance [sum(abs(w - d))]:\t\t\t| ")
    cat(x$TAD)
    cat("\nTotal absolute error [abs(Tx - hTx)]:\t\t\t\t| ")
    cat(x$TAE)
    cat("\nMean Standardized absolute error [TAE / n]:\t\t\t| ")
    cat(mean(x$SAE))
    cat("\nMean Percentage absolute error [SAE * 100]:\t\t\t| ")
    cat(mean(x$PSAE))
    #cat("\nStandardized absolute error [TAE / n]:\n")
    #stargazer(x$SAE, type='text')
    #cat("\nPercentage absolute error [SAE * 100]:\n")
    #stargazer(x$PSAE, type='text')
    cat("\nCorrelation Coefficient (Pearson Correlation):\t\t\t| ")
    cat(x$pearson[1,1])
    #cat("\nIndependent samples t-Test [t.test(Tx, hTx)]:\t")
    #print(x$ttest)
    cat("\nR squared (Tx~hTx):\t\t\t\t\t\t| ")
    cat(x$r2)
    cat("\nAdjusted R squared:\t\t\t\t\t\t| ")
    cat(x$r2.adj)
    cat("\nMean Modified z-statistic [Z = (r-p)/sqrt(p*(1-p)/sum(Tx))]:\t| ")
    cat(mean(x$Z, na.rm=T))
    cat("\nError in margin (EM) [(sum(d)-sum(w))/sum(d)]:\t\t\t| ")
    cat(x$EM)
    cat("\nError in distribution (ED) [abs(sum(d)-sum(w))/sum(d)]:\t\t| ")
    cat(x$ED)
    cat("\nNew Weights. Access via: X$final_weights:\t\t\t| ")
    cat(paste(length(x$final_weights),"New weights"))
    cat("\n-------------------------------------------------------------------------------------------\n")
    cat("\nStandardized absolute error [TAE / n]:\n")
    stargazer(as.data.frame(x$SAE), type='text', summary=F)
    cat("\nPercentage absolute error [SAE * 100]:\n")
    stargazer(as.data.frame(x$PSAE), type='text', summary=F)
    cat("\nModified z-statistic [Z = (r-p)/sqrt(p*(1-p)/sum(Tx))]:\n")
    stargazer(as.data.frame(x$Z), type='text', summary=F)


    return(x)
}


summary.GREGWT <- function(x, ...){
    ttest = x$ttest
    res <- list(
        call=x$call,
        # 1 ######
        TAD = sum(x$TAD),
        M.Weight.D = mean(x$DW),
        M.Chi2.D = mean(x$DChi2),
        T.Chi2.D = x$TDChi2,
        # 2 ######
        TAE = sum(x$TAE),
        SAE = sum(x$SAE),
        PSAE = mean(x$PSAE),
        # 3 ######
        p.val.ttest = ttest$p.value,
        pearson = mean(x$pearson[x$pearson==1]),
        r2 = x$r2,
        r2.adj = x$r2.adj,
        # 4 ######
        Z = mean(x$Z),
        EM = x$EM,
        ED = x$ED
        )
    class(res) <- "summary.GREGWT"
    return(res)
}


print.summary.GREGWT <- function(x, ...){
    TWidth = 10
    cat("Call:\n")
    print(x$call)
    cat("\n================================================\n")
    cat(format("TAD",                                width=TWidth), "|")
    cat(format("M.Weight.D",                         width=TWidth), "|")
    cat(format("M.Chi2.D",                           width=TWidth), "|")
    cat(format("T.Chi2.D",                           width=TWidth), "|")
    cat("\n------------------------------------------------\n")
    cat(format(x$TAD,        digits=2, scientific=T, width=TWidth), "|")
    cat(format(x$M.Weight.D, digits=2, scientific=T, width=TWidth), "|")
    cat(format(x$M.Chi2.D,   digits=2, scientific=T, width=TWidth), "|")
    cat(format(x$T.Chi2.D,   digits=2, scientific=T, width=TWidth), "|")
    cat("\n------------------------------------------------\n")
    cat(format("TAE",                                width=TWidth), "|")
    cat(format("SAE",                                width=TWidth), "|")
    cat(format("PSAE",                               width=TWidth), "|")
    cat(format("",                                   width=TWidth), "|")
    cat("\n------------------------------------------------\n")
    cat(format(x$TAE,        digits=2, scientific=T, width=TWidth), "|")
    cat(format(x$SAE,        digits=2, scientific=T, width=TWidth), "|")
    cat(format(x$PSAE,       digits=2, scientific=T, width=TWidth), "|")
    cat(format("",                                   width=TWidth), "|")
    cat("\n------------------------------------------------\n")
    cat(format("t-Test",                             width=TWidth), "|")
    cat(format("M.Pearson",                          width=TWidth), "|")
    cat(format("R2",                                 width=TWidth), "|")
    cat(format("adj R2",                             width=TWidth), "|")
    cat("\n------------------------------------------------\n")
    cat(format(x$p.val.ttest, digits=2, scientific=T, width=TWidth), "|")
    cat(format(x$pearson,     digits=2, scientific=T, width=TWidth), "|")
    cat(format(x$r2,          digits=2, scientific=T, width=TWidth), "|")
    cat(format(x$r2.adj,      digits=2, scientific=T, width=TWidth), "|")
    cat("\n------------------------------------------------\n")
    cat(format("M.Z",                                width=TWidth), "|")
    cat(format("EM",                                 width=TWidth), "|")
    cat(format("ED",                                 width=TWidth), "|")
    cat(format("",                                   width=TWidth), "|")
    cat("\n------------------------------------------------\n")
    cat(format(x$Z,          digits=2, scientific=T, width=TWidth), "|")
    cat(format(x$EM,         digits=2, scientific=T, width=TWidth), "|")
    cat(format(x$ED,         digits=2, scientific=T, width=TWidth), "|")
    cat(format("",                                   width=TWidth), "|")
    cat("\n================================================\n")
    #printCoefmat(x$Distance)
}


plot.GREGWT <- function(x, ...){
    layout(matrix(c(1,2,3,4),2,2)) # optional 4 graphs/page

    # PSAE
    names_y <- names(x$PSAE)
    names_y <- gsub("G.", "", names_y)
    barplot(x$PSAE,
            names.arg=names_y,
            main="Percentage Error of model constrains",
            ylab="PSAE = (Tx - hTx)/n*100", xlab="")
    abline(h=mean(x$PSAE), col="red")

    # Z-statistic
    x$Z[x$Z == Inf] <- NA
    x$Z[x$Z == -Inf] <- NA
    barplot(x$Z,
            names.arg=names_y,
            main="Z-Statistic of model constrains",
            ylab="Z = (r-p)/sqrt(p*(1-p)/sum(Tx))", xlab="")
    print(mean(Z, na.rm=T))
    print(Z)
    abline(h=mean(Z, na.rm=T), col="red")

    # Final vs. Initial weights
    if (dim(x$final_weights)[2] == 1){
        area_numbers <- 1
        plot(as.numeric(x$final_weights), x$input_weights,
             pch=".",
             ylab="Input Weights [d]",
             xlab="New Weights [w]",
             main="Initial and estimated new weights")
    } else {
        area_numbers <- dim(x$final_weights)[2]
        plot(as.numeric(x$final_weights[,1]), x$input_weights,
             pch=".",
             ylab="Input Weights [d]",
             xlab="New Weights [w]",
             main="Initial and estimated new weights")
        for (i in seq(2, area_numbers)){
            points(as.numeric(x$final_weights[, i]),
                x$input_weights, col=i, pch=".")
        }
    }
    abline(h=0, col="red")

    # Weight distance
    plot(sort(as.numeric(
         x$final_weights) - x$input_weights),
         ylab="Weight distance [w - d]",
         xlab="",
         main="Weight distance")
    abline(h=0,col="red")
    #ED <- abs(sum(x$input_weights)-sum(x$final_weights))/sum(x$input_weights)
    #EM <- sum(x$input_weights)-sum(x$final_weights)/sum(x$input_weights)
    #plot(ED, EM)
}

#' @title logtocsv
#'
#' @description
#' This function will try to read the log output created by function GREGWT
#' with the output_log variable set to TRUE and generate a csv file on the same
#' path called GREGWT_log.csv.
#'
#' @param file_name (default = FALSE) alternative file name to process, if
#' FALSE the function will look for a file name GREGWT.log in the root folder.
#'
#' @author M. Esteban Munoz H.
logtocsv <- function(file_name="GREGWT.log"){
    file_name <- "GREGWT.log"
    data_table <- read.table(file_name)
    if (dim(data_table)[2] == 25){
        col_index <- c(24,8,16,19,21)
        col_names <- c("area_code", "dTx_convergence", "dAm_convergence",
                       "iterations", "convergence")
    } else {
        col_index <- c(24,8,16,19,21,27,30)
        col_names <- c("area_code", "dTx_convergence", "dAm_convergence",
                       "iterations", "convergence", "TAE", "PSAE")
    }
    data_table <- data_table[col_index]
    names(data_table)      <- col_names
    data_table$area_code   <- as.character(data_table$area_code)
    data_table$convergence <- as.character(data_table$convergence)
    data_table$convergence[data_table$convergence == "OK"] <- "TRUE"
    data_table$convergence[data_table$convergence == "NC"] <- "FALSE"
    data_table$convergence <- as.logical(data_table$convergence)
    write.csv(data_table, file="GREGWT_log.csv", row.names=FALSE)
}
emunozh/GREGWT documentation built on May 16, 2019, 5:11 a.m.