R/01_PcAuxData.R

### Title:        PcAuxData Reference Class Definition
### Author:       Kyle M. Lang
### Contributors: Byungkwan Jung, Vibhuti Gupta, Pavel Panko
### Created:      2015-OCT-30
### Modified:     2018-AUG-17
### Note:         PcAuxData is the metadata class for the PcAux package.

##--------------------- COPYRIGHT & LICENSING INFORMATION --------------------##
##  Copyright (C) 2018 Kyle M. Lang <k.m.lang@uvt.nl>                         ##
##                                                                            ##
##  This file is part of PcAux.                                               ##
##                                                                            ##
##  This program is free software: you can redistribute it and/or modify it   ##
##  under the terms of the GNU General Public License as published by the     ##
##  Free Software Foundation, either version 3 of the License, or (at you     ##
##  option) any later version.                                                ##
##                                                                            ##
##  This program is distributed in the hope that it will be useful, but       ##
##  WITHOUT ANY WARRANTY; without even the implied warranty of                ##
##  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General  ##
##  Public License for more details.                                          ##
##                                                                            ##
##  You should have received a copy of the GNU General Public License along   ##
##  with this program. If not, see <http://www.gnu.org/licenses/>.            ##
##----------------------------------------------------------------------------##


################################################################################
#####----------------------- DEFINE CLASS & FIELDS ------------------------#####
################################################################################

PcAuxData <-
    setRefClass("PcAuxData",

                fields = list(
                    call         = "list",
                    data         = "ANY",
                    seed         = "integer",
                    miceIters    = "integer",
                    miceRidge    = "numeric",
                    maxNetWts    = "integer",
                    forcePmm     = "logical",
                    typeVec      = "character",
                    methVec      = "character",
                    nComps       = "numeric",
                    respCounts   = "numeric",
                    initialPm    = "numeric",
                    nomVars      = "character",
                    ordVars      = "character",
                    idVars       = "character",
                    dropVars     = "matrix",
                    nomMaxLev    = "integer",
                    ordMaxLev    = "integer",
                    conMinLev    = "integer",
                    probNoms     = "character",
                    probOrds     = "character",
                    probCons     = "character",
                    levelVec     = "numeric",
                    simMode      = "logical",
                    highPmVars   = "character",
                    emptyVars    = "character",
                    constants    = "character",
                    minRespCount = "integer",
                    verbose      = "integer",
                    groupVars    = "character",
                    intVars      = "character",
                    pcAux        = "list",
                    rSquared     = "list",
                    pcaMemLev    = "integer",
                    maxPower     = "integer",
                    interact     = "ANY",
                    poly         = "ANY",
                    collinThresh = "numeric",
                    minItemPredCor   = "numeric",
                    nGVarCats    = "integer",
                    collinVars   = "data.frame",
                    impFails     = "list",
                    patterns     = "list",
                    frozenGVars  = "ANY",
                    idFills      = "ANY",
                    nImps        = "integer",
                    compFormat   = "character",
                    miDatasets   = "ANY",
                    miceObject   = "ANY",
                    nProcess     = "integer",
                    moderators   = "character",
                    intMeth      = "integer",
                    idCols       = "ANY",
                    dumNoms      = "ANY",
                    facNoms      = "ANY",
                    status       = "list",
                    time         = "list",
                    checkStatus  = "character",
                    useQuickPred = "logical",
                    corPairs     = "data.frame",
                    dumVars      = "character",
                    frozenMods   = "character",
                    loggedEvents = "data.frame",
                    predMat      = "matrix",
                    minPcPredCor   = "numeric",
                    minPcPredCount   = "integer"
                )# END fields
                )# END PcAuxData


################################################################################
#####---------------------------- DEFINE METHODS --------------------------#####
################################################################################

PcAuxData$
    methods(

        ##---------------------------- Constructor ---------------------------##

        initialize = function(data         = data.frame(NULL),
                              nComps       = vector("numeric"),
                              nImps        = 0L,
                              nomVars      = vector("character"),
                              ordVars      = vector("character"),
                              idVars       = vector("character"),
                              dropVars     = matrix(NA, 1, 2),
                              moderators   = vector("character"),
                              groupVars    = vector("character"),
                              seed         = as.integer(NA),
                              forcePmm     = FALSE,
                              simMode      = FALSE,
                              verbose      = 0L,
                              maxPower     = 3L,
                              intMeth      = 0L,
                              nProcess     = 1L,
                              miceIters    = 10L,
                              miceRidge    = 1.0e-5,
                              maxNetWts    = 10000L,
                              nomMaxLev    = 10L,
                              ordMaxLev    = 10L,
                              conMinLev    = 10L,
                              pcaMemLev    = 0L,
                              collinThresh = 0.95,
                              minItemPredCor   = 0.1,
                              nGVarCats    = 3L,
                              checkStatus  = "none",
                              useQuickPred = FALSE,
                              minRespCount = as.integer(
                                  floor(0.05 * nrow(data))
                              ),
                              minPcPredCor    = 0.1,
                              minPcPredCount = 1L
                              )
        {
            "Initialize an object of class PcAuxData"

            ## User-supplied values:
            data       <<- data[ , setdiff(colnames(data), dropVars)]
            dropVars   <<- cbind(dropVars, "user_defined")
            seed       <<- seed
            nComps     <<- nComps
            nomVars    <<- nomVars
            ordVars    <<- ordVars
            idVars     <<- idVars
            simMode    <<- simMode
            verbose    <<- verbose
            groupVars  <<- groupVars
            maxPower   <<- maxPower
            nImps      <<- nImps
            moderators <<- moderators
            intMeth    <<- intMeth
            nProcess   <<- nProcess
            forcePmm   <<- forcePmm
            compFormat <<- ""

            ## Control list parameters:
            miceIters    <<- miceIters
            miceRidge    <<- miceRidge
            maxNetWts    <<- maxNetWts
            nomMaxLev    <<- nomMaxLev
            ordMaxLev    <<- ordMaxLev
            conMinLev    <<- conMinLev
            minRespCount <<- minRespCount
            pcaMemLev    <<- pcaMemLev
            collinThresh <<- collinThresh
            minItemPredCor   <<- minItemPredCor
            nGVarCats    <<- nGVarCats
            checkStatus  <<- checkStatus
            useQuickPred <<- useQuickPred
            minPcPredCor     <<- minPcPredCor
            minPcPredCount <<- minPcPredCount

            ## Structured fields:
            call <<- list(
                prepData    = NULL,
                createPcAux = NULL,
                miWithPcAux = NULL
            )
            pcAux <<- list(
                lin    = data.frame(NULL),
                nonLin = data.frame(NULL)
            )
            rSquared <<- list(
                lin    = vector("numeric"),
                nonLin = vector("numeric")
            )
            impFails <<- list(
                firstPass = vector("character"),
                pmm       = vector("character"),
                groupMean = vector("character"),
                grandMean = vector("character")
            )
            status <<- list(
                prep   = list(),
                create = list(),
                mi     = list()
            )
            time <<- list(
                prep   = vector("numeric"),
                create = vector("numeric"),
                mi     = vector("numeric")
            )

            ## Additional fields:
            typeVec      <<- vector("character")
            methVec      <<- vector("character")
            respCounts   <<- vector("numeric")
            initialPm    <<- vector("numeric")
            probNoms     <<- vector("character")
            probOrds     <<- vector("character")
            probCons     <<- vector("character")
            levelVec     <<- vector("numeric")
            highPmVars   <<- vector("character")
            emptyVars    <<- vector("character")
            constants    <<- vector("character")
            intVars      <<- vector("character")
            interact     <<- data.frame(NULL)
            poly         <<- list()
            collinVars   <<- data.frame(NULL)
            patterns     <<- list()
            frozenGVars  <<- data.frame(NULL)
            idFills      <<- list()
            miDatasets   <<- list()
            miceObject   <<- list()
            idCols       <<- data.frame(NULL)
            dumNoms      <<- list()
            facNoms      <<- data.frame(NULL)
            corPairs     <<- data.frame(NULL)
            dumVars      <<- vector("character")
            frozenMods   <<- vector("character")
            loggedEvents <<- data.frame(NULL)
            predMat      <<- matrix()
        },

        ##--------------- "Overloaded" / Non-Standard Mutators ---------------##

        setCall = function(x, parent)
        {
            if     (parent == "prepData"        ) call[[1]] <<- x
            else if(parent == "createPcAux"     ) call[[2]] <<- x
            else if(parent == "miWithPcAux"     ) call[[3]] <<- x
            else                                  stop("Invalid 'parent' argument.")
        },

        setPoly = function(x, power = NULL)
        {
            "Modify the list of polynomial expansions of 'data'"
            if(is.null(power)) poly          <<- x
            else               poly[[power]] <<- x
        },

        setPcAux = function(x, type = NULL)
        {
            "Modify the list of principal component auxiliaries"
            if     (is.null(type)      ) pcAux        <<- x
            else if(type == "linear"   ) pcAux$lin    <<- x
            else if(type == "nonLinear") pcAux$nonLin <<- x
            else                         stop("Invalid pcAux type.")
        },

        setRSquared = function(x, type = NULL)
        {
            "Modify the list of R-Squared values for the PcAux scores"
            if     (is.null(type)      ) rSquared        <<- x
            else if(type == "linear"   ) rSquared$lin    <<- x
            else if(type == "nonLinear") rSquared$nonLin <<- x
            else                         stop("Invalid rSquared type.")
        },

        setControl = function(x)
        {
            "Assign the control parameters"
            nonInts <- c("minItemPredCor",
                         "collinThresh",
                         "miceRidge",
                         "checkStatus",
                         "useQuickPred",
                         "minPcPredCor")

            for(n in names(x)) {
                if(n %in% nonInts) field(n, x[[n]])
                else               field(n, as.integer(x[[n]]))
            }
        },

        updateImpFails = function(x, type)
        {
            "Update the list of imputation failure records"
            impFails[[type]] <<- x
        },

        setMethVec = function(x, index = NULL)
        {
            "Update the elementary imputation method vector"
            if(is.null(index)) methVec        <<- x
            else               methVec[index] <<- x
        },

        setNComps = function(type)
        {
            "Set the number of PcAux to extract"
            r2 <- rSquared[[type]]
            nc <- nComps[type]
            if(is.infinite(nc)) {
                tmp          <-  which(r2[-length(r2)] == r2[-1])
                nComps[type] <<- ifelse(length(tmp) == 0, length(r2), tmp[1])
            } else if(nc < 1 & nc > 0) {
                nComps[type] <<- sum(r2 < nc) + 1
            } else {
                nComps[type] <<- length(r2)
            }
        },

        setStatus = function(step = "start")
        {
            "Set machine specs and encumbrance"
            session <- list(sessionInfo())
            os      <- as.character(Sys.info()["sysname"])

            if(os != "Windows" & os != "Linux")
                os <- unlist(session)[grep("macOS", unlist(session))]


            if(os == "Windows")
                lookFor <- list(
                    "wmic cpu get Name, Architecture, MaxClockSpeed, NumberOfLogicalProcessors, L3CacheSize, L3CacheSpeed, LoadPercentage",
                    "wmic MemoryChip get Capacity, Speed",
                    "wmic OS get FreePhysicalMemory"
                )

            else if(os == "Linux")
                lookFor <- list(
                    "top -bn1|grep 'load average'",
                    "lscpu| egrep 'Model name|^CPU\\(s|L3 cache'",
                    "free -m"
                )

            else if (os == "macOS")
                lookFor <- list(
                    "top -l1|egrep 'CPU usage|PhysMem'",
                    "system_profiler SPHardwareDataType|egrep 'Processor|Cache'"
                )

            else
                stop("Sorry, this option is not available for your operating system")

            if(step == "start")
                session[[2]] <- rapply(lookFor, system, intern = TRUE)
            else
                session      <- rapply(lookFor, system, intern = TRUE)

            stCall <- sum(sapply(call, function(x) is.null(x)))

            if     (stCall == 2) status$prep[[step]]   <<- session
            else if(stCall == 1) status$create[[step]] <<- session
            else if(stCall == 0) status$mi[[step]]     <<- session
        },

        setTime = function(step = "start")
        {
            "Set the elapsed time between processes"
            stCall <- sum(sapply(call, function(x) is.null(x)))
            if     (stCall == 2) time$prep[[step]] <<- proc.time()["elapsed"]
            else if(stCall == 1) time$create[step] <<- proc.time()["elapsed"]
            else if(stCall == 0) time$mi[step]     <<- proc.time()["elapsed"]
        },

        ##----------------------- "Overloaded" Accessors ---------------------##

        getPoly = function(power = NULL)
        {
            "Retrieve the polynomial expansions of 'data'"
            if(is.null(power)) return(poly         )
            else               return(poly[[power]])
        },

        getPcAux = function(type = NULL)
        {
            "Retrieve the principal component auxiliary scores"
            if     (is.null(type)      ) return(pcAux                )
            else if(type == "linear"   ) return(pcAux$lin            )
            else if(type == "nonLinear") return(pcAux$nonLin         )
            else                         stop  ("Invalid pcAux type.")
        },

        getRSquared = function(type = NULL)
        {
            "Retrieve the R-Squareds for the PcAux scores"
            if     (is.null(type)      ) return(rSquared                )
            else if(type == "linear"   ) return(rSquared$lin            )
            else if(type == "nonLinear") return(rSquared$nonLin         )
            else                         stop  ("Invalid rSquared type.")
        },

        getControl = function()
        {
            "Retrieve the control parameters"
            list(
                miceIters    = miceIters,
                miceRidge    = miceRidge,
                collinThresh = collinThresh,
                minRespCount = minRespCount,
                minItemPredCor   = minItemPredCor,
                maxNetWts    = maxNetWts,
                nomMaxLev    = nomMaxLev,
                ordMaxLev    = ordMaxLev,
                conMinLev    = conMinLev,
                nGVarCats    = nGVarCats,
                pcaMemLev    = pcaMemLev,
                checkStatus  = checkStatus,
                minPcPredCor     = minPcPredCor,
                minPcPredCount = minPcPredCount
            )
        },
        ##-------------- Data Screening and Manipulation Methods -------------##

        removeVars = function(x, reason, recordOnly = FALSE)
        {
            "Remove columns from 'data' and store their meta-data"
            dropCols <-  which(colnames(data) %in% x)
            dropVars <<- rbind(dropVars, cbind(colnames(data)[dropCols], reason))
            if(!recordOnly) data <<- data[ , -dropCols]
        },

        countVarLevels = function()
        {
            "Count the levels for each column in 'data'"
            levelVec <<- as.integer(
                unlist(lapply(data, function(x) length(unique(na.omit(x)))))
            )
            names(levelVec) <<- colnames(data)
        },

        typeData = function()
        {
            "Populate a vector containing each variable's type"
            cn <- colnames(data); nv <- ncol(data)
            ## Default type is continuous:
            typeVec                                  <<- rep("continuous", nv)
            typeVec[cn %in% nomVars & levelVec == 2] <<- "binary"
            typeVec[cn %in% nomVars & levelVec > 2 ] <<- "nominal"
            typeVec[cn %in% ordVars                ] <<- "ordinal"
            typeVec[cn %in% idVars                 ] <<- "id"
            typeVec[levelVec == 1                  ] <<- "constant"
            typeVec[initialPm == 1.0               ] <<- "empty"
            typeVec[cn %in% dropVars[ , 1]         ] <<- "drop"
            names(typeVec                          ) <<- colnames(data       )
        },

        castData = function()
        {
            "Cast all variables to the appropriate measurement level"
            for(i in 1 : ncol(data)) {
                data[ , i] <<-
                    switch(typeVec[i],
                           drop       =              data[ , i],
                           id         =              data[ , i],
                           empty      =              data[ , i],
                           constant   =              data[ , i],
                           binary     = as.factor  ( data[ , i] ),
                           nominal    = as.factor  ( data[ , i] ),
                           ordinal    = as.ordered ( data[ , i] ),
                           continuous = as.numeric ( data[ , i] )
                           )
            }
        },

        centerData = function()
        {
            conNames <- names(typeVec)[typeVec == "continuous"]
            data[ , conNames] <<-
                scale(data[ , conNames], center = TRUE, scale = FALSE)
        },

        checkTypes = function()
        {
            "Check each variable for a sensible number of levels"
            tmpN <- (typeVec == "nominal" | typeVec == "binary") &
                levelVec > nomMaxLev
            tmpC <- typeVec == "continuous" & levelVec < conMinLev
            tmpO <- typeVec == "ordinal"    & levelVec > ordMaxLev

            if(length(tmpN) > 0) probNoms <<- names(levelVec)[tmpN]
            if(length(tmpO) > 0) probOrds <<- names(levelVec)[tmpO]
            if(length(tmpC) > 0) probCons <<- names(levelVec)[tmpC]
        },

        countResponses = function(countMissing = FALSE,
                                  asProportion = FALSE,
                                  strict       = FALSE,
                                  initialPm    = FALSE)
        {
            "Calculate the variable-wise response counts"
            if(asProportion) {
                if(countMissing) {
                    respCounts        <<- colMeans(is.na(data)             )
                    ## Manually name 'respCounts' to hack issue with is.na()
                    ## dropping some column names
                    names(respCounts) <<- colnames(data                    )
                    noMissing         <-  all     (respCounts == 0.0       )
                } else {
                    respCounts        <<- colMeans(!is.na(data)            )
                    names(respCounts) <<- colnames(data                    )
                    noMissing         <-  all     (respCounts == 1.0       )
                }
            } else {
                if(countMissing) {
                    respCounts        <<- colSums (is.na(data)             )
                    names(respCounts) <<- colnames(data                    )
                    noMissing         <-  all     (respCounts == 0         )
                } else {
                    respCounts        <<- colSums (!is.na(data)            )
                    names(respCounts) <<- colnames(data                    )
                    noMissing         <-  all     (respCounts == nrow(data))
                }
            }

            if(strict & noMissing)
                stop(paste0("The data provided are completely ",
                            "observed.\nYou may wish to check ",
                            "your data for coding errors.")
                     )

            ## Store the inital proportion of missing data:
            if(initialPm) initialPm <<- colMeans(is.na(data))
        },

        findHighPmVars = function()
        {
            "Flag variables with few responses"
            tmpNames   <-  setdiff   (names(respCounts), dropVars[ , 1]       )
            tmpCounts  <-  respCounts[tmpNames                                ]
            highPmVars <<- tmpNames  [tmpCounts > 0 & tmpCounts < minRespCount]
            length(highPmVars) > 0 # Find any High PM variables?
        },

        findEmptyVars = function(remove = TRUE)
        {
            "Flag empty variables"
            tmpNames  <-  setdiff(names(typeVec), dropVars[ , 1])
            tmpTypes  <-  typeVec[tmpNames                      ]
            emptyVars <<- tmpNames[tmpTypes == "empty"          ]
            if(length(emptyVars) > 0) {# Find any empty variables?
                removeVars(x = emptyVars, reason = "empty", recordOnly = !remove)
                outFlag <- TRUE
            } else {
                outFlag <- FALSE
            }
            outFlag
        },

        findConstCols = function()
        {
            "Locate and fill constant columns in 'data'"
            creatingPcAux <-  length(pcAux$lin) == 0 # Are we in createPcAux()?
            tmpNames      <-  setdiff (names(typeVec), dropVars[ , 1])
            tmpTypes      <-  typeVec [tmpNames                      ]
            constants     <<- tmpNames[tmpTypes == "constant"        ]

            if(length(constants) > 0) {# Find any constant columns?
                if(creatingPcAux) {
                    removeVars(x = constants, reason = "constant")
                } else {
                    fillConstants()
                }
                outFlag <- TRUE
            } else {
                outFlag <- FALSE
            }
            outFlag
        },

        fillConstants = function()
        {
            "Fill constant columns with the appropriate value"
            tmp1    <- names  (initialPm)[initialPm == 0]
            targets <- setdiff(constants, tmp1          )
            if(length(targets) > 1) {
                tmp2 <- data.frame(
                    lapply(data[ , targets],
                           FUN = function(x) {
                               x[is.na(x)] <- unique(na.omit(x))
                               x
                           })
                )
            } else {
                tmp2              <- data  [ , targets   ]
                tmp2[is.na(tmp2)] <- unique(na.omit(tmp2))
            }
            data[ , targets] <<- tmp2
        },

        fillNomCell = function(name)
        {
            "Fill single missing nominal cells via marginal sampling"
            for(n in name) {
                tmp         <- table(data[ , n])
                impVal      <- sample(levels(data[ , n]), size = 1, prob = tmp)
                data[is.na(data[ , n]), n] <<- impVal
            }
        },

        cleanCollinVars = function(x)
        {
            "Remove one variable from all collinear pairs"
            collinVars     <<- x
            collinVarPairs <- collinVars[ , 1 : 2]
            naCount        <- nrow(data) - respCounts
            sameNaCntVec   <- NULL
            diffNaCntVec   <- NULL
            diffMaxCntVec  <- NULL

            ## First, drop any variable that is collinear with a key moderator
            ## to ensure that all moderators are retained
            if(length(moderators) > 0) {
                ## Find moderators in collinear pairs:
                modScreen <- do.call(cbind,
                                     lapply(collinVarPairs,
                                            function(x, mods) x %in% mods,
                                            mods = moderators)
                                     )
                ## Flag rows with no moderators:
                filter <- rowSums(modScreen) == 0
            } else {
                filter <- TRUE
            }

            if(any(!filter)) {
                ## Gather names of variables that are collinear with moderators:
                dropList <- list()
                for(i in which(!filter)) {
                    tmp <- collinVarPairs[i, !modScreen[i, ]]
                    if(length(tmp) == 1) dropList[[i]] <- tmp
                    else                 warnFun("collinMods",
                                                 map = collinVars[i, ])
                }

                ## Exclude variables collected above:
                ## NOTE: Need to check extent in case only moderators are
                ## collinear and no variables are excluded
                tmp <- unlist(dropList)
                if(length(tmp) > 0)
                    removeVars(x = unique(tmp), reason = "collinear")

                ## Redifine the collinear pairs:
                collinVarPairs <- collinVarPairs[filter, ]
            }

            while(nrow(collinVarPairs) > 0) {
                varCount     <- data.frame(table(unlist(collinVarPairs)))
                maxVarCount  <-
                    varCount[which(varCount$Freq == max(varCount$Freq)), ]
                maxVarCommon <- intersect(names(naCount), maxVarCount[ , 1])

                ## Check for missing value counts if maximum counts are equal
                if(nrow(maxVarCount) > 1) {
                    maxNaCountValue <- max(naCount[maxVarCommon])
                    maxNaVarNames   <-
                        names(which(naCount[maxVarCommon] == maxNaCountValue))
                    maxNaCount      <-
                        data.frame(t(append(maxNaVarNames, maxNaCountValue)))

                    colnames(maxNaCount) <- c("var1", "count")

                    ## Check if missing counts are same
                    if(nrow(maxNaCount) > 1) {
                        maxNaCount     <- maxNaCount[1, ]
                        maxNaCount     <- as.character(maxNaCount$var1)
                        collinVarPairs <-
                            subset(collinVarPairs,
                                   collinVarPairs[ , 1] != maxNaCount)
                        collinVarPairs <-
                            subset(collinVarPairs,
                                   collinVarPairs[ , 2] != maxNaCount)
                        sameNaCntVec   <- append(sameNaCntVec, maxNaCount)
                    }
                    else if(nrow(maxNaCount) == 1) {
                        maxNaCount     <- as.character(maxNaCount$var1)
                        collinVarPairs <-
                            subset(collinVarPairs,
                                   collinVarPairs[ , 1] != maxNaCount)
                        collinVarPairs <-
                            subset(collinVarPairs,
                                   collinVarPairs[ , 2] != maxNaCount)
                        diffNaCntVec   <- append(diffNaCntVec, maxNaCount)
                    }

                    ## Check if maximum counts are not equal
                } else if(nrow(maxVarCount) == 1) {
                    firstVar       <- as.character(maxVarCount$Var1)
                    collinVarPairs <-
                        subset(collinVarPairs, collinVarPairs[ , 1] != firstVar)
                    collinVarPairs <-
                        subset(collinVarPairs, collinVarPairs[ , 2] != firstVar)
                    diffMaxCntVec  <- append(diffMaxCntVec, firstVar)
                }
            }

            varsToRemove <- c(sameNaCntVec, diffNaCntVec, diffMaxCntVec)
            if(!is.null(varsToRemove))
                removeVars(x = unique(varsToRemove), reason = "collinear")
        },

        createMethVec = function(initialImp = FALSE, micemethods = micemethods)
        {
            "Populate a vector of elementary imputation methods"
            cn0 <- setdiff(colnames(data), c(intVars, colnames(poly)))
            cn1 <- setdiff(colnames(data), cn0)

            if(forcePmm) {
                methVec        <<- rep     ("pmm", ncol(data))
                names(methVec) <<- colnames(data             )

                ## Don't use PMM for nominal variables
                binNames <- cn0[typeVec[cn0] == "binary"]
                nomNames <- cn0[typeVec[cn0] == "nominal"]

                tmpIndex <- names(methVec) %in% binNames
                setMethVec(x = "logreg", index = tmpIndex)

                tmpIndex <- names(methVec) %in% nomNames
                setMethVec(x = "polyreg", index = tmpIndex)

                ## Impute binary interaction terms with logistic regression:
                if(intMeth == 1 & initialImp) {
                    facFlag <- unlist(lapply(data[ , cn1], is.factor))
                    if(any(facFlag)) {
                        tmpIndex <- colnames(data) %in% cn1[facFlag]
                        setMethVec(x = "logreg", index = tmpIndex)
                    }
                }
                ## Don't impute ID or Dropped Variables:
                tmpIndex <- names(methVec) %in% dropVars[ , 1]
                setMethVec(x = "", index = tmpIndex)
            } else {
                methVec <<-
                    sapply(typeVec[colnames(data)],
                           FUN = function(x) {
                               switch(x,
                                      continuous = micemethods[1],
                                      ordinal    = micemethods[2],
                                      nominal    = micemethods[3],
                                      binary     = micemethods[4],
                                      "")
                           }
                           )
                ## Work Around: Use PMM for variables with only 1 missing datum:
                setMethVec(x = "pmm", index = colSums(is.na(data)) == 1)
            }
            ## Don't impute fully observed variables:
            setMethVec(x = "", index = colSums(is.na(data)) == 0)
        },

        addVars = function(x, names = NULL)
        {
            "Add columns to 'data'"
            if(is.null(names)) names <-  colnames  (x              )
            oldNames                 <-  colnames  (data           )
            data                     <<- data.frame(data, x        )
            colnames(data)           <<- c         (oldNames, names)
        },

        idToCharacter = function()
        {
            "If any IDs are factors, cast them as character objects"
            if(length(idVars) == 1) {
                if(is.factor(idCols)) idCols <<- as.character(idCols)
            } else {
                for(i in idVars) {
                    if(is.factor(idCols[ , i]))
                        idCols[ , i] <<- as.character(idCols[ , i])
                }
            }
        },

        binGroupVars = function(undo = FALSE)
        {
            "Discretize continuous grouping variables"
            gvTypes  <- typeVec[groupVars                       ]
            conGVars <- names  (gvTypes)[gvTypes == "continuous"]
            if(!undo) {
                ## Define a vector defining the cut-point percentiles:
                probVec <- c(0, (c(1 : (nGVarCats - 1)) / nGVarCats), 1)
                ## Define a function to create the cut-points:
                cutFun  <- function(x, nCats, probs) {
                    cut(x,
                        breaks = quantile(x, probs = probs, na.rm = TRUE),
                        labels = c(1 : nCats)
                        )
                }
                ## Bin the variables:
                frozenGVars <<- data[ , conGVars]
                if(length(conGVars) > 1)
                    data[ , conGVars] <<- data.frame(lapply(X     = frozenGVars,
                                                            FUN   = cutFun,
                                                            nCats = nGVarCats,
                                                            probs = probVec)
                                                     )
                else
                    data[ , conGVars] <<-
                        cutFun(frozenGVars, nGVarCats, probVec)
            }
            else {
                tmp               <-  data[ , conGVars]
                data[ , conGVars] <<- frozenGVars
                frozenGVars       <<- tmp
            }
        },

        createPatterns = function()
        {
            "Create patterns to use for group-mean substitution"
            ## Deal with any continuous grouping variables:
            gVarCheck <- any(typeVec[groupVars] == "continuous")
            if(gVarCheck) binGroupVars()

            ## 'patterns' list a list of vectors made up of cross-tabulated
            ## grouping variables:
            patterns <<-
                lapply(c(length(groupVars) : 2),
                       FUN = function(x, data) {
                           tmpData <- data[ , groupVars[1 : x]]
                           apply(# Merge columns of tmpData into a single vector
                               apply(tmpData, 2, as.character),# Type-cast tmpData
                               1, paste, collapse = "")
                       },
                       data = data)

            patterns[[length(groupVars)]] <<- format(data[ , groupVars[1]])

            if(gVarCheck) binGroupVars(undo = TRUE)
        },

        completeMiData = function()
        {
            "Complete the multiply imputed data sets"
            specialComp <- compFormat %in% c("long", "broad", "repeated")
            if(specialComp) {
                miDatasets <<- complete(miceObject, compFormat)
                pcCols <-
                    grep("^linPC\\d|^nonLinPC\\d", colnames(miDatasets))
                miDatasets <<- miDatasets[ , -pcCols]
            }
            else {
                miDatasets <<- list()
                for(m in 1 : nImps) {
                    miDatasets[[m]] <<- complete(miceObject, m)
                    if(m == 1) pcCols <- grep("^linPC\\d|^nonLinPC\\d",
                                              colnames(miDatasets[[m]])
                                              )
                    miDatasets[[m]] <<- miDatasets[[m]][ , -pcCols]
                }
            }
        },

        transformMiData = function()
        {
            "Format imputed data sets after parallelMice()"

            ## Remove the PcAux:
            pcCols <- grep("^linPC\\d|^nonLinPC\\d", colnames(miDatasets[[1]]))
            for(m in 1 : nImps) miDatasets[[m]] <<- miDatasets[[m]][ , -pcCols]

            ## Impose the requested completion format:
            if(compFormat == "long") {
                .imp <- rep(c(1 : nImps), each = nrow(data))
                .id  <- rep(c(1 : nrow(data)), nImps       )
                miDatasets <<-
                    data.frame(.imp, .id, do.call(rbind.data.frame, miDatasets))
            }
            else if(compFormat %in% c("broad", "repeated")) {
                for(m in 1 : nImps) {
                    colnames(miDatasets[[m]]) <<-
                        paste0(colnames(miDatasets[[m]]), ".", m)
                }

                miDatasets <<- do.call(cbind.data.frame, miDatasets)

                if(compFormat == "repeated") {
                    tmp <-  rep(c(1 : (ncol(data) - length(pcCols))), nImps)
                    miDatasets <<- miDatasets[ , order(tmp)]
                }
            }
        },

        computeInteract = function()
        {
            "Calculate interaction terms"
            if(length(pcAux$lin) > 0) # Do we have linear PcAux?
                pcNames <- setdiff(colnames(pcAux$lin), idVars)

            ## Cast factors to numeric:
            if(length(ordVars) > 0) castOrdVars()
            if(length(nomVars) > 0) castNomVars()

            ## Define moderators and focal predictors:
            if(intMeth < 3) mods <- moderators
            else            mods <- colnames(data)

            if(intMeth == 1) focal <- setdiff(colnames(data), mods)
            else             focal <- pcNames

            ## Generate a list of interacted variable pairs:
            if(length(focal) == 0) {
                ## All possible two-way interactions:
                varCombs <- combn(mods, 2, simplify = FALSE)
            }
            else {
                ## Subset of interactions defined by user:
                varCombs <- list()
                for(m in mods)
                    for(f in focal)
                        varCombs[[paste0(m, f)]] <- c(m, f)
            }

            ## Generate variable names for interaction terms:
            intVars <<- as.character(sapply(varCombs, paste0, collapse = "."))

            ## Compute the interaction terms:
            if(intMeth == 1)
                interact <<- data.frame(
                    lapply(varCombs,
                           function(x, dat) dat[ , x[1]] * dat[ , x[2]],
                           dat = data)
                )
            else
                interact <<- data.frame(
                    lapply(varCombs,
                           function(x, dat, pc) dat[ , x[1]] * pc[ , x[2]],
                           dat = data,
                           pc  = pcAux$lin[ , pcNames])
                )
            colnames(interact) <<- intVars

            ## Remove dummy codes for empty cells:
            levVec   <-  sapply(interact, countLevels)
            interact <<- interact[ , levVec > 1]
            intVars  <<- colnames(interact)

            ## Cast dummy codes as factors:
            dumFlag <-
                sapply(interact,
                       function(x) all(unique(na.omit(x)) %in% c(0, 1))
                       )

            if(sum(dumFlag) > 1)
                interact[ , dumFlag] <<-
                    data.frame(lapply(interact[ , dumFlag], as.factor))
            else if(sum(dumFlag) == 1)
                interact[ , dumFlag] <<- as.factor(interact[ , dumFlag])

            ## If any PcAux are involved, orthogonalize the interaction terms
            ## w.r.t. the linear PcAux scores:
            if(intMeth > 1)
                for(v in 1 : ncol(interact))
                    interact[ , v] <<-
                        .lm.fit(y = interact[ , v],
                                x = as.matrix(pcAux$lin[ , pcNames]))$resid

            ## Undo type-cast of ordered factors:
            if(length(ordVars) > 0) castOrdVars(toNumeric = FALSE)
            if(length(nomVars) > 0) castNomVars(toNumeric = FALSE)
        },

        computePoly = function()
        {
            "Compute polynomial terms"
            ## Cast ordered factors to numeric:
            if(length(ordVars) > 0) castOrdVars()

            dataNames <- setdiff(colnames(data), c(intVars, nomVars))

            ## Construct a formula to define the polynomial transformations:
            form <- as.formula(
                paste0("~",
                       paste0("I(",
                              paste(dataNames,
                                    rep(c(2 : maxPower),
                                        each = length(dataNames)),
                                    sep = "^"),
                              ")",
                              collapse = " + "
                              )
                       )
            )

            ## Make sure missing values are retained in dummy codes:
            oldOpt <- options(na.action = "na.pass")

            ## Create the polynominal terms:
            if(length(dataNames) == 1) # Hack for only one target variable
                poly <<- data.frame(
                    model.matrix(form,
                                 data = as.data.frame(
                                     list(data[ , dataNames]),
                                     col.names = dataNames)
                                 )[ , -1]
                )
            else
                poly <<- data.frame(
                    model.matrix(form, data = data[ , dataNames])
                )[ , -1]

            ## Reset the na.action option:
            options(na.action = oldOpt$na.action)

            ## Revert ordinal variable casting:
            if(length(ordVars) > 0) castOrdVars(toNumeric = FALSE)
        },

                                        #calcRSquared    = function()                                               {
                                        #    "Compute the proportion of variance explained by PcAux"
                                        #    if(length(pcAux$lin) == 0) lv <- "lin"
                                        #    else                       lv <- "nonLin"
                                        #
                                        #    ## Compute the cumulative variance explained:
                                        #    totalVar <- sum(rSquared[[lv]], na.rm = TRUE)
                                        #
                                        #    rSquared[[lv]][1] <<- rSquared[[lv]][1] / totalVar
                                        #
                                        #    for(i in 2 : length(rSquared[[lv]]))
                                        #        rSquared[[lv]][i] <<-
                                        #            rSquared[[lv]][i - 1] + (rSquared[[lv]][i] / totalVar)
                                        #},

        codeNomVars = function()
        {
            "Dummy code nominal factors"
            noms <- colnames(data)[colnames(data) %in%
                                   setdiff(nomVars, dropVars[,1])]

            ## Store factor representations:
            facNoms           <<- data.frame(data[ , noms])
            colnames(facNoms) <<- noms

            for(v in noms) {
                ## Expand factors into dummy codes:

                ## Make sure missing values are retained in dummy codes:
                oldOpt <- options(na.action = "na.pass")

                ## Create/store dummy codes:
                tmp <- data.frame(
                    model.matrix(as.formula(paste0("~", v)), data = data)
                )
                dumNames <- colnames(tmp)

                ## Reset the na.action option:
                options(na.action = oldOpt$na.action)

                ## Remove dummy codes for empty factor levels:
                levVec                 <-  sapply(tmp, countLevels)
                dumNoms[[v]]           <<- data.frame(tmp[ , levVec > 1])
                colnames(dumNoms[[v]]) <<- dumNames[levVec > 1]
            }
            ## Save dummy code names:
            dumVars <<- as.character(unlist(lapply(dumNoms, colnames)))
        },

        castOrdVars = function(toNumeric = TRUE)
        {
            "Cast ordinal factors to numeric variables"
            ## Find ordinal variables that are still on the data set:
            ords <- colnames(data)[colnames(data) %in% ordVars]

            if(toNumeric) {
                ## Cast the ordinal variables as numeric:
                if(length(ords) > 1)
                    data[ , ords] <<-
                        data.frame(lapply(data[ , ords], as.numeric))
                else
                    data[ , ords] <<- as.numeric(data[ , ords])
            }
            else {
                ## Cast back to ordered factors:
                if(length(ords) > 1)
                    data[ , ords] <<-
                        data.frame(lapply(data[ , ords], as.ordered))
                else
                    data[ , ords] <<- as.ordered(data[ , ords])
            }
        },

        castNomVars = function(toNumeric = TRUE)
        {
            "Swap factor and dummy-coded representations of nominal variables"
            if(toNumeric) {# Replace factors in the data with dummy codes
                otherNames     <-  setdiff(colnames(data), nomVars)
                data           <<- data.frame(data[ , otherNames],
                                              do.call(cbind, dumNoms)
                                              )
                colnames(data) <<- c(otherNames, dumVars)

                ## Update the moderators with dummy code names:
                check <- moderators %in% nomVars
                if(any(check)) {
                    frozenMods <<- moderators
                    moderators <<-
                        c(moderators[!check],
                          unlist(lapply(dumNoms[moderators[check]], colnames),
                                 use.names = FALSE)
                          )
                }
            }
            else {# Undo dummy coding
                data <<-
                    data.frame(data[ , setdiff(colnames(data), dumVars)],
                               facNoms)

                ## Revert to original moderator list:
                moderators <<- frozenMods
            }
        }

    )# END PcAuxData$methods()
dsquire/PcAux documentation built on May 2, 2024, 4:48 a.m.