R/imputeProductionTriplet.R

Defines functions imputeProductionTriplet

Documented in imputeProductionTriplet

##' This function imputes the whole production domain.
##'
##' The function will impute production, area harvested and yield at the same
##' time.
##'
##' Transformation in the yield formula is not allowed and will not be taken
##' into account.
##'
##' @param data The data
##' @param processingParameters A list of the parameters for the production
##'     processing algorithms. See defaultProductionParameters() for a starting
##'     point.
##' @param formulaParameters A list holding the names and parmater of formulas.
##'     See \code{productionFormulaParameters}.
##' @param imputationParameters A list holding the imputation parameters, see
##'     \code{getImputationParameters}.
##' @export
##'
##' @import faoswsImputation
##' @import data.table
##'

imputeProductionTriplet = function(data,
                                  processingParameters,
                                  formulaParameters,
                                  imputationParameters, ...){

    originDataType = sapply(data, FUN = typeof)

    ################## ADDED TO HANDLE OUTLIERS

    # List additional arguments
    a <- list(...)

    # TODO: Should be moved to R/
    rollavg <- function(x, order = 3) {
      # order should be > 2
      stopifnot(order >= 3)
      
      non_missing <- sum(!is.na(x))
      
      # For cases that have just two non-missing observations
      order <- ifelse(order > 2 & non_missing == 2, 2, order)
      
      if (non_missing == 1) {
        x[is.na(x)] <- na.omit(x)[1]
      } else if (non_missing >= order) {
        n <- 1
        while(any(is.na(x)) & n <= 10) { # 10 is max tries
          movav <- suppressWarnings(RcppRoll::roll_mean(x, order, fill = 'extend', align = 'right'))
          movav <- data.table::shift(movav)
          x[is.na(x)] <- movav[is.na(x)]
          n <- n + 1
        }
        
        x <- zoo::na.fill(x, 'extend')
      }
      
      return(x)
    }

    # function added in order to keep values in line
    fix_out <- function(data,
                        completeImputationKey,
                        imputationTimeWindow,
                        formula_parameters,
                        imputation_parameters,
                        flagValidTable,
                        THRESHOLD,
                        AVG_YEARS) {

      orig_cols <- names(data)

      mydata <- copy(data)

      if (imputationTimeWindow == "all") {
        val_years <- completeImputationKey@dimensions$timePointYears@keys
      } else {
        val_years <-
          lastYear - 0:switch(imputationTimeWindow, "lastThree" = 2, "lastFive" = 4)
      }

      mydata <-
        flagValidTable[
          mydata,
          on = c("flagObservationStatus" = formula_parameters$productionObservationFlag,
                 "flagMethod" = formula_parameters$productionMethodFlag)
        ]

      setnames(
        mydata,
        c("flagObservationStatus", "flagMethod"),
        c(formula_parameters$productionObservationFlag, formula_parameters$productionMethodFlag)
      )

      mydata[,
        `:=`(
          mean_old =
            mean(
              get(formula_parameters$productionValue)[
                get(imputation_parameters$yearValue) %in% AVG_YEARS
              ]
            ),
          mean_protected =
            mean(
              get(formula_parameters$productionValue)[
                get(imputation_parameters$yearValue) %in% (max(AVG_YEARS) + 1):max(val_years) &
                  sum(Protected[get(imputation_parameters$yearValue) %in% (max(AVG_YEARS) + 1):max(val_years)]) >= 2 &
                  Protected == TRUE
              ]
            )
        ),
        by = c(imputation_parameters$byKey)
      ]

      mydata[is.nan(mean_old), mean_old := NA_real_]
      mydata[is.nan(mean_protected), mean_protected := NA_real_]

      # NOTE: some NA ratios are due to M- series
      mydata[,
        `:=`(
          ratio_old       = get(formula_parameters$productionValue) / mean_old,
          ratio_protected = get(formula_parameters$productionValue) / mean_protected
        )
      ]

      mydata[is.infinite(ratio_old), ratio_old := NA_real_]
      mydata[is.infinite(ratio_protected), ratio_protected := NA_real_]

      mydata[, ratio := ifelse(!is.na(ratio_protected), ratio_protected, ratio_old)]

      mydata[, outlier := FALSE]

      mydata[
        get(imputation_parameters$yearValue) %in% val_years &
          Protected == FALSE,
        outlier := abs(ratio - 1) > THRESHOLD
      ]

      mydata[
        outlier == TRUE,
        (formula_parameters$productionValue) := NA_real_
      ]

      mydata[,
        movav :=
          rollavg(
            get(formula_parameters$productionValue),
            order = 3
          ),
          by = c(imputation_parameters$byKey)
      ]

      mydata[
        outlier == TRUE & !is.na(movav),
        `:=`(
          c(formula_parameters$productionValue,
          formula_parameters$productionObservationFlag,
          formula_parameters$productionMethodFlag),
          # Flags are Ee to differentiate them from Ie
          list(movav, "E", "e")
        )
      ]

      return(mydata[, orig_cols, with = FALSE])
    }
    ################## / ADDED TO HANDLE OUTLIERS

    areaHarvestedImputationParameters = imputationParameters$areaHarvestedParams
    yieldImputationParameters = imputationParameters$yieldParams
    productionImputationParameters = imputationParameters$productionParams

    message("Initializing ... ")
    dataCopy = copy(data)
    
    ##filter out (m-) from the imputation process
    
    
    
    ## Data Quality Checks
    suppressMessages({
        ensureImputationInputs(data = dataCopy,
                               imputationParameters = yieldImputationParameters)
        ensureImputationInputs(data = dataCopy,
                               imputationParameters =
                                   productionImputationParameters)
  
        ensureProductionInputs(dataCopy,
                               processingParameters = processingParameters,
                               formulaParameters = formulaParameters,
                               returnData = FALSE,
                               normalised = FALSE)
    })

    setkeyv(x = dataCopy, cols = c(processingParameters$areaVar,
                                   processingParameters$yearValue))


    dataCopy = computeYield(dataCopy,
                            processingParameters = processingParameters,
                            formulaParameters = formulaParameters)
    ## Check whether all values are missing
    allYieldMissing = all(is.na(dataCopy[[formulaParameters$yieldValue]]))
    allProductionMissing = all(is.na(dataCopy[[formulaParameters$productionValue]]))
    allAreaMissing = all(is.na(dataCopy[[formulaParameters$areaHarvestedValue]]))


    if(!all(allYieldMissing)){
        ## Step two: Impute Yield
        message("Imputing Yield ...")
        n.missYield = sum(is.na(dataCopy[[formulaParameters$yieldValue]]))
        ## if(!missing(yieldFormula))
        ##     yieldFormula =
        ##     as.formula(gsub(yearValue, "yearValue",
        ##                     gsub(yieldValue, "yieldValue",
        ##                          deparse(yieldFormula))))

        dataCopy = imputeVariable(data = dataCopy,
                       imputationParameters = yieldImputationParameters)

        if (!is.null(a$FIX_OUTLIERS) && a$FIX_OUTLIERS == TRUE) {
          dataCopy <- fix_out(data                  = dataCopy,
                              completeImputationKey = a$completeImputationKey,
                              imputationTimeWindow  = a$imputationTimeWindow,
                              formula_parameters    = formulaParameters,
                              imputation_parameters = yieldImputationParameters,
                              flagValidTable        = a$flagValidTable,
                              THRESHOLD             = a$THRESHOLD,
                              AVG_YEARS             = a$AVG_YEARS)
        }

        ## TODO (Michael): Remove imputed zero yield as yield can not be zero by
        ##                 definition. This probably should be handled in the
        ##                 imputation parameter.
        ## Francesca: there is no reson why the zero yields have to be deleted!!
        ## It is the opposite: team B/C do not want to have yield when there is no production
        ## no areaHarvested!
        ##dataCopy =
        ##    removeZeroYield(dataCopy,
        ##                    yieldValue = formulaParameters$yieldValue,
        ##                    yieldObsFlag = formulaParameters$yieldObservationFlag,
        ##                    yieldMethodFlag = formulaParameters$yieldMethodFlag)
        n.missYield2 = length(which(is.na(
            dataCopy[[formulaParameters$yieldValue]])))
        message("Number of values imputed: ", n.missYield - n.missYield2)
        message("Number of values still missing: ", n.missYield2)

        ## Balance production now using imputed yield
        dataCopy =
            balanceProduction(data = dataCopy,
                              processingParameters = processingParameters,
                              formulaParameters = formulaParameters)

        ## NOTE (Michael): Check again whether production is available
        ##                 now after it is balanced.
        allProductionMissing = all(is.na(dataCopy[[formulaParameters$productionValue]]))
    } else {
        warning("The input dataset contains insufficient data to impute yield!")
    }

    if(!all(allProductionMissing)){
        ## step three: Impute production
        message("Imputing Production ...")
        n.missProduction = length(which(is.na(
            dataCopy[[formulaParameters$productionValue]])))

        dataCopy = imputeVariable(data = dataCopy,
                       imputationParameters = productionImputationParameters)

        if (!is.null(a$FIX_OUTLIERS) && a$FIX_OUTLIERS == TRUE) {
          dataCopy <- fix_out(data                  = dataCopy,
                              completeImputationKey = a$completeImputationKey,
                              imputationTimeWindow  = a$imputationTimeWindow,
                              formula_parameters    = formulaParameters,
                              imputation_parameters = productionImputationParameters,
                              flagValidTable        = a$flagValidTable,
                              THRESHOLD             = a$THRESHOLD,
                              AVG_YEARS             = a$AVG_YEARS)
        }

        n.missProduction2 = length(which(is.na(
            dataCopy[[formulaParameters$productionValue]])))
        message("Number of values imputed: ",
            n.missProduction - n.missProduction2)
        message("Number of values still missing: ", n.missProduction2)
    } else {
        warning("The input dataset contains insufficient data to impute production!")
    }

    ## step four: balance area harvested
    message("Imputing Area Harvested ...")
    n.missAreaHarvested =
        length(which(is.na(
            dataCopy[[formulaParameters$areaHarvestedValue]])))

    dataCopy =
        balanceAreaHarvested(data = dataCopy,
                             processingParameters = processingParameters,
                             formulaParameters = formulaParameters)
    allAreaMissing = all(is.na(dataCopy[[formulaParameters$areaHarvestedValue]]))

    if(!all(allAreaMissing)){
        ## HACK (Michael): This is to ensure the area harvested are also
        ##                 imputed. Then we delete all computed yield and
        ##                 then balance again. This causes the yield not
        ##                 comforming to the imputation model.
        ##
        ##                 This whole function should be re-writtened so
        ##                 that an algorithm similar to the EM algorithm
        ##                 estimates and impute the triplet in a conherent
        ##                 way.
        ##
        ##                 Issue #88
       
        dataCopy = imputeVariable(data = dataCopy,
                      imputationParameters = areaHarvestedImputationParameters)

        if (!is.null(a$FIX_OUTLIERS) && a$FIX_OUTLIERS == TRUE) {
          dataCopy <- fix_out(data                  = dataCopy,
                              completeImputationKey = a$completeImputationKey,
                              imputationTimeWindow  = a$imputationTimeWindow,
                              formula_parameters    = formulaParameters,
                              imputation_parameters = areaHarvestedImputationParameters,
                              flagValidTable        = a$flagValidTable,
                              THRESHOLD             = a$THRESHOLD,
                              AVG_YEARS             = a$AVG_YEARS)
        }
        
    ## It was this part that caused the double "i" in methodFlag in the same triplet:
    ## beacuse I was deliting those non-protected yields even if I had used them to compute
    ## as identity the other variables.   
    ##   dataCopy[!is.na(get(formulaParameters$areaHarvestedValue)) &
    ##            !is.na(get(formulaParameters$productionValue)) &
    ##            !(combineFlag(.SD,
    ##                          formulaParameters$yieldObservationFlag,
    ##                          formulaParameters$yieldMethodFlag) %in%
    ##              getProtectedFlag()),
    ##            `:=`(c(formulaParameters$yieldValue,
    ##                   formulaParameters$yieldObservationFlag,
    ##                   formulaParameters$yieldMethodFlag),
    ##                 list(NA, "M", "u"))]
        dataCopy =
            computeYield(dataCopy,
                         processingParameters = processingParameters,
                         formulaParameters = formulaParameters)
        dataCopy = imputeVariable(data = dataCopy,
                       imputationParameters = yieldImputationParameters)

        if (!is.null(a$FIX_OUTLIERS) && a$FIX_OUTLIERS == TRUE) {
          dataCopy <- fix_out(data                  = dataCopy,
                              completeImputationKey = a$completeImputationKey,
                              imputationTimeWindow  = a$imputationTimeWindow,
                              formula_parameters    = formulaParameters,
                              imputation_parameters = yieldImputationParameters,
                              flagValidTable        = a$flagValidTable,
                              THRESHOLD             = a$THRESHOLD,
                              AVG_YEARS             = a$AVG_YEARS)
        }
       
    } ## End of HACK.
    n.missAreaHarvested2 =
        length(which(is.na(
            dataCopy[[formulaParameters$areaHarvestedValue]])))
    message("Number of values imputed: ",
        n.missAreaHarvested - n.missAreaHarvested2)
    message("Number of values still missing: ", n.missAreaHarvested2)


    ## This is to ensure the data type of the output is identical to
    ## the input data.
    dataCopy[, `:=`(colnames(dataCopy),
                    lapply(colnames(dataCopy),
                           FUN = function(x){
                        if(x %in% names(originDataType)){
                            as(.SD[[x]], originDataType[[x]])
                        } else {
                            .SD[[x]]
                        }
                    }))]
    dataCopy
}
SWS-Methodology/faoswsProduction documentation built on March 21, 2023, 8:27 p.m.