R/data_envelope.R

#' Create a Data Envelope
#'
#' A Data Envelope is a simple list data stores probabilistic information about
#' the variables in a \code{data.frame}.
#'
#' @noRd
#'
#' @author David Navega
#'
#' @param x a data.frame
#' @param type a string defining the type of envelope. type = "data" creates an
#' envelope with the marginal probabilities (Default). type = "uniform" assumes
#' uniform marginal probability for the variables.
#'
create_envelope <- function(x, type = "data") {

  x <- as.data.frame(x)
  x_names <- colnames(x)

  x_class <- lapply(x, class)
  x_class <- sapply(x_class, numeric_factor_class)

  envelope <- named_apply(x_names, function(name) {
    # switch variable ----
    switch(
      EXPR = x_class[name],
      factor = {
        # swith envelope type ----
        switch(
          EXPR = type,
          data = {

            value <- levels(x[[name]])
            probability <- m_estimator(x[[name]])

            # return
            rout <- list(
              value = value,
              probability = probability,
              class = "factor"
            )
            return(rout)

          },
          uniform = {

            value <- levels(x[[name]])
            k <- nlevels(x[[name]])
            probability <- rep(x = 1 / k, times = k)

            # return
            rout <- list(
              value = value,
              probability = probability,
              class = "factor"
            )
            return(rout)

          },
          { stop("[-] type = 'data' or type = 'uniform'") }
        )

      },
      numeric = {

         # swith envelope type ----
          switch(
            EXPR = type,
            data = {

              x_range <- range(x[[name]], na.rm = T)
              from <- x_range[1]
              to <- x_range[2]

              kde <- stats::density(x[[name]], from = from, to = to, na.rm = T)


              value <- kde$x
              probability <- kde$y / sum(kde$y)

              # return
              rout <- list(
                value = value,
                probability = probability,
                class = "numeric"
              )
              return(rout)

            },
            uniform = {

              x_range <- range(x[[name]], na.rm = T)
              from <- x_range[1]
              to <- x_range[2]

              value <- seq(from = from, to = to, length.out = 512)
              probability <- rep(x = 1 / 512, times = 512)

              # return
              rout <- list(
                value = value,
                probability = probability,
                class = "numeric"
              )
              return(rout)

            },
            { stop("[-] type = 'data' or type = 'uniform'") }
          )

      }
    )
  })

  # return ----
  rout <- envelope
  return(rout)

}


#' Sample from a Data Envelope
#'
#' @noRd
#'
#' @author David Navega
#'
#' @param object a data envelope object
#' @param n an integer defining the number of observation to sample
#' @param replace a logical indicating if sampling is done with or without
#' replacement. Default is TRUE
#'
sample_envelope <- function(object, n, replace = TRUE) {

  x_names <- names(object)
  sampled_list <- named_apply(x_names, function(name) {

    sub_object <- object[[name]]
    sampled_value <- sample(
      x = sub_object$value,
      prob = sub_object$probability,
      size = n,
      replace = replace
    )

    condition <- sub_object$class == "factor"
    if(condition) {
      sampled_value <- factor(sampled_value, levels = sub_object$value)
    }

    # return
    rout <- sampled_value
    return(rout)

  })

  # return ----
  rout <- as.data.frame(sampled_list)
  colnames(rout) <- x_names
  return(rout)

}
dsnavega/imputeForest documentation built on May 8, 2019, 2:43 p.m.