R/multilabel_binarizer.R

Defines functions multi_label_binarizer

Documented in multi_label_binarizer

#' Multi-label Binarizer
#'
#' @param data Input data frame
#' @param feature Unquoted form of the feature/column to encode
#' @param sep Inner separator of the labels
#' @param as_integer Whether the boolean should be converted to integer or not
#'
#' @return Data frame with additional feature columns
#' @export
#'
#' @examples
multi_label_binarizer <- function(data, feature, sep = "#", as_integer = FALSE) {
  pad_data_with_sep <- function(data, feature, sep = "#") {
    data %>%
      dplyr::mutate(
        # Add separator to start and end of string
        processed_feature = stringr::str_replace_all(
          {{ feature }}, "^|$", sep
        ),
        # Remove duplicate separator
        processed_feature = stringr::str_replace_all(
          processed_feature, rep(sep, 2) %>% paste(sep = "", collapse = ""), sep
        ),
        # Fix NAa
        processed_feature = stringr::str_replace_na(processed_feature)
      ) %>%
      dplyr::select(-{{ feature }})
  }

  # Deparse feature name
  feature_name <- deparse(substitute(feature))

  # Process labels
  labels_raw <- data %>%
    dplyr::pull({{ feature }}) %>%
    stringr::str_split(sep) %>%
    base::unlist() %>%
    base::unique() %>%
    base::sort()

  labels <- paste0(
    # Keep original name
    feature_name, "_",
    # Processed labels
    labels_raw %>%
      # Replace spaces
      stringr::str_replace_all(" ", "_") %>%
      # Replace odd symbols (WIP)
      stringr::str_replace_all("/", "_")
  )

  detection_labels <- labels_raw %>%
    stringr::str_replace_all("^|$", sep)

  # Raw processed feature
  processed_feature <- pad_data_with_sep(data, {{ feature }}, sep) %>%
    dplyr::pull(processed_feature)

  # MLB encoding
  for (i in 1:length(labels)) {
    data[[labels[i]]] <- stringr::str_detect(processed_feature, detection_labels[i])
  }

  # Transform variables into integers
  if (as_integer) {
    data <- data %>%
      dplyr::mutate_at(
        .vars = dplyr::vars(labels),
        .funs = as.integer
      )
  }

  return(data %>% dplyr::select(-{{ feature }}))
}

#' @rdname multi_label_binarizer
#' @export
multi_label_binariser <- multi_label_binarizer
aldomann/encoder documentation built on May 12, 2020, 7:04 p.m.