R/clean.R

#' @title Function to clean training and test data
#'
#' @description \code{clean} Function to clean training and test data.
#'
#' @details Function to clean training and test data.
#'
#' @param \code{x} Input dataframe
#'
#' @return Cleaned dataframe
#'
#' @examples
#'
#'\dontrun{
#'  library(catR)
#'  library(dplyr)
#'  library(readr)
#'
#'  train <- "train.csv" %>%
#'    read_csv %>%
#'    clean
#'
#'}
#'
#' @export

clean <- function(
  x,
  dummy_variables = NULL
  ) {

  out <- tryCatch(
    {

      x[["Name_length"]] <- nchar(x[["Name"]])
      x[["Year"]] <- lubridate::year(x[["DateTime"]])
      x[["Month"]] <- lubridate::month(x[["DateTime"]])
      x[["Day"]] <- lubridate::day(x[["DateTime"]])
      x[["Wday"]] <- lubridate::wday(x[["DateTime"]])
      x[["Hour"]] <- lubridate::hour(x[["DateTime"]])
      x[["Weekend"]] <- ifelse(x[["Wday"]] %in% c(1,7), 1, 0)

      x[["TimeOfDay"]] <- "Unknown"
      x[which(x$Hour >= 5 & x$Hour < 12),"TimeOfDay"] <- "Morning"
      x[which(x$Hour >= 12 & x$Hour < 13),"TimeOfDay"] <- "Noon"
      x[which(x$Hour >= 13 & x$Hour < 18),"TimeOfDay"] <- "Afternoon"
      x[which(x$Hour >= 18 & x$Hour < 21),"TimeOfDay"] <- "Evening"
      x[which(x$Hour >= 21 & x$Hour <= 23),"TimeOfDay"] <- "Night"
      x[which(x$Hour >= 0 & x$Hour < 5),"TimeOfDay"] <- "Night"

      x <- tidyr::separate_(x, "SexuponOutcome", c("Fertility","Gender"), fill = "right")
      x[["Gender"]] <- ifelse(is.na(x[["Gender"]]),"Unknown", x[["Gender"]])

      # Convert age into num of days
      x[["Age_days"]] <- sapply(x[["AgeuponOutcome"]], FUN = convert_age)

      # Convert
      x[is.na(x)] <- 0 # Fill NA with 0

      # Simple Breed
      x[["IsMix"]] <- 0
      x[stringr::str_detect(x$Breed, "Mix"), "IsMix"] <- 1

      # Simple Color
      x[["Simple_Colour"]] <- sapply(x[["Color"]], function(x) strsplit(x, split = '/| ')[[1]][1])

      # Dummy variables

      if (!is.null(dummy_variables)) {

        x <- dummy_variables %>%
          purrr::map(~dummy_vars(train,variable = .x)) %>%
          bind_cols %>%
          bind_cols(x, .)

      }

      return(x)
    },
    warning = function(w) {

      w

    },
    error = function(e) {

      e

    },
    finally = {}
  )

  return(out)
}
glaggle/cadR documentation built on May 17, 2019, 6:39 a.m.