R/transform_customers.R

Defines functions transform_customers

Documented in transform_customers

#
#
#
#' Modify the customers data.
#'
#' Adds customer history to the \emph{customers} data by joining
#'   the 4 data sets. Cleans some variables and also makes some new ones.
#'
#'   \bold{Modifies/ Cleans:}
#'     \itemize{
#'       \item Modifies the \code{top_quality & jewelry_quality} columns in
#'         the \emph{customers} data
#'       \item Cleans the \emph{style} and the \emph{address}
#'       \item Also cleans \code{shipping_period & changed_periods}
#'       }
#'   \bold{Adds:}
#'     \itemize{
#'       \item \code{No_sh_periods}, number of changed shipment periods
#'       \item \code{age}, fixes the birthdate
#'       \item \code{age_group}, groups them into categories
#'       }
#'
#'
#' @param customers \bold{Data frame}, from the \emph{customers.rds}
#'   file
#' @param customer_styles \bold{Data frame}, from the
#'   \emph{customers_styles.rds} file
#' @param addresses \bold{Data frame}, from the \emph{addresses.rds}
#'   file
#' @param income \bold{Data frame}, from the
#'   \emph{median_income_by_zipcode.csv} file
#'
#' @examples
#' customers = readRDS('Data/Main_data/customers.rds')
#' customer_styles <- readRDS('Data/Main_data/customer_styles.rds')
#' addresses <- readRDS('Data/Main_data/addresses.rds')
#' income <- read.csv('Data/Auxiliary_data/median_income_by_zipcode.csv')
#' transformed_customers_data <- transform_customers(customers, customer_styles,
#'                                                   addresses, income)
#'
#'@return Returns the \emph{customers} data \emph{(a data frame)} with the
#'  aforementioned modifications.


## Workflow:
## changes the "top_quality" & "jewelry_quality" variables in customers
## counts the number of changed shipping periods(12/21) and creates
## No_sh_periods
## cleans if there is more than one style for the same customer
## cleans the address
## joins customers, customers_styles & address, cleans the
## shipping_period and changed_periods
## make age, age_group variables, fixes current_status
## and finally adds the median income by state zip codes

# Preprocess individual files --------------------------------------------------

#' @export
transform_customers <- function(customers, customers_styles,
                                address, income){
  customers %<>%  select(-style)

  #Customer quality choice
  customers_styles %<>%
    select(-quiz_version, -created_by_user_id) %>%
    mutate(
      top_quality = case_when(
        top_quality %in% c('average', 'decent', 'decent_3', 'less_than_25') ~ 'less than $25',
        top_quality %in% c('extremely', 'average_3', 'extremely_3', 'more_than_25') ~ 'more than $25',
        TRUE ~ top_quality
      ),
      jewelry_quality = case_when(
        jewelry_quality %in% c('average', 'decent', 'decent_3', 'less_than_25') ~ 'less than $25',
        jewelry_quality %in% c('extremely', 'average_3', 'extremely_3', 'more_than_25') ~ 'more than $25',
        TRUE ~ jewelry_quality
      )
    )

  No_sh_periods <- customers_styles %>%
    group_by(customer_id) %>%
    summarise(
      Changed_periods = n_distinct(shipping_period)
    ) %>% ungroup

  customers_styles %<>%
    group_by(customer_id) %>%
    arrange(as.Date(created_at), id) %>%
    filter(row_number()==1) %>%
    rename(
      Top_size = top_size,
      Bottom_size = bottom_size
    ) %>% ungroup() %>%
    left_join( No_sh_periods,by = "customer_id") %>%
    select(-id)


  address %<>%
    group_by(customer_id) %>%
    arrange(desc(is_validated)) %>%
    slice(1) %>%
    select(customer_id, zip, city, state_id, is_validated)

  # Join the customers with initial information and choices -----------------

  customers %<>%
    left_join(customers_styles, by = c('id'='customer_id')) %>%
    left_join(address, by = c('id'='customer_id')) %>%
    mutate(
      shipping_period = case_when(
        as.Date(created_at.x)!=as.Date(created_at.y) ~ 'Unknown',
        as.Date(created_at.x)==as.Date(created_at.y) ~ as.character(shipping_period)
      ),
      Changed_periods = case_when(
        Changed_periods > 1 ~ as.character(Changed_periods),
        Changed_periods==1 & shipping_period=='Unknown' ~ 'Unknown',
        Changed_periods==1 & shipping_period!='Unknown' ~ as.character(Changed_periods)
      )
    )

  # Fix birthdate and create age groups and current status ------------------

  customers %<>%
    mutate(
      age = -lubridate::year(birthdate) + lubridate::year(Sys.Date()),
      age_group = cut(age, breaks = c(0,25,30,40, 90),
                      labels = c('25 and under', '26 to 30', '31 to 40', 'Over 40'),
                      include.lowest = TRUE)
    ) %>% rename(
      First_ship_period = shipping_period,
      created_at = created_at.x
    ) %>% mutate(
      First_ship_period = case_when(
        First_ship_period == '12' ~ '2 weeks',
        First_ship_period == '21' ~ '4 weeks',
        First_ship_period == 'Unknown' ~ 'Unknown'
      ),
      Status = case_when(
        customer_status %in% c('cancelled', 'inactive') ~ 'Dead',
        !customer_status %in% c('cancelled', 'inactive') ~ 'Alive'
      )
    ) %>% select(-created_at.y)

  #Add income
  #Transform Income
  income %<>% mutate(
    Income = cut(Median,
                 breaks = c(0,30000,50000,80000,10000000),
                 include.lowest = TRUE,
                 labels = c('Under 30K', '30K-50K', '50K-80K', '+80K')),
    Zip = as.character(Zip)
  ) %>% rename(Median_income = Median)

  customers %<>% left_join(income, by = c('zip' = 'Zip'))

  return(customers)

}
shahreyar-abeer/nadinewestr documentation built on May 27, 2019, 1:06 a.m.