#
#
#
#' Modify the customers data.
#'
#' Adds customer history to the \emph{customers} data by joining
#' the 4 data sets. Cleans some variables and also makes some new ones.
#'
#' \bold{Modifies/ Cleans:}
#' \itemize{
#' \item Modifies the \code{top_quality & jewelry_quality} columns in
#' the \emph{customers} data
#' \item Cleans the \emph{style} and the \emph{address}
#' \item Also cleans \code{shipping_period & changed_periods}
#' }
#' \bold{Adds:}
#' \itemize{
#' \item \code{No_sh_periods}, number of changed shipment periods
#' \item \code{age}, fixes the birthdate
#' \item \code{age_group}, groups them into categories
#' }
#'
#'
#' @param customers \bold{Data frame}, from the \emph{customers.rds}
#' file
#' @param customer_styles \bold{Data frame}, from the
#' \emph{customers_styles.rds} file
#' @param addresses \bold{Data frame}, from the \emph{addresses.rds}
#' file
#' @param income \bold{Data frame}, from the
#' \emph{median_income_by_zipcode.csv} file
#'
#' @examples
#' customers = readRDS('Data/Main_data/customers.rds')
#' customer_styles <- readRDS('Data/Main_data/customer_styles.rds')
#' addresses <- readRDS('Data/Main_data/addresses.rds')
#' income <- read.csv('Data/Auxiliary_data/median_income_by_zipcode.csv')
#' transformed_customers_data <- transform_customers(customers, customer_styles,
#' addresses, income)
#'
#'@return Returns the \emph{customers} data \emph{(a data frame)} with the
#' aforementioned modifications.
## Workflow:
## changes the "top_quality" & "jewelry_quality" variables in customers
## counts the number of changed shipping periods(12/21) and creates
## No_sh_periods
## cleans if there is more than one style for the same customer
## cleans the address
## joins customers, customers_styles & address, cleans the
## shipping_period and changed_periods
## make age, age_group variables, fixes current_status
## and finally adds the median income by state zip codes
# Preprocess individual files --------------------------------------------------
#' @export
transform_customers <- function(customers, customers_styles,
address, income){
customers %<>% select(-style)
#Customer quality choice
customers_styles %<>%
select(-quiz_version, -created_by_user_id) %>%
mutate(
top_quality = case_when(
top_quality %in% c('average', 'decent', 'decent_3', 'less_than_25') ~ 'less than $25',
top_quality %in% c('extremely', 'average_3', 'extremely_3', 'more_than_25') ~ 'more than $25',
TRUE ~ top_quality
),
jewelry_quality = case_when(
jewelry_quality %in% c('average', 'decent', 'decent_3', 'less_than_25') ~ 'less than $25',
jewelry_quality %in% c('extremely', 'average_3', 'extremely_3', 'more_than_25') ~ 'more than $25',
TRUE ~ jewelry_quality
)
)
No_sh_periods <- customers_styles %>%
group_by(customer_id) %>%
summarise(
Changed_periods = n_distinct(shipping_period)
) %>% ungroup
customers_styles %<>%
group_by(customer_id) %>%
arrange(as.Date(created_at), id) %>%
filter(row_number()==1) %>%
rename(
Top_size = top_size,
Bottom_size = bottom_size
) %>% ungroup() %>%
left_join( No_sh_periods,by = "customer_id") %>%
select(-id)
address %<>%
group_by(customer_id) %>%
arrange(desc(is_validated)) %>%
slice(1) %>%
select(customer_id, zip, city, state_id, is_validated)
# Join the customers with initial information and choices -----------------
customers %<>%
left_join(customers_styles, by = c('id'='customer_id')) %>%
left_join(address, by = c('id'='customer_id')) %>%
mutate(
shipping_period = case_when(
as.Date(created_at.x)!=as.Date(created_at.y) ~ 'Unknown',
as.Date(created_at.x)==as.Date(created_at.y) ~ as.character(shipping_period)
),
Changed_periods = case_when(
Changed_periods > 1 ~ as.character(Changed_periods),
Changed_periods==1 & shipping_period=='Unknown' ~ 'Unknown',
Changed_periods==1 & shipping_period!='Unknown' ~ as.character(Changed_periods)
)
)
# Fix birthdate and create age groups and current status ------------------
customers %<>%
mutate(
age = -lubridate::year(birthdate) + lubridate::year(Sys.Date()),
age_group = cut(age, breaks = c(0,25,30,40, 90),
labels = c('25 and under', '26 to 30', '31 to 40', 'Over 40'),
include.lowest = TRUE)
) %>% rename(
First_ship_period = shipping_period,
created_at = created_at.x
) %>% mutate(
First_ship_period = case_when(
First_ship_period == '12' ~ '2 weeks',
First_ship_period == '21' ~ '4 weeks',
First_ship_period == 'Unknown' ~ 'Unknown'
),
Status = case_when(
customer_status %in% c('cancelled', 'inactive') ~ 'Dead',
!customer_status %in% c('cancelled', 'inactive') ~ 'Alive'
)
) %>% select(-created_at.y)
#Add income
#Transform Income
income %<>% mutate(
Income = cut(Median,
breaks = c(0,30000,50000,80000,10000000),
include.lowest = TRUE,
labels = c('Under 30K', '30K-50K', '50K-80K', '+80K')),
Zip = as.character(Zip)
) %>% rename(Median_income = Median)
customers %<>% left_join(income, by = c('zip' = 'Zip'))
return(customers)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.