R/Data_prep.R

Defines functions Data_prep

Documented in Data_prep

#' Data preparation for the raw data in order to bring them in the format we want
#' @export
#' @param input is a dataset with three columns (customer_id, iri_week, trans_spend)

Data_prep <- function(input) {
  library(dplyr)
  library(sqldf)
  library(data.table)
  #input$cust <- as.numeric(input$cust)
  colnames(input)[2] <- "date"


  # Remove outliers ---------------------------------------------------------

  input_outl <- input[ , .N, by = c('cust') ]
  lower_bound <- quantile(input_outl$N, 0.025)
  upper_bound <- quantile(input_outl$N, 0.975)
  outlier_ind <- which(input_outl$N < lower_bound | input_outl$N > upper_bound)
  input_outl <- input_outl[-which(rownames(input_outl) %in% outlier_ind),]
  df_input= input %>% inner_join(input_outl, by = 'cust')
  df_input <- subset(df_input, select = c(1:3))


  # Aggregate Data weekly ---------------------------------------------------

  input <- df_input[, .(spend=sum(spend)), by=list(cust, date)]


  # Calculate average spend per customer ------------------------------------

  avg_input_spend <- aggregate(input$spend, by=list(input$cust), FUN=mean)
  colnames(avg_input_spend)[1] <- "cust"
  colnames(avg_input_spend)[2] <- "spend"


  input <- subset(input, select = c("cust","date"))

  # Set the last week -------------------------------------------------------

  input$T_cal <- max(input$date)

  # Create freq, recency and total observ period ----------------------------

  input_train <- sqldf("select cust, count(*)-1 as freq, (max(date) - min(date))/7 as recency, (T_cal - min(date))/7 as Tot_Obs from input group by cust", drv = 'SQLite')
  input_train <- as.matrix(input_train)

  return(list(input_train,avg_input_spend))
  #return(input_train)
}
uogss/CLV documentation built on Jan. 29, 2021, 1:50 p.m.