R/clv_scratchpad.R

Defines functions calc_days_diff itt_sd create_cal_frame

require(BTYD)
require(data.table)
require(modelpipe)
require(lubridate)
require(rquery)
require(rqdatatable)

# params ------------------------------------------------------------------
cal_period <- as.Date("1997-09-30")

# data load ---------------------------------------------------------------
cdnowElog <- system.file("data/cdnowElog.csv", package = "BTYD")
df <- fread(cdnowElog)
df$date <- ymd(df$date)
setnames(df, "sampleid", "custid")



# Functions ---------------------------------------------------------------

calc_days_diff <- function(x, x_1){
  out <- difftime(x, x_1, units = "days") %>% as.numeric()
  return(out)
}

itt_sd <- function(x, x_1){
  n <- calc_days_diff(x, x_1)
  n <- n[!is.na(n)]
  sd_out <- sum((n - mean(n))^2) / (length(n)-1)
  return(sd_out^(1/2))
}

create_cal_frame <- function(cust_id, sales, date, cal_period){

  df <- data.table(cust_id, sales, date)

  testit::assert("Date entry is not a date.", is.Date(date))
  testit::assert("Sales is not numeric.", is.numeric(sales))
  testit::assert("Missing values in data.", nrow(df) == nrow(df[complete.cases(df), ]))


  df <- df %.>%
    rquery::project(.,
                    sales %:=% sum(sales),
                    groupby = c("cust_id", "date"))

  #summarise calibration data
  df_cal <- df[df$date <= cal_period, ]
  df_cal <- df_cal %.>%
    rquery::project(.,
                    total_sales = sum(sales),
                    mean_sales = mean(sales),
                    trans_cnt = length(date),
                    days_last = calc_days_diff(.(cal_period), max(date)),
                    itt_sd = ifelse(length(sales) <= 2, -1, itt_sd(date, shift(date, 1))),
                    groupby = "cust_id")

  #modeling period
  df_mdl <- df[df$date > cal_period, ]
  df_mdl <- df_mdl %.>%
    rquery::project(.,
                    fut_star = sum(sales),
                    fut_trans = length(date),
                    groupby = "cust_id")
  df_out <- merge(df_cal, df_mdl, all.x = T, by = "cust_id")
  df_out[is.na(df_out)] <- 0

  return(df_out)
}

# Data manipulation -------------------------------------------------------

df_cal <- create_cal_frame(df$custid, df$sales, df$date, cal_period)
df_cal <- split_data(df_cal)

train <- df_cal$df_train
train$target <- train$fut_trans
train_sales <- train$fut_star
train <- train[,c("fut_trans", "fut_star") := NULL]

test <- df_cal$df_test
test$target <- test$fut_trans
test_sales <- test$fut_star
test <- test[,c("fut_trans", "fut_star") := NULL]

mdl <- xgb_pois(train,
                test)
prescient/modelpipe documentation built on Dec. 25, 2019, 3:20 a.m.