R/rfmp_derived_variables.R

Defines functions prediction

#### Creating RFMP Derived variables
prediction <-  function(Customer_id,
                        Order_id,
                        product_name,
                        Date_purchase,
                        Quantity,
                        Unit_Price,
                        Amount){
  #data <- data.frame(data2$Customer.Id,data2$Order_id,data2$Item_id,data2$Date, data2$Quantity,data2$Unit.Price, data2$Amount)

  data = data.frame(Customer_id, Order_id,product_name, Date_purchase, Quantity, Unit_Price, Amount)
  names(data)[1] <- "Customer_id"
  names(data)[2] <- "Order_id"
  names(data)[3] <- "Item_id"
  names(data)[4] <- "Date"
  names(data)[5] <- "Quantity"
  names(data)[6] <- "Unit_Price"
  names(data)[7] <- "Amount"
  #recency <- function(Customer_ID, Order_ID, Order_item, Unit_Price)
  data[,4] = as.Date(as.character(data$Date), format="%Y-%m-%d")
  data = na.omit(data)
  data$Ma_Date = max(data$Date)

  #Second_Max = max(Date[Date != max(Date)])
  data <- data %>% group_by(Customer_id) %>% mutate( Total_no_of_orders = length(unique(Order_id)),
                                                     Group_Max = max(Date),Second_Max = max(Date[Date != max(Date)]),
                                                     Recency_of_last_Order = Ma_Date - Group_Max,
                                                     Recency_of_Second_recent_Order = ifelse(is.na(as.Date(as.character(Second_Max), format="%Y-%m-%d")),0, (Ma_Date - Second_Max)),
                                                     Third_Max =  max(Date[Date != max(Date) & Date != Second_Max]),
                                                     Recency_of_Third_recent_Order = ifelse(is.na(as.Date(as.character(Third_Max), format="%Y-%m-%d")),0, (Ma_Date - Third_Max)),
                                                     Fourth_Max =  max(Date[Date != max(Date) & Date != Second_Max & Date != Third_Max]),
                                                     Fifth_Max =  max(Date[Date != max(Date) & Date != Second_Max & Date != Third_Max & Date != Fourth_Max]))





  # Handling missing values
  data$Date = ifelse(is.na(as.Date(as.character(data$Date),format = "%Y-%m-%d")),0,data$Date)
  data$Second_Max <- ifelse(is.na(as.Date(as.character(data$Second_Max), format="%Y-%m-%d")),0,data$Second_Max)
  data$Third_Max <- ifelse(is.na(as.Date(as.character(data$Third_Max), format="%Y-%m-%d")),0,data$Third_Max)
  data$Fourth_Max <- ifelse(is.na(as.Date(as.character(data$Fourth_Max), format="%Y-%m-%d")),0,data$Fourth_Max)
  data$Fifth_Max <- ifelse(is.na(as.Date(as.character(data$Fifth_Max), format="%Y-%m-%d")),0,data$Fifth_Max)
  data$Group_Max <- ifelse(is.na(as.Date(as.character(data$Group_Max), format="%Y-%m-%d")),0,data$Group_Max)

  # Difference in Days, between successive recent order dates
  data$RC_L3_L2_int_days <- data$Third_Max - data$Second_Max
  data$RC_L3_L2_int_days <- ifelse(data$RC_L3_L2_int_days < -365, 0 , data$RC_L3_L2_int_days)
  data$RC_L4_L3_int_days <- data$Fourth_Max - data$Third_Max
  data$RC_L4_L3_int_days <- ifelse(data$RC_L4_L3_int_days < -365, 0 , data$RC_L4_L3_int_days)
  data$RC_L5_L4_int_days <- data$Fifth_Max - data$Fourth_Max
  data$RC_L5_L4_int_days <- ifelse(data$RC_L5_L4_int_days < -365, 0 , data$RC_L5_L4_int_days)

  # Difference in days from the 2nd most recent order
  data$RC_L5_L2_int_days <- ifelse(data$Second_Max == 0 | data$Fifth_Max == 0,0,data$Fifth_Max - data$Second_Max)

  # Average interval days from the 2nd most recent order
  #data$RC_L2-L3_avg_days <-
  data$RC_L2_L4_avg_days <- (data$RC_L3_L2_int_days + data$RC_L4_L3_int_days) / 2
  data$RC_L2_L5_avg_days <- (data$RC_L3_L2_int_days + data$RC_L4_L3_int_days + data$RC_L5_L4_int_days) / 3

  # High-Low Dates
  data$RC_Hi_2_5_int_days <- min(data$RC_L3_L2_int_days, data$RC_L4_L3_int_days,data$RC_L5_L4_int_days)
  data$RC_Lo_2_5_int_days <- max(data$RC_L3_L2_int_days, data$RC_L4_L3_int_days,data$RC_L5_L4_int_days)


  # Finding total recent order amount
  data1 <- sqldf('select Customer_id,sum(Amount) as Recent_Order_Amount,MAX (Date) as Max_date from data group by Customer_id,Order_id', method = "name__class")
  data1 = sqldf("select *,Max(Max_date) from data1 group by Customer_id")
  data <-  merge(data,data1[, c(1,2)], by = "Customer_id", all.x = T)

  # Frequency  Orders in "x" time period
  # data  %>%
  #   filter(Date >= Group_Max - 30) %>%
  #   select(unique.Order_id=Order_id) %>%
  #   group_by(Customer_id) %>%
  #   summarise(count = n())
  data1 = sqldf('select Customer_id,count(DISTINCT Order_id) as Number_of_Orders_last_30_days from data where Date >= Ma_Date - 30 group by Customer_id', method = "name__class")
  data <- merge(data,data1[, c(1,2)], by = "Customer_id", all.x = T, all.y = T)
  data$Number_of_Orders_last_30_days[is.na(data$Number_of_Orders_last_30_days)] = 0
  data1 = sqldf('select Customer_id,count(DISTINCT Order_id) as Number_of_Orders_last_90_days from data where Date >= Ma_Date -90 group by Customer_id', method = "name__class")
  data <- merge(data,data1[, c(1,2)], by = "Customer_id", all.x = T, all.y = T)
  data$Number_of_Orders_last_90_days[is.na(data$Number_of_Orders_last_90_days)] = 0
  data1 = sqldf('select Customer_id,count(DISTINCT Order_id) as Number_of_Orders_last_180_days from data where Date >= Ma_Date - 180 group by Customer_id', method = "name__class")
  data <- merge(data,data1[, c(1,2)], by = "Customer_id", all.x = T, all.y = T)
  data$Number_of_Orders_last_180_days[is.na(data$Number_of_Orders_last_180_days)] = 0
  data1 = sqldf('select Customer_id,count(DISTINCT Order_id) as Number_of_Orders_last_365_days from data where Date >= Ma_Date - 365 group by Customer_id', method = "name__class")
  data <- merge(data,data1[, c(1,2)], by = "Customer_id", all.x = T, all.y = T)
  data$Number_of_Orders_last_365_days[is.na(data$Number_of_Orders_last_365_days)] = 0

  #  Distinct Products in "x" orders
  data3 = sqldf("select Customer_id, count(DISTINCT Item_id) as No_Of_Distinct_Products from data where Date == Second_Max group by Customer_id",method = "name__class")
  data <- merge(data,data3[, c(1,2)], by = "Customer_id", all.x = T, all.y = T)
  data3 = sqldf("select Customer_id, count(DISTINCT Item_id) as No_Of_Distinct_Products from data where Date == Third_Max group by Customer_id",method = "name__class")
  data <- merge(data,data3[, c(1,2)], by = "Customer_id", all.x = T, all.y = T)

  #  Distinct Product in "x" time period
  data1 = sqldf('select Customer_id,count(DISTINCT Item_id) as Number_of_items_last_30_days from data where Date >= Ma_Date - 30 group by Customer_id', method = "name__class")
  data <- merge(data,data1[, c(1,2)], by = "Customer_id", all.x = T, all.y = T)
  data1 = sqldf('select Customer_id,count(DISTINCT Item_id) as Number_of_items_last_90_days from data where Date >= Ma_Date -90 group by Customer_id', method = "name__class")
  data <- merge(data,data1[, c(1,2)], by = "Customer_id", all.x = T, all.y = T)
  data1 = sqldf('select Customer_id,count(DISTINCT Item_id) as Number_of_items_last_180_days from data where Date >= Ma_Date - 180 group by Customer_id', method = "name__class")
  data <- merge(data,data1[, c(1,2)], by = "Customer_id", all.x = T, all.y = T)
  data1 = sqldf('select Customer_id,count(DISTINCT Item_id) as Number_of_items_last_365_days from data where Date >= Ma_Date - 365 group by Customer_id', method = "name__class")
  data <- merge(data,data1[, c(1,2)], by = "Customer_id", all.x = T, all.y = T)


  #  Non-Order activity in "x" time period


  #  Revenues in "x" time period
  data1 = sqldf('select Customer_id,SUM(Amount) as Revenue_from_orders_of_last_30_days from data where Date >= Ma_Date - 30 group by Customer_id', method = "name__class")
  data <- merge(data,data1[, c(1,2)], by = "Customer_id", all.x = T, all.y = T)

  data1 = sqldf('select Customer_id,SUM(Amount) as Revenue_from_orders_of_last_90_days from data where Date >= Ma_Date -90 group by Customer_id', method = "name__class")
  data <- merge(data,data1[, c(1,2)], by = "Customer_id", all.x = T, all.y = T)
  data1 = sqldf('select Customer_id,SUM(Amount) as Revenue_from_orders_of_last_180_days from data where Date >= Ma_Date - 180 group by Customer_id', method = "name__class")
  data <- merge(data,data1[, c(1,2)], by = "Customer_id", all.x = T, all.y = T)
  data1 = sqldf('select Customer_id,SUM(Amount) as Revenue_from_orders_of_last_365_days from data where Date >= Ma_Date - 365 group by Customer_id', method = "name__class")
  data <- merge(data,data1[, c(1,2)], by = "Customer_id", all.x = T, all.y = T)


  #  Revenue in last "x" orders, Revenue from 2nd, 3rd and 4th recent order
  data3 = sqldf("select Customer_id, SUM(Amount) as Revenue_from_2nd_Order from data where Date == Second_Max group by Customer_id",method = "name__class")
  data <- merge(data,data3[, c(1,2)], by = "Customer_id", all.x = T, all.y = T)
  data3 = sqldf("select Customer_id, SUM(Amount) as Revenue_from_3rd_Order from data where Date == Third_Max group by Customer_id",method = "name__class")
  data <- merge(data,data3[, c(1,2)], by = "Customer_id", all.x = T, all.y = T)
  data3 = sqldf("select Customer_id, SUM(Amount) as Revenue_from_4th_Order from data where Date == Fourth_Max group by Customer_id",method = "name__class")
  data <- merge(data,data3[, c(1,2)], by = "Customer_id", all.x = T, all.y = T)

  # handling Missing Values
  for (i in 1:ncol(data)) {
    data[,i] = ifelse(is.na(data[,i]) == T,0, data[,i])
  }
  #  Average revenue in last "x" orders
  #data$MT_L2_L3_avg_dols <- (sum(data$Revenue_from_2nd_Order, data$Revenue_from_3rd_Order) / 2)
  #data$MT_L2_L4_avg_dols <- sum(data$Revenue_from_2nd_Order, data$Revenue_from_3rd_Order, data$Revenue_from_4th_Order) / 3



  data <- data %>% group_by(Customer_id) %>% mutate( avg_of_second_Third_order = sum(unique(Revenue_from_2nd_Order),unique(Revenue_from_3rd_Order))/2,
                                                     avg_of_Third_Fourth_order = sum(unique(Revenue_from_3rd_Order),unique(Revenue_from_4th_Order))/2)

  # Building Product derived variables
  #a <- data %>% group_by(Customer_id, Item_id) %>% summarise(total_value = sum(Amount)) %>% mutate()

  #a = sqldf("select *,SUM(Amount) as Total_Amount from data group by  Customer_id, Item_id")
  library(dplyr)
  library(tidyr)
  data <- data  %>% group_by(Customer_id) %>% mutate(Revenue_of_Category1 = max(Amount),
                                                     Revenue_of_Category2= max(Amount[Amount != max(Amount)]),
                                                     Revenue_of_Category3 =  max(Amount[Amount != max(Amount) & Amount != Revenue_of_Category2]),
                                                     Other_Category = sum(Amount[Amount != Revenue_of_Category1 & Amount != Revenue_of_Category2 & Amount != Revenue_of_Category3]))


  # handling Missing Values
  for (i in 1:ncol(data)) {
    data[,i] = ifelse(is.na(data[,i]) == T,0, data[,i])
  }

  # Replace Infinity values by zero
  data <- do.call(data.frame,lapply(data, function(x) replace(x, is.infinite(x),0)))


  # Selecting variables from the dataset
  #names(data)
  input_data = data[,c( "Quantity" , "Unit_Price" ,
                        "Amount",
                        "Total_no_of_orders",
                        "Second_Max", "Recency_of_last_Order",
                        "Recency_of_Second_recent_Order",
                        "Recency_of_Third_recent_Order" ,       "Fourth_Max"       ,
                        "Fifth_Max"  ,                          "RC_L3_L2_int_days" ,
                        "RC_L4_L3_int_days"  ,                  "RC_L5_L4_int_days"  ,
                        "RC_L5_L2_int_days" ,                   "RC_L2_L4_avg_days"  ,
                        "RC_L2_L5_avg_days"  ,                  "RC_Hi_2_5_int_days"  ,
                        "RC_Lo_2_5_int_days" ,                  "Recent_Order_Amount" ,
                        "Number_of_Orders_last_30_days" ,       "Number_of_Orders_last_90_days",
                        "Number_of_Orders_last_180_days" ,      "Number_of_Orders_last_365_days"  ,
                        "No_Of_Distinct_Products.x"   ,         "No_Of_Distinct_Products.y"  ,
                        "Number_of_items_last_30_days"   ,      "Number_of_items_last_90_days"   ,
                        "Number_of_items_last_180_days"  ,      "Number_of_items_last_365_days"  ,
                        "Revenue_from_orders_of_last_30_days",  "Revenue_from_orders_of_last_90_days" ,
                        "Revenue_from_orders_of_last_180_days", "Revenue_from_orders_of_last_365_days",
                        "Revenue_from_2nd_Order"    ,           "Revenue_from_3rd_Order",
                        "Revenue_from_4th_Order"   ,            "avg_of_second_Third_order" ,
                        "avg_of_Third_Fourth_order"   ,         "Revenue_of_Category1"  ,
                        "Revenue_of_Category2"    ,             "Revenue_of_Category3"  ,
                        "Other_Category")]


}
keshav-valgen/RenewalModelling documentation built on May 29, 2019, 11:44 a.m.