Nothing
################################################## Pareto/NBD estimation, visualization functions
library(hypergeo)
library(optimx)
#' Define general parameters
#'
#' This is to ensure consistency across all functions that require the
#' likelihood function, or the log of it, and to make sure that the same
#' implementation of the hypergeometric function is used everywhere for building
#' \code{A0}.
#'
#' This function is only ever called by either \code{\link{pnbd.LL}} or
#' \code{\link{pnbd.PAlive}} so it returns directly the output that is expected
#' from those calling functions: either the log likelihood for a set of
#' customers, or the probability that a set of customers with characteristics
#' given by \code{x}, \code{t.x} and \code{T.cal}, having estimated a set of
#' \code{params}, is still alive. Either set of customers can be of size 1.
#' @inheritParams pnbd.LL
#' @param func name of the function calling dc.InputCheck; either \code{pnbd.LL}
#' or \code{pnbd.PAlive}.
#' @return A vector of log likelihood values if \code{func} is \code{pnbd.LL},
#' or a vector of probabilities that a customer is still alive if \code{func}
#' is \code{pnbd.PAlive}.
#' @seealso \code{\link{pnbd.LL}}
#' @seealso \code{\link{pnbd.PAlive}}
#' @seealso \code{\link{pnbd.DERT}}
pnbd.generalParams <- function(params,
x,
t.x,
T.cal,
func,
hardie = TRUE) {
# Since pnbd.LL and pnbd.pAlive are the only options
# for func, we don't need a printnames argument
# in the pnbd.generalParams wrapper.
stopifnot(func %in% c('pnbd.LL', 'pnbd.PAlive'))
inputs <- try(dc.InputCheck(params = params,
func = func,
printnames = c("r", "alpha", "s", "beta"),
x = x,
t.x = t.x,
T.cal = T.cal))
if('try-error' == class(inputs)) return(str(inputs)$message)
x <- inputs$x
t.x <- inputs$t.x
T.cal <- inputs$T.cal
r <- params[1]
alpha <- params[2]
s <- params[3]
beta <- params[4]
maxab <- max(alpha, beta)
absab <- abs(alpha - beta)
param2 <- s + 1
if (alpha < beta) {
param2 <- r + x
}
a <- alpha + T.cal
b <- maxab + t.x
c <- beta + T.cal
d <- maxab + T.cal
w <- r + s + x
if(hardie == TRUE) {
F1 <- h2f1(a = w,
b = param2,
c = w + 1,
z = absab / b)
F2 <- h2f1(a = w,
b = param2,
c = w + 1,
z = absab / d)
} else {
F1 <- Re(hypergeo(A = w,
B = param2,
C = w + 1,
z = absab / b))
F2 <- Re(hypergeo(A = w,
B = param2,
C = w + 1,
z = absab / d))
}
A0 <- F1/(b^w) - F2/(d^w)
# You only ever call this function from two other
# places: pnbd.LL or pnbd.PAlive.
if(func == 'pnbd.LL') {
# this returns the log likelihood for one random customer
part1 <- r * log(alpha) +
s * log(beta) +
lgamma(r + x) -
lgamma(r)
part2 <- 1 / (a^(w - s) * c^s)
return(part1 + log(part2) + log(1 + (s/w) * A0 / part2))
}
else if(func == 'pnbd.PAlive') {
# This returns the probability that a random customer is still alive
return(1 / (1 + s/w * a^(w - s) * c^s * A0))
} else {
return(NULL)
}
}
#' Use Bruce Hardie's Gaussian hypergeometric implementation
#'
#' In benchmarking \code{\link{pnbd.LL}} runs more quickly and
#' it returns the same results if it uses this helper instead of
#' \code{\link[hypergeo]{hypergeo}}, which is the default. But \code{h2f1}
#' is such a barebones function that in some edge cases it will keep
#' going until you get a segfault, where \code{\link[hypergeo]{hypergeo}}
#' would have failed with a proper error message.
#'
#' @param a, counterpart to A in \code{\link[hypergeo]{hypergeo}}
#' @param b, counterpart to B in \code{\link[hypergeo]{hypergeo}}
#' @param c, counterpart to C in \code{\link[hypergeo]{hypergeo}}
#' @param z, counterpart to z in \code{\link[hypergeo]{hypergeo}}
#' @seealso \code{\link[hypergeo]{hypergeo}}
#' @references Fader, Peter S., and Bruce G.S. Hardie. "A Note on Deriving the Pareto/NBD Model and
#' Related Expressions." November. 2005. Web. \url{http://www.brucehardie.com/notes/008/}
h2f1 <- function(a, b, c, z) {
lenz <- length(z)
j = 0
uj <- 1:lenz
uj <- uj/uj
y <- uj
lteps <- 0
while (lteps < lenz) {
lasty <- y
j <- j + 1
uj <- uj * (a + j - 1) * (b + j - 1)/(c + j - 1) * z/j
y <- y + uj
lteps <- sum(y == lasty)
}
return(y)
}
#' Pareto/NBD Log-Likelihood
#'
#' Calculates the log-likelihood of the Pareto/NBD model.
#'
#' @param params Pareto/NBD parameters - a vector with r, alpha, s, and beta, in
#' that order. r and alpha are unobserved parameters for the NBD transaction
#' process. s and beta are unobserved parameters for the Pareto (exponential
#' gamma) dropout process.
#' @param x number of repeat transactions in the calibration period T.cal, or a
#' vector of transaction frequencies.
#' @param t.x time of most recent repeat transaction, or a vector of recencies.
#' @param T.cal length of calibration period, or a vector of calibration period
#' lengths.
#' @param hardie if TRUE, use \code{\link{h2f1}} instead of
#' \code{\link[hypergeo]{hypergeo}}.
#'
#' @seealso \code{\link{pnbd.EstimateParameters}}
#'
#' @return A vector of log-likelihoods as long as the longest input vector (x,
#' t.x, or T.cal).
#' @references Fader, Peter S., and Bruce G.S. Hardie. "A Note on Deriving the
#' Pareto/NBD Model and Related Expressions." November. 2005. Web.
#' \url{http://www.brucehardie.com/notes/008/}
#'
#' @examples
#' # Returns the log likelihood of the parameters for a customer who
#' # made 3 transactions in a calibration period that ended at t=6,
#' # with the last transaction occurring at t=4.
#' pnbd.LL(params, x=3, t.x=4, T.cal=6, hardie = TRUE)
#'
#' # We can also give vectors as function parameters:
#' set.seed(7)
#' x <- sample(1:4, 10, replace = TRUE)
#' t.x <- sample(1:4, 10, replace = TRUE)
#' T.cal <- rep(4, 10)
#' pnbd.LL(params, x, t.x, T.cal, hardie = TRUE)
pnbd.LL <- function(params,
x,
t.x,
T.cal,
hardie = TRUE) {
pnbd.generalParams(params = params,
x = x,
t.x = t.x,
T.cal = T.cal,
func = 'pnbd.LL',
hardie = hardie)
}
#' Pareto/NBD P(Alive)
#'
#' Uses Pareto/NBD model parameters and a customer's past transaction behavior
#' to return the probability that they are still alive at the end of the
#' calibration period.
#'
#' P(Alive | X=x, t.x, T.cal, r, alpha, s, beta)
#'
#' x, t.x, and T.cal may be vectors. The standard rules for vector operations
#' apply - if they are not of the same length, shorter vectors will be recycled
#' (start over at the first element) until they are as long as the longest
#' vector. It is advisable to keep vectors to the same length and to use single
#' values for parameters that are to be the same for all calculations. If one of
#' these parameters has a length greater than one, the output will be a vector
#' of probabilities.
#'
#' @inheritParams pnbd.LL
#'
#' @return Probability that the customer is still alive at the end of the
#' calibration period. If x, t.x, and/or T.cal has a length greater than one,
#' then this will be a vector of probabilities (containing one element
#' matching each element of the longest input vector).
#' @references Fader, Peter S., and Bruce G.S. Hardie. "A Note on Deriving the
#' Pareto/NBD Model and Related Expressions." November. 2005. Web.
#' \url{http://www.brucehardie.com/notes/008/}
#'
#' @examples
#' data(cdnowSummary)
#' cbs <- cdnowSummary$cbs
#' params <- pnbd.EstimateParameters(cbs, hardie = TRUE)
#'
#' pnbd.PAlive(params, x=0, t.x=0, T.cal=39, TRUE)
#' # 0.2941633; P(Alive) of a customer who made no repeat transactions.
#'
#' pnbd.PAlive(params, x=23, t.x=39, T.cal=39, TRUE)
#' # 1; P(Alive) of a customer who has the same recency and total
#' # time observed.
#'
#' pnbd.PAlive(params, x=5:20, t.x=30, T.cal=39, TRUE)
#' # Note the "increasing frequency paradox".
#'
#' # To visualize the distribution of P(Alive) across customers:
#' p.alives <- pnbd.PAlive(params, cbs[,"x"], cbs[,"t.x"], cbs[,"T.cal"], TRUE)
#' plot(density(p.alives))
pnbd.PAlive <- function(params,
x,
t.x,
T.cal,
hardie = TRUE) {
pnbd.generalParams(params = params,
x = x,
t.x = t.x,
T.cal = T.cal,
func = 'pnbd.PAlive',
hardie = hardie)
}
#' Pareto/NBD Discounted Expected Residual Transactions
#'
#' Calculates the discounted expected residual transactions of a customer, given
#' their behavior during the calibration period.
#'
#' DERT(d | r, alpha, s, beta, X = x, t.x, T.cal)
#'
#' x, t.x, T.cal may be vectors. The standard rules for vector operations apply
#' - if they are not of the same length, shorter vectors will be recycled (start
#' over at the first element) until they are as long as the longest vector. It
#' is advisable to keep vectors to the same length and to use single values for
#' parameters that are to be the same for all calculations. If one of these
#' parameters has a length greater than one, the output will be also be a
#' vector.
#'
#' @inheritParams pnbd.LL
#' @param d the discount rate to be used. Make sure that it matches up with your
#' chosen time period (do not use an annual rate for monthly data, for
#' example).
#'
#' @return The number of discounted expected residual transactions for a
#' customer with a particular purchase pattern during the calibration period.
#' @references Fader, Peter S., Bruce G.S. Hardie, and Ka L. Lee. "RFM and CLV:
#' Using Iso-Value Curves for Customer Base Analysis." Journal of Marketing
#' Research Vol.42, pp.415-430. November. 2005.
#' \url{http://www.brucehardie.com/papers.html}
#' @references See equation 2.
#' @references Note that this paper refers to what this package is calling
#' discounted expected residual transactions (DERT) simply as discounted
#' expected transactions (DET).
#'
#' @examples
#' # elog <- dc.ReadLines(system.file("data/cdnowElog.csv", package="BTYD2"),2,3)
#' # elog[, 'date'] <- as.Date(elog[, 'date'], format = '%Y%m%d')
#' # cal.cbs <- dc.ElogToCbsCbt(elog)$cal$cbs
#' # params <- pnbd.EstimateParameters(cal.cbs, hardie = TRUE)
#' params <- c(0.5629966, 12.5590370, 0.4081095, 10.5148048)
#'
#' # 15% compounded annually has been converted to 0.0027 compounded continuously,
#' # as we are dealing with weekly data and not annual data.
#' d <- 0.0027
#'
#' # calculate the discounted expected residual transactions of a customer
#' # who made 7 transactions in a calibration period that was 77.86
#' # weeks long, with the last transaction occurring at the end of
#' # the 35th week.
#' pnbd.DERT(params,
#' x = 7,
#' t.x = 35,
#' T.cal = 77.86,
#' d,
#' hardie = TRUE)
#'
#' # We can also use vectors to compute DERT for several customers:
#' pnbd.DERT(params,
#' x = 1:10,
#' t.x = 30,
#' T.cal = 77.86,
#' d,
#' hardie = TRUE)
pnbd.DERT <- function(params,
x,
t.x,
T.cal,
d,
hardie = TRUE) {
loglike <- try(pnbd.LL(params = params,
x = x,
t.x = t.x,
T.cal = T.cal,
hardie = hardie))
if('try-error' %in% class(loglike)) return(loglike)
# This is the remainder of the original pnbd.DERT function def.
# No need to get too clever here. Revert to explicit assignment
# of params to r, alpha, s, beta the old-school way.
r <- params[1]
alpha <- params[2]
s <- params[3]
beta <- params[4]
z <- d * (beta + T.cal)
tricomi.part.1 = ((z)^(1 - s))/(s - 1) *
genhypergeo(U = c(1),
L = c(2 - s),
z = z,
check_mod = FALSE)
tricomi.part.2 = gamma(1 - s) *
genhypergeo(U = c(s),
L = c(s),
z = z,
check_mod = FALSE)
tricomi = tricomi.part.1 + tricomi.part.2
result <- exp(r * log(alpha) +
s * log(beta) +
(s - 1) * log(d) +
lgamma(r + x + 1) +
log(tricomi) -
lgamma(r) -
(r + x + 1) * log(alpha + T.cal) -
loglike)
return(result)
}
#' Pareto/NBD Log-Likelihood
#'
#' Calculates the log-likelihood of the Pareto/NBD model.
#'
#' @inheritParams pnbd.LL
#' @param cal.cbs calibration period CBS (customer by sufficient statistic). It
#' must contain columns for frequency ("x"), recency ("t.x"), and total time
#' observed ("T.cal"). Note that recency must be the time between the start of
#' the calibration period and the customer's last transaction, not the time
#' between the customer's last transaction and the end of the calibration
#' period. If your data is compressed (see \code{\link{dc.compress.cbs}}), a
#' fourth column labelled "custs" (number of customers with a specific
#' combination of recency, frequency and length of calibration period) will
#' make this function faster.
#' @seealso \code{\link{pnbd.EstimateParameters}}
#' @seealso \code{\link{pnbd.LL}}
#' @return The log-likelihood of the provided data.
#' @references Fader, Peter S., and Bruce G.S. Hardie. "A Note on Deriving the
#' Pareto/NBD Model and Related Expressions." November. 2005. Web.
#' \url{http://www.brucehardie.com/notes/008/}
#'
#' @examples
#' data(cdnowSummary)
#' cal.cbs <- cdnowSummary$cbs
#' # cal.cbs already has column names required by method
#'
#' # random assignment of parameters
#' params <- c(0.5, 8, 0.7, 10)
#' # returns the log-likelihood of the given parameters
#' pnbd.cbs.LL (params, cal.cbs, TRUE)
#'
#' # compare the speed and results to the following:
#' cal.cbs.compressed <- dc.compress.cbs(cal.cbs)
#' pnbd.cbs.LL (params, cal.cbs.compressed, TRUE)
pnbd.cbs.LL <- function(params,
cal.cbs,
hardie = TRUE) {
dc.check.model.params(printnames = c("r", "alpha", "s", "beta"),
params = params,
func = "pnbd.cbs.LL")
# Check that you have the right columns.
# They should be 'x', 't.x', 'T.cal' and optionally 'custs'
# in this order. They stand for, respectively
# -- x: frequency
# -- t.x: recency
# -- T.cal: observed calendar time
# -- custs: number of customers with this (x, t.x, T.cal) combo
foo <- colnames(cal.cbs)
stopifnot(foo[1] == 'x' &
foo[2] == 't.x' &
foo[3] == 'T.cal')
x <- cal.cbs[,'x']
t.x <- cal.cbs[,'t.x']
T.cal <- cal.cbs[,'T.cal']
if ("custs" %in% foo) {
custs <- cal.cbs[, "custs"]
} else {
custs <- rep(1, length(x))
}
return(sum(custs * pnbd.LL(params, x, t.x, T.cal, hardie)))
}
#' Pareto/NBD Parameter Estimation
#'
#' The best-fitting parameters are determined using the
#' \code{\link{pnbd.cbs.LL}} function. The sum of the log-likelihood for each
#' customer (for a set of parameters) is maximized in order to estimate
#' parameters.
#'
#' A set of starting parameters must be provided for this method. If no
#' parameters are provided, (1,1,1,1) is used as a default. It may be useful to
#' use starting values for r and s that represent your best guess of the
#' heterogeneity in the buy and die rate of customers. It may be necessary to
#' run the estimation from multiple starting points to ensure that it converges.
#' To compare the log-likelihoods of different parameters, use
#' \code{\link{pnbd.cbs.LL}}.
#'
#' The lower bound on the parameters to be estimated is always zero, since
#' Pareto/NBD parameters cannot be negative. The upper bound can be set with the
#' max.param.value parameter.
#'
#' This function may take some time to run. It uses \code{\link[optimx]{optimx}}
#' for maximum likelihood estimation, not \code{\link[stats]{optim}}.
#'
#' @param cal.cbs calibration period CBS (customer by sufficient statistic). It
#' must contain columns for frequency ("x"), recency ("t.x"), and total time
#' observed ("T.cal"). Note that recency must be the time between the start of
#' the calibration period and the customer's last transaction, not the time
#' between the customer's last transaction and the end of the calibration
#' period. If your data is compressed (see \code{\link{dc.compress.cbs}}), a
#' fourth column labelled "custs" (number of customers with a specific
#' combination of recency, frequency and length of calibration period) will
#' make this function faster.
#' @param par.start initial Pareto/NBD parameters - a vector with r, alpha, s,
#' and beta, in that order. r and alpha are unobserved parameters for the NBD
#' transaction process. s and beta are unobserved parameters for the Pareto
#' (exponential gamma) dropout process.
#' @param max.param.value the upper bound on parameters.
#' @param method the optimization method(s).
#' @param hardie if TRUE, have \code{\link{pnbd.LL}} use \code{\link{h2f1}}
#' instead of \code{\link[hypergeo]{hypergeo}}.
#' @param hessian set it to TRUE if you want the Hessian matrix, and then you
#' might as well have the complete \code{\link[optimx]{optimx}} object
#' returned.
#'
#' @return Unnamed vector of estimated parameters by default, \code{optimx}
#' object with everything if \code{hessian} is TRUE.
#' @seealso \code{\link{pnbd.cbs.LL}}
#' @references Fader, Peter S.; Hardie, and Bruce G.S.. "Overcoming the BG/NBD
#' Model's #NUM! Error Problem." December. 2013. Web.
#' \url{http://brucehardie.com/notes/027/bgnbd_num_error.pdf}
#'
#' @examples
#' data(cdnowSummary)
#'
#' cal.cbs <- cdnowSummary$cbs
#' # cal.cbs already has column names required by method
#'
#' # starting-point parameters
#' startingparams <- c(0.5, 6, 0.9, 8)
#'
#' # estimated parameters
#' est.params <- pnbd.EstimateParameters(cal.cbs = cal.cbs,
#' par.start = startingparams,
#' method = 'L-BFGS-B',
#' hardie = TRUE)
#'
#' # complete object returned by \code{\link[optimx]{optimx}}
#' optimx.set <- pnbd.EstimateParameters(cal.cbs = cal.cbs,
#' par.start = startingparams,
#' hardie = TRUE,
#' hessian = TRUE)
#'
#' # log-likelihood of estimated parameters
#' pnbd.cbs.LL(est.params, cal.cbs, TRUE)
pnbd.EstimateParameters <- function(cal.cbs,
par.start = c(1, 1, 1, 1),
max.param.value = 10000,
method = 'L-BFGS-B',
hardie = TRUE,
hessian = FALSE) {
dc.check.model.params(printnames = c("r", "alpha", "s", "beta"),
params = par.start,
func = "pnbd.EstimateParameters")
## helper function to be optimized
pnbd.eLL <- function(params, cal.cbs, hardie) {
params <- exp(params)
params[params > max.param.value] <- max.param.value
return(-1 * pnbd.cbs.LL(params = params,
cal.cbs = cal.cbs,
hardie = hardie))
}
logparams <- log(par.start)
if(hessian == TRUE) {
return(optimx(par = logparams,
fn = pnbd.eLL,
method = method,
cal.cbs = cal.cbs,
hardie = hardie,
hessian = hessian))
}
results <- optimx(par = logparams,
fn = pnbd.eLL,
method = method,
cal.cbs = cal.cbs,
hardie = hardie)
params <- exp(unname(unlist(results[method, c('p1', 'p2', 'p3', 'p4')])))
return(params)
}
#' Generalized Pareto/NBD Probability Mass Function
#'
#' Generalized probability mass function for the Pareto/NBD.
#'
#' P(X(t.start, t.end)=x | r, alpha, s, beta). Returns the probability that a
#' customer makes x repeat transactions in the time interval (t.start, t.end].
#'
#' It is impossible for a customer to make a negative number of repeat
#' transactions. This function will return an error if it is given negative
#' times or a negative number of repeat transactions. This function will also
#' return an error if t.end is less than t.start.
#'
#' \code{t.start}, \code{t.end}, and \code{x} may be vectors. The standard rules
#' for vector operations apply - if they are not of the same length, shorter
#' vectors will be recycled (start over at the first element) until they are as
#' long as the longest vector. It is advisable to keep vectors to the same
#' length and to use single values for parameters that are to be the same for
#' all calculations. If one of these parameters has a length greater than one,
#' the output will be a vector of probabilities.
#'
#' @param params Pareto/NBD parameters - a vector with r, alpha, s, and beta, in
#' that order. r and alpha are unobserved parameters for the NBD transaction
#' process. s and beta are unobserved parameters for the Pareto (exponential
#' gamma) dropout process.
#' @param t.start start of time period for which probability is being
#' calculated. It can also be a vector of values.
#' @param t.end end of time period for which probability is being calculated. It
#' can also be a vector of values.
#' @param x number of repeat transactions by a random customer in the period
#' defined by (t.start, t.end]. It can also be a vector of values.
#' @param hardie if TRUE, use \code{\link{h2f1}} instead of
#' \code{\link[hypergeo]{hypergeo}}.
#'
#' @return Probability of x transaction occuring between t.start and t.end
#' conditional on model parameters. If t.start, t.end, and/or x has a length
#' greater than one, a vector of probabilities will be returned.
#' @references Fader, Peter S., and Bruce G.S. Hardie. "Deriving an Expression
#' for P (X(t) = x) Under the Pareto/NBD Model." Sept. 2006. Web.
#' \url{http://www.brucehardie.com/notes/012/}
#' @references Fader, Peter S., Bruce G.S. Hardie, and Kinshuk Jerath. "Deriving
#' an Expression for P (X(t, t + tau) = x) Under the Pareto/NBD Model." Sept.
#' 2006. Web. \url{http://www.brucehardie.com/notes/013/}
#'
#' @examples
#' # probability that a customer will make 10 repeat transactions in the
#' # time interval (1,2]
#' data("cdnowSummary")
#' cal.cbs <- cdnowSummary$cbs
#' params <- pnbd.EstimateParameters(cal.cbs = cal.cbs,
#' method = "L-BFGS-B",
#' hardie = TRUE)
#' pnbd.pmf.General(params, t.start=1, t.end=2, x=10, hardie = TRUE)
#' # probability that a customer will make no repeat transactions in the
#' # time interval (39,78]
#' pnbd.pmf.General(params,
#' t.start = 39,
#' t.end = 78,
#' x = 0,
#' hardie = TRUE)
pnbd.pmf.General <- function(params,
t.start,
t.end,
x,
hardie = TRUE) {
if (any(t.start > t.end)) {
stop("Error in pnbd.pmf.General: t.start > t.end.")
}
inputs <- try(dc.InputCheck(params = params,
func = "pnbd.pmf.General",
printnames = c("r", "alpha", "s", "beta"),
t.start = t.start,
t.end = t.end,
x = x))
if('try-error' == class(inputs)) return(str(inputs)$message)
t.start <- inputs$t.start
t.end <- inputs$t.end
x <- inputs$x
max.length <- nrow(inputs)
r <- params[1]
alpha <- params[2]
s <- params[3]
beta <- params[4]
equation.part.0 <- rep(0, max.length)
equation.part.0[x == 0] <- 1 - exp(s * log(beta) - s * log(beta + t.start))
## (t.end - t.start)^x is left outside the exp() in equation.part.1
## because when t.end = t.start and x = 0, exp() gets us into trouble:
## -- exp(0 * log(0)) = NaN;
## -- doing directly 0^0 instead gets us 0^0=1
## -- 1 is much better than NaN.
equation.part.1 <- exp(lgamma(r + x) -
lgamma(r) -
lfactorial(x) +
r * log(alpha) -
r * log(alpha + t.end - t.start) -
x * log(alpha + t.end - t.start) +
s * log(beta) -
s * log(beta + t.end)) *
(t.end - t.start)^x
equation.part.2 <- r * log(alpha) + s * log(beta) + lbeta(r + x, s + 1) - lbeta(r, s)
# Marshal the parameters of the hypergeometric and the
# denominator in the expressions of B1 and B2 shown in
# http://www.brucehardie.com/notes/013/Pareto_NBD_interval_pmf_rev.pdf
B1B2 <- function(hardie,
r,
alpha,
s,
beta,
x,
t.start,
t.end = 0,
ii = 0) {
myalpha <- alpha
mybeta <- beta + t.start
maxab <- max(myalpha, mybeta) + t.end
absab <- abs(myalpha - mybeta)
param2 <- s + 1
if (myalpha < mybeta) {
# < same as <= in the case of
# the hypergeometric, because =
# case is trivial.
param2 <- r + x
}
w <- r + s + ii
a <- w
b <- param2
c <- r + s + x + 1
z <- absab/maxab
den <- maxab^w
if(hardie == TRUE) return(h2f1(a, b, c, z)/den)
return(Re(hypergeo(a, b, c, z))/den)
}
B.1 <- mapply(x = inputs$x,
t.start = inputs$t.start,
B1B2,
hardie = hardie,
r = r,
alpha = alpha,
s = s,
beta = beta)
#B.1 <- B1B2(hardie, r, alpha, s, beta, x, t.start)
equation.part.2.summation <- rep(NA, max.length)
## In the paper, for i=0 we have t^i / i * B(r+s, i).
## the denominator reduces to:
## i * Gamma (r+s) * Gamma(i) / Gamma (r+s+i) :
## Gamma (r+s) * Gamma(i+1) / Gamma(r+s+i) :
## Gamma (r+s) * Gamma(1) / Gamma(r+s) :
## 1
## The 1 represents this reduced piece of the equation.
for (i in 1:max.length) {
ii <- c(1:x[i])
equation.part.2.summation[i] <- B1B2(hardie, r, alpha, s, beta, x[i], t.start[i], t.end[i], 0)
if (x[i] > 0) {
equation.part.2.summation[i] <- equation.part.2.summation[i] +
sum((t.end[i] - t.start[i])^ii/(ii * beta(r + s, ii)) *
B1B2(hardie, r, alpha, s, beta, x[i], t.start[i], t.end[i], ii))
}
}
return(equation.part.0 +
equation.part.1 +
exp(equation.part.2 +
log(B.1 - equation.part.2.summation)))
}
#' Pareto/NBD Conditional Expected Transactions
#'
#' Uses Pareto/NBD model parameters and a customer's past transaction behavior
#' to return the number of transactions they are expected to make in a given
#' time period.
#'
#' E\[X(T.cal, T.cal + T.star) | x, t.x, r, alpha, s, beta\]
#'
#' \code{T.star}, \code{x}, \code{t.x}, and \code{T.cal} may be vectors. The
#' standard rules for vector operations apply - if they are not of the same
#' length, shorter vectors will be recycled (start over at the first element)
#' until they are as long as the longest vector. It is advisable to keep vectors
#' to the same length and to use single values for parameters that are to be the
#' same for all calculations. If one of these parameters has a length greater
#' than one, the output will be a vector of probabilities.
#'
#' @param params Pareto/NBD parameters - a vector with r, alpha, s, and beta, in
#' that order. r and alpha are unobserved parameters for the NBD transaction
#' process. s and beta are unobserved parameters for the Pareto (exponential
#' gamma) dropout process.
#' @param T.star length of time for which we are calculating the expected number
#' of transactions.
#' @param x number of repeat transactions in the calibration period T.cal, or a
#' vector of calibration period frequencies.
#' @param t.x time of most recent repeat transaction, or a vector of recencies.
#' @param T.cal length of calibration period, or a vector of calibration period
#' lengths.
#' @param hardie if TRUE, have \code{\link{pnbd.PAlive}} use \code{\link{h2f1}}
#' instead of \code{\link[hypergeo]{hypergeo}}.
#'
#' @return Number of transactions a customer is expected to make in a time
#' period of length t, conditional on their past behavior. If any of the input
#' parameters has a length greater than 1, this will be a vector of expected
#' number of transactions.
#' @references Fader, Peter S., and Bruce G.S. Hardie. "A Note on Deriving the
#' Pareto/NBD Model and Related Expressions." November. 2005. Web.
#' \url{http://www.brucehardie.com/notes/008/}
#' @seealso \code{\link{pnbd.Expectation}}
#'
#' @examples
#' params <- c(0.55, 10.56, 0.61, 11.64)
#' # Number of transactions a customer is expected to make in 2 time
#' # intervals, given that they made 10 repeat transactions in a time period
#' # of 39 intervals, with the 10th repeat transaction occurring in the 35th
#' # interval.
#' pnbd.ConditionalExpectedTransactions(params,
#' T.star = 2,
#' x = 10,
#' t.x = 35,
#' T.cal = 39,
#' hardie = TRUE)
#'
#' # We can also compare expected transactions across different
#' # calibration period behaviors:
#' pnbd.ConditionalExpectedTransactions(params,
#' T.star = 2,
#' x = 5:20,
#' t.x = 25,
#' T.cal = 39,
#' hardie = TRUE)
pnbd.ConditionalExpectedTransactions <- function(params,
T.star,
x,
t.x,
T.cal,
hardie = TRUE) {
inputs <- try(dc.InputCheck(params = params,
func = "pnbd.ConditionalExpectedTransactions",
printnames = c("r", "alpha", "s", "beta"),
T.star = T.star,
x = x,
t.x = t.x,
T.cal = T.cal))
if('try-error' == class(inputs)) return(str(inputs)$message)
T.star <- inputs$T.star
x <- inputs$x
t.x <- inputs$t.x
T.cal <- inputs$T.cal
r <- params[1]
alpha <- params[2]
s <- params[3]
beta <- params[4]
P1 <- (r + x) * (beta + T.cal)/((alpha + T.cal) * (s - 1))
P2 <- (1 - ((beta + T.cal)/(beta + T.cal + T.star))^(s - 1))
P3 <- pnbd.PAlive(params = params,
x = x,
t.x = t.x,
T.cal = T.cal,
hardie = hardie)
return(P1 * P2 * P3)
}
#' Pareto/NBD Probability Mass Function
#'
#' Probability mass function for the Pareto/NBD.
#'
#' P(X(t)=x | r, alpha, s, beta). Returns the probability that a customer makes
#' x repeat transactions in the time interval (0, t].
#'
#' Parameters \code{t} and \code{x} may be vectors. The standard rules for
#' vector operations apply - if they are not of the same length, the shorter
#' vector will be recycled (start over at the first element) until it is as long
#' as the longest vector. It is advisable to keep vectors to the same length and
#' to use single values for parameters that are to be the same for all
#' calculations. If one of these parameters has a length greater than one, the
#' output will be a vector of probabilities.
#'
#' @param params Pareto/NBD parameters - a vector with r, alpha, s, and beta, in
#' that order. r and alpha are unobserved parameters for the NBD transaction
#' process. s and beta are unobserved parameters for the Pareto (exponential
#' gamma) dropout process.
#' @param t length end of time period for which probability is being computed.
#' May also be a vector.
#' @param x number of repeat transactions by a random customer in the period
#' defined by t. May also be a vector.
#' @param hardie if TRUE, have \code{\link{pnbd.pmf.General}} use
#' \code{\link{h2f1}} instead of \code{\link[hypergeo]{hypergeo}}.
#'
#' @return Probability of X(t)=x conditional on model parameters. If t and/or x
#' has a length greater than one, a vector of probabilities will be returned.
#' @references Fader, Peter S., and Bruce G.S. Hardie. “Deriving an Expression
#' for P (X(t) = x) Under the Pareto/NBD Model.” Sept. 2006. Web.
#' \url{http://www.brucehardie.com/notes/012/}
#'
#' @examples
#' params <- c(0.55, 10.56, 0.61, 11.64)
#' # probability that a customer will make 10 repeat transactions in the
#' # time interval (0,2]
#' pnbd.pmf(params, t=2, x=10, hardie = TRUE)
#' # probability that a customer will make no repeat transactions in the
#' # time interval (0,39]
#' pnbd.pmf(params, t=39, x=0, hardie = TRUE)
#'
#' # Vectors may also be used as arguments:
#' pnbd.pmf(params = params,
#' t = 30,
#' x = 11:20,
#' hardie = TRUE)
pnbd.pmf <- function(params,
t,
x,
hardie = TRUE) {
inputs <- try(dc.InputCheck(params = params,
func = "pnbd.pmf",
printnames = c("r", "alpha", "s", "beta"),
t = t,
x = x))
if('try-error' == class(inputs)) return(str(inputs)$message)
pnbd.pmf.General(params = params,
t.start = 0,
t.end = inputs$t,
x = inputs$x,
hardie = hardie)
}
#' Pareto/NBD Expectation
#'
#' Returns the number of repeat transactions that a randomly chosen customer
#' (for whom we have no prior information) is expected to make in a given time
#' period.
#'
#' E(X(t) | r, alpha, s, beta)
#'
#' @inheritParams pnbd.LL
#' @param t The length of time for which we are calculating the expected number
#' of repeat transactions.
#' @return Number of repeat transactions a customer is expected to make in a
#' time period of length t.
#' @references Fader, Peter S., and Bruce G.S. Hardie. "A Note on Deriving the
#' Pareto/NBD Model and Related Expressions." November. 2005. Web.
#' \url{http://www.brucehardie.com/notes/008/}
#' @seealso [`pnbd.ConditionalExpectedTransactions`]
#' @examples
#' params <- c(0.55, 10.56, 0.61, 11.64)
#'
#' # Number of repeat transactions a customer is expected to make in 2 time intervals.
#' pnbd.Expectation(params = params,
#' t = 2)
#'
#' # We can also compare expected transactions over time:
#' pnbd.Expectation(params = params,
#' t = 1:10)
#' @md
pnbd.Expectation <- function(params, t) {
dc.check.model.params(printnames = c("r", "alpha", "s", "beta"),
params = params,
func = "pnbd.Expectation")
if (any(t < 0) || !is.numeric(t))
stop("t must be numeric and may not contain negative numbers.")
r = params[1]
alpha = params[2]
s = params[3]
beta = params[4]
return((r * beta)/(alpha * (s - 1)) * (1 - (beta/(beta + t))^(s - 1)))
}
#' Pareto/NBD Expected Cumulative Transactions
#'
#' Calculates the expected cumulative total repeat transactions by all customers
#' for the calibration and holdout periods.
#'
#' The function automatically divides the total period up into n.periods.final
#' time intervals. n.periods.final does not have to be in the same unit of time
#' as the T.cal data. For example: - if your T.cal data is in weeks, and you
#' want cumulative transactions per week, n.periods.final would equal T.star. -
#' if your T.cal data is in weeks, and you want cumulative transactions per day,
#' n.periods.final would equal T.star * 7.
#'
#' The holdout period should immediately follow the calibration period. This
#' function assume that all customers' calibration periods end on the same date,
#' rather than starting on the same date (thus customers' birth periods are
#' determined using max(T.cal) - T.cal rather than assuming that it is 0).
#'
#' @inheritParams pnbd.LL
#' @param T.tot End of holdout period. Must be a single value, not a vector.
#' @param n.periods.final Number of time periods in the calibration and holdout
#' periods. See details.
#' @return Vector of expected cumulative total repeat transactions by all
#' customers.
#' @seealso [`pnbd.Expectation`]
#' @examples
#' data(cdnowSummary)
#'
#' cal.cbs <- cdnowSummary$cbs
#' # cal.cbs already has column names required by method
#'
#' params <- c(0.55, 10.56, 0.61, 11.64)
#'
#' # Returns a vector containing cumulative repeat transactions for 546 days.
#' # All parameters are in weeks; the calibration period lasted 39 weeks
#' # and the holdout period another 39.
#' pnbd.ExpectedCumulativeTransactions(params = params,
#' T.cal = cal.cbs[,"T.cal"],
#' T.tot = 78,
#' n.periods.final = 546)
#' @md
pnbd.ExpectedCumulativeTransactions <- function(params,
T.cal,
T.tot,
n.periods.final) {
inputs <- try(dc.InputCheck(params = params,
func = "pnbd.ExpectedCumulativeTransactions",
printnames = c("r", "alpha", "s", "beta"),
T.cal = T.cal,
T.tot = T.tot,
n.periods.final = n.periods.final))
if('try-error' == class(inputs)) return(str(inputs)$message)
stopit <- "must be a single numeric value and may not be negative."
if (length(T.tot) > 1) stop(paste("T.tot", stopit, sep = " "))
if (length(n.periods.final) > 1) stop(paste("n.periods.final", stopit, sep = " "))
## Divide up time into equal intervals
intervals <- seq(T.tot/n.periods.final,
T.tot,
length.out = n.periods.final)
cust.birth.periods <- max(T.cal) - T.cal
expected.transactions <- sapply(intervals,
function(interval) {
if (interval <= min(cust.birth.periods))
return(0)
sum(pnbd.Expectation(params,
interval - cust.birth.periods[cust.birth.periods <= interval]))
})
return(expected.transactions)
}
#' Pareto/NBD Plot Frequency in Calibration Period
#'
#' Plots a histogram and returns a matrix comparing the actual and expected
#' number of customers who made a certain number of repeat transactions in the
#' calibration period, binned according to calibration period frequencies.
#'
#' This function requires a censor number, which cannot be higher than the
#' highest frequency in the calibration period CBS. The output matrix will have
#' (censor + 1) bins, starting at frequencies of 0 transactions and ending at a
#' bin representing calibration period frequencies at or greater than the censor
#' number. The plot may or may not include a bin for zero frequencies, depending
#' on the plotZero parameter.
#'
#' @param params Pareto/NBD parameters - a vector with r, alpha, s, and beta, in
#' that order. r and alpha are unobserved parameters for the NBD transaction
#' process. s and beta are unobserved parameters for the Pareto (exponential
#' gamma) dropout process.
#' @param cal.cbs calibration period CBS (customer by sufficient statistic). It
#' must contain columns for frequency ("x") and total time observed ("T.cal").
#' @param censor integer used to censor the data. See details.
#' @param hardie if TRUE, have \code{\link{pnbd.pmf}} use \code{\link{h2f1}}
#' instead of \code{\link[hypergeo]{hypergeo}}.
#' @param plotZero if FALSE, the histogram will exclude the zero bin.
#' @param xlab descriptive label for the x axis.
#' @param ylab descriptive label for the y axis.
#' @param title title placed on the top-center of the plot.
#'
#' @return Calibration period repeat transaction frequency comparison matrix
#' (actual vs. expected).
#'
#' @examples
#' data(cdnowSummary)
#' cal.cbs <- cdnowSummary$cbs
#' # cal.cbs already has column names required by method
#'
#' # parameters estimated using pnbd.EstimateParameters
#' est.params <- cdnowSummary$est.params
#' # the maximum censor number that can be used
#' max(cal.cbs[,"x"])
#'
#' pnbd.PlotFrequencyInCalibration(params = est.params,
#' cal.cbs = cal.cbs,
#' censor = 7,
#' hardie = TRUE)
pnbd.PlotFrequencyInCalibration <- function(params,
cal.cbs,
censor,
hardie = TRUE,
plotZero = TRUE,
xlab = "Calibration period transactions",
ylab = "Customers",
title = "Frequency of Repeat Transactions") {
dc.check.model.params(printnames = c("r", "alpha", "s", "beta"),
params = params,
func = "pnbd.PlotFrequencyInCalibration")
stopifnot("x" %in% colnames(cal.cbs) | "T.cal" %in% colnames(cal.cbs))
x <- cal.cbs[, "x"]
T.cal <- cal.cbs[, "T.cal"]
if (censor > max(x))
stop("censor too big (> max freq) in PlotFrequencyInCalibration.")
n.x <- rep(0, max(x) + 1)
custs = nrow(cal.cbs)
for (ii in unique(x)) {
n.x[ii + 1] <- sum(ii == x)
}
n.x.censor <- sum(n.x[(censor + 1):length(n.x)])
n.x.actual <- c(n.x[1:censor], n.x.censor)
T.value.counts <- table(T.cal)
T.values <- as.numeric(names(T.value.counts))
n.T.values <- length(T.values)
total.probability <- 0
n.x.expected <- rep(0, length(n.x.actual))
for (ii in 1:(censor)) {
this.x.expected <- 0
for (T.idx in 1:n.T.values) {
T <- T.values[T.idx]
if (T == 0)
next
n.T <- T.value.counts[T.idx]
expected.given.x.and.T <- n.T * pnbd.pmf(params, T, ii - 1, hardie)
this.x.expected <- this.x.expected + expected.given.x.and.T
total.probability <- total.probability + expected.given.x.and.T/custs
}
n.x.expected[ii] <- this.x.expected
}
n.x.expected[censor + 1] <- custs * (1 - total.probability)
col.names <- paste(rep("freq", length(censor + 1)), (0:censor), sep = ".")
col.names[censor + 1] <- paste(col.names[censor + 1], "+", sep = "")
censored.freq.comparison <- rbind(n.x.actual, n.x.expected)
colnames(censored.freq.comparison) <- col.names
cfc.plot <- censored.freq.comparison
if (plotZero == FALSE)
cfc.plot <- cfc.plot[, -1]
n.ticks <- ncol(cfc.plot)
if (plotZero == TRUE) {
x.labels <- 0:(n.ticks - 1)
x.labels[n.ticks] <- paste(n.ticks - 1, "+", sep = "")
} else {
x.labels <- 1:(n.ticks)
x.labels[n.ticks] <- paste(n.ticks, "+", sep = "")
}
ylim <- c(0, ceiling(max(cfc.plot) * 1.1))
barplot(cfc.plot, names.arg = x.labels, beside = TRUE, ylim = ylim, main = title,
xlab = xlab, ylab = ylab, col = 1:2)
legend("topright", legend = c("Actual", "Model"), col = 1:2, lwd = 2)
return(censored.freq.comparison)
}
#' Pareto/NBD Plot Frequency vs. Conditional Expected Frequency
#'
#' Plots the actual and conditional expected number transactions made by
#' customers in the holdout period, binned according to calibration period
#' frequencies. Also returns a matrix with this comparison and the number of
#' customers in each bin.
#'
#' This function requires a censor number, which cannot be higher than the
#' highest frequency in the calibration period CBS. The output matrix will have
#' (censor + 1) bins, starting at frequencies of 0 transactions and ending at a
#' bin representing calibration period frequencies at or greater than the censor
#' number.
#'
#' @param params Pareto/NBD parameters - a vector with r, alpha, s, and beta, in
#' that order. r and alpha are unobserved parameters for the NBD transaction
#' process. s and beta are unobserved parameters for the Pareto (exponential
#' gamma) dropout process.
#' @param T.star length of the holdout period. It must be a scalar for this
#' plot's purposes: you have one holdout period of a given length.
#' @param cal.cbs calibration period CBS (customer by sufficient statistic). It
#' must contain columns for frequency ("x"), recency ("t.x"), and total time
#' observed ("T.cal"). Note that recency must be the time between the start of
#' the calibration period and the customer's last transaction, not the time
#' between the customer's last transaction and the end of the calibration
#' period.
#' @param x.star vector of transactions made by each customer in the holdout
#' period.
#' @param censor integer used to censor the data. See details.
#' @param hardie if TRUE, have
#' \code{\link{pnbd.ConditionalExpectedTransactions}} use \code{\link{h2f1}}
#' instead of \code{\link[hypergeo]{hypergeo}}.
#' @param xlab descriptive label for the x axis.
#' @param ylab descriptive label for the y axis.
#' @param xticklab vector containing a label for each tick mark on the x axis.
#' @param title title placed on the top-center of the plot.
#'
#' @return Holdout period transaction frequency comparison matrix (actual vs. expected).
#'
#' @examples
#' data(cdnowSummary)
#'
#' cal.cbs <- cdnowSummary$cbs
#' # cal.cbs already has column names required by method
#'
#' # number of transactions by each customer in the 39 weeks
#' # following the calibration period
#' x.star <- cal.cbs[,"x.star"]
#'
#' # parameters estimated using pnbd.EstimateParameters
#' est.params <- cdnowSummary$est.params
#' # the maximum censor number that can be used
#' max(cal.cbs[,"x"])
#'
#' # plot conditional expected holdout period frequencies,
#' # binned according to calibration period frequencies
#' pnbd.PlotFreqVsConditionalExpectedFrequency(params = est.params,
#' T.star = 39,
#' cal.cbs = cal.cbs,
#' x.star = x.star,
#' censor = 7,
#' hardie = TRUE)
pnbd.PlotFreqVsConditionalExpectedFrequency <- function(params,
T.star,
cal.cbs,
x.star,
censor,
hardie = TRUE,
xlab = "Calibration period transactions",
ylab = "Holdout period transactions",
xticklab = NULL,
title = "Conditional Expectation") {
dc.check.model.params(printnames = c("r", "alpha", "s", "beta"),
params = params,
func = "pnbd.PlotFreqVsConditionalExpectedFrequency")
# Check that you have the right columns.
# They should be 'x', 't.x', 'T.cal' and optionally 'custs'
# in this order. They stand for, respectively
# -- x: frequency
# -- t.x: recency
# -- T.cal: observed calendar time
# -- custs: number of customers with this (x, t.x, T.cal) combo
foo <- colnames(cal.cbs)
stopifnot(foo[1] == 'x' &
foo[2] == 't.x' &
foo[3] == 'T.cal')
x <- cal.cbs[,'x']
t.x <- cal.cbs[,'t.x']
T.cal <- cal.cbs[,'T.cal']
if (censor > max(x))
stop("censor too big (> max freq) in PlotFreqVsConditionalExpectedFrequency.")
if (length(T.star) > 1 || T.star < 0 || !is.numeric(T.star))
stop("T.star must be a positive scalar.")
if (any(x.star < 0) || !is.numeric(x.star))
stop("x.star must be numeric and may not contain negative numbers.")
n.bins <- censor + 1
bin.size <- rep(0, n.bins)
# First, find the right sequence of
# transaction counts. It may have gaps,
# for which n.this.bin = 0, which is
# kinda hard to divide by. So:
for (cc in 0:censor) {
if (cc < censor) {
this.bin <- which(x == cc)
} else {
this.bin <- which(x >= cc)
}
n.this.bin <- length(this.bin)
bin.size[cc + 1] <- n.this.bin
}
# Now you got the right list of net bins:
# those with at least 1 customer in them.
bin.size <- bin.size[bin.size > 0]
n.bins <- length(bin.size)
names(bin.size) <- names(table(x))[1:n.bins]
xvals <- as.integer(names(bin.size))
transaction.actual <- rep(0, n.bins)
transaction.expected <- rep(0, n.bins)
for(cc in 1:n.bins) {
n.this.bin <- bin.size[cc]
if (cc < n.bins) {
this.bin <- which(x == xvals[cc])
} else {
this.bin <- which(x >= xvals[cc])
}
transaction.actual[cc] <- sum(x.star[this.bin])/n.this.bin
transaction.expected[cc] <- sum(pnbd.ConditionalExpectedTransactions(params,
T.star,
x[this.bin],
t.x[this.bin],
T.cal[this.bin],
hardie))/n.this.bin
}
col.names <- paste("freq", names(bin.size), sep = ".")
col.names[n.bins] <- paste(col.names[n.bins], '+', sep = '')
comparison <- rbind(transaction.actual, transaction.expected, bin.size)
colnames(comparison) <- col.names
x.labels <- sapply(colnames(comparison), function(x) gsub('freq.', '', x))
actual <- comparison[1, ]
expected <- comparison[2, ]
ylim <- c(0, ceiling(max(c(actual, expected)) * 1.1))
plot(actual, type = "l", xaxt = "n",
col = 1, ylim = ylim, xlab = xlab, ylab = ylab,
main = title)
lines(expected, lty = 2, col = 2)
axis(1, at = 1:ncol(comparison), labels = x.labels)
legend("topleft", legend = c("Actual", "Model"), col = 1:2, lty = 1:2, lwd = 1)
return(comparison)
}
#' Pareto/NBD Plot Actual vs. Conditional Expected Frequency by Recency
#'
#' Plots the actual and conditional expected number of transactions made by
#' customers in the holdout period, binned according to calibration period
#' recencies. Also returns a matrix with this comparison and the number of
#' customers in each bin.
#'
#' This function does bin customers exactly according to recency; it bins
#' customers according to integer units of the time period of cal.cbs.
#' Therefore, if you are using weeks in your data, customers will be binned as
#' follows: customers with recencies between the start of the calibration period
#' (inclusive) and the end of week one (exclusive); customers with recencies
#' between the end of week one (inclusive) and the end of week two (exlusive);
#' etc.
#'
#' The matrix and plot will contain the actual number of transactions made by
#' each bin in the holdout period, as well as the expected number of
#' transactions made by that bin in the holdout period, conditional on that
#' bin's behavior during the calibration period.
#'
#' @param params Pareto/NBD parameters - a vector with r, alpha, s, and beta, in
#' that order. r and alpha are unobserved parameters for the NBD transaction
#' process. s and beta are unobserved parameters for the Pareto (exponential
#' gamma) dropout process.
#' @param cal.cbs calibration period CBS (customer by sufficient statistic). It
#' must contain columns for frequency ("x"), recency ("t.x"), and total time
#' observed ("T.cal"). Note that recency must be the time between the start of
#' the calibration period and the customer's last transaction, not the time
#' between the customer's last transaction and the end of the calibration
#' period.
#' @param T.star length of the holdout period. It must be a scalar for this
#' plot's purposes: you have one holdout period of a given length.
#' @param x.star vector of transactions made by each customer in the holdout
#' period.
#' @param hardie if TRUE, have
#' \code{\link{pnbd.ConditionalExpectedTransactions}} use \code{\link{h2f1}}
#' instead of \code{\link[hypergeo]{hypergeo}}.
#' @param xlab descriptive label for the x axis.
#' @param ylab descriptive label for the y axis.
#' @param xticklab vector containing a label for each tick mark on the x axis.
#' @param title title placed on the top-center of the plot.
#'
#' @return Matrix comparing actual and conditional expected transactions in the holdout period.
#'
#' @examples
#' data(cdnowSummary)
#'
#' cal.cbs <- cdnowSummary$cbs
#' # cal.cbs already has column names required by method
#'
#' # number of transactions by each customer in the 39 weeks following
#' # the calibration period
#' x.star <- cal.cbs[,"x.star"]
#'
#' # parameters estimated using pnbd.EstimateParameters
#' est.params <- cdnowSummary$est.params
#'
#' # plot conditional expected holdout period transactions, binned according to
#' # calibration period recencies
#' pnbd.PlotRecVsConditionalExpectedFrequency(params = est.params,
#' cal.cbs = cal.cbs,
#' T.star = 39,
#' x.star = x.star,
#' hardie = TRUE)
pnbd.PlotRecVsConditionalExpectedFrequency <- function(params,
cal.cbs,
T.star,
x.star,
hardie = TRUE,
xlab = "Calibration period recency",
ylab = "Holdout period transactions",
xticklab = NULL,
title = "Actual vs. Conditional Expected Transactions by Recency") {
# No use for inputs, other than as error check.
inputs <- try(dc.InputCheck(params = params,
func = "pnbd.PlotRecVsConditionalExpectedFrequency",
printnames = c("r", "alpha", "s", "beta"),
T.star = T.star,
x.star = x.star))
if('try-error' == class(inputs)) return(str(inputs)$message)
if (length(T.star) > 1)
stop("T.star must be a scalar.")
# Check that you have the right columns.
# They should be 'x', 't.x', 'T.cal' and optionally 'custs'
# in this order. They stand for, respectively
# -- x: frequency
# -- t.x: recency
# -- T.cal: observed calendar time
# -- custs: number of customers with this (x, t.x, T.cal) combo
foo <- colnames(cal.cbs)
stopifnot(foo[1] == 'x' &
foo[2] == 't.x' &
foo[3] == 'T.cal')
x <- cal.cbs[,'x']
t.x <- cal.cbs[,'t.x']
T.cal <- cal.cbs[,'T.cal']
t.values <- sort(unique(t.x))
n.recs <- length(t.values)
transaction.actual <- rep(0, n.recs)
transaction.expected <- rep(0, n.recs)
rec.size <- rep(0, n.recs)
for (tt in 1:n.recs) {
this.t.x <- t.values[tt]
this.rec <- which(t.x == this.t.x)
n.this.rec <- length(this.rec)
rec.size[tt] <- n.this.rec
transaction.actual[tt] <- sum(x.star[this.rec])/n.this.rec
transaction.expected[tt] <- sum(pnbd.ConditionalExpectedTransactions(params,
T.star,
x[this.rec],
t.x[this.rec],
T.cal[this.rec],
hardie))/n.this.rec
}
comparison <- rbind(transaction.actual, transaction.expected, rec.size)
colnames(comparison) <- round(t.values, 3)
bins <- seq(1, ceiling(max(t.x)))
n.bins <- length(bins)
actual <- rep(0, n.bins)
expected <- rep(0, n.bins)
bin.size <- rep(0, n.bins)
x.labels <- NULL
if (is.null(xticklab) == FALSE) {
x.labels <- xticklab
} else {
x.labels <- 1:(n.bins)
}
point.labels <- rep("", n.bins)
point.y.val <- rep(0, n.bins)
for (ii in 1:n.bins) {
if (ii < n.bins) {
this.bin <- which(as.numeric(colnames(comparison)) >= (ii - 1) &
as.numeric(colnames(comparison)) < ii)
} else if (ii == n.bins) {
this.bin <- which(as.numeric(colnames(comparison)) >= ii - 1)
}
actual[ii] <- sum(comparison[1, this.bin])/length(comparison[1, this.bin])
expected[ii] <- sum(comparison[2, this.bin])/length(comparison[2, this.bin])
bin.size[ii] <- sum(comparison[3, this.bin])
}
ylim <- c(0, ceiling(max(c(actual, expected)) * 1.1))
plot(actual,
type = "l",
xaxt = "n",
col = 1,
ylim = ylim,
xlab = xlab,
ylab = ylab,
main = title)
lines(expected,
lty = 2,
col = 2)
axis(1, at = 1:n.bins, labels = x.labels)
legend("topleft",
legend = c("Actual", "Model"),
col = 1:2,
lty = 1:2,
lwd = 1)
return(rbind(actual, expected, bin.size))
}
#' Pareto/NBD Plot Discounted Expected Residual Transactions
#'
#' Plots discounted expected residual transactions for different combinations of
#' calibration period frequency and recency.
#'
#' The length of the calibration period \code{T.cal} must be a single value, not
#' a vector.
#'
#' @inheritParams pnbd.DERT
#' @param type must be either "persp" (perspective - 3 dimensional) or
#' "contour". Determines the type of plot produced by this function.
#'
#' @return A matrix with discounted expected residual transaction values for
#' every combination of calibration period frequency \code{x} and calibration
#' period recency \code{t.x}.
#'
#' @references Fader, Peter S., Bruce G.S. Hardie, and Ka L. Lee. "RFM and CLV:
#' Using Iso-Value Curves for Customer Base Analysis." Journal of Marketing
#' Research Vol.42, pp.415-430. November. 2005.
#' \url{http://www.brucehardie.com/papers.html}
#' @references Note that this paper refers to what this package is calling
#' discounted expected residual transactions (DERT) simply as discounted
#' expected transactions (DET).
#'
#' @examples
#' # The RFM and CLV paper uses all 78 weeks of the cdnow data to
#' # estimate parameters. These parameters can be estimated as follows:
#' # elog <- dc.ReadLines(system.file("data/cdnowElog.csv", package="BTYD2"),2,3)
#' # elog[, 'date'] <- as.Date(elog[, 'date'], format = '%Y%m%d')
#' # cal.cbs <- dc.ElogToCbsCbt(elog)$cal$cbs
#' # pnbd.EstimateParameters(cal.cbs, hardie = TRUE)
#'
#' # (The final function was run several times with its own output as
#' # input for starting parameters, to ensure that the result converged).
#'
#' params <- c(0.5629966, 12.5590370, 0.4081095, 10.5148048)
#'
#' # 15% compounded annually has been converted to 0.0027 compounded continously,
#' # as we are dealing with weekly data and not annual data.
#' d <- 0.0027
#'
#' pnbd.Plot.DERT(params = params,
#' x = 0:14,
#' t.x = 0:77,
#' T.cal = 77.86,
#' d = d,
#' hardie = TRUE,
#' type = "persp")
#' pnbd.Plot.DERT(params = params,
#' x = 0:14,
#' t.x = 0:77,
#' T.cal = 77.86,
#' d = d,
#' hardie = TRUE,
#' type="contour")
pnbd.Plot.DERT <- function(params,
x,
t.x,
T.cal,
d,
hardie = TRUE,
type = "persp") {
# No use for inputs, other than as error check.
inputs <- try(dc.InputCheck(params = params,
func = "pnbd.Plot.DERT",
printnames = c("r", "alpha", "s", "beta"),
x = x,
t.x = t.x,
T.cal = T.cal))
if('try-error' == class(inputs)) return(str(inputs)$message)
if (length(T.cal) > 1) stop("T.cal must be a single numeric value and may not be negative.")
if (!(type %in% c("persp", "contour"))) {
stop("The plot type in pnbd.Plot.DERT must be either 'persp' or 'contour'.")
}
DERT <- matrix(NA, length(t.x), length(x))
rownames(DERT) <- t.x
colnames(DERT) <- x
for (i in 1:length(t.x)) {
for (j in 1:length(x)) {
DERT[i, j] <- pnbd.DERT(params,
x[j],
t.x[i],
T.cal,
d,
hardie)
}
}
if (type == "contour") {
if (max(DERT, na.rm = TRUE) <= 10) {
levels <- 1:max(DERT, na.rm = TRUE)
} else if (max(DERT, na.rm = TRUE) <= 20) {
levels <- c(1, seq(2, max(DERT, na.rm = TRUE), 2))
} else {
levels <- c(1, 2, seq(5, max(DERT, na.rm = TRUE), 5))
}
contour(x = t.x,
y = x,
z = DERT,
levels = levels,
xlab = "Recency",
ylab = "Frequency",
main = "Iso-Value Representation of DERT")
}
if (type == "persp") {
persp(x = t.x,
y = x,
z = DERT,
theta = -30,
phi = 20,
axes = TRUE,
ticktype = "detailed",
nticks = 5,
main = "DERT as a Function of Frequency and Recency",
shade = 0.5,
xlab = "Recency",
ylab = "Frequency",
zlab = "Discounted expected residual transactions")
}
return(DERT)
}
#' Pareto/NBD Tracking Cumulative Transactions Plot
#'
#' Plots the actual and expected cumulative total repeat transactions by all
#' customers for the calibration and holdout periods, and returns this
#' comparison in a matrix.
#'
#' actual.cu.tracking.data does not have to be in the same unit of time as the
#' T.cal data. T.tot will automatically be divided into periods to match the
#' length of actual.cu.tracking.data. See
#' [`pnbd.ExpectedCumulativeTransactions`].
#'
#' The holdout period should immediately follow the calibration period. This
#' function assume that all customers' calibration periods end on the same date,
#' rather than starting on the same date (thus customers' birth periods are
#' determined using max(T.cal) - T.cal rather than assuming that it is 0).
#'
#' @inheritParams pnbd.ExpectedCumulativeTransactions
#' @param actual.cu.tracking.data A vector containing the cumulative number of
#' repeat transactions made by customers for each period in the total time
#' period (both calibration and holdout periods). See details.
#' @param xlab Descriptive label for the x axis.
#' @param ylab Descriptive label for the y axis.
#' @param xticklab Vector containing a label for each tick mark on the x axis.
#' @param title Title placed on the top-center of the plot.
#' @return Matrix containing actual and expected cumulative repeat transactions.
#'
#' @examples
#' data(cdnowSummary)
#'
#' cal.cbs <- cdnowSummary$cbs
#' # cal.cbs already has column names required by method
#'
#' # Cumulative repeat transactions made by all customers across calibration
#' # and holdout periods
#' cu.tracking <- cdnowSummary$cu.tracking
#'
#' # parameters estimated using pnbd.EstimateParameters
#' est.params <- cdnowSummary$est.params
#'
#' # All parameters are in weeks; the calibration period lasted 39
#' # weeks and the holdout period another 39.
#' pnbd.PlotTrackingCum(params = est.params,
#' T.cal = cal.cbs[,"T.cal"],
#' T.tot = 78,
#' actual.cu.tracking.data = cu.tracking)
#' @md
pnbd.PlotTrackingCum <- function(params,
T.cal,
T.tot,
actual.cu.tracking.data,
n.periods.final = NA,
xlab = "Week",
ylab = "Cumulative Transactions",
xticklab = NULL,
title = "Tracking Cumulative Transactions") {
# No use for inputs, other than as error check, so suppress
# any warnings about incompatible vector lengths here:
inputs <- suppressWarnings(try(dc.InputCheck(params = params,
func = "pnbd.PlotTrackingCum",
printnames = c("r", "alpha", "s", "beta"),
T.cal = T.cal,
T.tot = T.tot,
actual.cu.tracking.data = actual.cu.tracking.data,
n.periods.final = n.periods.final)))
if('try-error' == class(inputs)) return(str(inputs)$message)
inputs <- NULL
if (length(T.tot) > 1) stop("T.tot must be a single numeric value and may not be negative.")
actual <- actual.cu.tracking.data
if(is.na(n.periods.final)) n.periods.final <- length(actual)
expected <- pnbd.ExpectedCumulativeTransactions(params = params,
T.cal = T.cal,
T.tot = T.tot,
n.periods.final = n.periods.final)
cu.tracking.comparison <- rbind(actual, expected)
ylim <- c(0, max(c(actual, expected)) * 1.05)
plot(actual,
type = "l",
xaxt = "n",
xlab = xlab,
ylab = ylab,
col = 1,
ylim = ylim,
main = title)
lines(expected, lty = 2, col = 2)
if (is.null(xticklab) == FALSE) {
if (ncol(cu.tracking.comparison) != length(xticklab)) {
stop("Plot error, xticklab does not have the correct size")
}
axis(1, at = 1:ncol(cu.tracking.comparison), labels = xticklab)
} else {
axis(1, at = 1:length(actual), labels = 1:length(actual))
}
abline(v = max(T.cal),
lty = 2)
legend("bottomright",
legend = c("Actual", "Model"),
col = 1:2,
lty = 1:2,
lwd = 1)
return(cu.tracking.comparison)
}
#' Pareto/NBD Tracking Incremental Transactions Comparison
#'
#' Plots the actual and expected incremental total repeat transactions by all
#' customers for the calibration and holdout periods, and returns this
#' comparison in a matrix.
#'
#' actual.inc.tracking.data does not have to be in the same unit of time as the
#' T.cal data. T.tot will automatically be divided into periods to match the
#' length of actual.inc.tracking.data. See
#' [`pnbd.ExpectedCumulativeTransactions`].
#'
#' The holdout period should immediately follow the calibration period. This
#' function assume that all customers' calibration periods end on the same date,
#' rather than starting on the same date (thus customers' birth periods are
#' determined using max(T.cal) - T.cal rather than assuming that it is 0).
#'
#' @inheritParams pnbd.PlotTrackingCum
#' @param actual.inc.tracking.data A vector containing the incremental number of
#' repeat transactions made by customers for each period in the total time
#' period (both calibration and holdout periods). See details.
#' @return Matrix containing actual and expected incremental repeat transactions.
#' @examples
#' data(cdnowSummary)
#' cal.cbs <- cdnowSummary$cbs
#' # cal.cbs already has column names required by method
#'
#' # Cumulative repeat transactions made by all customers across calibration
#' # and holdout periods
#' cu.tracking <- cdnowSummary$cu.tracking
#' # make the tracking data incremental
#' inc.tracking <- dc.CumulativeToIncremental(cu.tracking)
#'
#' # parameters estimated using pnbd.EstimateParameters
#' est.params <- cdnowSummary$est.params
#'
#' # All parameters are in weeks; the calibration period lasted 39
#' # weeks and the holdout period another 39.
#' pnbd.PlotTrackingInc(params = est.params,
#' T.cal = cal.cbs[,"T.cal"],
#' T.tot = 78,
#' actual.inc.tracking.data = inc.tracking)
#' @md
pnbd.PlotTrackingInc <- function(params,
T.cal,
T.tot,
actual.inc.tracking.data,
n.periods.final = NA,
xlab = "Week",
ylab = "Transactions",
xticklab = NULL,
title = "Tracking Weekly Transactions") {
# No use for inputs, other than as error check, so suppress
# any warnings about incompatible vector lengths here:
inputs <- suppressWarnings(try(dc.InputCheck(params = params,
func = "pnbd.PlotTrackingInc",
printnames = c("r", "alpha", "s", "beta"),
T.cal = T.cal,
T.tot = T.tot,
actual.inc.tracking.data = actual.inc.tracking.data,
n.periods.final = n.periods.final)))
if('try-error' == class(inputs)) return(str(inputs)$message)
inputs <- NULL
if (length(T.tot) > 1) stop("T.tot must be a single numeric value and may not be negative.")
actual <- actual.inc.tracking.data
if(is.na(n.periods.final)) n.periods.final <- length(actual)
expected <- pnbd.ExpectedCumulativeTransactions(params = params,
T.cal = T.cal,
T.tot = T.tot,
n.periods.final = n.periods.final)
expected <- dc.CumulativeToIncremental(expected)
ylim <- c(0, max(c(actual, expected)) * 1.05)
plot(actual,
type = "l",
xaxt = "n",
xlab = xlab,
ylab = ylab,
col = 1,
ylim = ylim,
main = title)
lines(expected, lty = 2, col = 2)
if (is.null(xticklab) == FALSE) {
if (length(actual) != length(xticklab)) {
stop("Plot error, xticklab does not have the correct size")
}
axis(1, at = 1:length(actual), labels = xticklab)
} else {
axis(1, at = 1:length(actual), labels = 1:length(actual))
}
abline(v = max(T.cal),
lty = 2)
legend("topright",
legend = c("Actual", "Model"),
col = 1:2,
lty = 1:2,
lwd = 1)
return(rbind(actual, expected))
}
#' Plot Pareto/NBD Rate Heterogeneity
#'
#' A helper for plotting either the estimated gamma distribution of mu
#' (customers' propensities to drop out), or the estimated gamma distribution of
#' lambda (customers' propensities to purchase).
#'
#' @inheritParams pnbd.LL
#' @param func A string that is either "pnbd.PlotDropoutRateHeterogeneity" or
#' "pnbd.PlotTransactionRateHeterogeneity".
#' @param lim The upper-bound of the x-axis. A number is chosen by the function
#' if none is provided.
#' @return Depending on the value of `func`, either the distribution of
#' customers' propensities to purchase or the distribution of customers'
#' propensities to drop out.
#' @seealso [`pnbd.PlotDropoutRateHeterogeneity`]
#' @seealso [`pnbd.PlotTransactionRateHeterogeneity`]
#' @md
pnbd.PlotRateHeterogeneity <- function(params,
func,
lim = NULL) {
stopifnot(func %in% c("pnbd.PlotDropoutRateHeterogeneity",
"pnbd.PlotTransactionRateHeterogeneity"))
dc.check.model.params(printnames = c("r", "alpha", "s", "beta"),
params = params,
func = func)
shape_rate <- list(pnbd.PlotTransactionRateHeterogeneity = c(shape = params[1],
rate = params[2]),
pnbd.PlotDropoutRateHeterogeneity = c(shape = params[3],
rate = params[4]))
xlab_main <- list(pnbd.PlotTransactionRateHeterogeneity = c(xlab = "Transaction Rate",
main = "Heterogeneity in Transaction Rate"),
pnbd.PlotDropoutRateHeterogeneity = c(xlab = "Dropout Rate",
main = "Heterogeneity in Dropout Rate"))
shape <- shape_rate[[func]]['shape']
rate <- shape_rate[[func]]['rate']
rate.mean <- round(shape/rate, 4)
rate.var <- round(shape/rate^2, 4)
if (is.null(lim)) {
lim = qgamma(0.99, shape = shape, rate = rate)
}
x.axis.ticks <- seq(0, lim, length.out = 100)
heterogeneity <- dgamma(x.axis.ticks,
shape = shape,
rate = rate)
plot(x.axis.ticks,
heterogeneity,
type = "l",
xlab = xlab_main[[func]]['xlab'],
ylab = "Density",
main = xlab_main[[func]]['main'])
mean.var.label <- paste("Mean:", rate.mean, " Var:", rate.var)
mtext(mean.var.label, side = 3)
return(rbind(x.axis.ticks, heterogeneity))
}
#' Pareto/NBD Plot Transaction Rate Heterogeneity
#'
#' Plots and returns the estimated gamma distribution of lambda (customers'
#' propensities to purchase).
#'
#' This returns the distribution of each customer's Poisson parameter, which
#' determines the level of their purchasing (using the Pareto/NBD assumption
#' that purchasing on the individual level can be modeled with a Poisson
#' distribution).
#'
#' @inheritParams pnbd.PlotRateHeterogeneity
#' @return Distribution of customers' propensities to purchase.
#' @examples
#' params <- c(0.55, 10.56, 0.61, 11.64)
#' pnbd.PlotTransactionRateHeterogeneity(params)
#' params <- c(3, 10.56, 0.61, 11.64)
#' pnbd.PlotTransactionRateHeterogeneity(params)
pnbd.PlotTransactionRateHeterogeneity <- function(params,
lim = NULL) {
pnbd.PlotRateHeterogeneity(params = params,
func = "pnbd.PlotTransactionRateHeterogeneity",
lim = lim)
}
#' Pareto/NBD Plot Dropout Rate Heterogeneity
#'
#' Plots and returns the estimated gamma distribution of mu (customers'
#' propensities to drop out).
#'
#' This returns the distribution of each customer's exponential parameter that
#' determines their lifetime (using the Pareto/NBD assumption that a customer's
#' lifetime can be modeled with an exponential distribution).
#'
#' @inheritParams pnbd.PlotRateHeterogeneity
#' @return Distribution of customers' propensities to drop out.
#' @examples
#' params <- c(0.55, 10.56, 0.61, 11.64)
#' pnbd.PlotDropoutRateHeterogeneity(params)
#' params <- c(0.55, 10.56, 3, 11.64)
#' pnbd.PlotDropoutRateHeterogeneity(params)
pnbd.PlotDropoutRateHeterogeneity <- function(params,
lim = NULL) {
pnbd.PlotRateHeterogeneity(params = params,
func = "pnbd.PlotDropoutRateHeterogeneity",
lim = lim)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.