Nothing
BinUplift <- function(data, treat, outcome, x, n.split = 10, alpha = 0.05, n.min = 30){
# Univariate quantization for uplift.
#
# Args:
# data: a data frame containing the treatment, the outcome and the predictors.
# treat: name of a binary (numeric) vector representing the treatment
# assignment (coded as 0/1).
# outcome: name of a binary response (numeric) vector (coded as 0/1).
# x: name of the explanatory variable to categorize.
# ... and default parameters.
#
# Returns:
# The cut-offs for x.
# Error handling
# First, we need to check that the parameters are well filled and we create
if (n.split <= 0 ) {
stop("Number of splits must be positive")
}
if (alpha < 0 | alpha > 1 ) {
stop("alpha must be between 0 and 1")
}
if (sum(is.na(data[[outcome]])) > 0 ) {
stop("Dependent variable contains missing values: remove the observations and proceed")
}
if (sum(is.na(data[[treat]])) > 0 ) {
stop("Treatment variable contains missing values: remove the observations and proceed")
}
if (length(unique(data[[x]])) < 3) {
stop("Independent variable must contain at least 3 different unique values")
}
if (sum(is.na(data[[x]])) > 0 ) {
warning("Independent variable contains missing values: remove the observations and proceed")
}
# Keep complete cases only for the variable to categorize
data <- data[is.na(data[[x]])==FALSE,]
# Second, we need to define the subfunctions that create the splits and
# choose the best split based on z-test
BinUpliftStump <- function(data, outcome, treat, x, n.split){
x.cut <- unique(quantile(data[[x]], seq(0, 1, 1/n.split)))
splits <- matrix(data = NA, nrow = length(x.cut), ncol = 16)
colnames(splits) <- c("x.cut", "n.lt", "n.lc", "p.lt", "p.lc", "u.l",
"n.rt", "n.rc", "p.rt", "p.rc", "u.r", "diff", "p.t", "p.c",
"p.t.n.t", "p.c.n.c")
for(i in 1:length(x.cut)){
index.l <- data[[x]] < x.cut[i]
index.r <- data[[x]] >= x.cut[i]
splits[i,1] <- x.cut[i]
left <- data[index.l,]
right <- data[index.r,]
# uplift in left node
splits[i, 2] <- sum(left[[treat]]==1) # n.lt
splits[i, 3] <- sum(left[[treat]]==0) # n.lc
splits[i, 4] <- sum(left[[treat]]==1 & left[[outcome]]==1)/splits[i, 2] # p.lt
splits[i, 5] <- sum(left[[treat]]==0 & left[[outcome]]==1)/splits[i, 3] # p.lc
splits[i, 6] <- splits[i, 4] - splits[i, 5] # u.l
# uplift in right node
splits[i, 7] <- sum(right[[treat]]==1) # n.rt
splits[i, 8] <- sum(right[[treat]]==0) # n.rc
splits[i, 9] <- sum(right[[treat]]==1 & right[[outcome]]==1)/splits[i, 7] # p.rt
splits[i, 10] <- sum(right[[treat]]==0 & right[[outcome]]==1)/splits[i, 8] # p.rc
splits[i, 11] <- splits[i, 9] - splits[i, 10] # u.r
splits[i, 12] <- abs(splits[i, 6] - splits[i, 11]) # diff
#probabilities of success in treatment and control groups
splits[i, 13] <- sum(data[[treat]]==1 & data[[outcome]]==1)/sum(data[[treat]]==1) # p.t
splits[i, 14] <- sum(data[[treat]]==0 & data[[outcome]]==1)/sum(data[[treat]]==0) # p.c
#marginal counts needed for odds ratio
splits[i, 15] <- sum(data[[treat]]==1 & data[[outcome]]==1) #p.t.n.t
splits[i, 16] <- sum(data[[treat]]==0 & data[[outcome]]==1) #p.c.n.c
}
return(splits)
}
BinUpliftTest <- function(splits, alpha, n.min){
z.a <- qnorm(0.5*alpha, mean = 0, sd = 1, lower.tail = FALSE, log.p = FALSE)
test.splits <- data.frame(splits)
test.splits$sign <- 0
test.splits <- test.splits[complete.cases(test.splits),]
# We need the number of observations per group for the variance
test.splits$n.t <- test.splits$n.lt + test.splits$n.rt
test.splits$n.c <- test.splits$n.lc + test.splits$n.rc
# We need the number of responses in the left node per group for the odds ratio
test.splits$z_t <- test.splits$n.lt * test.splits$p.lt
test.splits$z_c <- test.splits$n.lc * test.splits$p.lc
test.splits$odds_ratio_t <- (test.splits$z_t / (test.splits$n.lt - test.splits$z_t)) /
((test.splits$p.t * test.splits$n.t - test.splits$z_t)/(test.splits$n.rt - (test.splits$p.t * test.splits$n.t - test.splits$z_t)))
test.splits$odds_ratio_c <- (test.splits$z_c / (test.splits$n.lc - test.splits$z_c)) /
((test.splits$p.c * test.splits$n.c - test.splits$z_c)/(test.splits$n.rc - (test.splits$p.c * test.splits$n.c - test.splits$z_c)))
# Extra checks in case odds_ratio is Inf of <= 0
test.splits <- test.splits[is.finite(test.splits$odds_ratio_t) == TRUE,]
test.splits <- test.splits[is.finite(test.splits$odds_ratio_c) == TRUE,]
test.splits <- test.splits[test.splits$odds_ratio_t > 0,]
test.splits <- test.splits[test.splits$odds_ratio_c > 0,]
#Error handling in case there are no possible splits
if (nrow(test.splits) == 0) {
return(test.splits)
}
# We can now compute the numerator and denominator of the test statistic
test.splits$esp_t <- 0
test.splits$esp_c <- 0
test.splits$var_t <- 0
test.splits$var_c <- 0
for (i in 1:nrow(test.splits)){
test.splits[i,]$esp_t <- BiasedUrn::meanFNCHypergeo(m1=test.splits[i,]$n.lt,
m2=test.splits[i,]$n.rt,
n=test.splits[i,]$p.t.n.t,
odds=test.splits$odds_ratio_t[i])
test.splits[i,]$esp_c <- BiasedUrn::meanFNCHypergeo(m1=test.splits[i,]$n.lc,
m2=test.splits[i,]$n.rc,
n=test.splits[i,]$p.c.n.c,
odds=test.splits$odds_ratio_c[i])
test.splits[i,]$var_t <- BiasedUrn::varFNCHypergeo(m1=test.splits[i,]$n.lt,
m2=test.splits[i,]$n.rt,
n=test.splits[i,]$p.t.n.t,
odds=test.splits$odds_ratio_t[i])
test.splits[i,]$var_c <- BiasedUrn::varFNCHypergeo(m1=test.splits[i,]$n.lc,
m2=test.splits[i,]$n.rc,
n=test.splits[i,]$p.c.n.c,
odds=test.splits$odds_ratio_c[i])
}
test.splits$z.num <- (test.splits$p.lt - test.splits$p.lc - test.splits$p.rt + test.splits$p.rc)
test.splits$z.den <- sqrt( (test.splits$n.t^2)*test.splits$var_t / ((test.splits$n.lt^2)*(test.splits$n.rt^2)) +
(test.splits$n.c^2)*test.splits$var_c / ((test.splits$n.lc^2)*(test.splits$n.rc^2)) )
test.splits$z.obs <- abs(test.splits$z.num / test.splits$z.den)
test.splits$sign <- 1*(test.splits$z.obs > z.a)
argmax.size <- test.splits[test.splits$n.lt > n.min & test.splits$n.lc > n.min & test.splits$n.rt > n.min & test.splits$n.rc > n.min,]
argmax.cut <- argmax.size[which.max(argmax.size$z.obs),]
argmax.cut$x.pos <- which(test.splits$x.cut == argmax.cut$x.cut)
return(argmax.cut)
}
# Now we can call the recursive partitionning subfunction that performs the
# splits based on the two (2) previous subfunctions
BinUpliftTree <- function(data, outcome, treat, x, n.split, alpha, n.min){
stump <- BinUpliftStump(data, outcome, treat, x, n.split)
best <- BinUpliftTest(stump, alpha, n.min)
if (best$sign == 0 || nrow(best) == 0) {
return("oups..no significant split")
} else if (best$sign == 1) {
# printing the cuts in order
print(paste("The variable", x, "has been cut at:"))
print(best$x.cut)
# the x.cuts at the left and at the right of the x.cut optimal
l.cuts <- best$x.pos - 1
r.cuts <- n.split-best$x.pos + 1
l.index <- data[[x]] < best$x.cut
r.index <- data[[x]] >= best$x.cut
l.create <- data[l.index,]
r.create <- data[r.index,]
l.stump <- BinUpliftStump(l.create, outcome, treat, x, l.cuts)
r.stump <- BinUpliftStump(r.create, outcome, treat, x, r.cuts)
l.best <- BinUpliftTest(l.stump, alpha, n.min)
r.best <- BinUpliftTest(r.stump, alpha, n.min)
# possible paths
if (l.best$sign == 0 || nrow(l.best) == 0) {
if (r.best$sign == 0 || nrow(r.best) == 0) {
return (best)
} else if (r.best$sign == 1) {
return (rbind(best, BinUpliftTree(r.create, outcome, treat, x, r.cuts, alpha, n.min)))
}
} else if (l.best$sign == 1) {
if (r.best$sign == 0 || nrow(r.best) == 0){
return (rbind(BinUpliftTree(l.create, outcome, treat, x, l.cuts, alpha, n.min), best))
} else if (r.best$sign == 1) {
return (rbind(BinUpliftTree(l.create, outcome, treat, x, l.cuts, alpha, n.min), best, BinUpliftTree(r.create, outcome, treat, x, r.cuts, alpha, n.min)))
}
}
}
}
# Now, if the variable we want to bin is not continuous, we need to change
# it to a continuous one to be able to split
# We sort the categories by uplift and create an ordinal variable 1,2,...,K
# where K is the number of the categories. Then, we can split that variable
# with n.split=K-1
BinUpliftCatRank <- function(data, outcome, treat, x){
splits <- matrix(data = NA, nrow = length(levels(data[[x]])), ncol=6)
colnames(splits) <- c("cat", "n.t", "n.c", "p.t", "p.c", "u")
splits <- as.data.frame(splits)
for(i in 1:nrow(splits)){
splits[i, 1] <- levels(data[[x]])[[i]]
# uplift in each category
splits[i, 2] <- sum(data[[treat]]==1 & data[[x]]== splits[i, 1]) # n.t
splits[i, 3] <- sum(data[[treat]]==0 & data[[x]]== splits[i, 1]) # n.c
splits[i, 4] <- sum(data[[treat]]==1 & data[[x]]== splits[i, 1] & data[[outcome]]==1)/splits[i, 2] # p.t
splits[i, 5] <- sum(data[[treat]]==0 & data[[x]]== splits[i, 1] & data[[outcome]]==1)/splits[i, 3] # p.c
splits[i, 6] <- splits[i, 4] - splits[i, 5] # u
}
splits$cat.rank <- rank(splits$u, ties.method = "random")
x.rank <- splits[, c(1,7)]
x.merge <- merge(x.rank, data, by.x = 'cat', by.y = 'x')
names(x.merge)[names(x.merge) == 'cat.rank'] <- 'x'
x.rank <- x.rank[order(x.rank$cat.rank),]
x.rank$cat <- paste0("'", x.rank$cat, "'")
res <- list(x.rank, x.merge)
return(res)
}
# Finally, we can grow the tree and return it as out.tree depending on nature of the
# variable we want to bin
if (is.factor(data[[x]])==TRUE) {
out.cat <- BinUpliftCatRank(data, outcome, treat, x)[[2]]
out.link <- BinUpliftCatRank(data, outcome, treat, x)[[1]]
out.tree <- BinUpliftTree(out.cat, outcome, treat, x, length(unique(out.cat$x))-1, alpha, n.min)
out.tree.cat <- list("out.tree" = out.tree, "out.link" = out.link)
return(out.tree.cat)
} else if (is.factor(data[[x]])==FALSE) {
out.tree <- BinUpliftTree(data, outcome, treat, x, n.split, alpha, n.min)
class(out.tree) <- "BinUplift"
return(out.tree)
}
}
# END FUN
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.