John Mount 2019-01-01
In this example we fit a piecewise linear function to example data. Please see here for a discussion of the methodology.
library("RcppDynProg")
library("ggplot2")
set.seed(2018)
g <- 100
d <- data.frame(
x = 0.05*(1:(3*g))) # ordered in x
d$y_ideal <- sin((0.3*d$x)^2)
d$y_observed <- d$y_ideal + 0.25*rnorm(length(d$y_ideal))
plt1 <- ggplot(data= d, aes(x = x)) +
geom_line(aes(y = y_ideal), linetype=2) +
geom_point(aes(y = y_observed)) +
ylab("y") +
ggtitle("raw data",
subtitle = "dots: observed values, dashed line: unobserved true values")
print(plt1)
x_cuts <- solve_for_partition(d$x, d$y_observed, penalty = 1)
print(x_cuts)
## x pred group what
## 1 0.05 -0.1570880 1 left
## 2 4.65 1.1593754 1 right
## 3 4.70 1.0653666 2 left
## 4 6.95 -0.9770792 2 right
## 5 7.00 -1.2254925 3 left
## 6 9.20 0.8971391 3 right
## 7 9.25 1.3792437 4 left
## 8 11.10 -1.1542021 4 right
## 9 11.15 -1.0418353 5 left
## 10 12.50 1.1519490 5 right
## 11 12.55 1.3964906 6 left
## 12 13.75 -1.2045219 6 right
## 13 13.80 -1.3791405 7 left
## 14 15.00 1.0195679 7 right
d$estimate <- approx(x_cuts$x, x_cuts$pred, xout = d$x, method = "linear", rule = 2)$y
d$group <- as.character(findInterval(d$x, x_cuts[x_cuts$what=="left", "x"]))
print(sum((d$y_observed - d$y_ideal)^2))
## [1] 20.42462
print(sum((d$estimate - d$y_ideal)^2))
## [1] 3.536541
print(sum((d$estimate - d$y_observed)^2))
## [1] 20.53796
plt2 <- ggplot(data= d, aes(x = x)) +
geom_line(aes(y = y_ideal), linetype=2) +
geom_point(aes(y = y_observed, color = group)) +
geom_line(aes(y = estimate, color = group)) +
ylab("y") +
ggtitle("RcppDynProg piecewise linear estimate",
subtitle = "dots: observed values, segments: observed group means, dashed line: unobserved true values") +
theme(legend.position = "none")
print(plt2)
use_vtreat <- requireNamespace("vtreat", quietly = TRUE)
Here we show many of the vtreat
custom variable coders. An application of these coders can be found here.
spline_variable <- function(varName, x, y, w = NULL) {
if(is.null(w)) {
w <- numeric(n) + 1
}
nknots <- 100
spline <- stats::smooth.spline(x, y,
w = w,
nknots = nknots,
keep.data = FALSE,
keep.stuff = FALSE,
cv = TRUE)$fit
estimate <- stats::predict(spline, x)$y
return(estimate)
}
customCoders = list('c.PiecewiseV.num' = vtreat::solve_piecewise,
'n.PiecewiseV.num' = vtreat::solve_piecewise,
'c.knearest.num' = vtreat::square_window,
'n.knearest.num' = vtreat::square_window,
'c.spline.num' = spline_variable,
'n.spline.num' = spline_variable,
'c.PiecewiseLin.num' = RcppDynProg::piecewise_linear,
'n.PiecewiseLin.num' = RcppDynProg::piecewise_linear,
'c.PiecewiseC.num' = RcppDynProg::piecewise_constant,
'n.PiecewiseC.num' = RcppDynProg::piecewise_constant)
codeRestriction = c("PiecewiseV",
"knearest",
"spline",
"PiecewiseLin",
"PiecewiseC")
trt <- vtreat::designTreatmentsN(
d,
'x', 'y_observed',
customCoders = customCoders,
codeRestriction = codeRestriction,
verbose = FALSE)
knitr::kable(trt$scoreFrame[, c("varName", "rsq", "sig")])
| varName | rsq| sig| |:----------------|----------:|----:| | x_PiecewiseV | 0.8064672| 0| | x_knearest | 0.7415207| 0| | x_spline | 0.8486192| 0| | x_PiecewiseLin | 0.8380624| 0| | x_PiecewiseC | 0.6652028| 0|
dt <- vtreat::prepare(trt, d)
dt$y_observed <- NULL
d <- cbind(d, dt)
print(sum((d$x_PiecewiseV - d$y_ideal)^2))
## [1] 5.336369
print(sum((d$x_PiecewiseV - d$y_observed)^2))
## [1] 25.18626
print(sum((d$x_knearest - d$y_ideal)^2))
## [1] 3.825642
print(sum((d$x_knearest - d$y_observed)^2))
## [1] 22.8239
print(sum((d$x_PiecewiseLin - d$y_ideal)^2))
## [1] 3.536541
print(sum((d$x_PiecewiseLin - d$y_observed)^2))
## [1] 20.53796
print(sum((d$x_spline - d$y_ideal)^2))
## [1] 1.002791
print(sum((d$x_spline - d$y_observed)^2))
## [1] 19.3061
plt3 <- ggplot(data= d, aes(x = x)) +
geom_line(aes(y = y_ideal), linetype=2) +
geom_point(aes(y = y_observed)) +
geom_line(aes(y = x_PiecewiseV)) +
ylab("y") +
ggtitle("vtreat PiecewiseV estimate",
subtitle = "dots: observed values, segments: observed group means, dashed line: unobserved true values") +
theme(legend.position = "none")
print(plt3)
plt4 <- ggplot(data= d, aes(x = x)) +
geom_line(aes(y = y_ideal), linetype=2) +
geom_point(aes(y = y_observed)) +
geom_line(aes(y = x_knearest)) +
ylab("y") +
ggtitle("vtreat k-nearest estimate",
subtitle = "dots: observed values, segments: observed group means, dashed line: unobserved true values") +
theme(legend.position = "none")
print(plt4)
plt5 <- ggplot(data= d, aes(x = x)) +
geom_line(aes(y = y_ideal), linetype=2) +
geom_point(aes(y = y_observed)) +
geom_line(aes(y = x_PiecewiseLin)) +
ylab("y") +
ggtitle("vtreat RcppDynProg piecewise linear estimate",
subtitle = "dots: observed values, segments: observed group means, dashed line: unobserved true values") +
theme(legend.position = "none")
print(plt5)
plt6 <- ggplot(data= d, aes(x = x)) +
geom_line(aes(y = y_ideal), linetype=2) +
geom_point(aes(y = y_observed)) +
geom_line(aes(y = x_PiecewiseC)) +
ylab("y") +
ggtitle("vtreat RcppDynProg piecewise constant estimate",
subtitle = "dots: observed values, segments: observed group means, dashed line: unobserved true values") +
theme(legend.position = "none")
print(plt6)
plt7 <- ggplot(data= d, aes(x = x)) +
geom_line(aes(y = y_ideal), linetype=2) +
geom_point(aes(y = y_observed)) +
geom_line(aes(y = x_spline)) +
ylab("y") +
ggtitle("spline estimate",
subtitle = "dots: observed values, segments: observed group means, dashed line: unobserved true values") +
theme(legend.position = "none")
print(plt7)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.