Nothing
###############################################
### TESTING ERM.CMS VIA LOGISTIC REGRESSION ###
###############################################
### Build dataset
N <- 200
D <- 2
K <- 2
X <- data.frame()
y <- data.frame()
# May be helpful to set seed for testing
for (j in (1:K)){
t <- seq(-.25, .25, length.out = N)
if (j==1) m <- stats::rnorm(N,-.2, .1) # Soft margin
if (j==2) m <- stats::rnorm(N, .2, .1)
# if (j==1) m <- stats::rnorm(N,-.2, .05) # Hard margin
# if (j==2) m <- stats::rnorm(N, .2, .05)
Xtemp <- data.frame(x1 = 3*t , x2 = m - t)
ytemp <- data.frame(matrix(j-1, N, 1))
X <- rbind(X, Xtemp)
y <- rbind(y, ytemp)
}
upper.bounds <- c(1, 1)
lower.bounds <- c(-1,-1)
data <- cbind(X, y)
y <- as.matrix(data[,3])
colnames(data) <- c(colnames(X), 'label')
### To verify in unit circle
# circleFun <- function(center = c(0,0),diameter = 1, npoints = 100){
# r = diameter / 2
# tt <- seq(0,2*pi,length.out = npoints)
# xx <- center[1] + r * cos(tt)
# yy <- center[2] + r * sin(tt)
# return(data.frame(x = xx, y = yy))
# }
# cir <- circleFun(c(0,0),2,npoints = 100)
ggplot2::ggplot(data) + ggplot2::geom_point(ggplot2::aes(x=x1, y=x2, color = as.character(label)),
size = 2, show.legend = F) +
ggplot2::scale_colour_discrete(name = "Label") + ggplot2::coord_fixed(ratio = 1) +
ggplot2::theme_bw(base_size = 12) + ggplot2::xlim(-.8, .8) + ggplot2::ylim(-.7, .7)
### No DP, no regularization
# gamma <- 0
# eps <- Inf
### No DP, regularization
# gamma <- .01
# eps <- Inf
### DP, no regularization
# gamma <- 0
# eps <- 1
### DP, regularization
gamma <- 1
eps <- 1
pm <- 'objective'
c <- 1/4 # Necessary constant for logistic regression if objective used
# pm <- 'output'
# c <- NULL
ermdp <- EmpiricalRiskMinimizationDP.CMS$new(mapXy.sigmoid, loss.cross.entropy,
'l2', eps, gamma, pm, c,
mapXy.gr.sigmoid,
loss.gr.cross.entropy)
ermdp$fit(X, y, upper.bounds, lower.bounds)
theta <- ermdp$coeff
# Decision boundary grid
grid <- expand.grid(seq(-.8, .8, length.out = 100),
seq(-.7, .7, length.out = 100))
Z <- round(ermdp$predict(grid))
gridPred = cbind(grid, Z)
colnames(gridPred)[3] <- 'label'
gridPred <- data.frame(gridPred)
# Decision boundary visualization
ggplot2::ggplot() + ggplot2::geom_point(data = data, ggplot2::aes(x=x1, y=x2, color = as.character(label)),
size = 2, show.legend = F) +
ggplot2::geom_tile(data = gridPred, ggplot2::aes(x = grid[, 1],y = grid[, 2],
fill=as.character(Z)), alpha = 0.3,
show.legend = F) + ggplot2::coord_fixed(ratio = 1) +
ggplot2::theme_bw(base_size = 12)+ ggplot2::xlim(-.8,.8) + ggplot2::ylim(-.7,.7)
### END TESTING LOGISTIC REGRESSION USING ERM ###
#########################################
### TESTING LOGISTIC REGRESSION CLASS ###
#########################################
### Build dataset (no bias, satisfies constraints)
N <- 200
D <- 2
K <- 2
X <- data.frame()
y <- data.frame()
# May be helpful to set seed for testing
for (j in (1:K)){
t <- seq(-.25, .25, length.out = N)
if (j==1) m <- stats::rnorm(N,-.2, .1) # Soft margin
if (j==2) m <- stats::rnorm(N, .2, .1)
# if (j==1) m <- stats::rnorm(N,-.2, .05) # Hard margin
# if (j==2) m <- stats::rnorm(N, .2, .05)
Xtemp <- data.frame(x1 = 3*t , x2 = m - t)
ytemp <- data.frame(matrix(j-1, N, 1))
X <- rbind(X, Xtemp)
y <- rbind(y, ytemp)
}
upper.bounds <- c(1, 1)
lower.bounds <- c(-1,-1)
data <- cbind(X, y)
y <- as.matrix(data[,3])
colnames(data) <- c(colnames(X), 'label')
### To verify in unit circle
# circleFun <- function(center = c(0,0),diameter = 1, npoints = 100){
# r = diameter / 2
# tt <- seq(0,2*pi,length.out = npoints)
# xx <- center[1] + r * cos(tt)
# yy <- center[2] + r * sin(tt)
# return(data.frame(x = xx, y = yy))
# }
# cir <- circleFun(c(0,0),2,npoints = 100)
ggplot2::ggplot(data) + ggplot2::geom_point(ggplot2::aes(x=x1, y=x2, color = as.character(label)),
size = 2, show.legend = F) +
ggplot2::scale_colour_discrete(name = "Label") + ggplot2::coord_fixed(ratio = 1) +
ggplot2::theme_bw(base_size = 12) + ggplot2::xlim(-.8, .8) + ggplot2::ylim(-.7, .7)
### End build dataset
### Build dataset (with bias, needs scaling)
# N <- 200 # number of points per class
# D <- 2 # dimensionality, we use 2D data for easy visulization
# K <- 2 # number of classes, binary for logistic regression
# X <- data.frame() # data matrix (each row = single example, can view as xy coordinates)
# y <- data.frame() # class labels
#
# # May be helpful to set seed for testing
#
# for (j in (1:K)){
# # t, m are parameters of parametric equations x1, x2
# t <- seq(0,1,length.out = N)
# # add randomness
# m <- stats::rnorm(N, j+0.5, 0.25)
# Xtemp <- data.frame(x1 = 3*t , x2 = m - t)
# ytemp <- data.frame(matrix(j-1, N, 1))
# X <- rbind(X, Xtemp)
# y <- rbind(y, ytemp)
# }
#
# upper.bounds <- c(3,3)
# lower.bounds <- c(0,0)
#
# data <- cbind(X,y)
# y <- as.matrix(data[,3])
# colnames(data) <- c(colnames(X), 'label')
#
# ggplot2::ggplot(data) + ggplot2::geom_point(ggplot2::aes(x=x1, y=x2, color = as.character(label)), size = 2) +
# ggplot2::scale_colour_discrete(name ="Label") +
# ggplot2::ylim(0, 3) + ggplot2::coord_fixed(ratio = 1) +
# ggplot2::ggtitle('Data to be classified') +
# ggplot2::theme_bw(base_size = 12) +
# ggplot2::theme(legend.position=c(0.85, 0.87))
### End build dataset
### No DP, no regularization
# gamma <- 0
# eps <- Inf
### No DP, regularization
# gamma <- .01
# eps <- Inf
### DP, no regularization
# gamma <- 0
# eps <- 1
### DP, regularization
gamma <- 1
eps <- 1
pm <- 'objective'
# pm <- 'output'
lrdp <- LogisticRegressionDP$new('l2', eps, gamma, pm)
### No bias, satisfies constraints
lrdp$fit(X,y,upper.bounds=upper.bounds,lower.bounds=lower.bounds)
### With bias, needs scaling
# lrdp$fit(X,y,upper.bounds=upper.bounds,lower.bounds=lower.bounds,add.bias=TRUE)
# lrdp$predict(X, add.bias=TRUE)
theta <- lrdp$coeff
# Decision boundary grid
### No bias, satisfies constraints
grid <- expand.grid(seq(-.8, .8, length.out = 100),
seq(-.7, .7, length.out = 100))
Z <- lrdp$predict(grid)
### With bias, needs scaling
# grid <- expand.grid(seq(0, 3, length.out = 100), seq(0, 3, length.out = 100))
# Z <- lrdp$predict(grid, add.bias=TRUE)
gridPred = cbind(grid, Z)
colnames(gridPred)[3] <- 'label'
gridPred <- data.frame(gridPred)
### No bias, satisfies constraints
ggplot2::ggplot() + ggplot2::geom_point(data = data, ggplot2::aes(x=x1, y=x2, color = as.character(label)),
size = 2, show.legend = F) +
ggplot2::geom_tile(data = gridPred, ggplot2::aes(x = grid[, 1],y = grid[, 2],
fill=as.character(Z)), alpha = 0.3,
show.legend = F) + ggplot2::coord_fixed(ratio = 1) +
ggplot2::theme_bw(base_size = 12)+ ggplot2::xlim(-.8,.8) + ggplot2::ylim(-.7,.7)
### With bias, needs scaling
# ggplot2::ggplot() + ggplot2::geom_point(data = data,
# ggplot2::aes(x=x1, y=x2, color = as.character(label)),
# size = 2, show.legend = F) +
# ggplot2::geom_tile(data = gridPred,
# ggplot2::aes(x = grid[, 1],y = grid[, 2], fill=as.character(Z)),
# alpha = 0.3, show.legend = F)+
# ggplot2::ggtitle('Decision Boundary for Logistic Regression') +
# ggplot2::coord_fixed(ratio = 1) +
# ggplot2::theme_bw(base_size = 12)
### END TESTING LOGISTIC REGRESSION CLASS ###
#
#########################################
### TESTING SVM CLASS (LINEAR CASE) ###
#########################################
### Build dataset (no bias, satisfies constraints)
N <- 200
D <- 2
K <- 2
X <- data.frame()
y <- data.frame()
# May be helpful to set seed for testing
for (j in (1:K)){
t <- seq(-.25, .25, length.out = N)
if (j==1) m <- stats::rnorm(N,-.2, .1) # Soft margin
if (j==2) m <- stats::rnorm(N, .2, .1)
# if (j==1) m <- stats::rnorm(N,-.2, .05) # Hard margin
# if (j==2) m <- stats::rnorm(N, .2, .05)
Xtemp <- data.frame(x1 = 3*t , x2 = m - t)
ytemp <- data.frame(matrix(j-1, N, 1))
X <- rbind(X, Xtemp)
y <- rbind(y, ytemp)
}
upper.bounds <- c(1, 1)
lower.bounds <- c(-1,-1)
data <- cbind(X, y)
y <- as.matrix(data[,3])
colnames(data) <- c(colnames(X), 'label')
### To verify in unit circle
# circleFun <- function(center = c(0,0),diameter = 1, npoints = 100){
# r = diameter / 2
# tt <- seq(0,2*pi,length.out = npoints)
# xx <- center[1] + r * cos(tt)
# yy <- center[2] + r * sin(tt)
# return(data.frame(x = xx, y = yy))
# }
# cir <- circleFun(c(0,0),2,npoints = 100)
ggplot2::ggplot(data) + ggplot2::geom_point(ggplot2::aes(x=x1, y=x2, color = as.character(label)),
size = 2, show.legend = F) +
ggplot2::scale_colour_discrete(name = "Label") + ggplot2::coord_fixed(ratio = 1) +
ggplot2::theme_bw(base_size = 12) + ggplot2::xlim(-.8, .8) + ggplot2::ylim(-.7, .7)
### End build dataset
### Build dataset (with bias, needs scaling)
# N <- 200 # number of points per class
# D <- 2 # dimensionality, we use 2D data for easy visulization
# K <- 2 # number of classes, binary for logistic regression
# X <- data.frame() # data matrix (each row = single example, can view as xy coordinates)
# y <- data.frame() # class labels
#
# # May be helpful to set seed for testing
#
# for (j in (1:K)){
# # t, m are parameters of parametric equations x1, x2
# t <- seq(0,1,length.out = N)
# # add randomness
# m <- stats::rnorm(N, j+0.5, 0.25)
# Xtemp <- data.frame(x1 = 3*t , x2 = m - t)
# ytemp <- data.frame(matrix(j-1, N, 1))
# X <- rbind(X, Xtemp)
# y <- rbind(y, ytemp)
# }
#
# upper.bounds <- c(3,3)
# lower.bounds <- c(0,0)
#
# data <- cbind(X,y)
# y <- as.matrix(data[,3])
# colnames(data) <- c(colnames(X), 'label')
#
# ggplot2::ggplot(data) + ggplot2::geom_point(ggplot2::aes(x=x1, y=x2, color = as.character(label)), size = 2) +
# ggplot2::scale_colour_discrete(name ="Label") +
# ggplot2::ylim(0, 3) + ggplot2::coord_fixed(ratio = 1) +
# ggplot2::ggtitle('Data to be classified') +
# ggplot2::theme_bw(base_size = 12) +
# ggplot2::theme(legend.position=c(0.85, 0.87))
### End build dataset
### No DP, no regularization
# gamma <- 0
# eps <- Inf
### No DP, regularization
# gamma <- .01
# eps <- Inf
### DP, no regularization
# gamma <- 0
# eps <- 1
### DP, regularization
gamma <- 1
eps <- 1
pm <- 'objective'
# pm <- 'output'
svmdp <- svmDP$new('l2', eps, gamma, pm, kernel='linear')
### No bias, satisfies constraints
svmdp$fit(X,y,upper.bounds=upper.bounds,lower.bounds=lower.bounds)
### With bias, needs scaling
# svmdp$fit(X,y,upper.bounds=upper.bounds,lower.bounds=lower.bounds,add.bias=TRUE)
# svmdp$predict(X, add.bias=TRUE)
theta <- svmdp$coeff
# Decision boundary grid
### No bias, satisfies constraints
grid <- expand.grid(seq(-.8, .8, length.out = 100),
seq(-.7, .7, length.out = 100))
Z <- svmdp$predict(grid)
### With bias, needs scaling
# grid <- expand.grid(seq(0, 3, length.out = 100), seq(0, 3, length.out = 100))
# Z <- svmdp$predict(grid, add.bias=TRUE)
gridPred = cbind(grid, Z)
colnames(gridPred)[3] <- 'label'
gridPred <- data.frame(gridPred)
### No bias, satisfies constraints
ggplot2::ggplot() + ggplot2::geom_point(data = data, ggplot2::aes(x=x1, y=x2, color = as.character(label)),
size = 2, show.legend = F) +
ggplot2::geom_tile(data = gridPred, ggplot2::aes(x = grid[, 1],y = grid[, 2],
fill=as.character(Z)), alpha = 0.3,
show.legend = F) + ggplot2::coord_fixed(ratio = 1) +
ggplot2::theme_bw(base_size = 12)+ ggplot2::xlim(-.8,.8) + ggplot2::ylim(-.7,.7)
### With bias, needs scaling
# ggplot2::ggplot() + ggplot2::geom_point(data = data,
# ggplot2::aes(x=x1, y=x2, color = as.character(label)),
# size = 2, show.legend = F) +
# ggplot2::geom_tile(data = gridPred,
# ggplot2::aes(x = grid[, 1],y = grid[, 2], fill=as.character(Z)),
# alpha = 0.3, show.legend = F)+
# ggplot2::ggtitle('Decision Boundary for Logistic Regression') +
# ggplot2::coord_fixed(ratio = 1) +
# ggplot2::theme_bw(base_size = 12)
### END TESTING SVM CLASS ###
#######################################################
### TESTING GAUSSIAN KERNEL SVM AND COMPARING TO REGULAR SVM ### (Linear)
#######################################################
# Build dataset
N <- 200
D <- 2
K <- 2
X <- data.frame()
y <- data.frame()
# May be helpful to set seed for testing
for (j in (1:K)){
t <- seq(-.25,.25,length.out = N)
# add randomness
if (j==1) m <- stats::rnorm(N,-.2,.1) # Soft margin
if (j==2) m <- stats::rnorm(N, .2,.1)
# if (j==1) m <- stats::rnorm(N,-.2, .05) # Hard margin
# if (j==2) m <- stats::rnorm(N, .2, .05)
Xtemp <- data.frame(x1 = 3*t , x2 = m - t)
ytemp <- data.frame(matrix(j-1, N, 1))
X <- rbind(X, Xtemp)
y <- rbind(y, ytemp)
}
upper.bounds <- c(1,1)
lower.bounds <- c(-1,-1)
data <- cbind(X,y)
y <- as.matrix(data[,3])
colnames(data) <- c(colnames(X), 'label')
# To verify in unit circle
# circleFun <- function(center = c(0,0),diameter = 1, npoints = 100){
# r = diameter / 2
# tt <- seq(0,2*pi,length.out = npoints)
# xx <- center[1] + r * cos(tt)
# yy <- center[2] + r * sin(tt)
# return(data.frame(x = xx, y = yy))
# }
# cir <- circleFun(c(0,0),2,npoints = 100)
ggplot2::ggplot(data) + ggplot2::geom_point(ggplot2::aes(x=x1, y=x2, color = as.character(label)),
size = 2, show.legend = F) +
ggplot2::scale_colour_discrete(name ="Label") + #ggplot2::ylim(-3, 3) +
ggplot2::coord_fixed(ratio = 1) +
ggplot2::theme_bw(base_size = 12) + ggplot2::xlim(-.8,.8) + ggplot2::ylim(-.7,.7)
# Hard-margin
# eps <- Inf
# # gamma <- 1
# gamma <- 0
# D <- 20
# Soft-margin
# eps <- Inf
# gamma <- .01
# D <- 20
# With DP
eps <- 1
gamma <- 1
D <- 5
pm <- 'objective'
# pm <- 'output'
ksvmdp <- svmDP$new('l2', eps, gamma, pm, kernel='Gaussian', D)
ksvmdp$fit(X,y) # Note bounds unnecessary for Gaussian kernel
theta <- ksvmdp$coeff
# Grid
grid <- expand.grid(seq(-.8, .8, length.out = 100), seq(-.7,.7, length.out = 100))
Z <- ksvmdp$predict(grid)
gridPred = cbind(grid, Z)
colnames(gridPred)[3] <- 'label'
gridPred <- data.frame(gridPred)
# decision boundary visualization
ggplot2::ggplot() + ggplot2::geom_point(data = data,
ggplot2::aes(x=x1, y=x2, color = as.character(label)),
size = 2, show.legend = F) +
ggplot2::geom_tile(data = gridPred,
ggplot2::aes(x = grid[, 1],y = grid[, 2], fill=as.character(Z)),
alpha = 0.3, show.legend = F)+
ggplot2::coord_fixed(ratio = 1) +
ggplot2::theme_bw(base_size = 12) + ggplot2::xlim(-.8,.8) + ggplot2::ylim(-.7,.7)
# Standard SVM
stdSVM <- e1071::svm(as.matrix(X),y,kernel="radial")
# Grid
grid <- expand.grid(seq(-.8, .8, length.out = 100), seq(-.7,.7, length.out = 100))
Z <- sign(predict(stdSVM, newdata=grid,decision.values=TRUE))
gridPred = cbind(grid, Z)
colnames(gridPred)[3] <- 'label'
gridPred <- data.frame(gridPred)
# decision boundary visualization
ggplot2::ggplot() + ggplot2::geom_point(data = data,
ggplot2::aes(x=x1, y=x2, color = as.character(label)),
size = 2, show.legend = F) +
ggplot2::geom_tile(data = gridPred,
ggplot2::aes(x = grid[, 1],y = grid[, 2], fill=as.character(Z)),
alpha = 0.3, show.legend = F)+
ggplot2::coord_fixed(ratio = 1) +
ggplot2::theme_bw(base_size = 12) + ggplot2::xlim(-.8,.8) + ggplot2::ylim(-.7,.7)
### END TESTING AND COMPARING KERNEL SVM ###
#######################################################
### TESTING GAUSSIAN KERNEL SVM ON NONLINEAR DATASET ### (Nonlinear)
#######################################################
# Build dataset
N <- 400 # number of points
D <- 2 # dimensionality, we use 2D data for easy visulization
K <- 2 # number of classes, binary for logistic regression
X <- data.frame() # data matrix (each row = single example, can view as xy coordinates)
y <- data.frame() # class labels
# May be helpful to set seed for testing
for (i in (1:N)){
Xtemp <- data.frame(x1 = stats::rnorm(1,sd=.28) , x2 = stats::rnorm(1,sd=.28))
if (sum(Xtemp^2)<.15) ytemp <- data.frame(y=0)
else ytemp <- data.frame(y=1)
X <- rbind(X, Xtemp)
y <- rbind(y, ytemp)
}
upper.bounds <- c(1,1)
lower.bounds <- c(-1,-1)
data <- cbind(X,y)
y <- as.matrix(data[,3])
colnames(data) <- c(colnames(X), 'label')
# To verify in unit circle
# circleFun <- function(center = c(0,0),diameter = 1, npoints = 100){
# r = diameter / 2
# tt <- seq(0,2*pi,length.out = npoints)
# xx <- center[1] + r * cos(tt)
# yy <- center[2] + r * sin(tt)
# return(data.frame(x = xx, y = yy))
# }
# cir <- circleFun(c(0,0),2,npoints = 100)
ggplot2::ggplot(data) + ggplot2::geom_point(ggplot2::aes(x=x1, y=x2, color = as.character(label)), size = 2,
show.legend = F) +
ggplot2::ylim(-1, 1) + ggplot2::xlim(-1,1)+
ggplot2::coord_fixed(ratio = 1) +
ggplot2::theme_bw(base_size = 12)
eps <- 5
gamma <- 1
D <- 20
# pm <- 'objective'
pm <- 'output'
ksvmdp <- svmDP$new('l2', eps, gamma, pm, kernel='Gaussian', D)
# Note bounds unnecessary with Gaussian kernel
ksvmdp$fit(X,y)
theta <- ksvmdp$coeff
# Grid
grid <- expand.grid(seq(-1, 1, length.out = 100), seq(-1, 1, length.out = 100))
Z <- ksvmdp$predict(grid)
gridPred = cbind(grid, Z)
colnames(gridPred)[3] <- 'label'
gridPred <- data.frame(gridPred)
# decision boundary visualization
ggplot2::ggplot() + ggplot2::geom_point(data = data,
ggplot2::aes(x=x1, y=x2, color = as.character(label)),
size = 2, show.legend = F) + ggplot2::ylim(-1, 1) + ggplot2::xlim(-1,1)+
ggplot2::geom_tile(data = gridPred,
ggplot2::aes(x = grid[, 1],y = grid[, 2], fill=as.character(Z)),
alpha = 0.3, show.legend = F)+
ggplot2::coord_fixed(ratio = 1) +
ggplot2::theme_bw(base_size = 12)
### END TESTING KERNEL SVM ON NONLINEAR DATASET ###
###################################
### TESTING WEIGHTED LINEAR SVM ###
###################################
## NOTE: Tested by comparing with Fig. 3 in paper "Weighted support vector
## machine for data classification"
## To test, run (1) without adding outliers or weighting, (2) with outliers
## but no weighting, and (3) with outliers and weighting outliers less than
## other datapoints. Should see "typical" weights for case 3 approach case 1
## as the outlier weights get smaller relative to normal weights. Noise
## should also be present due to DP.
set.seed(42)
### Build dataset (no bias, satisfies constraints)
N <- 200
D <- 2
K <- 2
X <- data.frame()
y <- data.frame()
# May be helpful to set seed for testing
for (j in (1:K)){
t <- seq(-.1, .1, length.out = N)
# if (j==1) m <- stats::rnorm(N,-.2, .1) # Soft margin
# if (j==2) m <- stats::rnorm(N, .2, .1)
if (j==1) m <- stats::rnorm(N,-.3, .1) # Hard margin
if (j==2) m <- stats::rnorm(N, .3, .1)
Xtemp <- data.frame(x1 = 3*t , x2 = m - t)
ytemp <- data.frame(y = matrix(j-1, N, 1))
X <- rbind(X, Xtemp)
y <- rbind(y, ytemp)
}
# Add outliers
outliersX <- data.frame(x1 = c(-0.27, -0.23), x2 = c(0.58, 0.59))
outliersy <- data.frame(y = matrix(0, 2, 1))
X <- rbind(X, outliersX)
y <- rbind(y, outliersy)
# Define weights/weights upper bound
wub <- 2
weights <- rep(wub, nrow(y))
weights[(nrow(y)-1):nrow(y)] <- wub/20
upper.bounds <- c(1, 1)
lower.bounds <- c(-1,-1)
data <- cbind(X, y)
y <- as.matrix(data[,3])
colnames(data) <- c(colnames(X), 'label')
### To verify in unit circle
# circleFun <- function(center = c(0,0),diameter = 1, npoints = 100){
# r = diameter / 2
# tt <- seq(0,2*pi,length.out = npoints)
# xx <- center[1] + r * cos(tt)
# yy <- center[2] + r * sin(tt)
# return(data.frame(x = xx, y = yy))
# }
# cir <- circleFun(c(0,0),2,npoints = 100)
ggplot2::ggplot(data) + ggplot2::geom_point(ggplot2::aes(x=x1, y=x2, color = as.character(label)),
size = 2, show.legend = F) +
ggplot2::scale_colour_discrete(name = "Label") + ggplot2::coord_fixed(ratio = 1) +
ggplot2::theme_bw(base_size = 12) + ggplot2::xlim(-.8, .8) + ggplot2::ylim(-.7, .7)
### End build dataset
set.seed(NULL)
### No DP, no regularization
# gamma <- 0
# eps <- Inf
### No DP, regularization
# gamma <- .01
# eps <- Inf
### DP, no regularization
# gamma <- 0
# eps <- 1
### DP, regularization
gamma <- 5
eps <- 1
# pm <- 'objective'
pm <- 'output'
wsvmdp <- svmDP$new('l2', eps, gamma, pm, kernel='linear')
### No bias, satisfies constraints
wsvmdp$fit(X,y,upper.bounds=upper.bounds,lower.bounds=lower.bounds,
weights=weights, weights.upper.bound=wub)
theta <- wsvmdp$coeff
# Decision boundary grid
### No bias, satisfies constraints
grid <- expand.grid(seq(-.8, .8, length.out = 100),
seq(-.7, .7, length.out = 100))
Z <- wsvmdp$predict(grid)
gridPred = cbind(grid, Z)
colnames(gridPred)[3] <- 'label'
gridPred <- data.frame(gridPred)
### No bias, satisfies constraints
ggplot2::ggplot() + ggplot2::geom_point(data = data, ggplot2::aes(x=x1, y=x2, color = as.character(label)),
size = 2, show.legend = F) +
ggplot2::geom_tile(data = gridPred, ggplot2::aes(x = grid[, 1],y = grid[, 2],
fill=as.character(Z)), alpha = 0.3,
show.legend = F) + ggplot2::coord_fixed(ratio = 1) +
ggplot2::theme_bw(base_size = 12)+ ggplot2::xlim(-.8,.8) + ggplot2::ylim(-.7,.7)
### END TESTING WEIGHTED LINEAR SVM CLASS ###
############################################
### TESTING WEIGHTED GAUSSIAN KERNEL SVM ###
############################################
## NOTE: Tested by comparing with Fig. 3 in paper "Weighted support vector
## machine for data classification"
## To test, run (1) without adding outliers or weighting, (2) with outliers
## but no weighting, and (3) with outliers and weighting outliers less than
## other datapoints. Should see "typical" weights for case 3 approach case 1
## as the outlier weights get smaller relative to normal weights. Noise
## should also be present due to DP.
set.seed(42)
### Build dataset (no bias, satisfies constraints)
N <- 200
D <- 2
K <- 2
X <- data.frame()
y <- data.frame()
# May be helpful to set seed for testing
for (j in (1:K)){
t <- seq(-.1, .1, length.out = N)
# if (j==1) m <- stats::rnorm(N,-.2, .1) # Soft margin
# if (j==2) m <- stats::rnorm(N, .2, .1)
if (j==1) m <- stats::rnorm(N,-.3, .1) # Hard margin
if (j==2) m <- stats::rnorm(N, .3, .1)
Xtemp <- data.frame(x1 = 3*t , x2 = m - t)
ytemp <- data.frame(y = matrix(j-1, N, 1))
X <- rbind(X, Xtemp)
y <- rbind(y, ytemp)
}
# Add outliers
outliersX <- data.frame(x1 = c(-0.27, -0.23), x2 = c(0.58, 0.59))
outliersy <- data.frame(y = matrix(0, 2, 1))
X <- rbind(X, outliersX)
y <- rbind(y, outliersy)
# Define weights/weights upper bound
wub <- 2
weights <- rep(wub, nrow(y))
weights[(nrow(y)-1):nrow(y)] <- wub/20
# No weights
# wub <- NULL
# weights <- NULL
upper.bounds <- c(1, 1)
lower.bounds <- c(-1,-1)
data <- cbind(X, y)
y <- as.matrix(data[,3])
colnames(data) <- c(colnames(X), 'label')
### To verify in unit circle
# circleFun <- function(center = c(0,0),diameter = 1, npoints = 100){
# r = diameter / 2
# tt <- seq(0,2*pi,length.out = npoints)
# xx <- center[1] + r * cos(tt)
# yy <- center[2] + r * sin(tt)
# return(data.frame(x = xx, y = yy))
# }
# cir <- circleFun(c(0,0),2,npoints = 100)
ggplot2::ggplot(data) + ggplot2::geom_point(ggplot2::aes(x=x1, y=x2, color = as.character(label)),
size = 2, show.legend = F) +
ggplot2::scale_colour_discrete(name = "Label") + ggplot2::coord_fixed(ratio = 1) +
ggplot2::theme_bw(base_size = 12) + ggplot2::xlim(-.8, .8) + ggplot2::ylim(-.7, .7)
### End build dataset
set.seed(NULL)
### No DP, no regularization
# gamma <- 0
# eps <- Inf
### No DP, regularization
gamma <- 1
eps <- Inf
### DP, no regularization
# gamma <- 0
# eps <- 1
### DP, regularization
# gamma <- .01
# eps <- 1
D <- 5
# pm <- 'objective'
pm <- 'output'
wsvmdp <- svmDP$new('l2', eps, gamma, pm, kernel='Gaussian', D)
### No bias, satisfies constraints
wsvmdp$fit(X,y,upper.bounds=upper.bounds,lower.bounds=lower.bounds,
weights=weights, weights.upper.bound=wub)
theta <- wsvmdp$coeff
# Decision boundary grid
### No bias, satisfies constraints
grid <- expand.grid(seq(-.8, .8, length.out = 100),
seq(-.7, .7, length.out = 100))
Z <- wsvmdp$predict(grid)
gridPred = cbind(grid, Z)
colnames(gridPred)[3] <- 'label'
gridPred <- data.frame(gridPred)
### No bias, satisfies constraints
ggplot2::ggplot() + ggplot2::geom_point(data = data, ggplot2::aes(x=x1, y=x2, color = as.character(label)),
size = 2, show.legend = F) +
ggplot2::geom_tile(data = gridPred, ggplot2::aes(x = grid[, 1],y = grid[, 2],
fill=as.character(Z)), alpha = 0.3,
show.legend = F) + ggplot2::coord_fixed(ratio = 1) +
ggplot2::theme_bw(base_size = 12)+ ggplot2::xlim(-.8,.8) + ggplot2::ylim(-.7,.7)
### END TESTING WEIGHTED LINEAR SVM CLASS ###
############################################################
### TESTING PARAMETER TUNING FUNCTION FOR CLASSIFICATION ###
############################################################
# Build dataset
N <- 200 # number of points per class
D <- 2 # dimensionality, we use 2D data for easy visulization
K <- 2 # number of classes, binary for logistic regression
X <- data.frame() # data matrix (each row = single example, can view as xy coordinates)
y <- data.frame() # class labels
# May be helpful to set seed for testing
for (j in (1:K)){
t <- seq(-.25,.25,length.out = N)
if (j==1) m <- stats::rnorm(N,-.2,.1)
if (j==2) m <- stats::rnorm(N, .2,.1)
Xtemp <- data.frame(x1 = 3*t , x2 = m - t)
ytemp <- data.frame(matrix(j-1, N, 1))
X <- rbind(X, Xtemp)
y <- rbind(y, ytemp)
}
upper.bounds <- c(1,1)
lower.bounds <- c(-1,-1)
data <- cbind(X,y)
y <- as.matrix(data[,3])
colnames(data) <- c(colnames(X), 'label')
# With weights
weights <- rep(1,length(y))
weights[1:floor(length(y)/2)] <- 0.5
wub <- 1
# To verify in unit circle
# circleFun <- function(center = c(0,0),diameter = 1, npoints = 100){
# r = diameter / 2
# tt <- seq(0,2*pi,length.out = npoints)
# xx <- center[1] + r * cos(tt)
# yy <- center[2] + r * sin(tt)
# return(data.frame(x = xx, y = yy))
# }
# cir <- circleFun(c(0,0),2,npoints = 100)
ggplot2::ggplot(data) +
ggplot2::geom_point(ggplot2::aes(x=x1, y=x2, color = as.character(label)), size = 2) +
ggplot2::scale_colour_discrete(name ="Label") +
ggplot2::coord_fixed(ratio = 1) +
ggplot2::theme_bw(base_size = 12) +
ggplot2::theme(legend.position=c(0.85, 0.87))
eps <- 1
# lrdp1 <- LogisticRegressionDP$new("l2", eps, 100)
# lrdp2 <- LogisticRegressionDP$new("l2", eps, 1)
# lrdp3 <- LogisticRegressionDP$new("l2", eps, .0001)
svmdp1 <- svmDP$new("l2", eps, 100, perturbation.method='output')
svmdp2 <- svmDP$new("l2", eps, 1, perturbation.method='output')
svmdp3 <- svmDP$new("l2", eps, .0001, perturbation.method='output')
models <- c(svmdp1, svmdp2, svmdp3)
model <- tune_classification_model(models, X, y, upper.bounds, lower.bounds,
weights=weights, weights.upper.bound=wub)
theta <- model$coeff
# Grid
grid <- expand.grid(seq(-.8, .8, length.out = 100), seq(-.7,.7, length.out = 100))
Z <- model$predict(grid)
gridPred = cbind(grid, Z)
colnames(gridPred)[3] <- 'label'
gridPred <- data.frame(gridPred)
# decision boundary
ggplot2::ggplot() + ggplot2::geom_point(data = data,
ggplot2::aes(x=x1, y=x2, color = as.character(label)),
size = 2, show.legend = F) +
ggplot2::geom_tile(data = gridPred,
ggplot2::aes(x = grid[, 1],y = grid[, 2], fill=as.character(Z)),
alpha = 0.3, show.legend = F)+
ggplot2::coord_fixed(ratio = 1) +
ggplot2::theme_bw(base_size = 12)
model$gamma
### END TESTING PARAMETER TUNING FUNCTION ###
###############################################
### TESTING ERMDP.KST VIA LINEAR REGRESSION ###
###############################################
# Build dataset
n <- 500
p <- 2 # for easy visualization
X <- data.frame(X=seq(-1,1,length.out = n))
true.theta <- c(-.3,.5) # Normal case (within search space bounds - shows variation)
# true.theta <- c(1,2) # Too large coeff case (outside search space bounds - no variation)
y <- true.theta[1] + as.matrix(X)%*%true.theta[2:length(true.theta)] + stats::rnorm(n=n,sd=.1)
y[y< -p] <- -p
y[y>p] <- p
data <- cbind(X,y)
y <- as.matrix(data[,2])
colnames(data) <- c(colnames(X), 'y')
# Includes everything
ub <- c(1,2)
lb <- c(-1,-2)
# Limits some values
# ub <- c(1, .75)
# lb <- c(-1, -.75)
ggplot2::ggplot(data) + ggplot2::geom_point(ggplot2::aes(x=X, y=y), size = 2) +
ggplot2::ylim(min(y), max(y)) + #ggplot2::coord_fixed(ratio = 1) +
ggplot2::ggtitle('Data to be regressed') +
ggplot2::theme_bw(base_size = 12)
eps <- 1
delta <- .01
domain <- list("constraints"=function(coeff) coeff%*%coeff - length(coeff),
"jacobian"=function(coeff) 2*coeff)
zeta <- 2*p^(3/2)
lambda <- p
gamma <- 1
ermdp.kst <- EmpiricalRiskMinimizationDP.KST$new(mapXy.linear, loss.squared.error,
'l2', eps, delta, domain, zeta,
lambda, gamma, mapXy.gr.linear,
loss.gr.squared.error)
ermdp.kst$fit(X,y,ub,lb,add.bias=TRUE)
theta <- ermdp.kst$coeff
grid <- seq(-1,1,length.out=100)
gridData <- data.frame(X=grid,y=theta[1]+theta[2]*grid)
# Regression line visualization
ggplot2::ggplot() + ggplot2::geom_point(data = data,
ggplot2::aes(x=X, y=y),
size = 2, show.legend = F) +
ggplot2::geom_line(data=gridData, ggplot2::aes(x=X,y=y), color='red') + ggplot2::ylim(min(y), max(y)) +
ggplot2::ggtitle('Regression Line for Linear Regression') +
ggplot2::theme_bw(base_size = 12)
theta
### END TESTING ERMDP.KST WITH LINEAR REGRESSION ###
#################################
### TESTING LINEAR REGRESSION ###
#################################
# Build dataset
n <- 500
X <- data.frame(X=seq(-1,1,length.out = n))
true.theta <- c(-.3,.5) # Normal case
# true.theta <- c(1,2) # Too large coeff case
y <- true.theta[1] + as.matrix(X)%*%true.theta[2:length(true.theta)] + stats::rnorm(n=n,sd=.1)
p <- length(true.theta)
y[y< -p] <- -p
y[y> p] <- p
data <- cbind(X,y)
y <- as.matrix(data[,2])
colnames(data) <- c(colnames(X), 'y')
ub <- c(1, p)
lb <- c(-1, -p)
ggplot2::ggplot(data) + ggplot2::geom_point(ggplot2::aes(x=X, y=y), size = 2) +
ggplot2::ylim(min(y), max(y)) + #ggplot2::coord_fixed(ratio = 1) +
ggplot2::ggtitle('Data to be regressed') +
ggplot2::theme_bw(base_size = 12)
regularizer <- function(coeff) coeff%*%coeff/2
regularizer.gr <- function(coeff) coeff
eps <- 1
delta <- 0.01
gamma <- 1
linrdp <- LinearRegressionDP$new(regularizer, eps, delta, gamma, regularizer.gr)
linrdp$fit(X,y,ub,lb,add.bias=TRUE)
theta <- linrdp$coeff
grid <- seq(-1,1,length.out=100)
gridData <- data.frame(X=grid,y=theta[1]+theta[2]*grid)
# Regression line visualization
ggplot2::ggplot() + ggplot2::geom_point(data = data,
ggplot2::aes(x=X, y=y),
size = 2, show.legend = F) +
ggplot2::geom_line(data=gridData, ggplot2::aes(x=X,y=y), color='red') + ggplot2::ylim(min(y), max(y)) +
ggplot2::ggtitle('Regression Line for Linear Regression') +
ggplot2::theme_bw(base_size = 12)
theta
### END TESTING LINEAR REGRESSION ###
###############################################################
### TESTING PARAMETER TUNING FUNCTION FOR LINEAR REGRESSION ###
###############################################################
# Build dataset
n <- 500
X <- data.frame(X=seq(-1,1,length.out = n))
true.theta <- c(-.3,.5) # Normal case
# true.theta <- c(1,2) # Too large coeff case
y <- true.theta[1] + as.matrix(X)%*%true.theta[2:length(true.theta)] + stats::rnorm(n=n,sd=.1)
p <- length(true.theta)
y[y< -p] <- -p
y[y> p] <- p
data <- cbind(X,y)
y <- as.matrix(data[,2])
colnames(data) <- c(colnames(X), 'y')
ub <- c(1, p)
lb <- c(-1, -p)
ggplot2::ggplot(data) + ggplot2::geom_point(ggplot2::aes(x=X, y=y), size = 2) +
ggplot2::ylim(min(y), max(y)) + #ggplot2::coord_fixed(ratio = 1) +
ggplot2::ggtitle('Data to be regressed') +
ggplot2::theme_bw(base_size = 12)
eps <- 1
delta <- 0.01
linrdp1 <- LinearRegressionDP$new("l2", eps, delta, 100)
linrdp2 <- LinearRegressionDP$new("l2", eps, delta, 1)
linrdp3 <- LinearRegressionDP$new("l2", eps, delta, .0001)
models <- c(linrdp1, linrdp2, linrdp3)
model <- tune_linear_regression_model(models, X, y, ub, lb, add.bias=TRUE)
theta <- model$coeff
grid <- seq(-1,1,length.out=100)
gridData <- data.frame(X=grid,y=theta[1]+theta[2]*grid)
# Regression line visualization
ggplot2::ggplot() + ggplot2::geom_point(data = data,
ggplot2::aes(x=X, y=y),
size = 2, show.legend = F) +
ggplot2::geom_line(data=gridData, ggplot2::aes(x=X,y=y), color='red') + ggplot2::ylim(min(y), max(y)) +
ggplot2::ggtitle('Regression Line for Linear Regression') +
ggplot2::theme_bw(base_size = 12)
model$gamma
### END TESTING PARAMETER TUNING FUNCTION FOR LINEAR REGRESSION ###
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.