inst/benchmark/clustered_SVM_Repeat.R

require(LiblineaR)
require(e1071)
require(SwarmSVM)

######## Data Preparation
local.file.name = tempfile()
download.file("http://www.sfu.ca/~hetongh/data/svmguide1.RData",local.file.name)
load(local.file.name)
svmguide1.t = svmguide1[[2]]
svmguide1 = svmguide1[[1]]

local.file.name = tempfile()
download.file("http://www.sfu.ca/~hetongh/data/ijcnn1.RData",local.file.name)
load(local.file.name)
ijcnn1.t = ijcnn1[[2]]
ijcnn1 = ijcnn1[[1]]

local.file.name = tempfile()
download.file("http://www.sfu.ca/~hetongh/data/usps.RData",local.file.name)
load(local.file.name)
usps.t = usps[[2]]
usps = usps[[1]]

local.file.name = tempfile()
download.file("http://www.sfu.ca/~hetongh/data/mnist.RData",local.file.name)
load(local.file.name)
mnist38 = mnist[[1]]
mnist38.t = mnist[[2]]
mnist49 = mnist[[3]]
mnist49.t = mnist[[4]]
mnistoe = mnist[[5]]
mnistoe.t = mnist[[6]]

######## Repeat Length
rep.len = 10

########################
######## Cluster SVM
########################

clusterSVM.cv = function(x, y, nfold = 5, ...) {
  n = nrow(x)
  ind = sample(n)
  folds = list()
  for (i in 1:(nfold-1)) {
    folds[[i]] = ind[1:(n %/% nfold)]
    ind = setdiff(ind,folds[[i]])
  }
  folds[[nfold]] = ind
  
  score = rep(0,nfold)
  for (i in 1:nfold) {
    train.ind = setdiff(1:n, folds[[i]])
    test.ind = folds[[i]]
    csvm.obj = SwarmSVM::clusterSVM(x = x[train.ind, ], y = y[train.ind], 
                                    valid.x = x[test.ind, ], valid.y = y[test.ind], 
                                    ...)
    score[i] = csvm.obj$valid.score
  }
  return(mean(score))
}

repeatClusterSVM = function(train, valid, rep.len = 10) {
  train = as.matrix(train)
  valid = as.matrix(valid)
  best.score = -Inf
  set.seed(1024)
  for (lmd in c(1,5,10,20,50,100)) {
    temp.score = clusterSVM.cv(x = train[,-1], y = train[,1], lambda = lmd,
                               centers = 8, iter.max = 1000, verbose = 0, 
                               cluster.method = "mlKmeans")
    if (temp.score>best.score) {
      best.score = temp.score
      best.lambda = lmd
    }
  }
  
  score = rep(0, rep.len)
  total.time = rep(0, rep.len)
  for (i in 1:rep.len) {
    csvm.obj = SwarmSVM::clusterSVM(x = train[,-1], y = train[,1], seed = i, 
                                    valid.x = valid[,-1], valid.y = valid[,1], 
                                    centers = 8, iter.max = 1000, verbose = 0,
                                    lambda = best.lambda, 
                                    cluster.method = "mlKmeans")
    score[i] = csvm.obj$valid.score
    total.time[i] = csvm.obj$time$total.time
  }
  gc()
  result = c(mean(score), sd(score), mean(total.time), sd(total.time), best.lambda)
  names(result) = c('Average Error', 'Standard Deviation', 
                    'Average Time', 'Standard Deviation', 'Best Lambda')
  result = round(result, 7)
  return(result)
}

repeatClusterSVM(svmguide1, svmguide1.t, rep.len)
# Average Error Standard Deviation       Average Time Standard Deviation        Best Lambda 
# 0.8143000          0.0091225          0.1589000          0.0143562         20.0000000 

repeatClusterSVM(ijcnn1, ijcnn1.t, rep.len)
# Average Error Standard Deviation       Average Time Standard Deviation        Best Lambda 
# 0.9444030          0.0022965          3.6295000          0.1568306          1.0000000 

repeatClusterSVM(usps, usps.t, rep.len)
# Average Error Standard Deviation       Average Time Standard Deviation        Best Lambda 
# 0.9552566          0.0011697          2.2441000          0.2853292          1.0000000 

repeatClusterSVM(mnist38, mnist38.t, rep.len)
# Average Error Standard Deviation       Average Time Standard Deviation        Best Lambda 
# 0.9855847          0.0012162         10.4385000          1.9813294          5.0000000 

repeatClusterSVM(mnist49, mnist49.t, rep.len)
# Average Error Standard Deviation       Average Time Standard Deviation        Best Lambda 
# 0.980663           0.001575          12.641600           1.430985           1.000000 

repeatClusterSVM(mnistoe, mnistoe.t, rep.len)
# Average Error Standard Deviation       Average Time Standard Deviation        Best Lambda 
# 0.960820           0.000319          61.526800           9.283494         100.000000


########################
######## LibLinear
########################

rep.len = 10

repeatLiblineaR = function(train, valid, rep.len = 10) {
  train = as.matrix(train)
  valid = as.matrix(valid)
  best.score = -Inf
  set.seed(1024)
  for (cst in c(0.01,0.1,1,10,100)) {
    temp.score = LiblineaR::LiblineaR(data = train[,-1], target = train[,1], 
                                      type = 1, verbose = F, cost = cst, cross = 5)
    if (temp.score>best.score) {
      best.score = temp.score
      best.cost = cst
    }
  }
  score = rep(0, rep.len)
  total.time = rep(0, rep.len)
  for (i in 1:rep.len) {
    set.seed(i)
    time.stamp = proc.time()
    liblinear.obj = LiblineaR::LiblineaR(data = train[,-1], target = train[,1], 
                                         type = 1, verbose = F, cost = best.cost)
    preds = predict(liblinear.obj, valid[,-1])$prediction
    score[i] = sum(preds==valid[,1])/length(valid[,1])
    total.time[i] = (proc.time()-time.stamp)[3]
  }
  result = c(mean(score), sd(score), mean(total.time), sd(total.time), best.cost)
  names(result) = c('Average Error', 'Standard Deviation', 
                    'Average Time', 'Standard Deviation', 'Best Cost')
  result = round(result, 7)
  return(result)
}

repeatLiblineaR(svmguide1, svmguide1.t, rep.len)
# Average Error Standard Deviation       Average Time Standard Deviation          Best Cost 
# 0.8007000          0.0031265          3.6246000          0.0455417        100.0000000 

repeatLiblineaR(ijcnn1, ijcnn1.t, rep.len)
# Average Error Standard Deviation       Average Time Standard Deviation          Best Cost 
# 0.9210532          0.0008342         51.0651000          0.8289324        100.0000000 

repeatLiblineaR(usps, usps.t, rep.len)
# Average Error Standard Deviation       Average Time Standard Deviation          Best Cost 
# 0.9377180          0.0000000          2.2539000          0.3196156         10.0000000 

repeatLiblineaR(mnist38, mnist38.t, rep.len)
# Average Error Standard Deviation       Average Time Standard Deviation          Best Cost 
# 0.7561492          0.0002125          0.7620000          0.0664011          1.0000000

repeatLiblineaR(mnist49, mnist49.t, rep.len)
# Average Error Standard Deviation       Average Time Standard Deviation          Best Cost 
# 0.9432446          0.0000000          0.7340000          0.0776645          1.0000000 

repeatLiblineaR(mnistoe, mnistoe.t, rep.len)
# Average Error Standard Deviation       Average Time Standard Deviation          Best Cost 
# 0.9021700          0.0000823         27.9471000          2.8855612         10.0000000 


########################
######## Kernel SVM
########################

svm.cv = function(x, y, nfold = 5, ...) {
  n = nrow(x)
  ind = sample(n)
  folds = list()
  for (i in 1:(nfold-1)) {
    folds[[i]] = ind[1:(n %/% nfold)]
    ind = setdiff(ind,folds[[i]])
  }
  folds[[nfold]] = ind
  
  score = rep(0,nfold)
  for (i in 1:nfold) {
    train.ind = setdiff(1:n, folds[[i]])
    test.ind = folds[[i]]
    svm.obj = e1071::svm(x = x[train.ind,], y = as.factor(y[train.ind]), ...)
    preds = predict(svm.obj, x[test.ind,], probability = FALSE)
    score[i] = sum(preds==y[test.ind])/length(y[test.ind])
  }
  return(mean(score))
}

rep.len = 10

repeatSVM = function(train, valid, rep.len = 10) {
  best.score = -Inf
  set.seed(1024)
  for (gm in c(0.01,0.1,1,10,100)) {
    for (cst in c(0.01,0.1,1,10,100)) {
      cat('Begin cv on',gm,'\t',cst)
      tp = proc.time()
      temp.score = svm.cv(x = train[,-1], y = train[,1], nfold = 5, 
                          gamma = gm, cost = cst, kernel = "radial")
      cat('\t\tTime:',(proc.time()-tp)[3],'\t\tScore:',temp.score,'\n')
      if (temp.score>best.score) {
        best.score = temp.score
        best.gamma = gm
        best.cost = cst
      }
    }
  }
  score = rep(0, rep.len)
  total.time = rep(0, rep.len)
  for (i in 1:rep.len) {
    set.seed(i)
    time.stamp = proc.time()
    svm.obj = e1071::svm(x = train[,-1], y = as.factor(train[,1]), 
                         kernel = "radial", gamma = best.gamma, cost = best.cost)
    preds = predict(svm.obj, valid[,-1])
    presd = as.numeric(preds)-1
    score[i] = sum(preds==valid[,1])/length(valid[,1])
    total.time[i] = (proc.time()-time.stamp)[3]
  }
  result = c(mean(score), sd(score), mean(total.time), sd(total.time), best.gamma, best.cost)
  names(result) = c('Average Error', 'Standard Deviation', 
                    'Average Time', 'Standard Deviation', 'Best Gamma', 'Best Cost')
  result = round(result, 7)
  return(result)
}

repeatSVM(svmguide1, svmguide1.t, rep.len)
# Average Error Standard Deviation       Average Time Standard Deviation         Best Gamma          Best Cost 
# 8.7875e-01         0.0000e+00         8.5260e-01         2.5906e-03         1.0000e+01         1.0000e+02 

repeatSVM(ijcnn1, ijcnn1.t, rep.len)
# Average Error Standard Deviation       Average Time Standard Deviation         Best Gamma          Best Cost 
# 0.9903164          0.0000000        131.4755000          2.2816740         10.0000000         10.0000000 

repeatSVM(usps, usps.t, rep.len)
# Average Error Standard Deviation       Average Time Standard Deviation         Best Gamma          Best Cost 
# 0.9706029          0.0000000          9.3162000          0.0553530          1.0000000        100.0000000  

repeatSVM(mnist38, mnist38.t, rep.len)
# Average Error Standard Deviation       Average Time Standard Deviation         Best Gamma          Best Cost 
# 0.9949597          0.0000000         77.8332000          0.3907411          1.0000000         10.0000000

repeatSVM(mnist49, mnist49.t, rep.len)
# Average Error Standard Deviation       Average Time Standard Deviation         Best Gamma          Best Cost 
# 0.9929684          0.0000000         57.6431000          0.1407752          1.0000000        100.0000000 

repeatSVM(mnistoe, mnistoe.t, rep.len)
# Not finished
# It is too long to tune the parameters

Try the SwarmSVM package in your browser

Any scripts or data that you put into this service are public.

SwarmSVM documentation built on Dec. 28, 2022, 1:24 a.m.