R/zzz.R

Defines functions .onLoad

# Setting up the spam checker SVM model
#' @import caret
#' @import kernlab
#' @importFrom utils read.csv
#' @importFrom stats na.fail

.onLoad <- function(libname, pkgname){
  spambase <- read.csv(system.file("extdata", "spambase.csv", package = pkgname),header=FALSE)
  headers <<- read.csv(system.file("extdata", "names.csv", package = pkgname),header=FALSE)
  colnames(spambase) <- sapply((1:nrow(headers)),function(i) toString(headers[i,1]))

  # Cleanup steps
  spambase$y <- as.factor(spambase$y)
  levels(spambase$y) <- c('ham', 'spam')
  spambaseSample <- spambase[sample(nrow(spambase), 1000),]

  trainIndex <- createDataPartition(spambaseSample$y, p = .8, list = FALSE, times = 1)
  dataTrain <- spambaseSample[ trainIndex,]

  ### finding optimal value of a tuning parameter
  sigDist <- kernlab::sigest(y ~ ., data = dataTrain, frac = 1)
  ### creating a grid of two tuning parameters, .sigma comes from the earlier line. we are trying to find best value of .C
  svmTuneGrid <- data.frame(.sigma = sigDist[1], .C = 2^(-2:7))

  svmModel <<- caret::train(y ~ .,
             data = dataTrain,
             method = "svmRadial",
             preProc = c("center", "scale"),
             tuneGrid = svmTuneGrid,
             trControl = caret::trainControl(method = "repeatedcv", repeats = 5,
                                      classProbs =  TRUE))
}
megahf/spamfilter documentation built on May 29, 2019, 4:42 a.m.