#' This script won't be part of the build package.
#' Its purpose is to transfer the source data to a R data.frame.
#'
#' The original dataset needed to be adjusted beforehand because it contained REAL and Integer
#' categories for describing the dimensions.
#'
library(foreign)
spambase <- read.arff(file = "data-raw/dataset_44_spambase_adjusted.arff")
# # duplicates could be removed for weighting all observations equally but the quality of
# # clustering is higher with duplicates included. That suggests that the duplicates have
# # positive impact on the cluster analysis.
# spambase <- spambase[!duplicated(spambase),]
# adjust datatype to original datatype which got lost during read
spambase$capital_run_length_longest <- as.integer(spambase$capital_run_length_longest)
spambase$capital_run_length_total <- as.integer(spambase$capital_run_length_total)
names(spambase)[1:57] <- c("make", "address", "all", "3d", "our", "over", "remove", "internet",
"order", "mail", "receive", "will", "people", "report", "adresses",
"free", "business", "email", "you", "credit", "your", "font", "000",
"money", "hp", "hpl", "george", "650", "lab", "labs", "telnet", "857",
"data", "415", "85", "technology", "1999", "parts", "pm", "direct",
"cs", "meeting", "original", "project", "re", "edu", "table",
"conference", ";", "(", "[", "!", "$", "#", "CAP_avg", "CAP_longest",
"CAP_total")
# write table to file for the package
devtools::use_data(spambase, overwrite = T)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.