data-raw/datasets.R

# bodyfat (gaussian) ------------------------------------------------------

temp_file <- tempfile()

download.file(
  "https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/regression/bodyfat",
  temp_file
)

tmp <- e1071::read.matrix.csr(temp_file, fac = FALSE)
unlink(temp_file)
tmp_x <- as.data.frame(as.matrix(tmp$x))
colnames(tmp_x) <- c("siri_1956",
                     "age",
                     "weight",
                     "height",
                     "neck",
                     "chest",
                     "abdomen",
                     "hip",
                     "thigh",
                     "knee",
                     "ankle",
                     "biceps",
                     "foream",
                     "wrist")

# use the Siri 1956 equation as response
bodyfat <- list(x = tmp_x[, -1], y = tmp_x[, 1])

usethis::use_data(bodyfat, overwrite = TRUE)

# abalone (poisson) -------------------------------------------------------

library(SparseM)
library(Matrix)
library(caret)

temp_file <- tempfile()

download.file(
  "https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/regression/abalone",
  temp_file
)

tmp <- e1071::read.matrix.csr(temp_file, fac = FALSE)
unlink(temp_file)
tmp_x <- as.data.frame(as.matrix(tmp$x))
tmp_x$V1 <- as.factor(tmp_x$V1)

tmp_x <- as.data.frame(model.matrix(~ ., tmp_x))[, -1]
colnames(tmp_x) <- c("sex",
                     "infant",
                     "length",
                     "diameter",
                     "height",
                     "weight_whole",
                     "weight_shucked",
                     "weight_viscera",
                     "weight_shell")

# randomly select a subset of the rows
part <- caret::createDataPartition(tmp$y, p = 0.05, list = FALSE)

abalone <- list(x = tmp_x[part, ], y = tmp$y[part])

usethis::use_data(abalone, overwrite = TRUE)

# heart (binomial) --------------------------------------------------------

library(SparseM)
library(Matrix)
library(fastDummies)
library(tidyverse)

temp_file <- tempfile()

download.file(
  "https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/heart",
  temp_file
)

tmp <- e1071::read.matrix.csr(temp_file, fac = FALSE)
x <- as.data.frame(as.matrix(tmp$x))
colnames(x) <- c("age",
                 "sex",
                 "chest_pain",
                 "bp",
                 "chol",
                 "glucose",
                 "ecg",
                 "hr",
                 "angina",
                 "old_peak",
                 "slope",
                 "vessels",
                 "thal")

x2 <- x %>%
  mutate(sex = factor(sex, labels = c("male", "female")),
         cp = factor(chest_pain,
                     levels = c(4, 1, 2, 3),
                     labels = c("asymtompatic", "typical", "atypical", "nonanginal")),
         ecg = factor(ecg, labels = c("normal", "abnormal", "estes")),
         angina = as.factor(angina),
         glucose = factor(glucose, labels = c("low", "high")),
         slope = factor(slope,
                        levels = c(1, 2, 3),
                        labels = c("upsloping", "flat", "downsloping")),
         thal = factor(thal, labels = c("normal", "fixed", "reversible")))

x3 <- model.matrix(~ ., x2) %>%
  as.data.frame() %>%
  select(age,
         bp,
         chol,
         hr,
         old_peak,
         vessels,
         sex = sexfemale,
         angina = angina1,
         glucose_high = glucosehigh,
         cp_typical = cptypical,
         cp_atypical = cpatypical,
         cp_nonanginal = cpnonanginal,
         ecg_abnormal = ecgabnormal,
         ecg_estes = ecgestes,
         slope_flat = slopeflat,
         slope_downsloping = slopedownsloping,
         thal_fixed = thalfixed,
         thal_reversible = thalreversible)

x4 <- Matrix::Matrix(as.matrix(x3), sparse = TRUE)

# response
y <- factor(tmp$y, labels = c("absence", "presence"))

heart <- list(x = x4, y = y)

usethis::use_data(heart, overwrite = TRUE)

# wine (multiclass) -------------------------------------------------------

library(Matrix)
library(SparseM)

temp_file <- tempfile(fileext = ".csv")

download.file(
  "https://raw.githubusercontent.com/hadley/rminds/master/1-data/wine.csv",
  temp_file
)

tmp <- read.csv(temp_file)

x <- as.matrix(tmp[, -1])
y <- as.factor(tmp[, 1])

wine <- list(x = x, y = y)

usethis::use_data(wine, overwrite = TRUE)


# Student (multi-task) ----------------------------------------------------

tmp_file <- tempfile()
tmp_dir <- tempdir()

download.file(
  "https://archive.ics.uci.edu/ml/machine-learning-databases/00320/student.zip",
  tmp_file
)

unzip(tmp_file, exdir = tmp_dir)

d1 <-
  read.table(file.path(tmp_dir, "student-mat.csv"), sep = ";", header = TRUE)
d2 <-
  read.table(file.path(tmp_dir, "student-por.csv"), sep = ";", header = TRUE)
d3 <- merge(d1, d2, by = c("school",
                           "sex",
                           "age",
                           "address",
                           "famsize",
                           "Pstatus",
                           "Medu",
                           "Fedu",
                           "Mjob",
                           "Fjob",
                           "reason",
                           "nursery",
                           "internet"),
            suffixes = c("_math", "_port"))
y <- with(d3, cbind(G3_math, G3_port))
x1 <- subset(d3,
             select = c(
               "school",
               "sex",
               "age",
               "address",
               "famsize",
               "Pstatus",
               "Medu",
               "Fedu",
               "Mjob",
               "Fjob",
               "reason",
               "nursery",
               "internet"
             )
)
x1$famsize <- relevel(x1$famsize, ref = "LE3")
x2 <- model.matrix(~ ., x1)[, -1]

colnames(x2) <- c("school_ms",
                  "sex",
                  "age",
                  "urban",
                  "large_family",
                  "cohabitation",
                  "Medu",
                  "Fedu",
                  "Mjob_health",
                  "Mjob_other",
                  "Mjob_services",
                  "Mjob_teacher",
                  "Fjob_health",
                  "Fjob_other",
                  "Fjob_services",
                  "Fjob_teacher",
                  "reason_home",
                  "reason_other",
                  "reason_rep",
                  "nusery",
                  "internet")
colnames(y) <- c("math", "portugese")

student <- list(x = x2, y = y)

usethis::use_data(student, overwrite = TRUE)

unlink(tmp_file)
jolars/prague documentation built on March 4, 2020, 7:13 p.m.