inst/doc/discoverYourData.R

## ----libLoading, results='hold', message=F, warning=F-------------------------
require(xgboost)
require(Matrix)
require(data.table)
if (!require('vcd')) {
  install.packages('vcd')
}

data.table::setDTthreads(2)

## ----results='hide'-----------------------------------------------------------
data(Arthritis)
df <- data.table(Arthritis, keep.rownames = FALSE)

## -----------------------------------------------------------------------------
head(df)

## -----------------------------------------------------------------------------
str(df)

## -----------------------------------------------------------------------------
head(df[,AgeDiscret := as.factor(round(Age/10,0))])

## -----------------------------------------------------------------------------
head(df[,AgeCat:= as.factor(ifelse(Age > 30, "Old", "Young"))])

## ----results='hide'-----------------------------------------------------------
df[,ID:=NULL]

## -----------------------------------------------------------------------------
levels(df[,Treatment])

## ----warning=FALSE,message=FALSE----------------------------------------------
sparse_matrix <- sparse.model.matrix(Improved ~ ., data = df)[,-1]
head(sparse_matrix)

## -----------------------------------------------------------------------------
output_vector = df[,Improved] == "Marked"

## -----------------------------------------------------------------------------
bst <- xgboost(data = sparse_matrix, label = output_vector, max_depth = 4,
               eta = 1, nthread = 2, nrounds = 10,objective = "binary:logistic")


## -----------------------------------------------------------------------------
importance <- xgb.importance(feature_names = colnames(sparse_matrix), model = bst)
head(importance)

## -----------------------------------------------------------------------------
importanceRaw <- xgb.importance(feature_names = colnames(sparse_matrix), model = bst, data = sparse_matrix, label = output_vector)

# Cleaning for better display
importanceClean <- importanceRaw[,`:=`(Cover=NULL, Frequency=NULL)]

head(importanceClean)

## ----fig.width=8, fig.height=5, fig.align='center'----------------------------
xgb.plot.importance(importance_matrix = importance)

## ----warning=FALSE, message=FALSE---------------------------------------------
c2 <- chisq.test(df$Age, output_vector)
print(c2)

## ----warning=FALSE, message=FALSE---------------------------------------------
c2 <- chisq.test(df$AgeDiscret, output_vector)
print(c2)

## ----warning=FALSE, message=FALSE---------------------------------------------
c2 <- chisq.test(df$AgeCat, output_vector)
print(c2)

## ----warning=FALSE, message=FALSE---------------------------------------------
data(agaricus.train, package='xgboost')
data(agaricus.test, package='xgboost')
train <- agaricus.train
test <- agaricus.test

#Random Forest - 1000 trees
bst <- xgboost(
    data = train$data,
    label = train$label,
    max_depth = 4,
    num_parallel_tree = 1000,
    subsample = 0.5,
    colsample_bytree = 0.5,
    nrounds = 1,
    objective = "binary:logistic",
    nthread = 2
)

#Boosting - 3 rounds
bst <- xgboost(
    data = train$data,
    label = train$label,
    max_depth = 4,
    nrounds = 3,
    objective = "binary:logistic",
    nthread = 2
)

Try the xgboost package in your browser

Any scripts or data that you put into this service are public.

xgboost documentation built on May 29, 2024, 5:11 a.m.