``` {r eval = FALSE} library(ff) library(bit) library(xgboost) library(dplyr) library(kgRainPredictR) ffload("data/train", overwrite = TRUE, rootpath = "data/")
N <- nrow(train) b_na <- bit(N) b_out <- bit(N) for (i in chunk(1,N,10000)) { b_na[i] = is.na(train$Ref[i]) b_out[i] = (train$Expected[i] > 69) } df_tr <- as.data.frame(train[, c("Id", "minutes_past", "Ref", "Expected")]) %>% filter(Expected <= 69) #remove outliers
``` {r eval=FALSE}
#Collapse by ID and create summary statistics
df_tr <- group_by(df_tr, Id) %>%
summarize(ref_mean = mean(time_difference(minutes_past) * interpolate(Ref), na.rm = TRUE),
Expected = max(Expected))
num_chunks <- 5
chunk_size <- 10000
num_chunks <- ceiling(n_ids / chunk_size)
dat <- list()
for (i in 1:num_chunks) {
b <- bit(N)
for (j in chunk(1,N,10000)) b[j] <- df_te$Id[j] %in% ids[1:chunk_size * i]
for (j in chunk(1,n_ids, 10000)) df_tr[df_tr$Idids[j],"chunkId"] <- i
dat[i] <- as.data.frame(train[!b, c("Id", "Expected")])
}
%>%
%>%
labels <- df_tr$Expected
bst <- xgboost(data = as.matrix(df_tr[,c("Id", "ref_mean")]),
label = labels,
max.depth = 3,
eta = .7,
nround = 10,
objective = "reg:linear",
missing = NA,
base_score = 0,
verbose = 1)
xgb.save(bst, 'bst_no_na.save')
#clean up
rm(df_tr)
rm(b)
rm(labels)
At this point we have a trained model and have saved it to a file. We've also removed our loaded data because we need to make everything available that we can for prediction.
``` {r eval = FALSE}
ffload("data/test", overwrite = TRUE, rootpath = "data/") df_te <- as.data.frame(test[,c("Id", "minutes_past", "Ref")]) %>% group_by(Id) %>% summarize(ref_mean = mean(time_difference(minutes_past) * Ref, na.rm = TRUE))
res <- data.frame( Id = df_te$Id, Expected = predict(bst, as.matrix(df_te), missing = NA) )
res <- group_by(res, Id) %>% summarize(Expected = mean(Expected)) createSubmission(res, "submission.csv")
```
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.