forage: Obtain various outputs using a trained tree ensemble on new...

Description Usage Arguments Details Value Examples

View source: R/forage.R

Description

Obtain terminalNodesMatrix, dissimilarity, proximity, outlyingness, depth on new data using a tree ensemble. Currently, ensembles from 'ranger' and 'randomForest' packages are supported. See details for the explanation about various outputs.

Usage

1
2
3
forage(object, newdata, what = "dissimilarity",
  method = "terminalNodes", context = "observations", classes = NULL,
  ...)

Arguments

object

Object of class 'ranger', 'randomForest'

newdata

A dataframe

what

(string) Type of output. Following are implemented: terminalNodesMatrix, dissimilarity, proximity, outlyingness, depth. Default is 'dissimilarity'

method

(string) Method to obtain the output. Following are implemented: terminalNodes. Default is 'terminalNodes'

context

(string) Specify whether output should be computed for 'observations' or 'trees'. Default is 'observations'

classes

(factor) Required when 'what' is 'outlyingness'

...

Currently not in use.

Details

Value

The following are returned depending on 'what':

Examples

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
## Not run: 
library("magrittr")
library("ggplot2")

# partiton iris data ----
set.seed(1)
index      <- sample.int(nrow(iris), floor(0.7 * nrow(iris)))
iris_train <- iris[index, ]
iris_test  <- iris[-index, ]

# grow a randomforest ensembles ----
model_ranger_supervised <- ranger::ranger(Species ~., data = iris_train, seed = 1)
model_ranger_supervised

model_ranger_unsupervised <- synthetic_forest(iris_train)
model_ranger_unsupervised$prediction.error

# obtain terminal nodes matrix ----
tn_supervised  <- forage(model_ranger_supervised
                         , newdata = iris_test
                         , what = "terminalNodesMatrix"
                         )
dim(tn_supervised)

tn_unsupervised <- forage(model_ranger_unsupervised
                          , newdata = iris_test
                          , what = "terminalNodesMatrix"
                          )
dim(tn_unsupervised)

# obtain similarity/distance between observations ----
di_supervised  <- forage(model_ranger_supervised
                         , newdata = iris_test
                         , what  = "dissimilarity" # or "proximity"
                         )
di_supervised %>% attr("Size")
di_supervised %>% as.matrix() %>% image()

di_unsupervised  <- forage(model_ranger_unsupervised
                           , newdata = iris_test
                           , what  = "dissimilarity" # or "proximity"
                           )
di_unsupervised %>% attr("Size")
di_unsupervised %>% as.matrix() %>% image()

# clustering using hclust partitions by species
hclust(di_unsupervised, method = "average") %>% cutree(k = 3) %>% table(unclass(iris_test$Species))

# explore outliers in train data ----
di_unsupervised_train  <- forage(model_ranger_unsupervised
                                 , newdata = iris_train
                                 , what  = "dissimilarity"
                                 )

outIndex <- forage(model_ranger_unsupervised
                   , newdata = iris_train
                   , what = "outlyingness"
                   , classes = iris_train$Species
                   )

# quick and rough outlier exploration
outs   <- outIndex %in% grDevices::boxplot.stats(outIndex)$out
which(outs)
labels <- (1:nrow(iris_train))
labels[outs] <- NA
set.seed(1)
to <- Rtsne::Rtsne(di_unsupervised_train, is_distance = TRUE, perplexity = 10)
to$Y %>% as.data.frame() %>%
  ggplot(aes(V1, V2, color = iris_train$Species, size = as.integer(outs))) +
  geom_point(alpha = 0.5) +
  geom_label(aes(label = labels))

# Look at the depth of the terminal nodes of the observations across trees
depth_observations <- forage(synthetic_forest(iris_train, splitrule = "extratrees")
                             , iris_train
                             , what = "depth"
                             )
depth_observations_sweep <- sweep(depth_observations
                                  , 2
                                  , matrixStats::colMaxs(depth_observations)
                                  , "/"
                                  )
avg_depth  <- matrixStats::rowMedians(depth_observations_sweep)
depthframe <- data.frame(index = 1:nrow(iris_train), depthratio = avg_depth)
averages   <- depthframe %>%
  dplyr::mutate(Species = iris_train$Species) %>%
  dplyr::group_by(Species) %>%
  dplyr::summarise(mean = median(depthratio), sd = mad(depthratio))

depthframe %>%
  ggplot(aes(index, depthratio)) +
  geom_point(aes(color = iris_train$Species)) +
  geom_label(aes(label = 1:nrow(iris_train), color = iris_train$Species)) +
  geom_hline(aes(yintercept = mean, colour = Species), averages) +
  geom_hline(aes(yintercept = mean + 1.5 * sd, colour = Species), linetype = 2, averages) +
  geom_hline(aes(yintercept = mean - 1.5 * sd, colour = Species), linetype = 2, averages) +
  scale_x_continuous(breaks = seq(1, nrow(iris_train))) +
  coord_flip()
# we might want to examine points of a class
# which have very small or large depth compared to the class average.

hierar <- stats::dist(depth_observations_sweep, method = "manhattan") %>% hclust()
hierar %>% plot()
hierar %>% cutree(h = 199) %>% table()
# observe that observation '29' which is a global outlier forms the only
# singleton cluster(8) at height 199.

# dissimilarity(rand index) matrix of trees ----
di_supervised_trees  <- forage(model_ranger_supervised
                               , newdata = iris_train
                               , what  = "dissimilarity" #' or "proximity
                               , context = "trees"
                               )
di_supervised_trees %>% as.matrix() %>% image()
di_supervised_trees %>% as.matrix() %>% density() %>% plot()
# indication of low 'correlation' between trees.

## End(Not run)

talegari/forager documentation built on May 3, 2019, 4:01 p.m.