library(dutchtabletopsdata)
library(purrr)
library(dplyr)
library(tidyr)
library(tibble)
library(ggplot2)
library(ggrepel)
library(randomForest)
knitr::opts_chunk$set(fig.width = 9, fig.height = 9, warning = FALSE, echo = FALSE, cache = TRUE, dev = "pdf")

What are random forests? Why use them?

Random forests are an example of an ensemble machine learning method that works by running many iterations of the same type of model and combining their results in an effort to avoid overfitting. The core model of random forests are, appropriately, decision trees. A single decision tree works by finding a split in one of the predictor variables (say, all horizontal paintings over here, all horiztonal ones over here) that does the best job at sorting out paintings based on the given repsonse variable (e.g., which artist painted it.), techinically known as asessing its node purity. One variable split likely does a mediocre job of predicting the response variable, thus the process is repeated, finding the best variable splits for each one of the branches produced by the first split, then the best splits for all the brances produced by that step, and so forth. Single decision trees can keep producing branches until they have perfectly memorized the data. Much like an undergraduate who aces the slide ID portion of the art history exam, but bombs the unknown image section, a single tree will perfectly reproduce the data you give it, but does poorly when given new cases that might have slightly different attributes than those it learned from.

Random forests work by creating hundreds or thousands of decision trees, giving each a slightly different subset of the original data to learn. Then, having built each of these trees, the model polls this "forest" in order to classify new data into different categories. In the case of a model learning to recognize artists based on the motifs used in paintings, some paintings may elicit an overwhelmingly lopsided decision by the forest when it is particularly sure of which attribution to give. Other paintings, however, may provoke more confusion - their attribution, in other words, may not be supportable based on motif information alone.

Random forests are particularly appropriate for our use for several reasons:

  1. They work relatively well on small samples of data that have a large number of descriptive variables
  2. Random forests performs well on systems where there are complex interactions between variables (e.g., lemon plus wine but without oysters tends to be a painitng by...)

Building the random forests

There are three main parameters to any of these models:

  1. Predictor variables: what variables will the model use to make that prediction?
  2. Motif variables only
  3. Composition variables only
    • With dimension informaion? Without?
  4. All available variables
  5. Response variable: what categorical or scalar value are we attempting to predict?
  6. An unqualified artist attribution (e.g "Pieter Claesz")
  7. A qualified attribution ("after Pieter Claesz", "possibly Pieter Claesz")
  8. Some other variable, e.g. general compositional scheme
  9. Data subset: What subset of the available observations (in this case, paintings) will the model use to make predictions? Just those paintings with definitve attributions? All available paintings? Paintings from a given period of time?

Because we may be interested in any number of combinations of these three parameters, we'll produce random forests for every combination up front, allowing easier exploration on the back end.

# Model artists with only n or more paintings
n_ptg <- 5
cleared_artists <- unique(filter(group_by(dt_compiled, artist), n() >= n_ptg)$artist)
cleared_artist_union <- unique(filter(group_by(dt_compiled, artist_union), n() >= n_ptg)$artist_union)

dt_valid <- dt_compiled %>% 
  filter(artist %in% cleared_artists, artist_union %in% cleared_artist_union)

# Model specifications
model_predictors <- list(
  motif_only = motif_vars(),
  composition_only = comp_vars()
  # comp_nodim = setdiff(comp_vars(), c("height", "width")),
  # combined = c(motif_vars(), comp_vars())
)

model_response <- list(
  simple = "artist",
  # qualified = "artist_union",
  comp_disp = "compositional_disposition",
  year = "year_early"
)

model_subsets <- list(
  definite_only = which(dt_valid$artist_relationship == "Definite" & dt_valid$artist_union %in% cleared_artist_union & dt_valid$artist %in% cleared_artists)
)

model_tests <- list(
  qualified_only = which(dt_valid$artist_relationship != "Definite" & dt_valid$artist_union %in% cleared_artist_union & dt_valid$artist %in% cleared_artists)
)

# Produce all combinations (the dot product) of these three model parameters
model_params <- list(
  predictors = model_predictors, 
  response = model_response, 
  subsets = model_subsets, 
  tests = model_tests) %>% 
  cross_named_lists()

# Actually run the models
rfl <- map(model_params, function(x) run_rf(dt_valid, response = x$response, predictors = x$predictors, ntree = 5000, portion = x$subsets, test = x$tests))

# Add the resulting random forest type ("classification" or "regression") to the
# model name
names(rfl) <- map2_chr(rfl, names(rfl), function(x, y) {
  type <- x$type
  paste(y, type, sep = ".")
})

Which artists were predictable by their chosen motifs? Their chosen compositions?

One core question: are certain artists more or less predictable by the motifs they choose, and/or by the compositions they use to arrange those motifs?

# Produce a long table of model error: how often was a given model "wrong" about
# the category in which it placed an artwork?
rfl_df <- rfl %>% 
  keep(function(x) x$type == "classification") %>% 
  map_df(error_only, .id = "model_type") %>% 
  separate(model_type, into = c("predictors", "response", "subset", "test", "forest_type"), sep = "\\.")
per_artist_error <- rfl_df %>% 
  filter(response == "simple" & subset == "definite_only" & predictors %in% c("motif_only", "composition_only")) %>% 
  select(-subset, -response, -forest_type, -test) %>% 
  ungroup() %>% 
  spread(predictors, class.error) %>% 
  rename(artist = actual) %>% 
  mutate(
    artist_relationship = factor("Definite", levels = c("Definite", "Possible", "After")),
    artist = factor(artist, levels = sort(unique(artist))))

pae <- ggplot(per_artist_error, aes(x = composition_only, y = motif_only, color = artist, shape = artist_relationship)) +
  theme_error(xname = "Composition", yname = "Motif") +
  geom_point(size = 5) +
  geom_label_repel(aes(label = artist)) +
  scale_shape_discrete(guide = FALSE)
pae
known_only_motif <- rfl$motif_only.simple.definite_only.qualified_only.classification
known_only_comp <- rfl$composition_only.simple.definite_only.qualified_only.classification

mvotes <- known_only_motif$votes %>% 
  as.data.frame() %>% 
  rownames_to_column(var = "painting_code") %>%
  mutate(predicted = known_only_motif$predicted) %>% 
  gather(candidate, votes, -painting_code, -predicted) %>%
  inner_join(dt_compiled, by = "painting_code") %>% 
  group_by(painting_code) %>% 
  filter(row_number(desc(votes)) == 1) %>% 
  ungroup()

anames <- levels(known_only_motif$y)
acols <- RColorBrewer::brewer.pal(8, name = "Set1")

artist_radial <- function(aname, acolor) {
  set.seed(100)
  adata <- mvotes %>% 
    filter(artist == aname)

  bdata <- setdiff(mvotes, adata)

  ggplot(bdata, aes(x = factor(predicted, levels = anames), y = votes)) +
    geom_jitter(alpha = 0.5, size = 2) +
    geom_jitter(data = adata, color = acolor, size = 5, alpha = 0.7) +
    scale_x_discrete(drop = FALSE) +
    coord_polar() +
    ylim(0, 1) +
    labs(t = "Vote proportion", x = NULL, title = aname) +
    theme_bw()

  ggsave(filename = paste0(aname, "_radial.pdf"), height = 7, width = 7)
}

rad_plots <- set_names(map2(anames, acols, artist_radial), anames)
rad_plots$`Floris van Dijck`
rad_plots$`Cornelis Mahu`

Several points of note:

  1. Gerrit Heda is a mercurial painter: of the "Definite" attributions, the model is mistaken more that 75% of the time when trying to attribute them on either motif or compositional predictors.
  2. The other painters are relatively more predictable, though Cornelis Mahu's compositional variety means the model mistakes his works for others almost half the time.
  3. Pieter Claesz apparently has an exceptionally stable set of motifs - 9 out of 10 times the model can accurate predict his works based on motif information alone.
  4. Floris van Dijck has compositions that are almost just as predictable.

Things get complex when looking at the difference between unqualified Willem Claesz Heda and Pieter Claesz attributions, and those attributions that are "after" the style of WCH or PC.

test_votes <- function(model) {

  votes <- model$test$votes

  stopifnot(!is.null(votes))

  votes %>% 
    as.data.frame() %>% 
    rownames_to_column("painting_code") %>% 
  inner_join(select(dt_compiled, painting_code, artist, artist_relationship), by = "painting_code") %>% 
  gather(predicted, ratio, -painting_code, -artist, -artist_relationship) %>% 
  group_by(painting_code, artist, artist_relationship) %>% 
  filter(min_rank(desc(ratio)) == 1) %>% 
  ungroup() %>% 
  select(-ratio) %>% 
  mutate(correct = artist == predicted) %>% 
  group_by(artist, artist_relationship) %>% 
  summarize(error = 1 - mean(correct))
}

test_errors <- bind_rows(
  motif_only = test_votes(known_only_motif),
  composition_only = test_votes(known_only_comp),
  .id = "type") %>% 
  spread(type, error) %>% 
  ungroup() %>% 
  filter(artist_relationship != "Co-painter")

all_error <- bind_rows(
  old = per_artist_error, 
  new = test_errors) %>% 
  mutate(
    artist = factor(artist, levels = sort(unique(artist))),
    artist_relationship = factor(artist_relationship, c("Definite", "Possible", "After")))

all_quals_error <- ggplot(all_error, aes(x = composition_only, y = motif_only, color = artist, shape = artist_relationship, alpha = artist_relationship != "Definite")) +
  theme_error("Composition", "Motif") +
  geom_point(size = 5) +
  scale_alpha_discrete(c(1, 0.1), guide = FALSE)
  geom_label_repel(aes(label = artist))
all_quals_error

Local variable importance

Having indentified which artists are readily predictable by particular motifs or compositional variants, we can then identify what those most distinctive motifs are, either in their presence, or their absence. A downside of random forest models is that they are more challenging to interpret than a traditional classifier such as a logarithmic regression. In logarithmic regression, each explanatory variable is treated independently[^indvar], with the model assigning an explicit coefficient to each variable. The larger the coefficient (either positive or negative), the greater the effect that variable has on the probability of an observation being slotted into a particular class. In a single decision tree, however, variables have the potential for complex dependent interactions (If, for example, A is larger than B, then C has a positive effect, but if A is less than B, C instead has a negative effect) not easily summed up by a single metric. Random forests - an ensemble of decision trees - compound this complexity. The very nuance that gives them their predictive power also makes them more challenging to interpret.

[^indvar]: Save for interaction variables, though those must be explicitly defined ahead of time; @james2013, 115.

To visualize the inner workings of this tree, we can rank the variables by an empirically-measured importance metric[^impmetric], and then see how the value of those variables actually impacts the confidence of the model when it tries to make predictions about a given artist. In cases where the presence of a motif (e.g. "leafy vegatation" == TRUE) results in more of the trees voting in favor of the correct artist, we can conclude that variable is a positive indicator of a given classification. On the other hand, when its presence results in almost no trees voting for the given artist, we can identify it as a negative indicator of that attribution.

[^impmetric]: The random forest model as implemented by @liaw2002 calculates variable inportance for a given class by averaging the mean decrease in accuracy for a given class (in this case, for a given artist) across all trees that occurs when the given variable is permuted - its values put in to random order, rendering it meaningless. The theory goes: the more important that variable is in the behavior of the model, the more negative the impact when that variable is effectively removed from the data.

pc_terms <- top_n_importance_names(known_only_motif, class = "Pieter Claesz")

pc_motif_votes <- rf_effects(known_only_motif, class = "Pieter Claesz", terms = pc_terms) %>% 
  left_join(dt_motif_labels, by = c("term" = "motif_code")) %>% 
  mutate(metric_value = ifelse(metric_value, "present", "absent"))

ggplot(pc_motif_votes, aes(x = metric_value, y = votes, color = is_true_positive, shape = metric_value, alpha = metric_value)) +
  facet_wrap(~ wrapped_motif_label, scales = "free_x") +
  geom_jitter() +
  scale_color_brewer(palette = "Set1") +
  scale_shape_discrete(guide = FALSE) +
  scale_alpha_discrete(guide = FALSE) +
  theme_bw() +
  theme(legend.position = "top") +
  labs(title = "Pieter Claesz - Important Motifs", x = NULL, y = "Model consensus", color = "Work actually by PC?")

ggsave(filename = "pc_motif.pdf", height = 7, width = 9)

fvs_terms <- top_n_importance_names(known_only_motif, class = "Floris van Schooten")

fvs_motif_votes <- rf_effects(known_only_motif, class = "Floris van Schooten", terms = fvs_terms) %>% 
  left_join(dt_motif_labels, by = c("term" = "motif_code")) %>% 
  mutate(metric_value = ifelse(metric_value, "present", "absent"))

ggplot(fvs_motif_votes, aes(x = metric_value, y = votes, color = is_true_positive, shape = metric_value, alpha = metric_value)) +
  facet_wrap(~ wrapped_motif_label, scales = "free_x") +
  geom_jitter() +
  scale_color_brewer(palette = "Set1") +
  scale_shape_discrete(guide = FALSE) +
  scale_alpha_discrete(guide = FALSE) +
  theme_bw() +
  theme(legend.position = "top") +
  labs(title = "Floris van Schooten - Important Motifs", x = NULL, y = "Model consensus", color = "Work actually by FvS?")

ggsave(filename = "fvs_motif.pdf", height = 7, width = 9)

wch_motif_votes <- rf_effects(known_only_motif, class = "Willem Claesz Heda") %>% 
  left_join(dt_motif_labels, by = c("term" = "motif_code")) %>% 
  mutate(metric_value = ifelse(metric_value, "present", "absent"))

ggplot(wch_motif_votes, aes(x = metric_value, y = votes, color = is_true_positive, shape = metric_value)) +
  facet_wrap(~ wrapped_motif_label, scales = "free_x") +
  geom_jitter() +
  scale_color_brewer(palette = "Set1") +
  scale_shape_discrete(guide = FALSE) +
  scale_alpha_discrete(guide = FALSE) +
  theme_bw() +
  theme(legend.position = "top") +
  labs(title = "Willem Claesz Heda - Important Motifs", x = NULL, y = "Model consensus", color = "Work actually by FvS?")
ng3m <- rf_contested_obs(known_only_motif) %>% 
  filter(painting_code == "NG 3") %>% 
  select(-contest_rank, -vote_div) %>% 
  gather(artist, votes, -painting_code) 

ggplot(ng3m, aes(x = fct_reorder(artist, votes), y = votes, fill = artist == "Nicolaes Gillis")) + 
  geom_bar(stat = "identity") + 
  coord_flip() + theme_bw() + 
  theme(legend.position = "none") + 
  labs(x = "Predicted artist") +
  ylim(0, 1)

ggsave(filename = "ng3m.pdf", width = 5, height = 4)


wch44m <- rf_contested_obs(known_only_motif) %>% 
  filter(painting_code == "WCH44") %>% 
  select(-contest_rank, -vote_div) %>% 
  gather(artist, votes, -painting_code) 

ggplot(wch44m, aes(x = fct_reorder(artist, votes), y = votes, fill = artist == "Willem Claesz Heda")) + 
  geom_bar(stat = "identity") + 
  coord_flip() + theme_bw() + 
  theme(legend.position = "none") + 
  labs(x = "Predicted artist") +
  ylim(0, 1)

ggsave(filename = "wch44m.pdf", width = 5, height = 4)


wch44c <- rf_contested_obs(known_only_comp) %>% 
  filter(painting_code == "WCH44") %>% 
  select(-contest_rank, -vote_div) %>% 
  gather(artist, votes, -painting_code) 

ggplot(wch44c, aes(x = fct_reorder(artist, votes), y = votes, fill = artist == "Willem Claesz Heda")) + 
  geom_bar(stat = "identity") + 
  coord_flip() + theme_bw() + 
  theme(legend.position = "none") + 
  labs(x = "Predicted artist") +
  ylim(0, 1)

ggsave(filename = "wch44c.pdf", width = 5, height = 4)
single_rf <- rfl$motif_only.simple.definite_only.qualified_only.classification

rfl_pca <- map(set_names(levels(single_rf$predicted)), function(x) {
  rf_local_importance(single_rf) %>% 
    semi_join(dt_compiled %>% filter(artist == x[1]), by = c("rowname" = "painting_code")) %>% 
    column_to_rownames("rowname") %>% 
    prcomp()

})

join_loadings <- function(pca) {
  pca_loadings_df(pca) %>% 
    left_join(dt_motif_labels, by = c("rowname" = "motif_code")) %>% 
    mutate(varlab = ifelse(is.na(wrapped_motif_label), rowname, motif_label))
}

rfl_loadings <- map(rfl_pca, join_loadings)
rfl_observations <- map(rfl_pca, pca_obs_df)

artist_motif_plot <- function(l, o, m = 12, hue = "red") {
  l %>% 
    filter(min_rank(desc(power)) <= m) %>% 
    ggplot(aes(x = PC1, y = PC2)) +
    geom_text(data = o, aes(label = rownames), alpha = 0.5) +
    geom_segment(aes(xend = 0, yend = 0), color = hue) +
    geom_label_repel(aes(label = varlab))
}

rfl_plots <- map2(rfl_loadings, rfl_observations, artist_motif_plot) %>% 
  map2(., names(rfl_observations), function(x, y) x + ggtitle(y))
mds_motif <- cmdscale(dist(known_only_motif$proximity)) %>%
  as.data.frame() %>% 
  rownames_to_column(var = "painting_code") %>% 
  inner_join(dt_painting_artist, by = "painting_code") %>% 
  mutate_at(vars(V1, V2), scales::rescale)

ggplot(mds_motif, aes(x = V1, y = V2)) +
  geom_point(alpha = 0.5) +
  geom_point(data = filter(mds_motif, artist1 == "Pieter Claesz"), color = "red")


mdlincoln/dutchtabletops documentation built on May 22, 2019, 4:08 p.m.