knitr::opts_chunk$set(warning = FALSE, message = FALSE, fig.width = 7, fig.height = 5, dev = "svg")
library(londonauctions)
library(tidyr)
library(dplyr)
library(ggplot2)
library(broom)
library(readr)
n_days <- 7

yday_months_x <- scale_x_continuous(breaks = cumsum(c(1, 31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30)), labels = c(month.name), limits = c(0, 365))
yday_months_y <- scale_y_continuous(breaks = cumsum(c(1, 31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30)), labels = c(month.name), limits = c(0, 365))
wday_days_x <- scale_x_continuous(breaks = seq(1,7,1), minor_breaks = NULL, labels = c("Sunday", "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday"))

Getty Sales Catalogs: What are the known unknowns?

if (!dir.exists("../bas_export"))
  dir.create("../bas_export")

write_csv(getty_catalogs, path = "../bas_export/1_getty_catalogs.csv")

getty_distribution <- ggplot(getty_catalogs, aes(x = year, fill = ifelse(is_input, "Indexed at item level", "Not yet indexed"))) +
    geom_histogram(binwidth = 1) +
    scale_fill_brewer(type = "qual", palette = 6) +
    theme_bw() +
    annotate("rect", xmin = 1780, xmax = 1835, ymin = -Inf, ymax = Inf, alpha = 0.2) +
    annotate("text", x = 1800, y = 175, label = "1780-1835") +
    labs(x = NULL, y = "number of catalogs", fill = NULL) +
    theme(legend.justification = c(0, 1), legend.position = c(0, 1))

dualplot(getty_distribution, "getty_distribution")

Filters

Because the price_factor variable is dependent on all the other prices in the database, we must make filtering decisions about including problematic records before binning sales in to price quintiles. Do we want to include or exclude problematic prices? What about certain transaction types? We can do that before setting up the pricing categories.

# Filter out sales whose prices are uncertain or which are doubled, while NOT 
# YET discarding those that had no price to begin with; these can be handled in
# later analysis.
usable_sales <- sales %>% 
  filter(
    amt_is_uncertain == FALSE & 
      has_double_price == FALSE & 
      transaction_type %in% c("sold", "bought in", "sold or bought in"))

Price factor

Due to inflation and deflation, it is difficult to compare absolute prices across the period of study. A simple solution to this problem is to categorize artworks into groups of most expensive and least expensive within their own years. The following function will group artowrks by year and split them into 5 buckets within their year, based on their price relative to other

usable_sales <- usable_sales %>% 
  group_by(year) %>% 
  mutate(price_factor = ntile(transaction_amt, n = 5)) %>% 
  ungroup()

We may wish to break down the analysis of seasonal effect exhibited by specific subsets of these data after calculating the price factor. A prime candidate for this is comparing auction houses, as they would still set their schedule relative to each other in part based on these overall price factors.

all_sales <- usable_sales
christies_sales <- usable_sales %>% filter(auction_house == "Christie's")
other_sales <- usable_sales %>% filter(!auction_house %in% c("Christie's"))

Auctions per year

per_year_sales <- usable_sales %>% count(year)
write_csv(per_year_sales, "../bas_export/2_usable_sales.csv")

per_year <- ggplot(per_year_sales, aes(x = year, y = n)) +
  geom_bar(stat = "identity") +
  theme_bw() +
  theme(legend.position = "bottom") +
  labs(x = "Year", y = "Painting lots auctioned per year", fill = "has transaction price?")

dualplot(per_year, "per_year")

Auctions per month

monthly_sales <- usable_sales %>% 
  mutate(month = factor(month, labels = month.name)) %>% 
  count(month)

write_csv(monthly_sales, "../bas_export/3_usable_sales.csv")

per_month <- ggplot(monthly_sales, aes(x = month, y = n)) +
  theme_bw() +
  geom_bar(stat = "identity") +
  theme(legend.position = "bottom") +
  labs(x = "Year", y = "Paintings auctioned per month", fill = "has transaction price?") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

dualplot(per_month, "per_month")

We are interested in measuring how temporally-concentrated sales were in a given year. Simply plotting out the number of sales on each day for a number of years within our period of study suggests that auctioneers in later years increasingly concentrated their sales within a shorter period.

usable_sales %>% 
  filter(year %in% seq(1800, 1830, 10), price_factor == 5) %>% 
  count(year, yday) %>% 
  ggplot(aes(x = yday, y = n)) +
  geom_bar(stat = "identity") +
  facet_wrap(~ year, ncol = 1) +
  theme_bw() +
  labs(x = "Day of the year", y = "Number of sales") +
  yday_months_x +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

To focus our analysis on the spread of the most successful sales for each year, we look for the top $n$ sale days for each year and calculate their coefficient of variation.[^cv] Here is the same visualization of sale days, with the seven top sales days for each year marked in red:

top_examples <- usable_sales %>% 
  filter(year %in% seq(1800, 1830, 10), price_factor == 5) %>% 
  count(year, yday) %>% 
  mutate(
    # In the case that multiple days tie for the same number of sales, all will
    # be included in the calculation
    rank = min_rank(desc(n)),
    is_top = rank <= 7) %>% 
  ungroup()

cv_examples <- top_examples %>% 
  filter(is_top == TRUE) %>% 
  group_by(year) %>% 
  summarize(cv = sd(yday)/mean(yday))

write_csv(top_examples, "../bas_export/4_top_examples.csv")

vis_top_example <- ggplot(top_examples, aes(x = yday, y = n, fill = is_top)) +
  geom_bar(stat = "identity") +
  geom_text(data = cv_examples, x = 250, y = 50, aes(label = paste("CV =", format(cv, digits = 2)), fill = NULL)) +
  scale_fill_manual(values = c("black", "red")) +
  facet_wrap(~ year, ncol = 1) +
  theme_bw() +
  theme(legend.position = "none") +
  labs(x = "Day of the year (with top seven days highlighted)", y = "Number of sales") +
  yday_months_x +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

dualplot(vis_top_example, "vis_top_example", h = 7)

[^cv]: The coefficient of variation for a given set of numbers is the ratio of the set's standard deviation to its mean. It shows the extent of variability in the set relative to the mean value

From these example years, we see that the top-most days become increasingly concentrated between 1790 and 1830. The coefficient of variation (the standard deviation of each day by the mean of all days) confirms this visual impression. Years with more concentrated top sales days have a lower coefficient of variation; their top sales days are simply closer to eachother in the calendar year.

We want to measure if the CV of yearly top sales days is significantly changing. We also want to nuance this measurement with two additional facets: the price factor of the artwork (did it fall into the most expensive or least expensive of artworks sold that year?) and three different ways of measuring the auction performance of a given day: the number of lots sold, the sum of all recorded transaction amounts per day, and the average lot price for that day.

First, we'll calculate the number of sales, total amount, and average amount for every day of every year, grouping sales by their price factor:

Next, we will filter these results to include only the top 7 days of each year, again grouped by price factor.

Next, we will calculate the CV of these top days for each year and price factor. Finally, we will run a linear regression model on these results. This is a statistical tool for measuirng the correlation between variables in a dataset. In this case, we wish to measure the correlation between the year (1780-1840) and the coefficient of variation of top sales days. The linear regression gives us a firm statistical footing to claim whether or not the data we have support the hypothesis that auctioneers increasingly scheduled top sales days close to eachother in the calendar year.

The following facetted scatterplot presents the variation of top days, overlaid with the results of our linear regression models. The years of the study period (1780-1840) are along the x-axis of the plot, while the y-axis is the coefficient of variation for sale days in that year. The dots represent the actual measured points from the dataset, while the straight lines illustrate the results of the linear regression model, providing a "line of best fit" for our data that helps to visualize the strength of the relationship between the year and the spread of top sales days in that year.

Because we also want to measure the effect of different measurement methods and the relative price of the artworks being sold, we grouped our cacluations by price factor and measurement method. In this figure, the groups are vizualized with a faceted grid: each column represents the different price factors of artworks, while each row represents a different measurement method.

# Join the significance calculations back onto the top-N-days data, so that we 
# can use ggplot to color each facet based on the value. Plot the change in 
# distribution of these top days between 1780-1840, coloring the plots based on
# the magnitude of the predicted change.
cv_results_points <- bind_rows(
  "all sales" = all_sales, 
  "Christie's only" = christies_sales, 
  "other houses" = other_sales, 
  .id = "set") %>% 
  group_by(set) %>% 
  do(all_peaks(., n_days)) %>% 
  ungroup() %>% 
  filter(type == "count_cv") %>% 
  select(-type)

cv_results_lm <- cv_results_points %>% 
  group_by(set, price_factor) %>% 
  do(tidy(lm(cv ~ year, data = .))) %>% 
  filter(term == "year")

cv_results_estimate <- cv_results_points %>% 
  group_by(set, price_factor) %>% 
  do(augment(lm(cv ~ year, data = .)))

cv_combined <- cv_results_estimate %>% inner_join(cv_results_lm, by = c("set", "price_factor"))
# Underlying plot data to be shared with BAS visualization contractor
write_csv(cv_combined, path = "../bas_export/5_cv_combined.csv")

cv_results_plot <- ggplot(cv_combined, aes(x = year, color = estimate)) +
  geom_point(aes(y = cv), size = 3, alpha = 0.5) +
  geom_ribbon(aes(ymin = .fitted - .se.fit, ymax = .fitted + .se.fit), color = NA, alpha = 0.4) +
  geom_line(aes(y = .fitted)) +
  facet_grid(set ~ price_factor) +
  ggtitle("price quintile") +
  theme_bw() +
  scale_color_gradient(low = "red", high = "black") +
  theme(
    plot.title = element_text(size = rel(1)),
    legend.position = "none",
    axis.text.x = element_text(angle = 45, hjust = 1)
  ) +
  labs(
    x = "Year",
    y = paste0("Coefficient of variation of the top ", n_days, " days per year")
  )

dualplot(cv_results_plot, "cv_results", h = 7, w = 9)

A benefit of this facetting is the ability to quickly discern that, in this case, sales in the highest price bracket show a strong trend towards clustering, regardless of measurement method. Similarly, we find that the spread of top days for lower 80% of sales (those in price factors 1 through 4) show little or no change in spread over the same time period.

Royal Academy Dates

Did Christie's navigate with houses like Fosters and Phillips to find slots on the calendar year? And how did they position themselves relative to the royal acadmey?

houses <- usable_sales %>% 
  mutate(
    # Label top three houses, and an 'Other' category
    auction_house = ifelse(auction_house %in% c("Christie's", "Foster (Edward)", "Phillips (Harry)"), auction_house, "Other"),
    # Arrange facets so 'Other' is displayed at left
    auction_house = factor(auction_house, levels = c("Christie's", "Phillips (Harry)", "Foster (Edward)", "Other"))
  ) %>% 
  # Actually, we don't want to show other... it's too confusing
  filter(price_factor %in% c(5), auction_house != "Other") %>% 
  group_by(price_factor, auction_house) %>% 
  do(peak_n(df = ., n_days = 7)) %>% 
  ungroup()

# Report out ratios all top days that fall after August 1st.
houses %>% 
  mutate(is_after_august = yday >= 214) %>% 
  count(auction_house, is_after_august) %>% 
  spread(is_after_august, n) %>% 
  mutate(total = `FALSE` + `TRUE`,
         ratio = `TRUE` / total)

write_csv(houses, path = "../bas_export/6_houses.csv")
write_csv(ra_dates, path = "../bas_export/6_ra_dates.csv")

house_dist <- ggplot(houses, aes(x = year)) +
  geom_point(aes(y = yday, color = auction_house, size = metric), alpha = 0.2) +
  geom_line(data = ra_dates, aes(x = exhib_year, y = exhib_start_day), size = 0.7, color = "black") +
  geom_line(data = ra_dates, aes(x = exhib_year, y = exhib_end_day), size = 0.7, color = "black") +
  yday_months_y +
  theme_bw() +
  scale_color_brewer(type = "qual", palette = 6, guide = "none") +
  xlim(1780, 1835) +
  theme(legend.position = "bottom") +
  labs(x = NULL, y = paste("date of top", n_days, "days"), size = "number of sales") +
  facet_grid(price_factor ~ auction_house)

dualplot(house_dist, "house_dist", h = 6)

Did the RA exhibition become more popular over this period? We can look at the takings per day for the RA, which we have available up until

Auctions per day of the week

all_ratios <- usable_sales %>% 
  count(wday) %>% 
  mutate(ratio = n/sum(n))

select_ratios <- usable_sales %>% 
  filter(auction_house %in% c("Christie's", "Phillips (Harry)", "Foster (Edward)")) %>% 
  mutate(auction_house = factor(auction_house, levels = c("Christie's", "Phillips (Harry)", "Foster (Edward)"))) %>% 
  count(auction_house, wday) %>% 
  group_by(auction_house) %>% 
  mutate(ratio = n/sum(n))

write_csv(all_ratios, path = "../bas_export/7_all_ratios.csv")
write_csv(select_ratios, path = "../bas_export/7_select_ratios.csv")

weekday_dist <- ggplot(select_ratios, aes(x = wday, y = ratio)) +
  geom_bar(data = all_ratios, fill = "gray", stat = "identity") +
  geom_bar(aes(fill = auction_house), stat = "identity", alpha = 0.7) +
  scale_fill_brewer(type = "qual", palette = 6, guide = "none") +
  facet_wrap(~ auction_house, nrow = 1) +
  theme_bw() +
  labs(x = "day of the week", y = "ratio of sales") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  wday_days_x

dualplot(weekday_dist, "weekday_dist")

Parliament Dates

We can also plot the parliament session dates for the same period. Around 1780-1800, the top sales days were generally spread from the start of parliament until the end; however these became increasingly clustered towards the close of parliament. Unsurprisingly, the Parliament meeting dates set the outer bounds of the auction season. However within those bounds, more interesting activity was afoot.

par_plot <- parliament_dates %>% 
  mutate(pre_1780 = par_end_year < 1780) %>%
  ggplot(aes(x = par_end_year, y = par_end_day)) + 
  annotate("rect", xmin = 1780, xmax = 1835, ymin = -Inf, ymax = Inf, alpha = 0.2) +
  annotate("text", x = 1807, y = 300, label = "1780-1835") +
  geom_point() +
  geom_hline(yintercept = 192) +
  geom_hline(yintercept = 147) +
  yday_months_y + 
  geom_smooth(method = "lm", se = FALSE, size = 1) +
  theme_bw() +
  labs(x = NULL, y = "Session closing day")

parliament_dates %>% 
  mutate(pre_1780 = par_end_year < 1780) %>% 
  group_by(pre_1780) %>% 
  summarize(med = median(par_end_day))

dualplot(par_plot, "par_plot")


mdlincoln/londonauctions documentation built on May 22, 2019, 4:11 p.m.