library(dplyr)
library(readr)
library(stringr)
library(lubridate)
library(tidyr)
library(assertr)

Paintings sales

The Getty Provenance Index staff has supplied us with a text export of their database for all sales of paintings made by auction houses in London. We will first load this table into the R data processing environment, and make a working copy of it.

# Read in the original data file as one long column of text
one_col <- data_frame(text = read_lines(system.file("extdata", "britishsales1.txt", package = "londonauctions")))

two_col <- data_frame(text = read_lines(system.file("extdata", "britishsales2.txt", package = "londonauctions")))

three_col <- data_frame(text = read_lines(system.file("extdata", "britishsales3.txt", package = "londonauctions")))


comb_col <- bind_rows(one_col, two_col, three_col)

# These are the variables we want to preserve for each record
req_vars <- c("--RECORD NUMBER--", "BBF Number", "Lot Number", "Edit Status", "Lot Sale Date", "Sale Begin Date", "Sale End Date", "Transaction", "Price", "Auct.House Auth.", "Verbatim Artist", "Title", "Verbatim Seller", "Verbatim Buyer")

sales <- comb_col %>%
  # Split the single column of text into two cols: variable and value
  separate(text, into = c("variable", "value"), sep = 17) %>%
  # Trim any whitespace from either column
  mutate(variable = str_trim(variable), value = str_trim(value)) %>%
  # Only keep those rows that are in the req_vars list
  filter(variable %in% req_vars) %>%
  # To group a set of rows with a particular object, create an id column that
  # increments +1 each time it hits a "--RECORD NUMBER--" row
  mutate(objnum = cumsum(variable == "--RECORD NUMBER--")) %>%
  # If any fields occur multiple times in the same record, we will collapse them
  # in to one field, delimited with a semicolon
  group_by(objnum, variable) %>% 
  summarize(value = paste(sort(unique(value)), collapse = "; ")) %>% 
  # Now spread() the table into one column per variable, and one row per observation
  spread(variable, value) %>% 
  # And rename column variables to be more standardized
  select(cat_no = `BBF Number`, lot_no = `Lot Number`, auction_house = `Auct.House Auth.`, is_edited = `Edit Status`, sale_date = `Lot Sale Date`, sale_begin = `Sale Begin Date`, sale_end = `Sale End Date`, transaction_type = Transaction, transaction = Price, artists = `Verbatim Artist`, title = `Title`, sellers = `Verbatim Seller`, buyers = `Verbatim Buyer`, objnum) %>% 
  # Double-check that we now have only one row per object number
  verify(n_distinct(.$objnum) == nrow(.)) %>% 
  ungroup()

# Original number of records:
nrow(sales)

Having loaded the data, we perform several cleaning operations to extract sturctured information from the semi-structured text entered in many fields of the Getty database.

Dates

The original data contained several different variations on dates for each sale. While most were in the format YYYY MMM DD, many lots are listed with the full range of dates their sales took place in, and the exact day of the lot sale specified after this. If an exact date was specified for a lot, we assigned that date. If only a range was given, we used the first date of the given range.

# Use R's date-parsing functions to convert these into standard date formats.
sales$date <- ymd(sales$sale_date)

# From this standard date field, we will derive additional integer values for
# year, month, week of the year, day of the year, day of the month, day of the
# week.

sales$year <- year(sales$date) %>% as.integer()
sales$month <- month(sales$date) %>% as.integer()
sales$week <- week(sales$date) %>% as.integer()
sales$yday <- yday(sales$date) %>% as.integer()
sales$mday <- mday(sales$date) %>% as.integer()
sales$wday <- wday(sales$date) %>% as.integer()
sales <- sales %>% filter(between(year, 1780, 1835))

# Number of records:
nrow(sales)

Transaction type

The Getty data have a transaction type field that describes if a lot was sold, bought in, passed, withdrawn, unknown, or some uncertain combination of any of those. A few fields are also marked with ? or [?] denoting further uncertainty. We break these markers out into a second column.

# Clean up transaction_type column

# Everything to lowercase
sales$transaction_type <- tolower(sales$transaction_type)

# Create a separate column denoting uncertainty
sales$type_is_uncertain <- str_detect(sales$transaction_type, "\\?")
# Remove ? and [?] from the transaction_type field
sales$transaction_type <- str_replace(sales$transaction_type, "\\?|\\[\\?\\]", "") %>% str_trim()

Transaction amounts

Before parsing the numbers contained in the sales column, we must parse textual information stored alongside them:

# Create a separate column denoting uncertainty
sales$amt_is_uncertain <- str_detect(sales$transaction, "\\?")
# Create a separate column denoting multiple prices
sales$has_double_price <- str_detect(sales$transaction, "\\[or\\]")

# How many sales have uncertain amounts?
nrow(sales %>% filter(amt_is_uncertain == TRUE))

# How many sales have more than one price?
nrow(sales %>% filter(has_double_price == TRUE))

# How many records have no transaction information at all?
nrow(sales %>% filter(is.na(transaction)))

# Summary table that shows the top most frequently ocurring transaction types,
# and compresses the rest into an "other" field
sale_share <- sales %>%
  count(transaction_type) %>% 
  mutate(share = n/sum(n) * 100) %>% 
  arrange(desc(share))

top_share <- sale_share %>% slice(1:3)
other_share <- sale_share %>% 
  slice(4:nrow(.)) %>% 
  summarize(transaction_type = "other", n = sum(n), share = sum(share))

bind_rows(top_share, other_share) %>% 
  knitr::kable(digits = 2, format.args = list(big.mark = ","))

nrow(sales)

There are very few sales with uncertain prices. We will exclude these. There are also comparatively few lots with more than one price attached. We will also exclude these, so as not to interfere with the later price quantile ranking.

Auction Houses

# Number of auction houses to count individually; the rest will be classified as "other"
n_houses <- 10

sales %>% 
  count(auction_house, sort = TRUE) %>% 
  mutate(auction_house = c(auction_house[1:n_houses], rep("other", nrow(.) - n_houses))) %>% 
  group_by(auction_house) %>% 
  summarize(sales = sum(n)) %>% 
  arrange(desc(sales)) %>% 
  mutate(share = sales/sum(sales)) %>% 
  knitr::kable(digits = 2, format.args = list(big.mark = ","))
# Detect the price number within the transaction amount field
sales$transaction_amt <- sales$transaction %>% 
  # Replace some puncutation where the decimal place is marked by a comma instead of a period
  str_replace("[,]", ".") %>% 
  # Extract numbers that may or may not have decimal places
  str_extract("\\d+(?:\\.\\d+)?") %>% 
  # Convert strings to numeric information
  as.numeric()

# Provisional sum of sale prices:
sum(sales$transaction_amt, na.rm = TRUE)

In many cases, several lots are grouped with the same transaction, so the same price has been assigned to multiple lots. This is marked one of several ways in the documentation:

# Locate transactions with two or three lots as indicated by an & symbol, and
# divide acordingly
num_and <- str_count(sales$transaction, "&")
sales$transaction_amt <- ifelse(
  num_and == 0,
  sales$transaction_amt,
  ifelse(
    num_and == 1,
    sales$transaction_amt/2,
    sales$transaction_amt/3)
)

# Provisional sum of sale prices:
sum(sales$transaction_amt, na.rm = TRUE)

# For lots marked with bracket notation, determine the trailing letter and 
# divide the transaction amount accordingly. Where there is no trailing letter,
# preserve the same value.
sales$bracket_letter <- str_match(sales$transaction, "\\[a-([a-z])\\]")[,2]
sales$bracket_letter[is.na(sales$bracket_letter)] <- "none"
sales$transaction_amt <- ifelse(sales$bracket_letter == "c", sales$transaction_amt/3, sales$transaction_amt)
sales$transaction_amt <- ifelse(sales$bracket_letter == "d", sales$transaction_amt/4, sales$transaction_amt)
sales$transaction_amt <- ifelse(sales$bracket_letter == "e", sales$transaction_amt/5, sales$transaction_amt)
sales$transaction_amt <- ifelse(sales$bracket_letter == "f", sales$transaction_amt/6, sales$transaction_amt)
sales$transaction_amt <- ifelse(sales$bracket_letter == "g", sales$transaction_amt/7, sales$transaction_amt)
sales$transaction_amt <- ifelse(sales$bracket_letter == "h", sales$transaction_amt/8, sales$transaction_amt)
sales$transaction_amt <- ifelse(sales$bracket_letter == "i", sales$transaction_amt/9, sales$transaction_amt)
sales$transaction_amt <- ifelse(sales$bracket_letter == "j", sales$transaction_amt/10, sales$transaction_amt)
sales$transaction_amt <- ifelse(sales$bracket_letter == "k", sales$transaction_amt/11, sales$transaction_amt)

# Provisional sum of sale prices:
sum(sales$transaction_amt, na.rm = TRUE)
# Select pertinent columns of data and save for analyses
sales <- sales %>% 
  select(cat_no, lot_no, objnum, auction_house, date, transaction_type, type_is_uncertain, amt_is_uncertain, has_double_price, transaction_amt, year, month, week, yday, mday, wday, transaction, artists, title, sellers, buyers)

Royal Academy dates

Royal Academy Archivist Mark Pomeroy. Years 1769--1982 are ready for parsing --- the remainder are incomplete. Here we convert the year, month, and day fields in the original table into formatted date columns ready for processing.

raw_ra <- read_csv(system.file("extdata", "ra_dates.csv", package = "londonauctions"), col_types = "iiiiin")

ra_dates <- raw_ra %>%
  mutate(
    start_string = paste(start_day, start_month, year, sep = "-"),
    end_string = paste(end_day, end_month, year, sep = "-"),
    exhib_start = start_string %>% dmy(),
    exhib_end = end_string %>% dmy(),
    exhib_start_day = yday(exhib_start) %>% as.integer(),
    exhib_end_day = yday(exhib_end) %>% as.integer(),
    exhib_year = year(exhib_start) %>% as.integer()
  ) %>%
  select(contains("exhib"), takings)

Parliament dates

parliament_dates <- read_csv(system.file("extdata", "parliament_dates.csv", package = "londonauctions")) %>% 
  select(
    par_start = start_date,
    par_end = end_date,
    par_start_year = start_year,
    par_end_year = end_year,
    par_start_day = start_yday,
    par_end_day = end_yday
  )

Getty Catalogue Input

They Getty has not yet input all of its historical sales catalogs. This table describes the data entry progress of known sale catalogs as of 8/19/2015.

getty_catalogs <- read_csv(system.file("extdata", "catalog_completion.csv", package = "londonauctions"), col_types = "ccc") %>% 
  mutate(
    is_input = ifelse(is_input == "yes", TRUE, FALSE),
    year = as.integer(str_match(sale_begin, "^(\\d{4})/")[,2]))
devtools::use_data(getty_catalogs, parliament_dates, ra_dates, sales, overwrite = TRUE)


mdlincoln/londonauctions documentation built on May 22, 2019, 4:11 p.m.