R/tidy_exhibitions.R

#' Prepare exhibition data frame
#'
#' @param .df.list a list of data frames containing exhibitions retrieved from artist-info.com
#'
#' @return a data frame
#' @export
#'
#' @examples
#' exhibitions <- tidy_exhibitions(df.list)
tidy_exhibitions <- function(.df.list){

  exhibitions <- df.list[["exhibitions"]] %>%

    # exclude incomplete observations
    filter(!is.na(solo_group)) %>%

    # remove duplicates when disregarding manually created exhibition id
    distinct(exh_title, exh_period, solo_group, exh_performer, artist_id, exh_place_id,
             .keep_all = T) %>%

    # convert IDs to characters for consistency in all data frames
    mutate_at(vars(contains("id")), as.character) %>%

    # parse month and year from exhibition period
    mutate(
      # extract beginning date
      exh_start_Ym = exh_period   %>% str_remove("-.+") %>% parse_monthYear(),
      exh_start_Y  = exh_start_Ym %>% str_extract("[0-9]{4}") %>% as.integer(),

      # extract ending date
      exh_end_Ym   = exh_period   %>% str_remove(".+-") %>% parse_monthYear(),
      exh_end_Y    = exh_end_Ym   %>% str_extract("[0-9]{4}") %>% as.integer()
    ) %>%

    mutate(exh_place_id = as.integer(exh_place_id)) %>%

    # create column to make edgelist in strict chronological order
    group_by(artist_id, exh_start_Ym) %>%
    # if there is only 1 obs. per group there wont be anything subtracted since id=0
    mutate(count = row_number() - 1) %>%
    ungroup() %>%
    mutate(# add one day
           exh_start_Ymd = exh_start_Ym %>% paste0("01") %>%
             lubridate::ymd() + lubridate::month(count),
           # convert back to integer
           exh_start_Ymd = exh_start_Ymd %>% as.character() %>%
             str_remove_all("-") %>% as.integer()
             ) %>%

    select(-count, - exh_period) %>%

    # remove strange data
    filter(id != "18933-1" # group exhibition that is listed as solo with 554 artists
    )

  zwack <- readxl::read_xlsx("data/Zwack-exh-manual-collected.xlsx") %>%
    filter(artist_id != "not in artistinfo") %>%
    select(- contains("artfacts"), - exh_period) %>%
    mutate(exh_start_Ym = str_extract(exh_start_Ymd, "[0-9]{6}"),
           exh_start_Y  = str_extract(exh_start_Ymd, "[0-9]{4}"),
           exh_end_Ym   = str_extract(exh_end_Ymd,   "[0-9]{6}"),
           exh_end_Y    = str_extract(exh_end_Ymd,   "[0-9]{4}")) %>%
    mutate_at(vars(dplyr::matches("exh_[a-z]+_Y")), as.integer)


  exhibitions <- bind_rows(exhibitions, zwack)

  # Herzog, Walther
  # exhibitions[exhibitions$artist_id == "14563", ]$artist_id <- "14562"


  return(exhibitions)

}
Framus94/HierarchiesAndCareers documentation built on June 5, 2019, 8:52 a.m.