R/getTatts.R

Defines functions getTatts

Documented in getTatts

#' getTatts - Download historic sale results data from Tattersalls and
#' Tattersalls Ireland.
#'
#' \code{getTatts} downloads historic sale results data from the Tattersalls and
#' Tattersalls Ireland websites.
#'
#' \code{getTatts} downloads historic sale data from the Tattersalls
#' \url{http://tattersalls.com} and Tattersalls Ireland
#' \url{http://tattersalls.ie} websites, based on the supplied URL. Data is
#' scraped from the sales results tables on the Tattersalls and Tattersalls
#' Ireland websites. Various options may be specified such as a \code{filename}
#' and output format. Valid output formats are \code{csv}, \code{Rds} and
#' \code{sqlite}. Only an \code{Rds} file is generated by default. Valid URLs
#' for Tattersalls and Tattersalls Ireland sales have been tested as far back as
#' 2009. The demo directory contains a complete set of URLs and function calls
#' to generate results data for all sales back to 2009.
#'
#' @param url A string containing the universal resource locator for a
#'   Tattersalls' website page of historic bloodstock sale data. Required, no
#'   default set.
#' @param catalogue A string containing the universal resource locator for a
#'   sale catalogue, usually containing pedigree information in PDF format.
#'   Optional, no default set.
#' @param auctioneer A string containing the name of the company conducting the
#'   auction sale. Required, no default set.
#' @param country A string containing the abbreviated country code for the
#'   location of the sale. e.g. UK. Required, no default set.
#' @param currency A string containing the abbreviated currency code for the
#'   currency of sale bids and payments. e.g. GNS. Required, no default set.
#' @param date A string containing the date of the sale. Multi-day sales should
#'   only have the first day's date entered. The date should be entered in the
#'   format yyyy-mm-dd. Required, no default set.
#' @param csv A Boolean defining the data output format, in this case a CSV
#'   file. Required. Defaults to FALSE. May be changed to TRUE. Multiple output
#'   formats are possible.
#' @param rds A Boolean defining the data output format, in this case an Rds
#'   file. Required. Defaults to TRUE. May be changed to FALSE. Multiple output
#'   formats are possible.
#' @param sqlite A Boolean defining the data output format, in this case an
#'   SQLite file. Required. Defaults to FALSE. May be changed to TRUE. Multiple
#'   output formats are possible. Requires the RSQLite library to be installed,
#'   which is only optional for pinhooker package installation. Prior to
#'   attempting SQLite output, please ensure the RSQLite package is installed.
#' @param sale A string containing the name of the sale. e.g. Breeding Stock
#'   Sale. Required, no default set.
#' @param filename A string containing the output file name, without file
#'   extension. Required. Defaults to 'bloodstockSalesData'. Files are output to
#'   the current working directory.
#'
#' @return If all parameters are valid, data will be downloaded from the
#'   Tattersalls website, normalised and output, as the specified file fomats,
#'   in the current working directory directory.
#'
#' @examples
#'   getTatts(url =
#'   "http://db.tattersalls.com/4DCGI/Entry/List?30%096%09=%20DEM%2015%09by8",
#'   catalogue = "http://docs.tattersalls.com/cat/december/2015decmares.pdf",
#'   csv = FALSE, rds = TRUE, sqlite = FALSE, auctioneer = "Tattersalls",
#'   country = "UK", currency = "GNS", date = "2015-11-30", sale = "December
#'   Mares Sale", filename = "tattsSaleData")
#'
#' @export

getTatts <-
  function(url, catalogue = "", auctioneer, country, currency, date, csv = FALSE, rds = TRUE, sqlite = FALSE, sale = "", filename = "bloodstockSalesData") {
    # Scrape URL
    tattsData <- xml2::read_html(url)

    # Extract horse name, sire and dam data
    nameSexSireDam <-
      as.list(
        tattsData %>% rvest::html_nodes(
          ".row2+ .odd .lhs+ td , .row2+ .even .lhs+ td , .odd:nth-child(3) .lhs+ td"
        ) %>%
          rvest::html_text()
      )
    nameSexSireDam <- unlist(nameSexSireDam)
    nameSexSireDam <-
      as.data.frame(nameSexSireDam, stringsAsFactors = FALSE)

    # Extract Consignor and Purchaser data
    consignorPurchaser <-
      as.list(tattsData %>% rvest::html_nodes(".row2 .lhs+ td") %>%
                rvest::html_text())
    consignorPurchaser <- unlist(consignorPurchaser)
    consignorPurchaser <-
      as.data.frame(consignorPurchaser, stringsAsFactors = FALSE)

    # Extract Lot numbers
    lot <-
      as.list(
        tattsData %>% rvest::html_nodes(".row2+ .odd .lhs , .row2+ .even .lhs , .odd:nth-child(3) .lhs") %>%
          rvest::html_text()
      )

    lot <- unlist(lot)
    lot <- as.data.frame(lot, stringsAsFactors = FALSE)
    lot <- plyr::rename(lot,c("lot" = "Lot"))

    # Check for "Private Sales"
    if (grepl("Private Sales", lot$Lot[1]) == TRUE) {
      lot <- lot[-1, , drop = FALSE]
      nameSexSireDam <- nameSexSireDam[-1, , drop = FALSE]
      consignorPurchaser <- consignorPurchaser[-1, , drop = FALSE]
    }

    # Extract prices, remove commas, set NA to zero, set class to integer
    price <-
      as.list(
        tattsData %>% rvest::html_nodes(
          ".row2+ .odd .rhsr , .row2+ .even .rhsr , .odd:nth-child(3) .rhsr"
        ) %>%
          rvest::html_text()
      )

    price <- unlist(price)
    price <- as.data.frame(price, stringsAsFactors = FALSE)
    price <- plyr::rename(price,c("price" = "Price"))
    price$Price <- gsub(",", "", price$Price)
    price$Price[price$Price == ""] <- "0"
    price$Price <- as.integer(price$Price)

    # Separate name info, save as new data frame, rename column
    name <-
      sub("(.*?)).*", "\\1)", nameSexSireDam$nameSexSireDam) %>%
      as.data.frame(stringsAsFactors = FALSE)
    name <- plyr::rename(name, c("." = "Name"))

    # Separate sex info, save as new data frame, rename column
    sexInterim <-
      sub(".*?/ (.*?)", "\\1", nameSexSireDam$nameSexSireDam) # Account for horses with no names
    sexInterim <- sub(".*?) (.*?) .*", "\\1", sexInterim)
    sexInterim <- sub("\\.", " ", sexInterim)
    sex <- sub('.*(?=.{2}$)', '', sexInterim, perl = T) %>%
      as.data.frame(stringsAsFactors = FALSE)
    sex <- plyr::rename(sex, c("." = "Sex"))

    # Expand sex names to match other imported data sets
    sex$Sex[sex$Sex == "F."] <- "Filly"
    sex$Sex[sex$Sex == "C."] <- "Colt"
    sex$Sex[sex$Sex == "M."] <- "Mare"
    sex$Sex[sex$Sex == "G."] <- "Gelding"
    sex$Sex[sex$Sex == "H."] <- "Hack"
    sex$Sex[sex$Sex == "R."] <- "Rig"

    # Extract Colour info, save as new data frame rename column
    colour <- sub("(.*?) .*", "\\1", sexInterim) %>%
      as.data.frame(stringsAsFactors = FALSE)
    colour <- plyr::rename(colour, c("." = "Colour"))

    # Separate Sire info, save as new data frame, rename column
    sire <-
      sub(".*?by (.*?) x.*", "\\1", nameSexSireDam$nameSexSireDam)
    sire <-
      sub("(.*?)).*", "\\1)", sire) %>% # Accounting for horses with no names
      as.data.frame(stringsAsFactors = FALSE)
    sire <- plyr::rename(sire, c("." = "Sire"))

    # Separate Dam info, save as new data frame, rename column
    dam <- sub(".*?x (.*?)", "\\1", nameSexSireDam$nameSexSireDam)
    dam <-
      sub(".*?x (.*?)", "\\1", dam) # Doing it twice to account for Sire's with x in their name

    # Copy Sex info to blank cells
    dam <-
      sub(".*?/(.*?)).*", "\\1)", dam) %>% # Accounting for horses with no names
      as.data.frame(stringsAsFactors = FALSE)
    dam <- plyr::rename(dam, c("." = "Dam"))

    # Separate Purchaser info, save as new data frame, rename column
    purchaser <-
      sub(".*?Purchaser: (.*?)\n.*", "\\1", consignorPurchaser$consignorPurchaser) %>%
      as.data.frame(stringsAsFactors = FALSE)
    purchaser <- plyr::rename(purchaser, c("." = "Purchaser"))

    # Clean up Withdrawn, Vendor purchased and Not Sold to match other imported data sets
    purchaser$Purchaser[purchaser$Purchaser == "Lot Withdrawn"] <-
      "Withdrawn"
    purchaser$Purchaser[purchaser$Purchaser == "Lot Not Sold"] <-
      paste("Not Sold (", price$Price[purchaser$Purchaser == "Lot Not Sold"],")", sep = "")
    purchaser$Purchaser[purchaser$Purchaser == "Vendor"] <-
      paste("Vendor (", price$Price[purchaser$Purchaser == "Vendor"],")", sep = "")
    price$Price[grepl("Vendor", purchaser$Purchaser) == TRUE] <- 0
    price$Price[grepl("Not Sold", purchaser$Purchaser) == TRUE] <- 0

    # Separate Consignor info, save as new data frame, rename column
    consignor <-
      sub(".*?Consignor: (.*?) —.*", "\\1", consignorPurchaser$consignorPurchaser) %>%
      as.data.frame(stringsAsFactors = FALSE)
    consignor <- plyr::rename(consignor, c("." = "Consignor"))

    saleData <-
      cbind(lot, name, sex, colour, sire, dam, consignor, purchaser, price)

    # Clean up horses with no names
    saleData$Name[saleData$Name == saleData$Sire] <- ""

    # Create empty dataframe with correct column names.
    allCols <-
      data.frame(
        Lot = integer(), Name = character(), Foaled = integer(), Sex = character(), Type = character(), Colour = character(), Sire = character(), Dam = character(), Consignor = character(), Stabling = character(), Purchaser = character(), coveringSire = character(), Catalogue = character(), Price = integer(), stringsAsFactors =
          FALSE
      )

    # Bind empty dataframe with XLS data
    saleData <- plyr::rbind.fill(allCols, saleData)

    # Create new columns with data input from function options
    saleData$Auctioneer <- auctioneer
    saleData$Country <- country
    saleData$Currency <- currency
    saleData$saleDate <- date
    saleData$Catalogue <- catalogue
    saleData$Sale <- sale

    # Reset column data types
    saleData$Price <- as.integer(saleData$Price)
    saleData$saleDate <- as.Date(saleData$saleDate, "%Y-%m-%d")

    # Check to see if CSV file exists. Then write CSV.
    if (isTRUE(csv)) {
      if (!isTRUE(file.exists(paste(filename,".csv", sep = "")))) {
        write.csv(
          saleData, paste(filename,".csv", sep = ""), row.names = FALSE, na =
            ""
        )
      } else {
        saleDataSaved <-
          read.csv(
            paste(filename,".csv", sep = ""), sep = ",", stringsAsFactors =
              FALSE, as.is = TRUE
          )
        saleData$saleDate <- as.character(saleData$saleDate)
        saleDataFinal <- rbind(saleDataSaved, saleData)
        write.csv(
          saleDataFinal, paste(filename,".csv", sep = ""), row.names = FALSE, na =
            ""
        )
      }
    }

    # Check to see if RDS file exists. Then write RDS.
    if (isTRUE(rds)) {
      if (!isTRUE(file.exists(paste(filename,".rds", sep = "")))) {
        saleData[is.na(saleData)] <- ""
        saveRDS(saleData, paste(filename,".rds", sep = ""))
      } else {
        saleDataSaved <- readRDS(paste(filename,".rds", sep = ""))
        saleData[is.na(saleData)] <- ""
        saleDataFinal <- rbind(saleDataSaved, saleData)
        saveRDS(saleDataFinal, paste(filename,".rds", sep = ""))
      }
    }

    # Check to see if SQLite file exists. Then write SQLite file.
    if (isTRUE(sqlite)) {
      if (!requireNamespace("RSQLite", quietly = TRUE)) {
        stop(
          "The package RSQlite is required to generate the SQLite data file. Please install it and run the script again.",
          call. = FALSE
        )
      }
      if (!isTRUE(file.exists(paste(filename,".sqlite", sep = "")))) {
        saleData[is.na(saleData)] <- ""
        saleData$saleDate <- as.character(saleData$saleDate)
        con <-
          dbConnect(SQLite(), paste(filename,".sqlite", sep = ""))
        dbWriteTable(
          con, name = filename, value = transform(saleData, saleDate), row.names =
            FALSE, append = TRUE
        )
        dbDisconnect(con)
      } else {
        con <-
          dbConnect(SQLite(), paste(filename,".sqlite", sep = ""))
        sql1 <- paste("SELECT * FROM ",filename, sep = "")
        saleDataSaved <- dbGetQuery(con, sql1)
        saleData[is.na(saleData)] <- ""
        saleData$saleDate <- as.character(saleData$saleDate)
        saleDataFinal <- rbind(saleDataSaved, saleData)
        dbWriteTable(
          con, name = filename, value = transform(saleDataFinal, saleDate), row.names =
            FALSE, overwrite = TRUE
        )
        dbDisconnect(con)
      }
    }
  }
phillc73/pinhooker documentation built on Feb. 18, 2021, 9:21 p.m.