Nothing
## Deprecated for now - requires significant work following changes to webpage
#' Scrape Bath Rugby match dates, kick-off times and results
#'
#' Web scraping function to gather dates and times of Bath Rugby matches, and
#' whether or not Bath won, from the
#' \href{http://www.bathrugby.com/fixtures-results/results-tables/results-match-reports/}{Bath Rugby website}.\cr\cr
#' \emph{Note: This function's code is heavily commented so that if you want
#' to, you can use it as a tutorial/guide for writing similar functions of
#' your own! You can view the commented code on the GitHub repo
#' \href{https://github.com/owenjonesuob/BANEScarparking/blob/master/R/web_scraping.R}{here}}.
#'
#' @param x A vector of years of seasons to retrieve records from.
#' @return A data frame of kick-off date-times and match outcomes.
#' @examples
#' # Return matches from 2016/17 season
#' rugby <- get_rugby(2017)
#' \donttest{
#' # Return matches from 2014/15, 2015/16 seasons
#' seasons <- c(2015, 2016)
#' rugby <- get_rugby(seasons)
#' }
#' @export
get_rugby <- function(x) {
### TO FIX
return("Results not available, for now :(")
# Set up data frame, to be added to shortly
rugby <- data.frame(GMT = character(0), HomeWin = logical(0))
# For each season:
for (i in 1:length(x)) {
# Put together the link to the webpage for the season's fixtures
address <- paste0("https://www.bathrugby.com/fixtures-results/",
"results-tables/results-match-reports/",
"?seasonEnding=", x[i])
# "Parse" page so we can work with it in R
parsedpage <- RCurl::getURL(address,
.opts = RCurl::curlOptions(followlocation = TRUE)) %>%
XML::htmlParse()
# Scrape required information from page (first, home match dates):
# xpathSApply returns a vector (simplified from a list, hence SApply
# rather than Apply) of HTML "nodes" satisfying certain conditions...
dates <- XML::xpathSApply(parsedpage,
# First, (visually) find the information we're
# interested in on the webpage (just using a
# browser). Then use the developer console
# (launch with F12, or Cmd+Option+I on Mac) to
# inspect the HTML.
#
# Now to grab the bits of HTML we are
# interested in. Define conditions using XPath
# query language:
# * //dd finds all nodes of type "dd"
# * [@class='...'] takes the subset of nodes
# with "class" attribute "..."
# * / finds all "child" nodes of currently
# selected nodes
paste0("//dd[@class='fixture homeFixture']",
"/span[@class='fixtureDate']"),
# We want the value (in this case, the string)
# contained between the HTML tags of these
# nodes (the 'contents' of the node): the XML
# function xmlValue does this for us!
XML::xmlValue) %>%
# We currently have a string such as " 6 Sep 2014" or "13 Sep 2014",
# but this would be more useful as a date-time!
# See documentation of strptime for % abbreviations to use in the
# "format" string. In this case we're telling the as.POSIXct
# function that:
# '%n': there might be a space, or a few spaces
# '%e': day number, as a decimal
# ' ' : there's definitely a space here
# '%b': abbreviated month name
# ' ' : another space
# '%Y': 4-digit year
as.POSIXct(format = "%n%e %b %Y", tz = "UTC") %>%
# Reverse order of elements (to restore chronological order)
rev()
# Now, kick-off times:
KO <- XML::xpathSApply(parsedpage,
paste0("//dd[@class='fixture homeFixture']",
"/span[@class='fixtureTime']"),
XML::xmlValue) %>%
# Values are currently strings of form "Kick Off 00:00"
# Take 10th character onwards:
substring(10) %>%
# Convert to hour-minute time
lubridate::hm() %>%
rev()
# Combine date and time (just use +, R handles this for us!)
GMT <- dates + KO
# Results of the matches: did Bath win? (Maybe people hang around in
# town for longer afterwards if so.)
HomeWin <- XML::xpathSApply(parsedpage,
paste0("//dd[@class='fixture homeFixture']",
"/span[@class='fixtureResult']"),
XML::xmlValue) %>%
# Values are "Bath won ..." or "Bath lost ...".
# grepl returns TRUE if element contains a string, FALSE if it
# doesn't Note: The "." is the pipe's "placeholder" argument: by
# default, %>% passes on the previous step's result as the FIRST
# parameter. Here, we need to pass it as the second argument, so
# have to use "." to represent it.
grepl("won", .) %>%
rev()
# Stick the GMT and HomeWin vectors together as columns of a data frame
toAdd <- data.frame(GMT, HomeWin)
# Attach this dataframe to the existing data frame of all matches
rugby <- rbind(rugby, toAdd)
}
# Return the complete data frame
rugby
}
#' Scrape the number of advertised events in Bath for each day
#'
#' Web scraping function to retrieve the number of events advertised at
#' \url{http://www.bath.co.uk/events} for each day in a specified range of
#' months.\cr\cr
#' \emph{Note: Have a look at this package's GitHub repo - in particular,
#' \href{https://github.com/owenjonesuob/BANEScarparking/blob/master/R/web_scraping.R}{here}
#' - to see the code for this function, along with comments which
#' explain the process followed. See \code{\link{get_rugby}} for a similar
#' function with more detailed commentary!}
#'
#' @param from A date or date-time object, or YYYY-MM-DD string: the first day
#' from which to get an event count.
#' @param to A date or date-time object, or YYYY-MM-DD string: the last day
#' from which to get an event count.
#' @return A data frame of daily event counts for each day in the specified
#' range.
#' @examples
#' # Return event count for 01 January 2015
#' events <- get_events("2015-01-01", "2015-01-01")
#' \donttest{
#' # Return daily event counts from 01 Oct 2014 to 17 Jul 2015
#' events <- get_events("2014-10-01", "2015-07-17")
#'
#' # Return daily event counts for all months in date range of parking records
#' raw_data <- get_all_crude()
#' df <- refine(raw_data)
#'
#' events <- get_events(min(df$LastUpdate), max(df$LastUpdate))
#' }
#' @seealso \code{\link{get_events_detail}}
#' @export
get_events <- function(from, to) {
# Get all year-month combinations in specified range
year_month <- seq(as.Date(from), as.Date(to), by = "months") %>%
substr(., 0, nchar(.)+2)
# Create web addresses for past events pages
addresses <- paste0("http://www.bath.co.uk/events/", year_month)
# Initialize event-count vector
event_count <- vector("list", length(addresses))
names(event_count) <- year_month
# Create a text bar in the console
pb <- utils::txtProgressBar(min = 0, max = length(addresses),
initial = 0, style = 3)
# For each year-month:
for (i in 1:length(addresses)) {
# "Parse" page so we can work with it in R
parsedpage <- RCurl::getURL(addresses[i],
.opts = RCurl::curlOptions(followlocation = TRUE)) %>%
XML::htmlParse()
# Find all days from current month
day_events <- XML::xpathApply(parsedpage,
paste0("//td[contains(@class, ",
"'tribe-events-thismonth')]"))
# Set up a vector to store number of events from each day in a month
daily_event_count <- vector("integer", length(day_events))
# For each day:
for (j in 1:length(day_events)) {
# Look for a "view more" node
view_more <- XML::xpathSApply(day_events[[j]],
"div[@class='tribe-events-viewmore']",
XML::xmlValue)
# If such a node exists...
if (length(view_more) > 0) {
# ... it contains a string of the form "View all x events", so
# we isolate x by using gsub to replace all non-digit characters
# (the caret ^ means "all except") with "" (nothing!), and then
# convert the string "x" to a number
daily_event_count[j] <- view_more %>%
gsub("[^0-9]", "", .) %>%
as.numeric()
} else {
# If there is no "view more" node, then we simply count the
# number of 'div's: there is one more div than events, because
# the day number (which isn't an event!) is also found in a div
daily_event_count[j] <- XML::xpathApply(day_events[[j]],
"div") %>%
length() - 1
}
}
# Add this list to our list of year-month event counts
event_count[[i]] <- daily_event_count
# Update progress bar
utils::setTxtProgressBar(pb, i)
}
# Trim the last month... (events only up to "to" date)
lec <- length(event_count)
event_count[[lec]] <- event_count[[lec]][1:{lubridate::day(to)}]
# Then the first month... (events only from "from" date)
event_count[[1]] <- event_count[[1]][{lubridate::day(from)}:length(event_count[[1]])]
# Get all dates in the months we are looking at
all_dates <- seq(as.Date(from), as.Date(to), by = "days")
# Make a data frame with each date and the number of events that day:
# "unlist" does exactly as it says on the tin, it just stretches a list out
# into a long vector
events <- data.frame(Date = all_dates, count = unlist(event_count))
# Get rid of unnecessary numeric row names
rownames(events) <- NULL
# Return the complete data frame!
events
}
#' Scrape the details of advertised events in Bath for each day
#'
#' Web scraping function to retrieve the detail for events advertised at
#' \url{http://www.bath.co.uk/events} for each day in a specified range of
#' dates.
#' @author Ryan Kenning (@@rkenning)
#' @param from A date or date-time object, or string, of the first
#' date for which to find events.
#' @param to A date or date-time object, or string, of the last
#' date for which to find events.
#' @return A data frame of daily event details for each day in the specified
#' range of months.
#' @examples
#' # Return event details for 01 January 2015
#' events <- get_events_detail("2015-01-01", "2015-01-01")
#' \dontrun{
#' # Return daily event details from 01 Oct 2014 to 08 Oct 2014
#' events <- get_events_detail("2014-10-01", "2014-10-08")
#' }
#' @seealso \code{\link{get_events}}
#' @export
get_events_detail <- function(from, to) {
# Get all year-month combinations in specified range
year_month_day_seq <- seq(as.Date(from), as.Date(to), by = "days") %>%
substr(., 0, nchar(.)+6)
# Create web addresses for past event pages
addresses <- paste0("http://www.bath.co.uk/events/", year_month_day_seq)
# Initialize event-count vector
event_count <- vector("list", length(addresses))
names(event_count) <- year_month_day_seq
# Initialise 0 length vectors to create the initial DF object (Not sure if this is the best approach to append to a df type pattern)
year_month_day <- vector(mode="character",0)
event_name <- vector(mode="character", 0)
event_location_name <- vector(mode="character", 0)
event_postcode <- vector(mode="character", 0)
event_street <- vector(mode="character", 0)
event_locality <- vector(mode="character", 0)
event_start <- vector(mode="character", 0)
event_end <- vector(mode="character", 0)
time_slot <- vector(mode="character", 0)
# Create the initial df (Not sure if this is the best approach to append to a df type pattern)
events <- data.frame(year_month_day, time_slot, event_name, event_location_name, event_postcode, event_street, event_locality, event_start, event_end, stringsAsFactors = FALSE)
# Add a progress bar to the console
pb <- utils::txtProgressBar(min = 0, max = length(year_month_day_seq),
initial = 0, style = 3)
# For each year-month-day loop through the URLs:
for (ymd in 1:length(addresses)) {
# Connect to address and return HTLM page from URL
url <- addresses[ymd]
# Parse the HTML page
webpage <- xml2::read_html(url)
# Find the Timeslot HTML nodes
event_time_slot <- webpage %>%
rvest::html_nodes(".tribe-events-day-time-slot")
#Loop through the timeslots
for (i in 2:length(event_time_slot)){
# Get the current timeslot header text
time_slot_val <- event_time_slot[i] %>%
rvest::html_node(xpath=".//h5") %>%
rvest::html_text()
# From the current timeslot, get the event detail html node
event_detail_nodes <- event_time_slot[i] %>%
rvest::html_nodes(xpath=".//div[contains(@class, 'type-tribe_events')]")
# Initialise the value vectors with the length based on number of events found
event_name <- vector(mode="character", length=length(event_detail_nodes))
event_location_name <- vector(mode="character", length=length(event_detail_nodes))
event_postcode <- vector(mode="character", length=length(event_detail_nodes))
event_street <- vector(mode="character", length=length(event_detail_nodes))
event_locality <- vector(mode="character", length=length(event_detail_nodes))
event_start <- vector(mode="character", length=length(event_detail_nodes))
event_end <- vector(mode="character", length=length(event_detail_nodes))
time_slot <- rep(time_slot_val,length=length(event_detail_nodes))
# Revised html text function to retun NA if no HTML text value can be found
html_text_na <- function(x, ...) {
txt <- try(rvest::html_text(x, ...))
if (inherits(txt, "try-error") || (length(txt)==0)) { return(NA) }
return(txt)
}
# Loop through each event node
for (j in 1:length(event_detail_nodes)){
#Add parsed event
event_name[j] <- rvest::html_nodes(event_detail_nodes[j], xpath=".//a[@class='url']") %>% rvest::html_text(trim=TRUE)
event_location_name[j] <- rvest::html_node(event_detail_nodes[j], xpath=".//div[@class='tribe-events-venue-details']//a/text()") %>% html_text_na()
event_postcode[j] <- rvest::html_node(event_detail_nodes[j], xpath=".//*[contains(concat( ' ', @class, ' ' ), 'tribe-postal-code')]") %>% html_text_na()
event_street[j] <- rvest::html_node(event_detail_nodes[j], xpath=".//*[contains(concat( ' ', @class, ' ' ), 'tribe-street-address')]") %>% html_text_na()
event_locality[j] <- rvest::html_node(event_detail_nodes[j], xpath=".//*[contains(concat( ' ', @class, ' ' ), 'tribe-locality')]") %>% html_text_na()
event_start[j] <- rvest::html_node(event_detail_nodes[j], xpath=".//*[contains(concat( ' ', @class, ' ' ), 'tribe-event-date-start')]") %>% html_text_na()
event_end[j] <- rvest::html_node(event_detail_nodes[j], xpath=".//*[contains(concat( ' ', @class, ' ' ), 'tribe-event-date-end')]") %>% html_text_na()
}
# Make a Dataframe from the populated vectors (only if there are any events)
if (length(event_detail_nodes)) {
new_events <- data.frame(year_month_day_seq[ymd], time_slot, event_name, event_location_name, event_postcode, event_street, event_locality, event_start, event_end)
# Combine the event details collect so far with the new event details parsed for the current day and timeslot
events <- rbind(events, new_events )
}
}
# Update the progress bar
utils::setTxtProgressBar(pb, value = ymd)
}
# Return the dataframe
events
}
## Deprecated for now - requires significant work following changes to webpage
# #' Scrape daily weather records for Bath
# #'
# #' This function scrapes the \href{https://www.wunderground.com/}{Wunderground}
# #' website to get daily weather summaries for Bath over a given date range.
# #'
# #' @param from A date or date-time object, or YYYY-MM-DD string: the first day
# #' from which to get a weather summary.
# #' @param to A date or date-time object, or YYYY-MM-DD string: the last day
# #' from which to get a weather summary.
# #' @return A data frame of daily weather summaries for each day in the specified
# #' range.
# #' @examples
# #' # Return weather summary for 01 January 2015
# #' weather <- get_daily_weather("2015-01-01", "2015-01-01")
# #' \dontrun{
# #' # Return daily weather summaries from 01 Oct 2014 to 17 Jul 2016
# #' weather <- get_daily_weather("2014-10-01", "2016-07-17")
# #'
# #'
# #' # Return daily weather records for all days in date range of parking records
# #'
# #' library(lubridate)
# #'
# #' raw_data <- get_all_crude()
# #' df <- refine(raw_data)
# #'
# #' weather <- rbind(get_daily_weather(min(df$LastUpdate), max(df$LastUpdate))
# #' }
# #' @export
#
# get_daily_weather <- function(from, to) {
#
# # If we've been asked for more than 398 records, chop into chunks (table
# # on the site only shows up to 398 records at once!)
# from_dates <- seq(lubridate::as_date(from), lubridate::as_date(to), by = "395 day")
# to_dates <- from_dates + 394
# # Last one needs to be up to "to" - and add 1 extra day, else single-day
# # requests don't work
# to_dates[length(to_dates)] <- lubridate::as_date(to) + 1
#
#
# # Initialise an empty data frame
# weather <- data.frame()
#
#
# for (i in seq_along(from_dates)) {
#
# # Make start date into correct format, and extract numbers from end date
# from_day <- gsub("-", "/", as.character(from_dates[i]))
# day_end <- lubridate::day(to_dates[i])
# month_end <- lubridate::month(to_dates[i])
# year_end <- lubridate::year(to_dates[i])
#
#
#
#
# # Set up the URL query (EGTG is closest location to Bath)
# url <- paste0("https://www.wunderground.com/history/airport/EGTG/",
# from_day,
# "/CustomHistory.html?dayend=", day_end,
# "&monthend=", month_end,
# "&yearend=", year_end,
# "&req_city=&req_state=&req_statename=&reqdb.zip=&reqdb.magic=&reqdb.wmo=")
#
# # Get the HTML content of the webpage
# wg <- xml2::read_html(url)
#
# # Find the second table on the webpage (the massive weather table)
# wgtab <- rvest::html_table(wg, header = FALSE)[[2]]
#
# # Set column names
# names(wgtab) <- c("GMT", "max_temp_c", "mean_temp_c",
# "min_temp_c", "max_dewpoint_c", "mean_dewpoint_c",
# "min_dewpoint_c", "max_humidity", "mean_humidity",
# "min_humidity", "max_pressure_hpa",
# "mean_pressure_hpa", "min_pressure_hpa",
# "max_visibility_km", "mean_visibility_km", "min_visibility_km",
# "max_wind_kmph", "mean_wind_kmph",
# "max_gust_kmph", "precip_mm", "precip_type")
#
# # The GMT column of the table currently contains years, months AND days;
# # we'll use regex to separate them out into new columns
# # First, if GMT contains a 1 or 2-digit number then this must be a day
# # number, so put this number into a new column; or put in an NA if GMT
# # did NOT contain a1 or 2-digit number
# wgtab <- dplyr::mutate(wgtab, day = ifelse(grepl("^\\d{1,2}$", GMT), GMT, NA),
# # Do the same for 3-character strings (month names)
# month = ifelse(grepl("^[A-Za-z]{3}$", GMT), GMT, NA),
# # Do the same for 4-digit numbers (year numbers)
# year = ifelse(grepl("^\\d{4}$", GMT), GMT, NA)) %>%
# # Whenever there's an NA in the month or year columns (i.e. most of the
# # time!) fill in each block of NAs with the first previous non-NA value;
# # e.g. Jan becomes Jan
# # NA Jan
# # NA Jan
# # Feb Feb
# # NA Feb
# dplyr::mutate(month = zoo::na.locf(month, na.rm = FALSE),
# year = zoo::na.locf(year, na.rm = FALSE)) %>%
# # Wherever there's an NA in the day column, this was a row containing
# # (unnecessary) headers, rather than data; so we drop these rows
# dplyr::filter(!is.na(day)) %>%
# # Now we stick the year-month-day columns together and convert to a date
# dplyr::mutate(Date = lubridate::ymd(paste(year, month, day, sep = "-"))) %>%
# # Then we can drop the old GMT column and the year-month-day columns
# dplyr::select(-day, -month, -year, -GMT) %>%
# # Re-order the columns to get Date on the left (currently on the right)
# dplyr::select(Date, dplyr::everything()) %>%
# # Lastly, the max_gust_kmph column is always empty, and precip_type
# # column needs splitting into separate columns
# dplyr::mutate("was_rainy" = grepl("Rain", precip_type),
# "was_snowy" = grepl("Snow", precip_type),
# "was_foggy" = grepl("Fog", precip_type)) %>%
# dplyr::select(-max_gust_kmph, -precip_type)
#
#
# # Add this set or records to the table
# weather <- dplyr::bind_rows(weather, wgtab)
#
# }
#
# # Return the table! (Trim the last row, because remember we had to ask for 1
# # extra day)
# weather[1:(nrow(weather)-1), ]
# }
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.