knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  fig.path = "man/figures/README-",
  out.width = "100%"
)

scrappy: A Simple Web Scraper logo

r badger::badge_cran_release("scrappy", "black") r badger::badge_devel("villegar/scrappy", "yellow") r badger::badge_github_actions("villegar/scrappy")

The goal of scrappy is to provide simple functions to scrape data from different websites for academic purposes.

Installation

You can install the released version of scrappy from CRAN with:

install.packages("scrappy")

And the development version from GitHub with:

# install.packages("devtools")
devtools::install_github("villegar/scrappy")

Example

NOTE: To run the following examples on your computer, you need to download and install Mozilla Firefox (https://www.mozilla.org/en-GB/firefox/new/). Alternatively, you can replace the value of browser in the call to RSelenium::rsDriver.

# Create RSelenium session
rD <- RSelenium::rsDriver(browser = "firefox", port = 4549L, verbose = FALSE)

# Call scrappy
out_newa <- scrappy::newa_nrcc(
  client = rD$client,
  year = 2020,
  month = 12, # December
  station = "gbe", # Geneva (Bejo) station
  save_file = FALSE
) # Don't save output

out_gmaps <- scrappy::google_maps(
  client = rD$client,
  name = "Sefton Park",
  max_reviews = 20
)

out_gpp <- scrappy::find_a_gp(rD$client, postcode = "L69 3GL")

# Stop server
conn <- rD$server$stop()

NEWA @ Cornell University

The Network for Environment and Weather Applications at Cornell University. Website: http://newa.cornell.edu

# Create RSelenium session
rD <- RSelenium::rsDriver(browser = "firefox", port = 4549L, verbose = FALSE)

# Call scrappy
out <- scrappy::newa_nrcc(
  client = rD$client,
  year = 2020,
  month = 12, # December
  station = "gbe", # Geneva (Bejo) station
  save_file = FALSE
) # Don't save output to a CSV file
# Stop server
rD$server$stop()

Partial output from the previous example:

knitr::kable(head(out_newa, 10))

Google Maps

Extract the reviews for Sefton Park in Liverpool (only the 20 most recent):

# Create RSelenium session
rD <- RSelenium::rsDriver(browser = "firefox", port = 4549L, verbose = FALSE)

# Call scrappy
out <- scrappy::google_maps(
  client = rD$client,
  name = "Sefton Park",
  max_reviews = 20
)
# Stop server
rD$server$stop()

Output after removing original authors' names and URL to their profiles:

`%>%` <- scrappy::`%>%`
out_gmaps %>%
  dplyr::mutate(
    author = paste0("Author ", seq_along(author)),
    author_url = ""
  ) %>%
  knitr::kable()

NHS GP practices by postcode

# Create RSelenium session
rD <- RSelenium::rsDriver(browser = "firefox", port = 4549L, verbose = FALSE)

# Retrieve GP practices near L69 3GL
# (Waterhouse building, University of Liverpool)
out <- scrappy::find_a_gp(rD$client, postcode = "L69 3GL")

# Stop server
rD$server$stop()
knitr::kable(out_gpp)


villegar/scrappy documentation built on Jan. 25, 2024, 4:38 p.m.