knitr::opts_chunk$set( collapse = TRUE, comment = "#>" )
library(covid19interventions) library(dplyr) library(rvest) library(RSelenium)
# website with stay at home/shelter in orders url <- xml2::read_html("https://www.nytimes.com/interactive/2020/us/coronavirus-stay-at-home-order.html") # identify elements to scrape at the state level td_tags <- c('h3', 'p.l-order') sah_ = list() # loop through multiple elements to scrape for (i in td_tags){ sah_[[i]] = url %>% html_nodes('div.state-wrap.statewide') %>% html_nodes(i) %>% html_text } # list to Dataframe conversion names(sah_) <- c('state','order') sah <- as.data.frame(sah_) sah["county"] <- 'All' # States that did not implement a stay at home order # Pull the county level info with state attached no_sah <- html_nodes(url, "div.state-wrap ") for (section in no_sah) { state <- html_nodes(section, "h3") %>% html_text place <- html_nodes(section, "p.l-place") %>% html_text order <- html_nodes(section, "p.l-order") %>% html_text # counties under l-place class for every state # if state has implemented SAH in one or more county # extract county level SAH data if (length(place) >= 1){ sah <- sah %>% add_row(state = state, order = order, county = place) } } # Format Columns sah$state = unlist(lapply(sah$state,function(x){ paste(unlist(strsplit(as.character(x)," About "))[1],collapse = " ")})) sah$county = unlist(lapply(sah$county,function(x){ paste(unlist(strsplit(as.character(x)," About "))[1],collapse = " ")})) sah$order_type = unlist(lapply(sah$order,function(x){ paste(unlist(strsplit(as.character(x),", effective "))[1],collapse = "")})) sah$date_time = unlist(lapply(sah$order,function(x){ paste(unlist(strsplit(as.character(x),", effective "))[2],collapse = "")})) sah$time = unlist(lapply(sah$date,function(x){ paste(unlist(strsplit(as.character(x)," at "))[2],collapse = "")})) sah$date <- unlist(lapply(sah$date,function(x){ paste(unlist(strsplit(as.character(x)," at "))[1],collapse = "")})) # Fix date & Filter columns sah$order_date <- as.Date(sah$date,format = "%B %d") sah <- sah[c("state", "county", "order_type", "order_date", "time")] save(sah, file = "./data/sah.rda")
# website with closures in Georgia schools <- xml2::read_html("https://www.wsbtv.com/weather/school-closings/") # identify elements to scrape td_tags <- c('td.name', 'td.county', 'td.date') closures_ = list() # the html_table way does not work because of description column for some rows # loop through multiple elements to scrape each column of interest for (i in td_tags){ closures_[[i]] = schools %>% html_nodes(i) %>% html_text() } # list to Dataframe converstion names(closures_) <- c('name','county','date') closures <- as.data.frame(closures_) # Format Date Columns closures$start_date = unlist(lapply(closures$date,function(x){ paste(unlist(strsplit(as.character(x)," "))[2:3],collapse = " ")} )) closures$end_date = unlist(lapply(closures$date,function(x){ paste(unlist(strsplit(as.character(x)," "))[4:5],collapse = " ")} )) # remove old date column closures <- closures[,-3] # add state column closures$state <- 'Georgia' # assign types to each row based on UArizona codes closures$type <- '' closures$type =ifelse(grepl("School|Academy", closures$name, fixed = FALSE),"Pub-school-closed",closures$type) closures$type =ifelse(grepl("Church", closures$name, fixed = FALSE),"Churches-closed",closures$type) closures$type =ifelse(grepl("College|University", closures$name, fixed = FALSE),"Uni-school-closed",closures$type) closures$type =ifelse(grepl("Government|City", closures$name, fixed = FALSE),"Govt-fac-closed",closures$type)
```{bash, eval=FALSE}
source(java -jar selenium-server-standalone-3.9.1.jar)
```r # Selenium way library(RSelenium) # .jar file here: http://selenium-release.storage.googleapis.com/index.html?path=3.9/ #URL with js-rendered content to be scraped driver<- rsDriver(browser=c("firefox")) remDr <- driver[["client"]] remDr$open() remDr$navigate("https://www.wsbtv.com/weather/school-closings/") # test # remDr$maxWindowSize() # remDr$screenshot(display = TRUE) # filter to county schools through search box searchbox <- remDr$findElement(using = "class name", 'filter-table') searchbox$sendKeysToElement(list("County Schools", key = "enter")) # close ad Sys.sleep(8) ad <- remDr$findElement(using = "class name", 'widget_fb_close') ad$clickElement() # columns to scrape scrape_col <- c('name','county','date') name_elmt <- remDr$findElements(using = "class name", 'name') county_elmt <- remDr$findElements(using = "class name", 'county') date_elmt <- remDr$findElements(using = "class name", 'date') # extract text form col name_scraped <- unlist(lapply(name_elmt, function(x){x$getElementText()})) county_scraped <- unlist(lapply(county_elmt, function(x){x$getElementText()})) date_scraped <- unlist(lapply(date_elmt, function(x){x$getElementText()})) # next page webElem<-remDr$findElement(using = "id", value = "page-selector-2") webElem$clickElement() # need to loop through functions earlier for each page..
# Selenium way library(RSelenium) # .jar file here: http://selenium-release.storage.googleapis.com/index.html?path=3.9/ #URL with js-rendered content to be scraped driver<- rsDriver(browser=c("firefox")) remDr <- driver[["client"]] remDr$open() remDr$navigate("https://www.unacast.com/covid19/social-distancing-scoreboard") # test # remDr$maxWindowSize() # remDr$screenshot(display = TRUE) # filter to county schools through search box #example script library(rvest) my_session <- html_session() library(rvest) library(stringr) #extract source as hown in the image above iframe_src <- html_session('https://covid19-scoreboard.unacast.com/') %>% html_node("iframe") %>% html_attr("src") #get the url to that iframe iframe_url <- str_c("https://scrapethissite.com",iframe_src) #extract turtle names: turtle_names <- html_session(iframe_url) %>% html_nodes("div.root") %>% html_text() print(turtle_names)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.