how2scrape: A Self-Contained Web Scraping Lab

rm(list=ls(all=T))

# load bills df

load('/a/ha-nfs-2-ib/export/data/bswift-1/eajones3/df.RData')

# Libraries

  require(rvest)
  require(dplyr)
  require(stringr)

# Fix text urls

df_bills <- df_bills %>%
  mutate(text_link = gsub('(\\?)', '/text?format=txt&', text_link))

# Cosponsor extracting function

  co_extractor <- function(url) {
    # Libaries
    require(rvest)
    require(dplyr)

    # Read html
    html.cosponsors <- read_html(url)

    # Extract cosponsor name(s)
    cosponsors <- html.cosponsors %>%
      html_nodes('#main a') %>%
      html_text()

    # Extract date of cosponsorship
    date_cosponsor <- html.cosponsors %>%
      html_nodes('td.date') %>%
      html_text()

    # Paste both vectors together
    # Separate by hypen for easy string splits later
    m <- paste(cosponsors, date_cosponsor, sep = ' - ')

    # If N_cosponsors > 1, collapse by semi-colon for easy string split later
    if (length(m) > 1) m <- paste(m, sep = '', collapse = '; ')
    if (length(m) == 0) m <- "No cosponsors"

    # Put in a random crawl-delay between 2 and 6
    d <- runif(1,2,6)
    Sys.sleep(d)

    return(m)

  }

# Text extracting function

  text_extractor <- function(url) {

    # Libraries
    require(rvest)
    require(dplyr)
    require(stringr)

    # Read html
    html <- read_html(url)

    # # Create logical proxy for presence of text
    # proxy_text <- html %>%
    #   html_nodes('.selected a') %>%
    #   html_text() %>%
    #   grepl('\\(',.)

    # Create logical proxy for enrolled bill or not
    proxy_enrolled <- html %>%
      html_nodes('#textVersion') %>%
      html_text() %>%
      grepl('Enrolled',.) %>%
      {if (length(.)==0) . <- FALSE else .}

      if (proxy_enrolled) {

        text <- html %>%
          # extract text of bill
          html_nodes('#billTextContainer') %>%
          html_text() %>%
          # Second check to ensure ther
          # Split into meaningful parts
          str_split(., '\n{2}') %>%
          .[[1]] %>%
          # drop blank elements
          .[. != ''] %>%
          # Drop everything prior to start of main text
          .[-(1:grep('An Act', .)[1])] %>%
          # Remove excess white space
          str_replace_all('[\n\t]', '') %>%
          trimws()

      } else {

        # Extract text
        text <- html %>%
          html_nodes('#billTextContainer') %>%
          html_text() %>%
          # Check to ensure there is text
          { if (length(.) > 0) . <-
              # Split into meaningful parts
              str_split(., '\n{2}') %>%
              .[[1]] %>%
              # Drop blank elements
              .[. != ''] %>%
              # Drop everything prior to start of main text
              .[-(1:grep('BILL|ACT|AMENDMENTS', .)[1])] %>%
              # remove excess white space
              str_replace_all('[\n\t]', '') %>%
              trimws() else . <- "No text available"
              }
      }

    # Put in a random crawl-delay between 2 and 6
    d <- runif(1,2,6)
    Sys.sleep(d)

    return(text)

  }

# Create safe version of text_extractor with purrr::possibly

  try_text <- purrr::possibly(text_extractor, otherwise = NA)

# Parallel Processing Libraries

  library(foreach)
  library(parallel)
  library(doParallel)

# Make cluster of cores

  cl <-  makeCluster(4, type = "FORK")

# Register the cluster

  registerDoParallel(cl)

# Create class which holds multiple results for each loop iteration.
# Each loop iteration populates two properties: $result1 and $result2.
# For a great tutorial on S3 classes, see:
# http://www.cyclismo.org/tutorial/R/s3Classes.html#creating-an-s3-class

  multiResultClass <- function(result1=NULL, result2=NULL)
  {
    me <- list(
      list(result1),
      list(result2)
    )

    ## Set the name for the class
    class(me) <- append(class(me),"multiResultClass")
    return(me)
  }

# Parallel loop over the cosponsor_links and bill text to extract
# respective data

  bill_container <- foreach(i = 1:nrow(df_bills)) %dopar% {
    result <- multiResultClass()
    result[[1]] <- co_extractor(df_bills$cosponsors_link[i])
    result[[2]] <- try_text(df_bills$text_link[i])
    return(result)
  }

# Stop cluster

  stopCluster(cl)

# Save result

  save(bill_container,
       file = '/a/ha-nfs-2-ib/export/data/bswift-1/eajones3/result.RData')

EandrewJones/how2scrape documentation built on Oct. 21, 2020, 5:13 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

EandrewJones/how2scrape
A Self-Contained Web Scraping Lab

ext_scripts/hpc_wslab.R
In EandrewJones/how2scrape: A Self-Contained Web Scraping Lab

R Package Documentation

Browse R Packages

We want your feedback!

EandrewJones/how2scrape A Self-Contained Web Scraping Lab

ext_scripts/hpc_wslab.R In EandrewJones/how2scrape: A Self-Contained Web Scraping Lab

R Package Documentation

Browse R Packages

We want your feedback!

EandrewJones/how2scrape
A Self-Contained Web Scraping Lab

ext_scripts/hpc_wslab.R
In EandrewJones/how2scrape: A Self-Contained Web Scraping Lab