knitr::opts_chunk$set( echo=TRUE, message=F, warning=F, fig.width=10 )

```{css, echo=F} h1, h2{ margin-top:100px; margin-bottom:20px; }

table { margin-left: auto; margin-right: auto; margin-top:80px; margin-bottom:800px; }

img { max-width: 90%; display: block; margin-right: auto; margin-left: auto; margin-top:30px; margin-bottom:20px; }

```r
library( dplyr )
library( pander )
library( xml2 )
library( RCurl ) # for url.exists
library( httr ) # for http_error
library( stringr ) # for str_extract

Load Sample Websites

# load sample of 20 URLs from 1023-EZ forms
sample.urls <- 
structure(list(EIN = c("01-0278788", "01-0468034", "01-0512631", 
"01-0531578", "01-0533351", "01-0604316", "01-0716774", "01-0764856", 
"01-0791209", "01-0868557", "01-0900532", "01-0963519", "02-0508950", 
"02-0538937", "02-0594382", "02-0653283", "02-0729067", "02-0760556", 
"02-0764719", "02-0785015"), ORGNAME = c("GOODWINS MILLS FIREFIGHTERS RELIEFASSOCIATION", 
"MAINE WELFARE DIRECTORS ASSOCIATION ", "MAINE MYCOLOGICAL ASSOCIATION INC ", 
"BUCKSPORT AREA CULTURAL ARTS SOCIETY- BACAS", "PORTLAND RUGBY FOOTBALL CLUB ", 
"KENNESAW MOUNTAIN HIGH SCHOOL TRACKFIELD BOOSTER CLUB INC", 
"WALNUT BEND ELEMENTARY SCHOOL PARENT-TEACHER ORGANIZATION", 
"FLORIDA CERT ASSOCIATION", "LADY EAGLE BASKETBALL BOOSTER CLUB", 
"GRAND FORKS DOG TRAINING CLUB", "YOUTH OF UNITY Y O U INC", 
"LAKEVILLE YOUTH FOOTBALL  CHEERLEADING", "GATE CITY HARLEY OWNERS GROUP", 
"FOREST PRESERVE USERS INC", "THE HERITAGE EDUCATION GROUP", 
"HABITAT FOR HUMANITY OF ELLIS COUNTY INC", "GO SOUNDS PAA INC", 
"NORTHEAST TEXAS EMMAUS INC", "CHIPPEWA FALLS CARDINAL HOOPS", 
"JOE UFF CANCER FOUNDATION INC "), ORGURL = c("HTTP://GMFD.ORG/GMFRA/GMFRAINDEX.HTM", 
"WWW.MAINEWELFAREDIRECTORS.ORG", "MAINELYMUSHROOMS.ORG", "WWW.BACASMAINE.ORG", 
"WWW.PORTLANDRFC.COM", "WWW.KMHSTRACKFIELD.COM", "WWW.WBE-PTO.ORG", 
"HTTP://WWW.FLACERTASSOCIATION.ORG", "WWW.LADYEAGLEBASKETBALL.NET", 
"GRANDFORKSDOGTRAININGCLUB.ORG", "WWW.UNITYOFYOUTH.ORG", "HTTP://WWW.LVYOUTH.COM", 
"GATECITYHOG.COM", "WWW.FORESTPRESERVEUSERSSNOWCLUB.COM", "WWW.HERITAGEEDUCATIONGROUP.ORG", 
"HTTPS://WWW.FACEBOOK.COM/HFHEC", "WWW.GOSOUNDSINC.COM", "WWW.NETEMMAUS.ORG", 
"WWW.CARDINALHOOPS.ORG", "WWW.UFFISTOUGH.COM")), class = "data.frame", row.names = c(NA, 
-20L))

head( sample.urls ) %>% pander()

Cleaning Functions

## To clean the URLs for the purpose of intial downloads
## Ensures that every url starts with the same kind of header
cleanURLtoDownload <- function( name ){
  temp <- tolower( name )
  for( i in 1:length( temp ) ){
    if( grepl( "^http:/{2}", temp [ i ] ) == T ){
      temp[ i ] <- sub( "^http:/{2}", "", temp[ i ] ) # removes "http://"
      if( grepl( "^www\\.*", temp[ i ] ) == TRUE ){ 
        temp[ i ] <- paste0( "http://", temp[ i ] ) 
      } else {
        temp[ i ] <- paste0( "http://www.", temp[ i ] ) 
      }
    } else if( grepl( "^https:/{2}", temp [ i ] ) == T ){
      temp[ i ] <- sub( "^https:/{2}", "", temp[ i ] ) # removes "https://"
      if( grepl( "^www\\.*", temp[ i ] ) == TRUE ){ 
        temp[ i ] <- paste0( "https://", temp[ i ] ) 
      } else {
        temp[ i ] <- paste0( "https://www.", temp[ i ] ) 
      }
    } else if( grepl( "^https?:/{2}", temp [ i ] ) != T ) {
      temp[ i ] <- paste0( "http://", temp[ i ] ) 
    }
  }
  temp <- gsub( "/$", "", temp ) # Remove trailing "/", important for initial URL generation
  return( temp )
}

## To clean the names of URLs for the purpose of saving mirrors
cleanName <- function( name ){
  temp <- sub( "^https?:/{2}", "", name ) # removes "http://" and "https://"
  temp <- sub( "^.*w{3}\\.", "", temp ) # removes "www."
  temp <- gsub( "/$", "", temp ) # removes any "/" at the end
  temp <- gsub( " ", "", temp ) # removes any " "
  temp <- gsub( "\\?|:", "", temp ) # removes any "\" or ":"
  temp <- gsub( "/", "_._", temp  ) # converts any "/" within the address to a representative string, "_._"
  return( temp )
}

## To clean the URLs for the purpose of comparing and removing duplicates
cleanURLtoCompare <- function( name ){
  temp <- gsub( " ", "", name ) # removes any " "
  temp <- gsub( "\\>/{2}", "/", temp ) # converts any instances of "//" to "/" after the http(s):// head text
  temp <- gsub( "/$", "", temp ) # removes any "/" at the end
  return( temp )
}

Clean the Input File

sample.urls$ORGURL_CLEANED <- cleanURLtoDownload( sample.urls$ORGURL )
sample.urls$ORGNAME <- gsub( " $", "", sample.urls$ORGNAME ) # remove any trailing spaces

# Using the URLs column from the input file
URL.values <- sample.urls$ORGURL_CLEANED

compiled.URLs <- c() #object used to collect every URL compiled throughout the process, the full list for each provided site.

URL.redirect <- c()
input.URL <- c() #holds the URLs provided at the start of each iteration, before any cropping

head( URL.values )

Function for Checking URL Status

checkURLstatus <- function( input.URL ){

  result <- data.frame(matrix(ncol=5,nrow=1,dimnames=list(NULL,
                        c("URL","Type","URL.Exists","HTTP.Status","Valid"))))

  result[["URL"]] <- input.URL #store the url being checked
  result[["Type"]] <- "raw"
  result[["URL.Exists"]] <- url.exists( result[["URL"]] ) 
  result[["HTTP.Status"]] <- tryCatch( http_status( GET( result[["URL"]] ) )[[1]] , 
              error = function( e ){ NA } )
  result[["Valid"]] <- result[["URL.Exists"]] && (result[["HTTP.Status"]] == "Success")

  URL.cropped <- str_extract( result[["URL"]], "^https?://w{3}\\.[[:alpha:]]*\\.[[:alpha:]]*" )

  if( result [["Valid"]] == FALSE && URL.cropped != result[["URL"]] ){
          result_cropped <- data.frame(matrix(ncol=5,nrow=1,dimnames=list(NULL,
                        c("URL","Type","URL.Exists","HTTP.Status","Valid"))))

          result_cropped[["URL"]] <- URL.cropped #store the url being checked
          result_cropped[["Type"]] <- "cropped"
          result_cropped[["URL.Exists"]] <- if( url.exists( result_cropped[["URL"]] ) ) 
          result_cropped[["HTTP.Status"]] <- tryCatch( http_status( GET( result_cropped[["URL"]] ) )[[1]] , 
              error = function( e ){ NA } )
          result_cropped[["Valid"]] <- result_cropped[["URL.Exists"]] && (result_cropped[["HTTP.Status"]] == "Success")

          result <- rbind(result, result_cropped)
  }

  return( result )
}

Test checkURLstatus function for websites

input.URL <- URL.values[1]
print( checkURLstatus( input.URL ) )

Get URL embedded within the frame after redirection

getRedirectedFrameURL <- function( input.URL ){
        # Add a check if the URL value redirects. If so, that redirected portion is what needs to be pasted to the front of the outgoing link
        URL.redirect <- cleanURLtoCompare( GET( input.URL )[ 1 ] )

        html_page <- read_html( URL.redirect )
        # See if the website is contained within a frame. If so, extract the URL embedded within the frame 
        # Use that URL as the redirect value
        html_page %>%
        rvest::html_nodes( "frame") %>%
        rvest::html_attr( "src" ) -> frame.links

        # Include some sort of check; if frame.links != length( 0 ), then should use that URL
        if( length( frame.links ) != 0 ){
                # Definitely have it overwrite the html_page, need to wrap in read_html()
                html_page <- read_html( frame.links[ !is.na( frame.links ) ] )
                # Have this overwrite the redirected value
                return( cleanURLtoCompare( GET( frame.links[ !is.na( frame.links ) ] )[1] ) )
        }
        else{
                return( URL.redirect )       
        }
}

Test getRedirectedFrameURL function for websites

input.URL <- URL.values[1]
print( getRedirectedFrameURL( input.URL ) )

Extract All Nodes

Show the code to extract distinct nodes from HTML pages. Show examples of what the nodes contain, their meta-data, etc.



Nonprofit-Open-Data-Collective/webscraper documentation built on July 19, 2023, 6:09 p.m.