knitr::opts_chunk$set( echo=TRUE, message=F, warning=F, fig.width=10 )
```{css, echo=F} h1, h2{ margin-top:100px; margin-bottom:20px; }
table { margin-left: auto; margin-right: auto; margin-top:80px; margin-bottom:800px; }
img { max-width: 90%; display: block; margin-right: auto; margin-left: auto; margin-top:30px; margin-bottom:20px; }
```r library( dplyr ) library( pander ) library( xml2 ) library( RCurl ) # for url.exists library( httr ) # for http_error library( stringr ) # for str_extract
# load sample of 20 URLs from 1023-EZ forms sample.urls <- structure(list(EIN = c("01-0278788", "01-0468034", "01-0512631", "01-0531578", "01-0533351", "01-0604316", "01-0716774", "01-0764856", "01-0791209", "01-0868557", "01-0900532", "01-0963519", "02-0508950", "02-0538937", "02-0594382", "02-0653283", "02-0729067", "02-0760556", "02-0764719", "02-0785015"), ORGNAME = c("GOODWINS MILLS FIREFIGHTERS RELIEFASSOCIATION", "MAINE WELFARE DIRECTORS ASSOCIATION ", "MAINE MYCOLOGICAL ASSOCIATION INC ", "BUCKSPORT AREA CULTURAL ARTS SOCIETY- BACAS", "PORTLAND RUGBY FOOTBALL CLUB ", "KENNESAW MOUNTAIN HIGH SCHOOL TRACKFIELD BOOSTER CLUB INC", "WALNUT BEND ELEMENTARY SCHOOL PARENT-TEACHER ORGANIZATION", "FLORIDA CERT ASSOCIATION", "LADY EAGLE BASKETBALL BOOSTER CLUB", "GRAND FORKS DOG TRAINING CLUB", "YOUTH OF UNITY Y O U INC", "LAKEVILLE YOUTH FOOTBALL CHEERLEADING", "GATE CITY HARLEY OWNERS GROUP", "FOREST PRESERVE USERS INC", "THE HERITAGE EDUCATION GROUP", "HABITAT FOR HUMANITY OF ELLIS COUNTY INC", "GO SOUNDS PAA INC", "NORTHEAST TEXAS EMMAUS INC", "CHIPPEWA FALLS CARDINAL HOOPS", "JOE UFF CANCER FOUNDATION INC "), ORGURL = c("HTTP://GMFD.ORG/GMFRA/GMFRAINDEX.HTM", "WWW.MAINEWELFAREDIRECTORS.ORG", "MAINELYMUSHROOMS.ORG", "WWW.BACASMAINE.ORG", "WWW.PORTLANDRFC.COM", "WWW.KMHSTRACKFIELD.COM", "WWW.WBE-PTO.ORG", "HTTP://WWW.FLACERTASSOCIATION.ORG", "WWW.LADYEAGLEBASKETBALL.NET", "GRANDFORKSDOGTRAININGCLUB.ORG", "WWW.UNITYOFYOUTH.ORG", "HTTP://WWW.LVYOUTH.COM", "GATECITYHOG.COM", "WWW.FORESTPRESERVEUSERSSNOWCLUB.COM", "WWW.HERITAGEEDUCATIONGROUP.ORG", "HTTPS://WWW.FACEBOOK.COM/HFHEC", "WWW.GOSOUNDSINC.COM", "WWW.NETEMMAUS.ORG", "WWW.CARDINALHOOPS.ORG", "WWW.UFFISTOUGH.COM")), class = "data.frame", row.names = c(NA, -20L)) head( sample.urls ) %>% pander()
## To clean the URLs for the purpose of intial downloads ## Ensures that every url starts with the same kind of header cleanURLtoDownload <- function( name ){ temp <- tolower( name ) for( i in 1:length( temp ) ){ if( grepl( "^http:/{2}", temp [ i ] ) == T ){ temp[ i ] <- sub( "^http:/{2}", "", temp[ i ] ) # removes "http://" if( grepl( "^www\\.*", temp[ i ] ) == TRUE ){ temp[ i ] <- paste0( "http://", temp[ i ] ) } else { temp[ i ] <- paste0( "http://www.", temp[ i ] ) } } else if( grepl( "^https:/{2}", temp [ i ] ) == T ){ temp[ i ] <- sub( "^https:/{2}", "", temp[ i ] ) # removes "https://" if( grepl( "^www\\.*", temp[ i ] ) == TRUE ){ temp[ i ] <- paste0( "https://", temp[ i ] ) } else { temp[ i ] <- paste0( "https://www.", temp[ i ] ) } } else if( grepl( "^https?:/{2}", temp [ i ] ) != T ) { temp[ i ] <- paste0( "http://", temp[ i ] ) } } temp <- gsub( "/$", "", temp ) # Remove trailing "/", important for initial URL generation return( temp ) } ## To clean the names of URLs for the purpose of saving mirrors cleanName <- function( name ){ temp <- sub( "^https?:/{2}", "", name ) # removes "http://" and "https://" temp <- sub( "^.*w{3}\\.", "", temp ) # removes "www." temp <- gsub( "/$", "", temp ) # removes any "/" at the end temp <- gsub( " ", "", temp ) # removes any " " temp <- gsub( "\\?|:", "", temp ) # removes any "\" or ":" temp <- gsub( "/", "_._", temp ) # converts any "/" within the address to a representative string, "_._" return( temp ) } ## To clean the URLs for the purpose of comparing and removing duplicates cleanURLtoCompare <- function( name ){ temp <- gsub( " ", "", name ) # removes any " " temp <- gsub( "\\>/{2}", "/", temp ) # converts any instances of "//" to "/" after the http(s):// head text temp <- gsub( "/$", "", temp ) # removes any "/" at the end return( temp ) }
sample.urls$ORGURL_CLEANED <- cleanURLtoDownload( sample.urls$ORGURL ) sample.urls$ORGNAME <- gsub( " $", "", sample.urls$ORGNAME ) # remove any trailing spaces # Using the URLs column from the input file URL.values <- sample.urls$ORGURL_CLEANED compiled.URLs <- c() #object used to collect every URL compiled throughout the process, the full list for each provided site. URL.redirect <- c() input.URL <- c() #holds the URLs provided at the start of each iteration, before any cropping head( URL.values )
checkURLstatus <- function( input.URL ){ result <- data.frame(matrix(ncol=5,nrow=1,dimnames=list(NULL, c("URL","Type","URL.Exists","HTTP.Status","Valid")))) result[["URL"]] <- input.URL #store the url being checked result[["Type"]] <- "raw" result[["URL.Exists"]] <- url.exists( result[["URL"]] ) result[["HTTP.Status"]] <- tryCatch( http_status( GET( result[["URL"]] ) )[[1]] , error = function( e ){ NA } ) result[["Valid"]] <- result[["URL.Exists"]] && (result[["HTTP.Status"]] == "Success") URL.cropped <- str_extract( result[["URL"]], "^https?://w{3}\\.[[:alpha:]]*\\.[[:alpha:]]*" ) if( result [["Valid"]] == FALSE && URL.cropped != result[["URL"]] ){ result_cropped <- data.frame(matrix(ncol=5,nrow=1,dimnames=list(NULL, c("URL","Type","URL.Exists","HTTP.Status","Valid")))) result_cropped[["URL"]] <- URL.cropped #store the url being checked result_cropped[["Type"]] <- "cropped" result_cropped[["URL.Exists"]] <- if( url.exists( result_cropped[["URL"]] ) ) result_cropped[["HTTP.Status"]] <- tryCatch( http_status( GET( result_cropped[["URL"]] ) )[[1]] , error = function( e ){ NA } ) result_cropped[["Valid"]] <- result_cropped[["URL.Exists"]] && (result_cropped[["HTTP.Status"]] == "Success") result <- rbind(result, result_cropped) } return( result ) }
input.URL <- URL.values[1] print( checkURLstatus( input.URL ) )
getRedirectedFrameURL <- function( input.URL ){ # Add a check if the URL value redirects. If so, that redirected portion is what needs to be pasted to the front of the outgoing link URL.redirect <- cleanURLtoCompare( GET( input.URL )[ 1 ] ) html_page <- read_html( URL.redirect ) # See if the website is contained within a frame. If so, extract the URL embedded within the frame # Use that URL as the redirect value html_page %>% rvest::html_nodes( "frame") %>% rvest::html_attr( "src" ) -> frame.links # Include some sort of check; if frame.links != length( 0 ), then should use that URL if( length( frame.links ) != 0 ){ # Definitely have it overwrite the html_page, need to wrap in read_html() html_page <- read_html( frame.links[ !is.na( frame.links ) ] ) # Have this overwrite the redirected value return( cleanURLtoCompare( GET( frame.links[ !is.na( frame.links ) ] )[1] ) ) } else{ return( URL.redirect ) } }
input.URL <- URL.values[1] print( getRedirectedFrameURL( input.URL ) )
Show the code to extract distinct nodes from HTML pages. Show examples of what the nodes contain, their meta-data, etc.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.