Rcrawler: Web Crawler and Scraper

Documented in Drv_fetchpage LinkExtractor

#' LinkExtractor
#'
#' Fetch and parse a document by URL, to extract page info, HTML source and links (internal/external).
#' Fetching process can be done by HTTP GET request or through webdriver (phantomjs) which simulate a real browser rendering.
#'
#'
#' @param url character, url to fetch and parse.
#' @param id  numeric, an id to identify a specific web page in a website collection, it's auto-generated byauto-generated by \code{\link{Rcrawler}} function.
#' @param lev numeric, the depth level of the web page, auto-generated by \code{\link{Rcrawler}} function.
#' @param IndexErrPages character vector, http error code-statut that can be processed, by default, it's \code{IndexErrPages<-c(200)} which means only successfull page request should be parsed .Eg, To parse also 404 error pages add, \code{IndexErrPages<-c(200,404)}.
#' @param Useragent , the name the request sender, default to "Rcrawler". but we recommand using a regular browser user-agent to avoid being blocked by some server.
#' @param use_proxy, object created by httr::use_proxy() function, if you want to use a proxy to retreive web page. (does not work with webdriver).
#' @param Timeout ,default to 5s
#' @param urlsZoneXpath, xpath pattern of the section from where links should be exclusively gathered/collected.
#' @param URLlenlimit interger, Maximum URL length to process, default to 255 characters (Useful to avoid spider traps)
#' @param urlregexfilter character vector, filter out extracted internal urls by one or more regular expression.
#' @param urlExtfilter character vector, the list of file extensions to exclude from parsing, Actualy, only html pages are processed(parsed, scraped); To define your own lis use \code{urlExtfilter<-c(ext1,ext2,ext3)}
#' @param ExternalLInks boolean, default FALSE, if set to TRUE external links also are returned.
#' @param urlbotfiler character vector , directories/files restricted by robot.txt
#' @param encod character, web page character encoding
#' @param removeparams character vector, list of url parameters to be removed form web page internal links.
#' @param removeAllparams boolean, IF TRUE the list of scraped urls will have no parameters.
#' @param Browser the client object of a remote headless web driver(virtual browser), created by \code{br<-run_browser()} function, or a logged-in browser session object, created by \link{LoginSession}, after installing web driver Agent \code{install_browser()}. see examples below.
#' @param RenderingDelay the time required by a webpage to be fully rendred, in seconds.
#'
#' @return return a list of three elements, the first is a list containing the web page details (url, encoding-type, content-type, content ... etc), the second is a character-vector containing the list of retreived internal urls and the third is a vetcor of external Urls.
#' @author salim khalil
#'
#' @import webdriver
#' @importFrom  httr GET
#' @importFrom  httr user_agent
#' @importFrom  httr use_proxy
#' @importFrom  httr timeout
#' @importFrom  httr content
#' @importFrom  httr add_headers
#' @importFrom  data.table %chin% %like%
#' @importFrom  xml2  xml_find_all
#' @importFrom  xml2  xml_text
#' @importFrom  xml2  xml_find_first
#'
#' @examples
#'
#'\dontrun{
#'
#' ###### Fetch a URL using GET request :
#' ######################################################
#' ##
#' ## Very Fast, but can't fetch javascript rendred pages or sections
#'
#' # fetch the page with default config, then returns page info and internal links
#'
#' page<-LinkExtractor(url="http://www.glofile.com")
#'
#' # this will return  alse external links
#'
#' page<-LinkExtractor(url="http://www.glofile.com", ExternalLInks = TRUE)
#'
#' # Specify Useragent to overcome bots blocking by some websites rules
#'
#' page<-LinkExtractor(url="http://www.glofile.com", ExternalLInks = TRUE,
#'        Useragent = "Mozilla/5.0 (Windows NT 6.3; Win64; x64)",)
#'
#' # By default, only HTTP succeeded page are parsed, therefore, to force
#' # parse error pages like 404 you need to specify IndexErrPages,
#'
#' page<-LinkExtractor(url="http://www.glofile.com/404notfoundpage",
#'       ExternalLInks = TRUE, IndexErrPages = c(200,404))
#'
#'
#' #### Use GET request with a proxy
#' #
#' proxy<-httr::use_proxy("190.90.100.205",41000)
#' pageinfo<-LinkExtractor(url="http://glofile.com/index.php/2017/06/08/taux-nette-detente/",
#' use_proxy = proxy)
#'
#' #' Note : use_proxy arguments can' not't be configured with webdriver
#'
#' ###### Fetch a URL using a web driver (virtual browser)
#' ######################################################
#' ##
#' ## Slow, because a headless browser called phantomjs will simulate
#' ## a user session on a website. It's useful for web page having important
#' ## javascript rendred sections such as menus.
#' ## We recommend that you first try normal previous request, if the function
#' ## returns a forbidden 403 status code or an empty/incomplete source code body,
#' ## then try to set a normal useragent like
#' ## Useragent = "Mozilla/5.0 (Windows NT 6.3; Win64; x64)",
#' ## if you still have issue then you shoud try to set up a virtual browser.
#'
#' #1 Download and install phantomjs headless browser
#' install_browser()
#'
#' #2 start browser process (takes 30 seconds usualy)
#' br <-run_browser()
#'
#' #3 call the function
#' page<-LinkExtractor(url="http://www.master-maroc.com", Browser = br,
#'       ExternalLInks = TRUE)
#'
#' #4 dont forget to stop the browser at the end of all your work with it
#' stop_browser(br)
#'
#' ###### Fetch a web page that requires authentication
#' #########################################################
#' ## In some case you may need to retreive content from a web page which
#' ## requires authentication via a login page like private forums, platforms..
#' ## In this case you need to run \link{LoginSession} function to establish a
#' ## authenticated browser session; then use \link{LinkExtractor} to fetch
#' ## the URL using the auhenticated session.
#' ## In the example below we will try to fech a private blog post which
#' ## require authentification .
#'
#' If you retreive the page using regular function LinkExtractor or your browser
#' page<-LinkExtractor("http://glofile.com/index.php/2017/06/08/jcdecaux/")
#' The post is not visible because it's private.
#' Now we will try to login to access this post using folowing creditentials
#' username : demo and password : rc@pass@r
#'
#' #1 Download and install phantomjs headless browser (skip if installed)
#' install_browser()
#'
#' #2 start browser process
#' br <-run_browser()
#'
#' #3 create auhenticated session
#' #  see \link{LoginSession} for more details
#'
#'  LS<-LoginSession(Browser = br, LoginURL = 'http://glofile.com/wp-login.php',
#'                 LoginCredentials = c('demo','rc@pass@r'),
#'                 cssLoginFields =c('#user_login', '#user_pass'),
#'                 cssLoginButton='#wp-submit' )
#'
#' #check if login successful
#' LS$session$getTitle()
#' #Or
#' LS$session$getUrl()
#' #Or
#' LS$session$takeScreenshot(file = 'sc.png')
#'
#' #3 Retreive the target private page using the logged-in session
#' page<-LinkExtractor(url='http://glofile.com/index.php/2017/06/08/jcdecaux/',Browser = LS)
#'
#' #4 dont forget to stop the browser at the end of all your work with it
#' stop_browser(LS)
#'
#'
#' ################### Returned Values #####################
#' #########################################################
#'
#' # Returned 'page' variable should include :
#' # 1- list of page details,
#' # 2- Internal links
#' # 3- external links.
#'
#' #1 Vector of extracted internal links  (in-links)
#' page$InternalLinks
#'
#' #2 Vector of extracted external links  (out-links)
#' page$ExternalLinks
#'
#  #3 Page information list
#' page$Info
#'
#' # Requested Url
#' page$Info$Url
#'
#' # Sum of extracted links
#' page$Info$SumLinks
#'
#' # The status code of the HTTP response 200, 401, 300...
#' page$Info$Status_code
#'
#' # The MIME type of this content from HTTP response
#' page$Info$Content_type
#'
#' # Page text encoding UTF8, ISO-8859-1 , ..
#' page$Info$Encoding
#'
#' # Page source code
#' page$Info$Source_page
#'
#' Page title
#' page$Info$Title
#'
#' Other returned values page$Info$Id, page$Info$Crawl_level,
#' page$Info$Crawl_status are only used by Rcrawler funtion.
#'
#'
#' }
#'
#' @export

LinkExtractor <- function(url, id, lev, IndexErrPages, Useragent, Timeout=6, use_proxy=NULL,
                          URLlenlimit=255, urlExtfilter, urlregexfilter, encod, urlbotfiler, removeparams,
                          removeAllparams=FALSE, ExternalLInks=FALSE, urlsZoneXpath=NULL, Browser, RenderingDelay=0) {

  if(!missing(Browser) && !is.null(use_proxy) ) stop("unfortunately, phantomjs can't be configured to use proxy")
  nblinks<-0
  pageinfo<-list()
  links2<- vector()
  linkl<-list()
  links<-vector()
  Extlinks<- vector()
  if(!missing(Browser)){
    if(length(Browser)<2) stop("please setup a web driver using run_browser()")
  }
  base <- strsplit(gsub("http://|https://", "", url), "/")[[c(1, 1)]]
  if (missing(urlbotfiler)) urlbotfiler<-" "
  if (missing(id)) id<-sample(1:1000, 1)
  if (missing(lev)) lev<-1
  if (missing(IndexErrPages)) errstat<-c(200)
  else errstat<-c(200,IndexErrPages)
  if(missing(Useragent)) Useragent="Mozilla/5.0 (Windows NT 6.3; WOW64; rv:42.0) Firefox/42.0"
  if(missing(urlExtfilter)) urlExtfilter<-c("flv","mov","swf","txt","xml","js","css","zip","gz","rar","7z","tgz","tar","z","gzip","bzip","tar","mp3","mp4","aac","wav","au","wmv","avi","mpg","mpeg","pdf","doc","docx","xls","xlsx","ppt","pptx","jpg","jpeg","png","gif","psd","ico","bmp","odt","ods","odp","odb","odg","odf")
  if (missing(urlregexfilter)){ urlregexfilter<-".*" }
  else { urlregexfilter<-paste(urlregexfilter,collapse="|")}
  #page<-NULL
  if(!missing(Browser)){
    page<-tryCatch(Drv_fetchpage(url = url, browser = Browser), error=function(e) NULL)
    }else {
      if(is.null(use_proxy)){
        page<-tryCatch(httr::GET(url, httr::user_agent(Useragent), httr::timeout(Timeout),httr::add_headers(`Origin`=base)) , error=function(e) list(NULL,e))
      } else {
        page<-tryCatch(httr::GET(url, httr::user_agent(Useragent), use_proxy , httr::timeout(Timeout),httr::add_headers(`Origin`=base)) , error=function(e) list(NULL,e))
      }
    }

    if(length(page)==2){
      if(grepl("Timeout was reached",page[[2]]$message)){
        page<-tryCatch(httr::GET(url, httr::user_agent(Useragent), httr::add_headers(`Origin`=base)) , error=function(e) list(NULL,e))
        if(length(page)==2){
          if(grepl("Timeout was reached",page[[2]]$message)){
              cat ("warning ! Unable to fetch the website using GET request , try to use web driver method (run_browser func see manual)")
          }
          page<-NULL
        }
      }
    }

  # 1 if domain exist (could resolve host name)
  if (!is.null(page)){
    # 2 if page exist (not 404,301,302,500,503,403)
    if(page$status_code %in% errstat){
      # 4 if page content is html
      if(grepl("html",page$headers$`content-type`,ignore.case = TRUE)){

        if(missing(Browser) ){

          if (missing(encod)){
          x<-as.character(httr::content(page, type = "htmlTreeParse", as="text", encoding = "UTF-8"))
          cont<-x
          } else {
          x<-as.character(httr::content(page, type = "htmlTreeParse", as="text", encoding = encod))
          cont<-x
          }
          if(is.na(cont)){
            x<-as.character(httr::content(page, type = "htmlTreeParse", as="text", encoding = "ISO-8859-1"))
            cont<-x
          }
          links<-vector()
        x<-xml2::read_html(x)
        if(!is.null(urlsZoneXpath)){
            for(h in 1:length(urlsZoneXpath)){
              zonex<-tryCatch(xml2::xml_find_all(x, urlsZoneXpath[[h]]) , error=function(e) NULL)
              if(!is.null(zonex)){
                li<-xml2::xml_find_all(zonex, ".//a/@href")
                li<-as.vector(paste(li))
                li<-gsub(" href=\"(.*)\"", "\\1", li)
                links<-c(links,li)
              }
            }
          } else {
              links<-xml2::xml_find_all(x, "//a/@href")
              links<-as.vector(paste(links))
              links<-gsub(" href=\"(.*)\"", "\\1", links)
          }
        }else {
          links<-vector()
          Sys.sleep(RenderingDelay)
          x<-page$PageSource
          cont<-x
          if(!is.null(urlsZoneXpath)){
            w<-1
            for(h in 1:length(urlsZoneXpath)){
              zonex<-tryCatch(Browser$session$findElement(xpath = urlsZoneXpath[[h]]), error=function(e) NULL)
              # , error=function(e) NULL)
              if(!is.null(zonex)){
                linksel<-tryCatch(zonex$findElements(xpath = ".//*/a" ), error=function(e) NULL)
                for(l in linksel){
                  if(length(l$getAttribute("href"))!=0) {
                    links<-c(links,l$getAttribute("href"))
                  } else{
                    linkl[[w]]<-l
                    w<-w+1
                  }
                }
              }
            }
          } else {
            linksel<-Browser$session$findElements(xpath = "//*/a")
            w<-1
            links<-vector()
            for(l in linksel){
              if(length(l$getAttribute("href"))!=0) {
                links<-c(links,l$getAttribute("href"))
              } else{
                linkl[[w]]<-l
                w<-w+1
              }
            }
          }

        }


        links<-unique(links)
        domain0<- strsplit(gsub("http://|https://|www\\.", "", url), "/")[[c(1, 1)]]
        domain<- paste(domain0, "/", sep="")
        # Link canonicalization
        links<-LinkNormalization(links,url)

        # Ignore some Url parameters or remove all parameters
        if (!missing(removeparams)){
          if(removeparams!=""){
          links<-sapply(links , function(x) Linkparamsfilter(x, removeparams), USE.NAMES = FALSE)
          }
        }
        if (removeAllparams){
          links<-sapply(links , function(x) Linkparamsfilter(x, removeAllparams = TRUE), USE.NAMES = FALSE)
        }

        links<-unique(links)
        # Link robots.txt filter
        if (!missing(urlbotfiler)){
       links<-links[!links %like% paste(urlbotfiler,collapse="|") ]
        }
        if(length(links)!=0) {
          for(s in 1:length(links)){
            if (!is.na(links[s])){
              #limit length URL to 255
              if( nchar(links[s])<=URLlenlimit) {
                ext<-tools::file_ext(sub("\\?.+", "", basename(links[s])))
                # 6 Filtre eliminer les liens externes , le lien source lui meme, les lien avec diese et les liens deja dans dans liste ( evite double), les types de fichier filtrer, les lien tres longs , les liens de type share
                #&& !(url==links[s])
                if(grepl(domain,links[s]) && !(links[s] %in% links2) && !(ext %in% urlExtfilter) && grepl(pattern = urlregexfilter,x = links[s])){
                  links2<-c(links2,links[s])
                #calcul de nombre des liens
                  nblinks<-nblinks+1
                }
                 if(ExternalLInks){
                   if ( !grepl(domain,links[s]) && !(links[s] %in% Extlinks) && !(ext %in% urlExtfilter)){
                      Extlinks<-c(Extlinks,links[s])
                      nblinks<-nblinks+1
                   }
                 }  else {
                   Extlinks <- vector()
                   }
              }
            }
          }
        } else {
          links2 <- vector()
          linkl<-list()
          Extlinks <- vector()
        }
      } else {links2 <- vector()
              cont<-"NULL"
              linkl<-list()
              Extlinks <- vector()
              }
    } else {
            links2 <- vector()
            cont<-"NULL"
            linkl<-list()
            Extlinks <- vector()
    }
      #Ligne - page detail
      if(cont=="NULL"){
        titre<-"NULL"
      } else {
        titre<-tryCatch(xml2::xml_text(xml2::xml_find_first(xml2::read_html(cont), "//*/title")) , error=function(e) NULL)
      }
      contenttype<-tryCatch(gsub("(.*)\\;.*", "\\1", page$headers$`content-type`), error=function(e) "NA")
      if(page$headers$`content-type`=="html"){
        contentencod<-GetEncodingHTML(cont)
      } else{
        contentencod<-tryCatch(gsub("(.*)=(.*)","\\2", gsub(".*\\;.", "\\1", page$headers$`content-type`)), error=function(e) "NA")
      }
      pageinfo<-list(Id=id,Url=url,Crawl_status="finished",Crawl_level=lev,SumLinks=nblinks,"", Status_code=page$status_code, Content_type=contenttype, Encoding=contentencod, Source_page=cont, Title=titre)
    }else {
      links2 <- vector()
      Extlinks <- vector()
      pageinfo<-list(Id=id,Url=url,Crawl_status="NULL",Crawl_level=lev,SumLinks="",Status_code="",Content_type="",Encoding="",Source_page="",Title="")
    }

  if(missing(Browser)){
    paquet<-list(Info=pageinfo,
               InternalLinks=links2,
               ExternalLinks=Extlinks)
  }else{
    paquet<-list(Info=pageinfo,
                 InternalLinks=links2,
                 ExternalLinks=Extlinks,
                 OtherLinksTags=linkl)
  }
  return(paquet)
}


#' Fetch page using web driver/Session
#'
#' @param url character, web page URL to retreive
#' @param browser Object returned by \code{\link{run_browser}}
#'
#' @return return a list of three elements, the first is a list containing the web page details (url, encoding-type, content-type, content ... etc), the second is a character-vector containing the list of retreived internal urls and the third is a vetcor of external Urls.
#' @author salim khalil
#'
#' @import  webdriver
#' @importFrom  jsonlite fromJSON
#' @importFrom  jsonlite validate

#' @export

Drv_fetchpage <- function(url, browser) {

  if (missing(browser)) stop("browser argument is missing! use run_browser() to build a browser object or LoginSession() for pages requiring authentification")
  if (missing(url)) stop("url argument is missing! you need to provide the url to be fetched")

  if (length(browser)<3){
      browser$session$initialize(port=browser$process$port)
  }
  browser$session$go(url)
  # one login  try
  if (length(browser)==3){
    if(grepl(browser$loginInfo$LoginURL,browser$session$getUrl())){
      LoginSession(Browser = browser, LoginURL = browser$loginInfo$LoginURL, LoginCredentials = browser$loginInfo$LoginCredentials,
                   cssLoginFields = browser$loginInfo$cssLoginFields, cssLoginButton = browser$loginInfo$cssLoginButton,cssRadioToCheck = browser$loginInfo$cssRadioToCheck,
                   XpathLoginFields = browser$loginInfo$XpathLoginFields, XpathLoginButton = browser$loginInfo$XpathLoginButton, browser$loginInfo$XpathRadioToCheck
                    )
      browser$session$go(url)
    }
  }

  sc=as.character(browser$session$getSource())
  x<-browser$session$readLog( type = "har")
  xjson<-tryCatch(jsonlite::fromJSON(x$message,simplifyVector = FALSE), error=function(e) NULL)

  if(!is.null(xjson) && length(xjson)>0){
    if(length(xjson$log$entries)>1){
        xjson<-xjson$log$entries

        if(substring(url, nchar(url)) == "/"){
          url2<-substr(url, 1, nchar(url)-1)
        } else url2 <-paste0(url,"/")

        #get the position of list wich contain desired http header

        for (i in 1:length(xjson)) {
           if (url2==xjson[[i]]$request$url
               || url==xjson[[i]]$request$url
               || grepl(paste0("^",gsub("([.|()\\^{}+$*?]|\\[|\\])", "\\\\\\1", url),"(\\/)?$"), xjson[[i]]$request$url)
           ) { p<-xjson[[i]]
           }

        }

        if(exists("p")){
          if("status" %in% names(p$response)){
          status_c<-p$response$status
          }else status_c<-200
          if(length(p$response$headers)>0){
            for(i in 1:length(p$response$headers)){
              if("name" %in% names(p$response$headers[[i]])){
                if (grepl("^content-type$",p$response$headers[[i]]$name,ignore.case = TRUE)){
                  content_type<-p$response$headers[[i]]$value
                } else content_type<-get_contenttype(sc)
              } else content_type<-get_contenttype(sc)
            }
          } else content_type<-get_contenttype(sc)
        }else {
          status_c<-200
          content_type<-get_contenttype(sc)
        }
    } else {
      status_c<-200
      content_type<-get_contenttype(sc)
    }
  } else {
    status_c<-200
    content_type<-"html"
  }
  #if(!exists(content_type)) content_type<-"text/html"
  page<-list(status_code=status_c,
             PageSource=sc,
             headers=list(`content-type`=content_type))

  #if (length(browser)<3){
  #  browser$session$delete()
  #}

  page
}
get_contenttype<- function (sc){
  content_type<-""
  if(grepl('^<\\?xml', trimws(sc))) content_type<-"xml"
  else if ( grepl("<!doctype.*",trimws(sc), perl=TRUE,ignore.case = TRUE)
            || grepl("/.*(br|basefont|hr|input|source|frame|param|area|meta|!--|col|link|option|base|img|wbr|!DOCTYPE).*?>|<(a|abbr|acronym|address|applet|article|aside|audio|b|bdi|bdo|big|blockquote|body|button|canvas|caption|center|cite|code|colgroup|command|datalist|dd|del|details|dfn|dialog|dir|div|dl|dt|em|embed|fieldset|figcaption|figure|font|footer|form|frameset|head|header|hgroup|h1|h2|h3|h4|h5|h6|html|i|iframe|ins|kbd|keygen|label|legend|li|map|mark|menu|meter|nav|noframes|noscript|object|ol|optgroup|output|p|pre|progress|q|rp|rt|ruby|s|samp|script|section|select|small|span|strike|strong|style|sub|summary|sup|table|tbody|td|textarea|tfoot|th|thead|time|title|tr|track|tt|u|ul|var|video).*?<\\/\\2>.*/i",trimws(sc), perl=TRUE))
    content_type<-"HTML"
  else if(jsonlite::validate(sc)) content_type<-"json"
  else content_type<-""

  return (content_type)
}