Nothing
#' LinkExtractor
#'
#' Fetch and parse a document by URL, to extract page info, HTML source and links (internal/external).
#' Fetching process can be done by HTTP GET request or through webdriver (phantomjs) which simulate a real browser rendering.
#'
#'
#' @param url character, url to fetch and parse.
#' @param id numeric, an id to identify a specific web page in a website collection, it's auto-generated byauto-generated by \code{\link{Rcrawler}} function.
#' @param lev numeric, the depth level of the web page, auto-generated by \code{\link{Rcrawler}} function.
#' @param IndexErrPages character vector, http error code-statut that can be processed, by default, it's \code{IndexErrPages<-c(200)} which means only successfull page request should be parsed .Eg, To parse also 404 error pages add, \code{IndexErrPages<-c(200,404)}.
#' @param Useragent , the name the request sender, default to "Rcrawler". but we recommand using a regular browser user-agent to avoid being blocked by some server.
#' @param use_proxy, object created by httr::use_proxy() function, if you want to use a proxy to retreive web page. (does not work with webdriver).
#' @param Timeout ,default to 5s
#' @param urlsZoneXpath, xpath pattern of the section from where links should be exclusively gathered/collected.
#' @param URLlenlimit interger, Maximum URL length to process, default to 255 characters (Useful to avoid spider traps)
#' @param urlregexfilter character vector, filter out extracted internal urls by one or more regular expression.
#' @param urlExtfilter character vector, the list of file extensions to exclude from parsing, Actualy, only html pages are processed(parsed, scraped); To define your own lis use \code{urlExtfilter<-c(ext1,ext2,ext3)}
#' @param ExternalLInks boolean, default FALSE, if set to TRUE external links also are returned.
#' @param urlbotfiler character vector , directories/files restricted by robot.txt
#' @param encod character, web page character encoding
#' @param removeparams character vector, list of url parameters to be removed form web page internal links.
#' @param removeAllparams boolean, IF TRUE the list of scraped urls will have no parameters.
#' @param Browser the client object of a remote headless web driver(virtual browser), created by \code{br<-run_browser()} function, or a logged-in browser session object, created by \link{LoginSession}, after installing web driver Agent \code{install_browser()}. see examples below.
#' @param RenderingDelay the time required by a webpage to be fully rendred, in seconds.
#'
#' @return return a list of three elements, the first is a list containing the web page details (url, encoding-type, content-type, content ... etc), the second is a character-vector containing the list of retreived internal urls and the third is a vetcor of external Urls.
#' @author salim khalil
#'
#' @import webdriver
#' @importFrom httr GET
#' @importFrom httr user_agent
#' @importFrom httr use_proxy
#' @importFrom httr timeout
#' @importFrom httr content
#' @importFrom httr add_headers
#' @importFrom data.table %chin% %like%
#' @importFrom xml2 xml_find_all
#' @importFrom xml2 xml_text
#' @importFrom xml2 xml_find_first
#'
#' @examples
#'
#'\dontrun{
#'
#' ###### Fetch a URL using GET request :
#' ######################################################
#' ##
#' ## Very Fast, but can't fetch javascript rendred pages or sections
#'
#' # fetch the page with default config, then returns page info and internal links
#'
#' page<-LinkExtractor(url="http://www.glofile.com")
#'
#' # this will return alse external links
#'
#' page<-LinkExtractor(url="http://www.glofile.com", ExternalLInks = TRUE)
#'
#' # Specify Useragent to overcome bots blocking by some websites rules
#'
#' page<-LinkExtractor(url="http://www.glofile.com", ExternalLInks = TRUE,
#' Useragent = "Mozilla/5.0 (Windows NT 6.3; Win64; x64)",)
#'
#' # By default, only HTTP succeeded page are parsed, therefore, to force
#' # parse error pages like 404 you need to specify IndexErrPages,
#'
#' page<-LinkExtractor(url="http://www.glofile.com/404notfoundpage",
#' ExternalLInks = TRUE, IndexErrPages = c(200,404))
#'
#'
#' #### Use GET request with a proxy
#' #
#' proxy<-httr::use_proxy("190.90.100.205",41000)
#' pageinfo<-LinkExtractor(url="http://glofile.com/index.php/2017/06/08/taux-nette-detente/",
#' use_proxy = proxy)
#'
#' #' Note : use_proxy arguments can' not't be configured with webdriver
#'
#' ###### Fetch a URL using a web driver (virtual browser)
#' ######################################################
#' ##
#' ## Slow, because a headless browser called phantomjs will simulate
#' ## a user session on a website. It's useful for web page having important
#' ## javascript rendred sections such as menus.
#' ## We recommend that you first try normal previous request, if the function
#' ## returns a forbidden 403 status code or an empty/incomplete source code body,
#' ## then try to set a normal useragent like
#' ## Useragent = "Mozilla/5.0 (Windows NT 6.3; Win64; x64)",
#' ## if you still have issue then you shoud try to set up a virtual browser.
#'
#' #1 Download and install phantomjs headless browser
#' install_browser()
#'
#' #2 start browser process (takes 30 seconds usualy)
#' br <-run_browser()
#'
#' #3 call the function
#' page<-LinkExtractor(url="http://www.master-maroc.com", Browser = br,
#' ExternalLInks = TRUE)
#'
#' #4 dont forget to stop the browser at the end of all your work with it
#' stop_browser(br)
#'
#' ###### Fetch a web page that requires authentication
#' #########################################################
#' ## In some case you may need to retreive content from a web page which
#' ## requires authentication via a login page like private forums, platforms..
#' ## In this case you need to run \link{LoginSession} function to establish a
#' ## authenticated browser session; then use \link{LinkExtractor} to fetch
#' ## the URL using the auhenticated session.
#' ## In the example below we will try to fech a private blog post which
#' ## require authentification .
#'
#' If you retreive the page using regular function LinkExtractor or your browser
#' page<-LinkExtractor("http://glofile.com/index.php/2017/06/08/jcdecaux/")
#' The post is not visible because it's private.
#' Now we will try to login to access this post using folowing creditentials
#' username : demo and password : rc@pass@r
#'
#' #1 Download and install phantomjs headless browser (skip if installed)
#' install_browser()
#'
#' #2 start browser process
#' br <-run_browser()
#'
#' #3 create auhenticated session
#' # see \link{LoginSession} for more details
#'
#' LS<-LoginSession(Browser = br, LoginURL = 'http://glofile.com/wp-login.php',
#' LoginCredentials = c('demo','rc@pass@r'),
#' cssLoginFields =c('#user_login', '#user_pass'),
#' cssLoginButton='#wp-submit' )
#'
#' #check if login successful
#' LS$session$getTitle()
#' #Or
#' LS$session$getUrl()
#' #Or
#' LS$session$takeScreenshot(file = 'sc.png')
#'
#' #3 Retreive the target private page using the logged-in session
#' page<-LinkExtractor(url='http://glofile.com/index.php/2017/06/08/jcdecaux/',Browser = LS)
#'
#' #4 dont forget to stop the browser at the end of all your work with it
#' stop_browser(LS)
#'
#'
#' ################### Returned Values #####################
#' #########################################################
#'
#' # Returned 'page' variable should include :
#' # 1- list of page details,
#' # 2- Internal links
#' # 3- external links.
#'
#' #1 Vector of extracted internal links (in-links)
#' page$InternalLinks
#'
#' #2 Vector of extracted external links (out-links)
#' page$ExternalLinks
#'
# #3 Page information list
#' page$Info
#'
#' # Requested Url
#' page$Info$Url
#'
#' # Sum of extracted links
#' page$Info$SumLinks
#'
#' # The status code of the HTTP response 200, 401, 300...
#' page$Info$Status_code
#'
#' # The MIME type of this content from HTTP response
#' page$Info$Content_type
#'
#' # Page text encoding UTF8, ISO-8859-1 , ..
#' page$Info$Encoding
#'
#' # Page source code
#' page$Info$Source_page
#'
#' Page title
#' page$Info$Title
#'
#' Other returned values page$Info$Id, page$Info$Crawl_level,
#' page$Info$Crawl_status are only used by Rcrawler funtion.
#'
#'
#' }
#'
#' @export
LinkExtractor <- function(url, id, lev, IndexErrPages, Useragent, Timeout=6, use_proxy=NULL,
URLlenlimit=255, urlExtfilter, urlregexfilter, encod, urlbotfiler, removeparams,
removeAllparams=FALSE, ExternalLInks=FALSE, urlsZoneXpath=NULL, Browser, RenderingDelay=0) {
if(!missing(Browser) && !is.null(use_proxy) ) stop("unfortunately, phantomjs can't be configured to use proxy")
nblinks<-0
pageinfo<-list()
links2<- vector()
linkl<-list()
links<-vector()
Extlinks<- vector()
if(!missing(Browser)){
if(length(Browser)<2) stop("please setup a web driver using run_browser()")
}
base <- strsplit(gsub("http://|https://", "", url), "/")[[c(1, 1)]]
if (missing(urlbotfiler)) urlbotfiler<-" "
if (missing(id)) id<-sample(1:1000, 1)
if (missing(lev)) lev<-1
if (missing(IndexErrPages)) errstat<-c(200)
else errstat<-c(200,IndexErrPages)
if(missing(Useragent)) Useragent="Mozilla/5.0 (Windows NT 6.3; WOW64; rv:42.0) Firefox/42.0"
if(missing(urlExtfilter)) urlExtfilter<-c("flv","mov","swf","txt","xml","js","css","zip","gz","rar","7z","tgz","tar","z","gzip","bzip","tar","mp3","mp4","aac","wav","au","wmv","avi","mpg","mpeg","pdf","doc","docx","xls","xlsx","ppt","pptx","jpg","jpeg","png","gif","psd","ico","bmp","odt","ods","odp","odb","odg","odf")
if (missing(urlregexfilter)){ urlregexfilter<-".*" }
else { urlregexfilter<-paste(urlregexfilter,collapse="|")}
#page<-NULL
if(!missing(Browser)){
page<-tryCatch(Drv_fetchpage(url = url, browser = Browser), error=function(e) NULL)
}else {
if(is.null(use_proxy)){
page<-tryCatch(httr::GET(url, httr::user_agent(Useragent), httr::timeout(Timeout),httr::add_headers(`Origin`=base)) , error=function(e) list(NULL,e))
} else {
page<-tryCatch(httr::GET(url, httr::user_agent(Useragent), use_proxy , httr::timeout(Timeout),httr::add_headers(`Origin`=base)) , error=function(e) list(NULL,e))
}
}
if(length(page)==2){
if(grepl("Timeout was reached",page[[2]]$message)){
page<-tryCatch(httr::GET(url, httr::user_agent(Useragent), httr::add_headers(`Origin`=base)) , error=function(e) list(NULL,e))
if(length(page)==2){
if(grepl("Timeout was reached",page[[2]]$message)){
cat ("warning ! Unable to fetch the website using GET request , try to use web driver method (run_browser func see manual)")
}
page<-NULL
}
}
}
# 1 if domain exist (could resolve host name)
if (!is.null(page)){
# 2 if page exist (not 404,301,302,500,503,403)
if(page$status_code %in% errstat){
# 4 if page content is html
if(grepl("html",page$headers$`content-type`,ignore.case = TRUE)){
if(missing(Browser) ){
if (missing(encod)){
x<-as.character(httr::content(page, type = "htmlTreeParse", as="text", encoding = "UTF-8"))
cont<-x
} else {
x<-as.character(httr::content(page, type = "htmlTreeParse", as="text", encoding = encod))
cont<-x
}
if(is.na(cont)){
x<-as.character(httr::content(page, type = "htmlTreeParse", as="text", encoding = "ISO-8859-1"))
cont<-x
}
links<-vector()
x<-xml2::read_html(x)
if(!is.null(urlsZoneXpath)){
for(h in 1:length(urlsZoneXpath)){
zonex<-tryCatch(xml2::xml_find_all(x, urlsZoneXpath[[h]]) , error=function(e) NULL)
if(!is.null(zonex)){
li<-xml2::xml_find_all(zonex, ".//a/@href")
li<-as.vector(paste(li))
li<-gsub(" href=\"(.*)\"", "\\1", li)
links<-c(links,li)
}
}
} else {
links<-xml2::xml_find_all(x, "//a/@href")
links<-as.vector(paste(links))
links<-gsub(" href=\"(.*)\"", "\\1", links)
}
}else {
links<-vector()
Sys.sleep(RenderingDelay)
x<-page$PageSource
cont<-x
if(!is.null(urlsZoneXpath)){
w<-1
for(h in 1:length(urlsZoneXpath)){
zonex<-tryCatch(Browser$session$findElement(xpath = urlsZoneXpath[[h]]), error=function(e) NULL)
# , error=function(e) NULL)
if(!is.null(zonex)){
linksel<-tryCatch(zonex$findElements(xpath = ".//*/a" ), error=function(e) NULL)
for(l in linksel){
if(length(l$getAttribute("href"))!=0) {
links<-c(links,l$getAttribute("href"))
} else{
linkl[[w]]<-l
w<-w+1
}
}
}
}
} else {
linksel<-Browser$session$findElements(xpath = "//*/a")
w<-1
links<-vector()
for(l in linksel){
if(length(l$getAttribute("href"))!=0) {
links<-c(links,l$getAttribute("href"))
} else{
linkl[[w]]<-l
w<-w+1
}
}
}
}
links<-unique(links)
domain0<- strsplit(gsub("http://|https://|www\\.", "", url), "/")[[c(1, 1)]]
domain<- paste(domain0, "/", sep="")
# Link canonicalization
links<-LinkNormalization(links,url)
# Ignore some Url parameters or remove all parameters
if (!missing(removeparams)){
if(removeparams!=""){
links<-sapply(links , function(x) Linkparamsfilter(x, removeparams), USE.NAMES = FALSE)
}
}
if (removeAllparams){
links<-sapply(links , function(x) Linkparamsfilter(x, removeAllparams = TRUE), USE.NAMES = FALSE)
}
links<-unique(links)
# Link robots.txt filter
if (!missing(urlbotfiler)){
links<-links[!links %like% paste(urlbotfiler,collapse="|") ]
}
if(length(links)!=0) {
for(s in 1:length(links)){
if (!is.na(links[s])){
#limit length URL to 255
if( nchar(links[s])<=URLlenlimit) {
ext<-tools::file_ext(sub("\\?.+", "", basename(links[s])))
# 6 Filtre eliminer les liens externes , le lien source lui meme, les lien avec diese et les liens deja dans dans liste ( evite double), les types de fichier filtrer, les lien tres longs , les liens de type share
#&& !(url==links[s])
if(grepl(domain,links[s]) && !(links[s] %in% links2) && !(ext %in% urlExtfilter) && grepl(pattern = urlregexfilter,x = links[s])){
links2<-c(links2,links[s])
#calcul de nombre des liens
nblinks<-nblinks+1
}
if(ExternalLInks){
if ( !grepl(domain,links[s]) && !(links[s] %in% Extlinks) && !(ext %in% urlExtfilter)){
Extlinks<-c(Extlinks,links[s])
nblinks<-nblinks+1
}
} else {
Extlinks <- vector()
}
}
}
}
} else {
links2 <- vector()
linkl<-list()
Extlinks <- vector()
}
} else {links2 <- vector()
cont<-"NULL"
linkl<-list()
Extlinks <- vector()
}
} else {
links2 <- vector()
cont<-"NULL"
linkl<-list()
Extlinks <- vector()
}
#Ligne - page detail
if(cont=="NULL"){
titre<-"NULL"
} else {
titre<-tryCatch(xml2::xml_text(xml2::xml_find_first(xml2::read_html(cont), "//*/title")) , error=function(e) NULL)
}
contenttype<-tryCatch(gsub("(.*)\\;.*", "\\1", page$headers$`content-type`), error=function(e) "NA")
if(page$headers$`content-type`=="html"){
contentencod<-GetEncodingHTML(cont)
} else{
contentencod<-tryCatch(gsub("(.*)=(.*)","\\2", gsub(".*\\;.", "\\1", page$headers$`content-type`)), error=function(e) "NA")
}
pageinfo<-list(Id=id,Url=url,Crawl_status="finished",Crawl_level=lev,SumLinks=nblinks,"", Status_code=page$status_code, Content_type=contenttype, Encoding=contentencod, Source_page=cont, Title=titre)
}else {
links2 <- vector()
Extlinks <- vector()
pageinfo<-list(Id=id,Url=url,Crawl_status="NULL",Crawl_level=lev,SumLinks="",Status_code="",Content_type="",Encoding="",Source_page="",Title="")
}
if(missing(Browser)){
paquet<-list(Info=pageinfo,
InternalLinks=links2,
ExternalLinks=Extlinks)
}else{
paquet<-list(Info=pageinfo,
InternalLinks=links2,
ExternalLinks=Extlinks,
OtherLinksTags=linkl)
}
return(paquet)
}
#' Fetch page using web driver/Session
#'
#' @param url character, web page URL to retreive
#' @param browser Object returned by \code{\link{run_browser}}
#'
#' @return return a list of three elements, the first is a list containing the web page details (url, encoding-type, content-type, content ... etc), the second is a character-vector containing the list of retreived internal urls and the third is a vetcor of external Urls.
#' @author salim khalil
#'
#' @import webdriver
#' @importFrom jsonlite fromJSON
#' @importFrom jsonlite validate
#' @export
Drv_fetchpage <- function(url, browser) {
if (missing(browser)) stop("browser argument is missing! use run_browser() to build a browser object or LoginSession() for pages requiring authentification")
if (missing(url)) stop("url argument is missing! you need to provide the url to be fetched")
if (length(browser)<3){
browser$session$initialize(port=browser$process$port)
}
browser$session$go(url)
# one login try
if (length(browser)==3){
if(grepl(browser$loginInfo$LoginURL,browser$session$getUrl())){
LoginSession(Browser = browser, LoginURL = browser$loginInfo$LoginURL, LoginCredentials = browser$loginInfo$LoginCredentials,
cssLoginFields = browser$loginInfo$cssLoginFields, cssLoginButton = browser$loginInfo$cssLoginButton,cssRadioToCheck = browser$loginInfo$cssRadioToCheck,
XpathLoginFields = browser$loginInfo$XpathLoginFields, XpathLoginButton = browser$loginInfo$XpathLoginButton, browser$loginInfo$XpathRadioToCheck
)
browser$session$go(url)
}
}
sc=as.character(browser$session$getSource())
x<-browser$session$readLog( type = "har")
xjson<-tryCatch(jsonlite::fromJSON(x$message,simplifyVector = FALSE), error=function(e) NULL)
if(!is.null(xjson) && length(xjson)>0){
if(length(xjson$log$entries)>1){
xjson<-xjson$log$entries
if(substring(url, nchar(url)) == "/"){
url2<-substr(url, 1, nchar(url)-1)
} else url2 <-paste0(url,"/")
#get the position of list wich contain desired http header
for (i in 1:length(xjson)) {
if (url2==xjson[[i]]$request$url
|| url==xjson[[i]]$request$url
|| grepl(paste0("^",gsub("([.|()\\^{}+$*?]|\\[|\\])", "\\\\\\1", url),"(\\/)?$"), xjson[[i]]$request$url)
) { p<-xjson[[i]]
}
}
if(exists("p")){
if("status" %in% names(p$response)){
status_c<-p$response$status
}else status_c<-200
if(length(p$response$headers)>0){
for(i in 1:length(p$response$headers)){
if("name" %in% names(p$response$headers[[i]])){
if (grepl("^content-type$",p$response$headers[[i]]$name,ignore.case = TRUE)){
content_type<-p$response$headers[[i]]$value
} else content_type<-get_contenttype(sc)
} else content_type<-get_contenttype(sc)
}
} else content_type<-get_contenttype(sc)
}else {
status_c<-200
content_type<-get_contenttype(sc)
}
} else {
status_c<-200
content_type<-get_contenttype(sc)
}
} else {
status_c<-200
content_type<-"html"
}
#if(!exists(content_type)) content_type<-"text/html"
page<-list(status_code=status_c,
PageSource=sc,
headers=list(`content-type`=content_type))
#if (length(browser)<3){
# browser$session$delete()
#}
page
}
get_contenttype<- function (sc){
content_type<-""
if(grepl('^<\\?xml', trimws(sc))) content_type<-"xml"
else if ( grepl("<!doctype.*",trimws(sc), perl=TRUE,ignore.case = TRUE)
|| grepl("/.*(br|basefont|hr|input|source|frame|param|area|meta|!--|col|link|option|base|img|wbr|!DOCTYPE).*?>|<(a|abbr|acronym|address|applet|article|aside|audio|b|bdi|bdo|big|blockquote|body|button|canvas|caption|center|cite|code|colgroup|command|datalist|dd|del|details|dfn|dialog|dir|div|dl|dt|em|embed|fieldset|figcaption|figure|font|footer|form|frameset|head|header|hgroup|h1|h2|h3|h4|h5|h6|html|i|iframe|ins|kbd|keygen|label|legend|li|map|mark|menu|meter|nav|noframes|noscript|object|ol|optgroup|output|p|pre|progress|q|rp|rt|ruby|s|samp|script|section|select|small|span|strike|strong|style|sub|summary|sup|table|tbody|td|textarea|tfoot|th|thead|time|title|tr|track|tt|u|ul|var|video).*?<\\/\\2>.*/i",trimws(sc), perl=TRUE))
content_type<-"HTML"
else if(jsonlite::validate(sc)) content_type<-"json"
else content_type<-""
return (content_type)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.