R/parsepage.r

Defines functions extract_links parseurlheader

Documented in extract_links parseurlheader

#' Parse an http header
#' 
#' @param header Header text
#' @return Named character vector
#' @author jefferis
#' @export
parseurlheader<-function(header){
  if(regexpr("OK",header[1])<0) return(NULL)
  tt=header[grep(":",header)]
  tt=sub("\\r","",tt)
  names=sub("^([^:]+): .*","\\1",tt)
  values=sub("^[^:]+: (.*)","\\1",tt)
  l=list()
  l[names]=values
  l
}

#' Extract the links from text of a web page
#' 
#' @details The baseurl is normally just the original url (although a different
#' url is sometime explicitly specified in the html body).
#' @details absolute depends on getRelativeUrl from the \code{XML} package.
#' @param body Raw text of web page
#' @param linktype class of link to find (e.g. href,src)
#' @param regex Regular expression to filter links
#' @param fixed Whether regular expression is fixed
#' @param rooturl Base url for expansion of relative links
#' @param absolute Whether to convert relative urls to absolute
#' @param USE.NAMES Return relative links as names when absolute = FALSE
#' @return character verctor of urls
#' @author jefferis
#' @export
#' @seealso \code{\link{grep},\link{getHTMLLinks}}
#' @importFrom XML getRelativeURL parseURI
extract_links<-function(body,linktype="href",regex=NULL,fixed=FALSE,
    rooturl=attr(body,'url'),absolute=TRUE,USE.NAMES=FALSE){
  t2=body[grep(linktype,body,fixed=fixed)]
  t3=unlist(strsplit(t2,"><")) # split lines with multiple html fields
  t4=t3[grep(linktype,t3,fixed=fixed)] # just keep the ones that still match linktype
  links=sub(paste(".*",linktype,"=\"([^\"]+).*",sep=""),"\\1",t4)
  
  if(!is.null(regex)) links=links[grep(regex,links,fixed=fixed)]
  
  if(absolute && !is.null(rooturl)) {
    if(parseURI(rooturl)$path=="") rooturl=paste(rooturl,'/',sep='')
    links=getRelativeURL(links,rooturl)
    if(!USE.NAMES) names(links)<-NULL
  }
  links
}
jefferis/gscraper documentation built on Oct. 25, 2020, 12:08 a.m.