R/getURLContent.R

Defines functions getContentType trim stop.if.HTTP.error

getURLContent =
  #
  # Used to be
  #  header = basicTextGatherer()
  #  ans = getBinaryURL(url, headerfunction = header$update, curl = header$curl())
  #  processContent(ans, header$header(), .encoding)
  # but now we use the dynamic reader.
  #
function(url, ..., curl = getCurlHandle(.opts = .opts), .encoding = NA, binary = NA, .opts = list(...),
         header = dynCurlReader(curl, binary = binary, baseURL = url, isHTTP = isHTTP, encoding = .encoding),
          isHTTP = length(grep('^[[:space:]]*http', url)) > 0)
{
  url = as(url, "character")

  if(!missing(curl))
     curlSetOpt(.opts = .opts, curl = curl)

  if(is.logical(header)) {
     returnHeader = header
     header = dynCurlReader(curl, binary = binary, baseURL = url, isHTTP = isHTTP, encoding = .encoding)
  } else
     returnHeader = FALSE

  if(!('headerfunction' %in% names(.opts))) {
    # .opts$headerfunction = header$update
     protect = missing(header)
     curlSetOpt(curl = curl, .isProtected = protect,
                 headerfunction = header$update)
   }

  if(!isHTTP && !('writefunction' %in% names(.opts))) {
      # If for example this is scp where there is no header
      # or headerfunction will never get called. So we have to
      # set the writefunction as well.
    # .opts$headerfunction = header$update
     protect = missing(header)
     curlSetOpt(curl = curl, .isProtected = protect,
                 writefunction = header$update)
   }

  curlPerform(url = url, curl = curl, .opts = .opts)

  if(isHTTP && length(header$header())) {
     http.header = parseHTTPHeader(header$header())
     stop.if.HTTP.error(http.header)
  }

  if(returnHeader)
    list(header = if(is(returnHeader, "AsIs"))
                      header$header()
                  else
                      parseHTTPHeader(header$header()),
         body = header$value())
  else
    header$value()
}

stop.if.HTTP.error =
function(http.header)
{

  if(length(http.header) == 0)
    return(NA) # or TRUE

  if( floor(as.integer(http.header[["status"]])/100) >= 4) {
     klass = getHTTPErrorClass(http.header[["status"]])
     err = simpleError(http.header[["statusMessage"]])
     err$httpHeader = http.header
     class(err) = c(klass, class(err))
     #signalCondition(err)
     stop(err)
  }

  TRUE
}

processContent =
#
# Figure out how to interpret the contents based on the HTTP response's header
# i.e. look at its Content-Type.
#
function(ans, header, .encoding = NA)
{
  headerText = if(is.character(header)) header else header$value()
  http.header = parseHTTPHeader(headerText)

  stop.if.HTTP.error(http.header)

  content.type = getContentType(http.header)
  binary = isBinaryContent(http.header, content.type)
  if(!(is.na(binary) || binary)) {
     ans = rawToChar(ans)
     if(length(.encoding)  == 0 || is.na(.encoding)) {
        charset = grep("charset", content.type, value = TRUE)
        if(length(charset))
          .encoding = strsplit(charset, "=")[[1]][2]
     }
     if(length(.encoding) && !is.na(.encoding))
       Encoding(ans) = .encoding
  } else {
     attr(ans, "Content-Type") = getContentType(http.header)
     ans
  }
  ans
}

trim =
function(x)
{
    gsub("(^[[:space:]]+|[[:space:]]+$)", "", x, perl = TRUE)
}

getContentType =
function(header, full = FALSE)
{
   i = match("content-type", tolower(names(header)))
   if( is.na( i ) )
      return(character())

   tmp = trim(strsplit(header[i[1]], "; *")[[1]])
   if(!full)
       return(tmp)

   vals = strsplit(tmp, "=")
   structure(gsub(";$", "", sapply(vals, function(x) x[length(x)])),
             names = sapply(vals, function(x) if(length(x) > 1) x[1] else ""))
}

# See http://www.iana.org/assignments/media-types/
textContentTypes = c("html", "text", "xhtml", "plain", "xml", "x-latex", "css", "latex",
                     "sgml", "postscript", "texinfo", "ecmascript", "javascript",
                     "atom+xml", "json")

isBinaryContent =
  #
  #  type can be given as a list intended to be separate  header elements
  #  e.g. Content-Type, Content-Encoding, etc.
  #  Each can be a vector.
  #
function(header, type = getContentType(header)[1],
          textTypes = getOption("text.content.types"))
{
   if(is.list(type) && length(type) > 1) {
     last <- TRUE
     for(i in type) {
        if(length(i) && !any(is.na(i)) && (last <- isBinaryContent(header, i, textTypes)))
          return(TRUE)
     }
     return(last)
   }

   if(length(type) == 0)
     return(NA)

   if(is.null(textTypes))
     textTypes = textContentTypes
   type.els = strsplit(type, "/")[[1]]
   if(type.els[1] == "text")
     return(FALSE)

   if(any(type.els %in% textContentTypes))
      return(FALSE)

   if(length(grep("\\+xml$", type.els)))
      return(FALSE)

   TRUE
}

Try the RCurl package in your browser

Any scripts or data that you put into this service are public.

RCurl documentation built on Nov. 3, 2023, 1:09 a.m.