R/getURLContent.R

Defines functions isBinaryFile getContentType trim stop.if.HTTP.error

Documented in isBinaryFile

getURLContent =
  #
  # Used to be
  #  header = basicTextGatherer()  
  #  ans = getBinaryURL(url, headerfunction = header$update, curl = header$curl())
  #  processContent(ans, header$header(), .encoding)
  # but now we use the dynamic reader.
  #
function(url, ..., curl = getCurlHandle(.opts = .opts), .encoding = NA, binary = NA, .opts = list(...),
         header = dynCurlReader(curl, binary = binary, baseURL = url, isHTTP = isHTTP, encoding = .encoding),
          isHTTP = length(grep('^[[:space:]]*http', url)) > 0)
{
  url = as(url, "character")
  
  if(!missing(curl))
     curlSetOpt(.opts = .opts, curl = curl)

  if(is.logical(header)) {
     returnHeader = header
     header = dynCurlReader(curl, binary = binary, baseURL = url, isHTTP = isHTTP, encoding = .encoding, header = header)
  } else
     returnHeader = FALSE
  
  if(!('headerfunction' %in% names(.opts))) {
    # .opts$headerfunction = header$update
     protect = missing(header)
     curlSetOpt(curl = curl, .isProtected = protect,
                 headerfunction = header$update)
   }
  
  if(!isHTTP && !('writefunction' %in% names(.opts))) {
      # If for example this is scp where there is no header
      # or headerfunction will never get called. So we have to
      # set the writefunction as well.
    # .opts$headerfunction = header$update
     protect = missing(header)
     curlSetOpt(curl = curl, .isProtected = protect,
                 writefunction = header$update)
   }  
  
  curlPerform(url = url, curl = curl, .opts = .opts)
  
  if(isHTTP && length(header$header())) {
     http.header = parseHTTPHeader(header$header())
     stop.if.HTTP.error(http.header, header$value())
  }

  if(returnHeader)
    list(header = if(is(returnHeader, "AsIs"))
                      header$header()
                  else
                      parseHTTPHeader(header$header()),
         body = header$value())
  else
    header$value()
}

stop.if.HTTP.error = 
function(http.header, body = NA)
{

  if(length(http.header) == 0)
    return(NA) # or TRUE
  
  if( floor(as.integer(http.header[["status"]])/100) >= 4) {
     klass =  getHTTPErrorClass(http.header[["status"]])
     err = simpleError(http.header[["statusMessage"]])
     err$httpHeader = http.header
     err$body =  body
     class(err) = c(klass, class(err))
     
     #signalCondition(err)
     stop(err)
  }

  TRUE
}

processContent = 
#
# Figure out how to interpret the contents based on the HTTP response's header
# i.e. look at its Content-Type.
#
function(ans, header, .encoding = NA)
{
  headerText = if(is.character(header)) header else header$value()
  http.header = parseHTTPHeader(headerText)

  stop.if.HTTP.error(http.header, ans)  

  content.type = getContentType(http.header)
  binary = isBinaryContent(http.header, content.type)
  if(!(is.na(binary) || binary)) {
     ans = rawToChar(ans)
     if(length(.encoding)  == 0 || is.na(.encoding)) {
        charset = grep("charset", content.type, value = TRUE)
        if(length(charset))
          .encoding = strsplit(charset, "=")[[1]][2]
     }
     if(length(.encoding) && !is.na(.encoding))
       Encoding(ans) = .encoding
  } else {
     attr(ans, "Content-Type") = getContentType(http.header)
     ans
  }
  ans
}

trim =
function(x) 
{
    gsub("(^[[:space:]]+|[[:space:]]+$)", "", x, perl = TRUE)
}

getContentType = 
function(header, full = FALSE)
{
   i = match("content-type", tolower(names(header)))
   if( is.na( i ) )
      return(character())

   tmp = trim(strsplit(header[i[1]], "; *")[[1]])
   if(!full)
       return(tmp)

   vals = strsplit(tmp, "=")
   structure(gsub(";$", "", sapply(vals, function(x) x[length(x)])), 
             names = sapply(vals, function(x) if(length(x) > 1) x[1] else ""))
}

# See http://www.iana.org/assignments/media-types/
textContentTypes = c("html", "text", "xhtml", "plain", "xml", "tex", "x-latex", "css", "latex", "x-texinfo",
                     "sgml", "postscript", "texinfo", "ecmascript", "javascript",
                     "atom+xml", "json", "x-bsh", "x-sh", "x-shar","x-rtf", "base64", "x-tcl", "x-lisp", "x-java-source",
                     "x-fortran", "x-script.csv", "x-csh", "x-c")

if(FALSE) {
   # see mime.R
guessMIMEType =
function(filename)
{
    ext = getExtension(filename)
    data("mimeTypeExtensions", package = "RCurl", envir = environment())
    mimeTypeExtensions[ext]
}
}


isBinaryFile =
function(name, mimeType = NA)
{
#      type = "application/binary"

  if(is.na(mimeType)) 
    mimeType  = guessMIMEType()

  isBinaryContent( , mimeType)  
}


isBinaryContent =
  #
  #  type can be given as a list intended to be separate  header elements
  #  e.g. Content-Type, Content-Encoding, etc.
  #  Each can be a vector.
  #
function(header, type = getContentType(header)[1],
          textTypes = getOption("text.content.types", textContentTypes))
{
   if(is.list(type) && length(type) > 1) {
     last <- TRUE
     for(i in type) {
        if(length(i) && !is.na(i) && (last <- isBinaryContent(header, i, textTypes)))
          return(TRUE)
     }
     return(last)
   }
  
   if(length(type) == 0)
     return(NA)
  
   if(is.null(textTypes))
     textTypes = textContentTypes
   type.els = strsplit(type, "/")[[1]]
   if(type.els[1] == "text")
     return(FALSE)
   
   if(any(type.els %in% textContentTypes))
      return(FALSE)

   if(length(grep("\\+xml$", type.els)))
      return(FALSE)

   TRUE
}
omegahat/RCurl documentation built on June 10, 2022, 12:34 p.m.