R/RSocrata.R

Defines functions write.socrata checkUpdateResponse ls.socrata read.socrata getSodaTypes getSodaTypes getContentAsDataFrame getContentAsDataFrame getResponse no_deniro posixify fieldName validateUrl isFourByFour fetch_user_agent logMsg

Documented in fieldName isFourByFour ls.socrata posixify read.socrata validateUrl write.socrata

# An interface to data hosted online in Socrata data repositories
# 
# Author: Hugh J. Devlin, Ph. D. 2013-08-28
###############################################################################

# library('httr')       # for access to the HTTP header
# library('jsonlite')   # for parsing data types from Socrata
# library('mime')       # for guessing mime type
# library('plyr')       # for parsing JSON files

#' Time-stamped message
#'
#' Issue a time-stamped, origin-stamped log message. 
#' @param s - a string
#' @return None (invisible NULL) as per cat
#' @author Hugh J. Devlin \email{Hugh.Devlin@@cityofchicago.org}
#' @noRd
logMsg <- function(s) {
  cat(format(Sys.time(), "%Y-%m-%d %H:%M:%OS3 "), as.character(sys.call(-1))[1], ": ", s, '\n', sep='')
}

#' Compiles the information to be used in HTTP headers
#' 
#' Grabs the headers (RSocrata version, OS, and R version) to be used while
#' making HTTP requests with Socrata. This enables Socrata's team to track
#' the usage of RSocrata.
#' @return a string
#' @importFrom utils packageVersion
#' @author Tom Schenk Jr \email{tom.schenk@@cityofchicago.org}
#' @noRd
fetch_user_agent <- function() {
  rSocrataVersion <- packageVersion("RSocrata")
  operatingSystem <- Sys.info()[["sysname"]]
  operatingSystemVersion <- paste(Sys.info()[["release"]], Sys.info()[["version"]])
  rVersion <- paste0(R.version$major, 
                     ".", 
                     R.version$minor, 
                     ifelse( # Checks if version has status, e.g., "rev"
                       R.version$status == "", 
                       "", 
                       paste0("-",R.version$status))
  )
  
  header <- paste0( "RSocrata/",
                    rSocrataVersion, " (",
                    operatingSystem, "/",
                    operatingSystemVersion, "; ",
                    "R/", rVersion,
                    ")"
  )
  return(header)
}

#' Checks the validity of the syntax for a potential Socrata dataset Unique Identifier, also known as a 4x4.
#'
#' Will check the validity of a potential dataset unique identifier
#' supported by Socrata. It will provide an exception if the syntax
#' does not align to Socrata unique identifiers. It only checks for
#' the validity of the syntax, but does not check if it actually exists.
#' @param fourByFour - a string; character vector of length one
#' @return TRUE if is valid Socrata unique identifier, FALSE otherwise
#' @author Tom Schenk Jr \email{tom.schenk@@cityofchicago.org}
#' @export
isFourByFour <- function(fourByFour) {
  fourByFour <- as.character(fourByFour)
  if(nchar(fourByFour) != 9)
    return(FALSE)
  if(regexpr("[[:alnum:]]{4}-[[:alnum:]]{4}", fourByFour) == -1)
    return(FALSE)
  TRUE	
}

#' Convert, if necessary, URL to valid REST API URL supported by Socrata.
#'
#' Will convert a human-readable URL to a valid REST API call
#' supported by Socrata. It will accept a valid API URL if provided
#' by users and will also convert a human-readable URL to a valid API
#' URL. Will accept queries with optional API token as a separate
#' argument or will also accept API token in the URL query. Will
#' resolve conflicting API token by deferring to original URL.
#' @param url - a string; character vector of length one
#' @param app_token - a string; SODA API token used to query the data 
#' portal \url{https://dev.socrata.com/consumers/getting-started.html}
#' @return a - valid Url
#' @importFrom httr parse_url build_url
#' @author Tom Schenk Jr \email{tom.schenk@@cityofchicago.org}
#' @export
validateUrl <- function(url, app_token) {
  url <- as.character(url)
  parsedUrl <- httr::parse_url(url)
  if(is.null(parsedUrl$scheme) | is.null(parsedUrl$hostname) | is.null(parsedUrl$path))
    stop(url, " does not appear to be a valid URL.")
  if(!is.null(app_token)) { # Handles the addition of API token and resolves invalid uses
    if(is.null(parsedUrl$query[["$$app_token"]])) {
      token_inclusion <- "valid_use"
    } else {
      token_inclusion <- "already_included" }
    switch(token_inclusion,
           "already_included"={ # Token already included in url argument
             warning(url, " already contains an API token in url. Ignoring token supplied in the `app_token=` argument.")
           },
           "valid_use"={ # app_token argument is used, not duplicative.
            parsedUrl$query$`$$app_token` <- as.character(app_token)
           }
          )
  } 
  if(substr(parsedUrl$path, 1, 9) == 'resource/') {
    return(httr::build_url(parsedUrl)) # resource url already
  } else if(basename(parsedUrl$path) == "rows.json" | basename(parsedUrl$path) == "rows.csv") { # See issue #124
    parsedUrl$path <- substr(parsedUrl$path, start = 11, stop = 19)
    parsedUrl$query <- NULL
  }
  
  # if /data appended to URL, remove it
  pathLength <- nchar(parsedUrl$path)
  if(substr(parsedUrl$path, pathLength - 4, pathLength) == '/data') {
     parsedUrl$path <- substr(parsedUrl$path, 1, pathLength - 5)
  }
  if(substr(parsedUrl$path, pathLength - 5, pathLength) == '/data/')  {
    parsedUrl$path <- substr(parsedUrl$path, 1, pathLength - 6)
  }

  
  fourByFour <- basename(parsedUrl$path)
  if(!isFourByFour(fourByFour))
    stop(fourByFour, " is not a valid Socrata dataset unique identifier.")
  else {
    parsedUrl$path <- paste('resource/', fourByFour, '.csv', sep="")
    httr::build_url(parsedUrl)
  }
}

#' Convert Socrata human-readable column name to field name
#' 
#' Convert Socrata human-readable column name,
#' as it might appear in the first row of data,
#' to field name as it might appear in the HTTP header;
#' that is, lower case, periods replaced with underscores#'
#' @param humanName - a Socrata human-readable column name
#' @return Socrata field name 
#' @export
#' @author Hugh J. Devlin, Ph. D. \email{Hugh.Devlin@@cityofchicago.org}
#' @examples
#' fieldName("Number.of.Stations") # number_of_stations
fieldName <- function(humanName) {
  tolower(gsub('\\.', '_', as.character(humanName)))	
}

#' Convert Socrata calendar_date string to POSIX
#'
#' @param x - character vector in one of two Socrata calendar_date formats
#' @return a POSIX date
#' @export
#' @author Hugh J. Devlin, Ph. D. \email{Hugh.Devlin@@cityofchicago.org}
posixify <- function(x) {
  x <- as.character(x)
  if (length(x)==0) return(x)
  
  ## Define regex patterns for short and long date formats (CSV) and ISO 8601 (JSON),  
  ## which are the three formats that are supplied by Socrata. 
  patternShortCsv <- paste0("^[[:digit:]]{1,2}/[[:digit:]]{1,2}/[[:digit:]]{4}$") # MM/DD/YYYY
  patternLongCsv <- paste("^[[:digit:]]{1,2}/[[:digit:]]{1,2}/[[:digit:]]{4}", 
                           "[[:digit:]]{1,2}:[[:digit:]]{1,2}:[[:digit:]]{1,2}",
                           ".M$") # MM/DD/YYYY hh:mm:ss AM/PM
  patternJsonDecimal <- paste0("^[[:digit:]]{4}-[[:digit:]]{2}-[[:digit:]]{2}T",
                               "[[:digit:]]{2}:[[:digit:]]{2}:[[:digit:]]{2}.[[:digit:]]{3}","$") # YYYY-MM-DDThh:mm:ss.sss
  patternJsonNoDecimal <- paste0("^[[:digit:]]{4}-[[:digit:]]{2}-[[:digit:]]{2}T",
                                 "[[:digit:]]{2}:[[:digit:]]{2}:[[:digit:]]{2}","$") # YYYY-MM-DDThh:mm:ss
  ## Find number of matches with grep
  nMatchesShortCsv <- grepl(pattern = patternShortCsv, x)
  nMatchesLongCsv <- grepl(pattern = patternLongCsv, x)
  nMatchesJsonDecimal <- grepl(pattern = patternJsonDecimal, x)
  nMatchesJsonNoDecimal <- grepl(pattern = patternJsonNoDecimal, x)
  ## Parse as the most likely calendar date format. CSV short/long ties go to short format
  if( any(nMatchesLongCsv == TRUE) ){
    return(as.POSIXct(strptime(x, format="%m/%d/%Y %I:%M:%S %p"))) # long date-time format
  }	else if ( any(nMatchesShortCsv == TRUE) ){
    return(as.POSIXct(strptime(x, format="%m/%d/%Y"))) # short date format
  } 
  if( any(nMatchesJsonDecimal == TRUE) | any(nMatchesJsonNoDecimal == TRUE) ){
    return(as.POSIXct(x, format = "%Y-%m-%dT%H:%M:%S")) # JSON format
  } else {
    warning("Unable to properly format date field; formatted as character string.")
    return(x)
  }
}

#' Convert Socrata money fields to numeric
#' 
#' @param x - a factor of Money fields
#' @return a number
#' @author Tom Schenk Jr \email{tom.schenk@@cityofchicago.org}
#' @noRd
no_deniro <- function(x) {
  x <- sub("\\$", "", x)
  x <- as.numeric(x)
}

#' Wrap httr GET in some diagnostics
#' 
#' In case of failure, report error details from Socrata
#' 
#' @param url - Socrata Open Data Application Program Interface (SODA) query
#' @param email - Optional. The email to the Socrata account with read access to the dataset.
#' @param password - Optional. The password associated with the email to the Socrata account
#' @return httr response object
#' @importFrom httr http_status GET content stop_for_status user_agent
#' @author Hugh J. Devlin, Ph. D. \email{Hugh.Devlin@@cityofchicago.org}
#' @noRd
getResponse <- function(url, email = NULL, password = NULL) {
  
  if(is.null(email) && is.null(password)){
    response <- httr::GET(url, httr::user_agent(fetch_user_agent()))
  } else { # email and password are not NULL
    response <- httr::GET(url, httr::authenticate(email, password), httr::user_agent(fetch_user_agent()))
  }
  
  # status <- httr::http_status(response)
  if(response$status_code != 200) {
    msg <- paste("Error in httr GET:", response$status_code, response$headers$statusmessage, url)
    if(!is.null(response$headers$`content-length`) && (response$headers$`content-length` > 0)) {
      details <- httr::content(response)
      msg <- paste(msg, details$code[1], details$message[1])	
    }
    logMsg(msg)
  }
  httr::stop_for_status(response)
  return(response)
}

#' Content parsers
#'
#' Return a data frame for csv
#'
#' @author Hugh J. Devlin \email{Hugh.Devlin@@cityofchicago.org}
#' @importFrom httr content
#' @importFrom jsonlite fromJSON
#' @importFrom utils read.csv
#' @param response - an httr response object
#' @return data frame, possibly empty
#' @noRd
getContentAsDataFrame <- function(response) { UseMethod('response') }
getContentAsDataFrame <- function(response) {
  mimeType <- response$header$'content-type'
  # skip optional parameters
  sep <- regexpr(';', mimeType)[1]
  if(sep != -1) mimeType <- substr(mimeType, 0, sep[1] - 1)
  switch(mimeType,
         'text/csv' = 
           read.csv(textConnection(httr::content(response, 
                                                 as = "text", 
                                                 type = "text/csv", 
                                                 encoding = "utf-8")), 
                    stringsAsFactors = FALSE), # automatic parsing
         'application/json' = 
           if(length(httr::content(response)) == 0) # empty json?
             data.frame() # empty data frame
         else
           as.data.frame.list(fromJSON(httr::content(response,
                                                     as = "text",
                                                     type = "application/json",
                                                     encoding = "utf-8")),
                              stringsAsFactors=FALSE)
  ) # end switch
}

#' Get the SoDA 2 data types
#'
#' Get the Socrata Open Data Application Program Interface data types from the http response header
#' @author Hugh J. Devlin, Ph. D. \email{Hugh.Devlin@@cityofchicago.org}
#' @param response - headers attribute from an httr response object
#' @return a named vector mapping field names to data types
#' @importFrom jsonlite fromJSON
#' @noRd
getSodaTypes <- function(response) { UseMethod('response') }
getSodaTypes <- function(response) {
  dataTypes <- response$headers[['x-soda2-types']]
  if (is.null(dataTypes)) {
    return(NULL)
  }
  else {
    result <- jsonlite::fromJSON(response$headers[['x-soda2-types']])
    names(result) <- jsonlite::fromJSON(response$headers[['x-soda2-fields']])
    return(result)
  }
}

#' Get a full Socrata data set as an R data frame
#'
#' Manages throttling and POSIX date-time conversions
#'
#' @param url - A Socrata resource URL, 
#' or a Socrata "human-friendly" URL, 
#' or Socrata Open Data Application Program Interface (SODA) query 
#' requesting a comma-separated download format (.csv suffix), 
#' May include SoQL parameters, 
#' but is assumed to not include a SODA offset parameter
#' @param app_token - a string; SODA API token used to query the data 
#' portal \url{https://dev.socrata.com/consumers/getting-started.html}
#' @param email - Optional. The email to the Socrata account with read access to the dataset
#' @param password - Optional. The password associated with the email to the Socrata account
#' @param stringsAsFactors - Optional. Should character columns be converted to factor (TRUE or FALSE)?
#' @return an R data frame with POSIX dates
#' @author Hugh J. Devlin, Ph. D. \email{Hugh.Devlin@@cityofchicago.org}
#' @examples
#' \dontrun{
#' # Human-readable URL:
#' url <- "https://soda.demo.socrata.com/dataset/USGS-Earthquakes-for-2012-11-01-API/4334-bgaj"
#' df <- read.socrata(url)
#' # SoDA URL:
#' df <- read.socrata("https://soda.demo.socrata.com/resource/4334-bgaj.csv")
#' # Download private dataset
#' socrataEmail <- Sys.getenv("SOCRATA_EMAIL", "mark.silverberg+soda.demo@@socrata.com")
#' socrataPassword <- Sys.getenv("SOCRATA_PASSWORD", "7vFDsGFDUG")
#' privateResourceToReadCsvUrl <- "https://soda.demo.socrata.com/resource/a9g2-feh2.csv" # dataset
#' read.socrata(url = privateResourceToReadCsvUrl, email = socrataEmail, password = socrataPassword)
#' # Using an API key to read datasets (reduces throttling)
#' token <- "ew2rEMuESuzWPqMkyPfOSGJgE"
#' df <- read.socrata("https://soda.demo.socrata.com/resource/4334-bgaj.csv", 
#'                    app_token = token)
#' nrow(df)
#' closeAllConnections()
#' }
#' @importFrom httr parse_url build_url
#' @importFrom mime guess_type
#' @importFrom plyr rbind.fill
#' @export
read.socrata <- function(url, app_token = NULL, email = NULL, password = NULL,
                         stringsAsFactors = FALSE) {
  validUrl <- validateUrl(url, app_token) # check url syntax, allow human-readable Socrata url
  parsedUrl <- httr::parse_url(validUrl)
  mimeType <- mime::guess_type(parsedUrl$path)
  if (!is.null(names(parsedUrl$query))) { # check if URL has any queries 
    ## if there is a query, check for specific queries and handle them
    orderTest <- any(names(parsedUrl$query) == "$order")
    queries <- unlist(parsedUrl$query)
    countTest <- any(startsWith(queries, "count"))
    if(!orderTest & !countTest) # sort by Socrata unique identifier
      validUrl <- paste(validUrl, if(is.null(parsedUrl$query)) {'?'} else {"&"}, '$order=:id', sep='')
  }
  else {
    validUrl <- paste(validUrl, {'?'}, '$order=:id', sep='')
    parsedUrl <- httr::parse_url(validUrl) # reparse because URL now has a query
  }
  if(!(mimeType %in% c('text/csv','application/json')))
    stop("Error in read.socrata: ", mimeType, " not a supported data format.")
  response <- getResponse(validUrl, email, password)
  page <- getContentAsDataFrame(response)
  result <- page
  dataTypes <- getSodaTypes(response)
  # parse any $limit out of the URL
  if(is.null(parsedUrl$query$`$limit`) & is.null(parsedUrl$query$`$LIMIT`))
    limitProvided <- FALSE
  else { 
    limitProvided <- TRUE
  }
  # PAGE through data and combine
  # if user limit is provided do not page
  # if no limit $provided, loop until all data is paged
  while (nrow(page) > 0 & !limitProvided) { 
    query <- paste(validUrl, if(is.null(parsedUrl$query)) {'?'} else {"&"}, 
                   '$limit=50000&$offset=', nrow(result), sep='')
    response <- getResponse(query, email, password)
    page <- getContentAsDataFrame(response)
    result <- rbind.fill(result, page) # accumulate
  }	
  if (is.null(dataTypes)) {
    warning("Dates and currency fields will be converted to character")
  } else {
    # convert Socrata calendar dates to posix format
    for(columnName in colnames(result)[!is.na(dataTypes[fieldName(colnames(result))])
                                       & (dataTypes[fieldName(colnames(result))] == 'calendar_date'
                                          | dataTypes[fieldName(colnames(result))] == 'floating_timestamp')]) {
      result[[columnName]] <- posixify(result[[columnName]])
    }
    for(columnName in colnames(result)[!is.na(dataTypes[fieldName(colnames(result))]) & dataTypes[fieldName(colnames(result))] == 'money']) {
      result[[columnName]] <- no_deniro(result[[columnName]])
    }
  }
  # convert logical fields to character
  for(columnName in colnames(result)) {
    if(typeof(result[,columnName]) == "logical")
      result[,columnName] <- as.character(result[,columnName])
  }
  if(stringsAsFactors){
    result <- data.frame(unclass(result), stringsAsFactors = stringsAsFactors)
  }
  return(result)
}

#' List datasets available from a Socrata domain
#'
#' @param url - A Socrata URL. This simply points to the site root. 
#' @return an R data frame containing a listing of datasets along with
#' various metadata.
#' @author Peter Schmiedeskamp \email{pschmied@@uw.edu}
#' @examples
#' \dontrun{
#' # Download list of data sets
#' df <- ls.socrata("https://soda.demo.socrata.com")
#' # Check schema definition for metadata
#' attributes(df)
#' }
#' @importFrom jsonlite fromJSON
#' @importFrom httr GET build_url parse_url content user_agent
#' @export
ls.socrata <- function(url) {
  url <- as.character(url)
  parsedUrl <- httr::parse_url(url)
  if(is.null(parsedUrl$scheme) | is.null(parsedUrl$hostname))
    stop(url, " does not appear to be a valid URL.")
  parsedUrl$path <- "data.json"
  #Download data
  response <- httr::GET(httr::build_url(parsedUrl), httr::user_agent(fetch_user_agent()))
  data_dot_json <- jsonlite::fromJSON(content(response, "text"))
  
  data_df <- as.data.frame(data_dot_json$dataset)
  # Assign Catalog Fields as attributes
  attr(data_df, "@context") <- data_dot_json$`@context`
  attr(data_df, "@id") <- data_dot_json$`@id`
  attr(data_df, "@type") <- data_dot_json$`@type`
  attr(data_df, "conformsTo") <- data_dot_json$conformsTo
  attr(data_df, "describedBy") <- data_dot_json$describedBy
  # Convert dates (strings) to POSIX-formatted dates
  data_df$issued <- as.POSIXct(data_df$issued)
  data_df$modified <- as.POSIXct(data_df$modified)
  data_df$theme <- as.character(data_df$theme)
  return(data_df)
}

#' Wrap httr PUT/POST in some diagnostics
#' 
#' In case of failure, report error details from Socrata.
#' 
#' @param url - Socrata Open Data Application Program Interface (SODA) endpoint (JSON only for now)
#' @param json_data_to_upload - JSON encoded data to update your SODA endpoint with
#' @param http_verb - PUT or POST depending on update mode
#' @param email - email associated with Socrata account (will need write access to dataset)
#' @param password - password associated with Socrata account (will need write access to dataset)
#' @param app_token - optional app_token associated with Socrata account
#' @return httr a response object
#' @importFrom httr GET POST PUT authenticate user_agent add_headers
#' 
#' @noRd
checkUpdateResponse <- function(json_data_to_upload, url, http_verb, email, password, app_token = NULL) {
  if(http_verb == "POST"){
    response <- httr::POST(url,
                           body = json_data_to_upload,
                           httr::authenticate(email, password),
                           httr::user_agent(fetch_user_agent()),
                           httr::add_headers("X-App-Token" = app_token,
                                             "Content-Type" = "application/json")) #, verbose())
  } else if(http_verb == "PUT"){
    response <- httr::PUT(url,
                          body = json_data_to_upload,
                          httr::authenticate(email, password),
                          httr::user_agent(fetch_user_agent()),
                          httr::add_headers("X-App-Token" = app_token,
                                            "Content-Type" = "application/json")) # , verbose())
  }
  
  return(response)
}

#' Write to a Socrata dataset (full replace or upsert)
#'
#' @description Method for updating Socrata datasets 
#'
#' @param dataframe - dataframe to upload to Socrata
#' @param dataset_json_endpoint - Socrata Open Data Application Program Interface (SODA) endpoint (JSON only for now)
#' @param update_mode - "UPSERT" or "REPLACE"; consult https://dev.socrata.com/publishers/getting-started.html 
#' @param email - The email to the Socrata account with read access to the dataset
#' @param password - The password associated with the email to the Socrata account
#' @param app_token - a (non-required) string; SODA API token can be used to query the data 
#' portal \url{https://dev.socrata.com/consumers/getting-started.html}
#' @author Mark Silverberg \email{mark.silverberg@@socrata.com}
#' @importFrom httr parse_url build_url
#' @examples
#' \dontrun{
#' # Store user email and password
#' socrataEmail <- Sys.getenv("SOCRATA_EMAIL", "mark.silverberg+soda.demo@@socrata.com")
#' socrataPassword <- Sys.getenv("SOCRATA_PASSWORD", "7vFDsGFDUG")
#' 
#' datasetToAddToUrl <- "https://soda.demo.socrata.com/resource/xh6g-yugi.json" # dataset
#' 
#' # Generate some data
#' x <- sample(-1000:1000, 1)
#' y <- sample(-1000:1000, 1)
#' df_in <- data.frame(x,y)
#' 
#' # Upload to Socrata
#' write.socrata(df_in,datasetToAddToUrl,"UPSERT",socrataEmail,socrataPassword)
#' }
#' @export
write.socrata <- function(dataframe, dataset_json_endpoint, update_mode, email, password, app_token = NULL) {
  
  # translate update_mode to http_verbs
  if(update_mode == "UPSERT"){
    http_verb <- "POST"
  } else if(update_mode == "REPLACE") {
    http_verb <- "PUT"
  } else {
    stop("update_mode must be UPSERT or REPLACE")
  }
  
  # convert dataframe to JSON
  dataframe_as_json_string <- jsonlite::toJSON(dataframe)
  
  # do the actual upload
  response <- checkUpdateResponse(dataframe_as_json_string, dataset_json_endpoint, http_verb, email, password, app_token)
  
  return(response)
  
}

Try the RSocrata package in your browser

Any scripts or data that you put into this service are public.

RSocrata documentation built on Aug. 31, 2023, 9:07 a.m.