R/get_resource_sql.R

Defines functions get_resource_sql

Documented in get_resource_sql

#' Get PHS Open Data using SQL
#'
#' @description Similar to [get_resource()]
#' but with more options for server-side querying of data.
#' However, this function has a lower maximum row number
#' (32,000 vs 99,999) for returned results.
#'
#' @param sql (character) a single PostgreSQL SELECT query.
#'
#' Must include a resource ID, which must be double-quoted
#' e.g.,
#'
#' ```SELECT * from "58527343-a930-4058-bf9e-3c6e5cb04010"```
#'
#'
#' Column names must be double-quoted,
#' while character values in filters must be single-quoted.
#' e.g.,
#'
#' ```"Age" = '34'```
#'
#'
#' You may need to escape quote marks with `\` to implement this. e.g.,
#'
#'
#' ```sql = "SELECT * FROM \"<res_id>\" WHERE \"Age\" = '34'"```.
#'
#' @seealso [get_resource()] for downloading a resource without using a
#' SQL query.
#'
#' @return a [tibble][tibble::tibble-package] with the query results.
#' Only 32,000 rows can be returned from a single SQL query.
#' @export
#'
#' @examples
#' sql <- "
#'    SELECT
#'      \"TotalCancelled\",\"TotalOperations\",\"Hospital\",\"Month\"
#'    FROM
#'      \"bcc860a4-49f4-4232-a76b-f559cf6eb885\"
#'    WHERE
#'      \"Hospital\" = 'D102H'
#' "
#' df <- get_resource_sql(sql)
#'
#' # This is equivalent to:
#' cols <- c(
#'   "TotalCancelled", "TotalOperations",
#'   "Hospital", "Month"
#' )
#' row_filter <- c(Hospital = "D102H")
#'
#' df2 <- get_resource(
#'   "bcc860a4-49f4-4232-a76b-f559cf6eb885",
#'   col_select = cols,
#'   row_filters = row_filter
#' )
get_resource_sql <- function(sql) {
  if (length(sql) != 1) {
    cli::cli_abort(c(
      x = "SQL validation error.",
      i = "{.var sql} must be length 1 not {length(sql)}."
    ))
  }

  if (!inherits(sql, "character")) {
    cli::cli_abort(c(
      x = "SQL validation error.",
      i = "{.var sql} must be of class {.cls character} not {.cls {class(sql)}}."
    ))
  }

  # check query is a SELECT statement
  if (!grepl("^\\s*?SELECT", sql)) {
    cli::cli_abort(c(
      x = "SQL validation error.",
      i = "{.var sql} must start with {.val SELECT}"
    ))
  }

  # Add the SQL statement to the query
  query <- list("sql" = sql)

  # attempt get request
  content <- phs_GET("datastore_search_sql", query)

  if (!is.null(content[["result"]][["records_truncated"]])) {
    cli::cli_warn(
      "The data was truncated because your query matched more than the
      maximum number of rows."
    )
  }


  # extract the records (rows) from content
  data <- purrr::map_dfr(
    content$result$records,
    ~ {
      # replace NULL with "" so tibble works
      is_null <- purrr::map_lgl(.x, is.null)
      .x[is_null] <- ""

      tibble::as_tibble(.x)
    }
  )

  # If the query returned no rows, exit now.
  if (nrow(data) == 0L) {
    return(data)
  }

  # get correct order of columns
  order <- purrr::map_chr(
    content$result$fields,
    ~ .x$id
  )
  order <- order[!order %in% c("_id", "_full_text")]

  # select and reorder columns to reflect
  cleaner <- dplyr::select(data, dplyr::all_of(order))

  # warn if limit may have been surpassed
  if (nrow(cleaner) == 32000L) {
    cli::cli_warn(c(
      "Row number limit",
      i = "SQL queries are limitted to returning 32,000 results.
      This may have affected the results of your query."
    ))
  }

  return(cleaner)
}
Public-Health-Scotland/phsopendata documentation built on April 14, 2025, 11:45 a.m.