R/split_query.R

Defines functions split_comma_list split_clause split_limit split_order_by split_having split_group_by split_where split_from split_select clause_starts_here split_query

Documented in split_query

# Copyright 2023 Cloudera Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

#' Split a SQL query
#'
#' @description Splits a SQL \code{SELECT} statement into clauses, and splits
#'   comma-separated column lists within the clauses.
#'
#' @param query a character string containing a SQL \code{SELECT} statement
#' @param tidyverse for queryparser internal use only
#' @return A list object with named elements representing the clauses of the
#'   query
#' @examples
#' my_query <- "SELECT origin, dest,
#'     COUNT(flight) AS num_flts,
#'     round(AVG(distance)) AS dist,
#'     round(AVG(arr_delay)) AS avg_delay
#'   FROM flights
#'   WHERE distance BETWEEN 200 AND 300
#'     AND air_time IS NOT NULL
#'   GROUP BY origin, dest
#'   HAVING num_flts > 3000
#'   ORDER BY num_flts DESC, avg_delay DESC
#'   LIMIT 100;"
#'
#' split_query(my_query)
#' @seealso \code{\link{parse_query}}
#' @export
split_query <- function(query, tidyverse) {
  if (!identical(typeof(query), "character") || !identical(length(query), 1L)) {
    stop("Unexpected input to split_query()", call. = FALSE)
  }

  original_encoding <- Encoding(query)
  if (original_encoding == "unknown") {
    original_encoding <- "UTF-8"
  }

  query <- trimws(query)
  query <- squish_sql(query)
  query <- sub(";$", "", query)

  rc <- rawConnection(raw(0L), "r+")
  on.exit(close(rc))
  writeChar(query, rc)
  len <- seek(rc, 0L) - 1L

  if (!clause_starts_here(rc, "select")) {
    stop("Query must begin with the SELECT keyword", call. = FALSE)
  }

  seek(rc, 7L)
  select_distinct <- FALSE
  if (clause_starts_here(rc, "all")) {
    seek(rc, 10L)
  } else if (clause_starts_here(rc, "distinct")) {
    select_distinct <- TRUE
    seek(rc, 15L)
  } else {
    seek(rc, 6L)
  }

  pos_from <- NULL
  pos_where <- NULL
  pos_group_by <- NULL
  pos_having <- NULL
  pos_order_by <- NULL
  pos_limit <- NULL

  in_quotes <- FALSE
  in_parens <- 0
  escaped <- FALSE

  while((pos <- seek(rc, NA)) <= len) {

    # identify when inside strings and parentheses
    char <- readChar(rc, 1L, useBytes = TRUE)
    if (char %in% quote_chars) {
      if (!in_quotes) {
        in_quotes <- TRUE
        escaped <- FALSE
        quo_char <- char
      } else if (char == quo_char) {
        if (escaped) {
          escaped <- FALSE
        } else {
          esc_quo <- c(quo_char, "\\")
          if (!readChar(rc, 1L, useBytes = TRUE) %in% esc_quo) {
            in_quotes <- FALSE
            escaped <- FALSE
            rm(quo_char)
          } else {
            escaped <- TRUE
          }
          seek(rc, -1L, "current")
        }
      }
      in_word <- FALSE
    } else if (!in_quotes && char == "(") {
      escaped <- FALSE
      in_parens <- in_parens + 1
      in_word <- FALSE
    } else if (!in_quotes && char == ")") {
      escaped <- FALSE
      in_parens <- in_parens - 1
      in_word <- FALSE
    } else if (is_word_character(char, useBytes = TRUE)) {
      escaped <- FALSE
      in_word <- TRUE
    } else {
      escaped <- FALSE
      in_word <- FALSE
    }

    if (!in_quotes && !in_word) {

      # identify unsupported syntax
      if (clause_starts_here(rc, "over")) {
        stop("OVER clauses are not supported", call. = FALSE)
      }
      if (clause_starts_here(rc, "select")) {
        if (in_parens > 0) {
          stop("Subqueries are not supported", call. = FALSE)
        } else {
          stop("The SELECT keyword is used two or more times", call. = FALSE)
        }
      }
    }

    if (!in_quotes && in_parens <= 0 && !in_word) {

      # identify unsupported syntax
      if (clause_starts_here(rc, "union")) {
        stop("The UNION operator is not supported", call. = FALSE)
      }
      if (clause_starts_here(rc, "intersect")) {
        stop("The INTERSECT operator is not supported", call. = FALSE)
      }
      if (clause_starts_here(rc, "except")) {
        stop("The EXCEPT operator is not supported", call. = FALSE)
      }

      # identify beginnings of clauses
      if (clause_starts_here(rc, "from")) {
        # don't split on the "from" is "is [not] distinct from"
        if (!preceded_by_keyword(rc, "distinct", useBytes = TRUE)) {
          pos_from <- append(pos_from, pos + 1L)
        }
      } else if (clause_starts_here(rc, "where")) {
        pos_where <- append(pos_where, pos + 1L)
      } else if (clause_starts_here(rc, "group by")) {
        pos_group_by <- append(pos_group_by, pos + 1L)
      } else if (clause_starts_here(rc, "having")) {
        pos_having <- append(pos_having, pos + 1L)
      } else if (clause_starts_here(rc, "order by")) {
        pos_order_by <- append(pos_order_by, pos + 1L)
      } else if (clause_starts_here(rc, "limit")) {
        pos_limit <- append(pos_limit, pos + 1L)
      }

    }

    seek(rc, pos + 1)
  }

  if (in_quotes) {
    stop("Query contains unmatched quotation marks", call. = FALSE)
  }
  if (in_parens > 0) {
    stop("Query contains unmatched parentheses", call. = FALSE)
  }

  start_pos <- list(
    "select" = 0,
    "from" = pos_from,
    "where" = pos_where,
    "group_by" = pos_group_by,
    "having" = pos_having,
    "order_by" = pos_order_by,
    "limit" = pos_limit
  )
  if (any(lapply(start_pos, length) > 1)) {
    stop("One or more clauses is used two or more times", call. = FALSE)
  }
  start_pos <- unlist(start_pos) + 1
  if (any(diff(start_pos) < 0)) {
    stop("Clauses are in an incorrect order", call. = FALSE)
  }
  stop_pos <- c(start_pos[-1] - 1, len)
  names(stop_pos) <- names(start_pos)
  clauses <- mapply(
    function(x, y) list(start = x, stop = y),
    start_pos,
    stop_pos,
    SIMPLIFY = FALSE
  )
  Encoding(query) <- "bytes"
  clauses <- lapply(
    clauses, function(x) {
      substr(query, x$start, x$stop)
    }
  )

  clauses$select <- split_select(clauses$select)
  clauses$from <- split_from(clauses$from)
  clauses$where <- split_where(clauses$where)
  clauses$group_by <- split_group_by(clauses$group_by)
  clauses$having <- split_having(clauses$having)
  clauses$order_by <- split_order_by(clauses$order_by)
  clauses$limit <- split_limit(clauses$limit)

  clauses <- lapply(clauses, function(clause) {Encoding(clause) <- original_encoding; clause})

  if (select_distinct) {
    attr(clauses$select, "distinct") <- TRUE
  }

  clauses
}

clause_starts_here <- function(rc, keyword) {
  keyword_starts_here(rc, keyword, useBytes = TRUE, look_back = FALSE)
}

split_select <- function(clause) {
  split_comma_list(split_clause(clause, "select( all)?( distinct)?"))
}

split_from <- function(clause) {
  split_clause(clause, "from")
}

split_where <- function(clause) {
  split_clause(clause, "where")
}

split_group_by <- function(clause) {
  split_comma_list(split_clause(clause, "group by"))
}

split_having <- function(clause) {
  split_clause(clause, "having")
}

split_order_by <- function(clause) {
  split_comma_list(split_clause(clause, "order by"))
}

split_limit <- function(clause) {
  split_clause(clause, "limit")
}

split_clause <- function(clause, keyword) {
  if (is.null(clause)) return(NULL)
  clause <- trimws(clause)
  keyword_regex <- paste0("^", keyword, ws_regex, "*")
  clause <- sub(keyword_regex, "", clause, ignore.case = TRUE, useBytes = TRUE)
  clause
}

split_comma_list <- function(comma_list) {
  if (is.null(comma_list)) return(NULL)

  rc <- rawConnection(raw(0L), "r+")
  on.exit(close(rc))
  writeChar(comma_list, rc)
  len <- seek(rc, 0L) - 1L

  pos_comma <- NULL

  in_quotes <- FALSE
  in_parens <- 0
  escaped <- FALSE

  while((pos <- seek(rc, NA)) <= len) {
    char <- readChar(rc, 1L, useBytes = TRUE)
    if (char %in% quote_chars) {
      if (!in_quotes) {
        in_quotes <- TRUE
        escaped <- FALSE
        quo_char <- char
      } else if (char == quo_char) {
        if (escaped) {
          escaped <- FALSE
        } else {
          esc_quo <- c(quo_char, "\\")
          if (!readChar(rc, 1L, useBytes = TRUE) %in% esc_quo) {
            in_quotes <- FALSE
            escaped <- FALSE
            rm(quo_char)
          } else {
            escaped <- TRUE
          }
          seek(rc, -1L, "current")
        }
      } else {
        escaped <- FALSE
      }
    } else if (!in_quotes && char == "(") {
      escaped <- FALSE
      in_parens <- in_parens + 1
    } else if (!in_quotes && char == ")") {
      escaped <- FALSE
      in_parens <- in_parens - 1
    } else if (!in_quotes && in_parens <= 0) {
      escaped <- FALSE
      if (char == ",") {
        pos_comma <- append(pos_comma, pos)
      }
    } else {
      escaped <- FALSE
    }
  }

  pos_comma <- pos_comma + 1

  if (is.null(pos_comma)) {
    trimws(comma_list)
  } else {
    original_encoding <- Encoding(comma_list)
    if (original_encoding == "unknown") {
      original_encoding <- "UTF-8"
    }
    Encoding(comma_list) <- "bytes"
    out <- trimws(
      substring(comma_list, c(1, pos_comma + 1), c(pos_comma - 1, len))
    )
    Encoding(out) <- original_encoding
    out
  }
}

Try the queryparser package in your browser

Any scripts or data that you put into this service are public.

queryparser documentation built on Jan. 10, 2023, 1:08 a.m.