R/extract_alias.R

Defines functions extract_alias

Documented in extract_alias

# Copyright 2023 Cloudera Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

#' Extract the column alias from a SQL expression
#'
#' @description Extracts the column alias assignment from an expression used in
#'   the \code{SELECT} list of a SQL query
#'
#' @param expr a character string containing a SQL expression which might have a
#'   column alias assignment at the end
#' @return a character string containing the inputted SQL expression with the
#'   column alias assignment removed (if it existed) and with the assigned alias
#'   as its name
#' @examples
#' expr <- "round(AVG(arr_delay)) AS avg_delay"
#' extract_alias(expr)
#' @details The expression must not contain any unquoted whitespace characters
#'   except spaces, and there must be no unquoted runs or two or more spaces.
#'   Use \code{\link{squish_sql}} to satisfy this whitespace requirement.
#'
#'   queryparser also uses this function internally to extract table aliases
#'   used in the \code{FROM} clause.
#' @export
extract_alias <- function(expr) {
  if (!identical(typeof(expr), "character") || !identical(length(expr), 1L)) {
    stop("Unexpected input to extract_alias()", call. = FALSE)
  }

  expr <- trimws(expr)

  bytes_in_chars <- nchar(strsplit(expr, "")[[1]], type = "bytes")

  rc <- rawConnection(raw(0L), "r+")
  on.exit(close(rc))
  writeChar(expr, rc)
  len <- seek(rc, 0L) - 1L

  column_alias <- NULL
  expr_without_alias <- NULL

  found_as_before_alias <- FALSE
  look_for_char_before_alias <- FALSE
  look_for_as_keyword <- FALSE
  quoted_string_at_end <- FALSE
  quote_at_end <- FALSE
  possible_word_at_end <- FALSE
  was_in_quotes <- FALSE
  in_quotes <- FALSE

  # go backwards, from end toward beginning
  step_number <- 0
  step_positions <- c(rev(cumsum(bytes_in_chars)),0)[-1]
  advance_positions <- 1
  char_is_quote_escape <- FALSE
  while(TRUE) {
    if (advance_positions > 0) {
      if (step_number + advance_positions > length(step_positions)) {
        break;
      }
      step_number <- step_number + advance_positions
      pos <- step_positions[step_number]
    }
    advance_positions <- 1
    seek(rc, pos)
    char <- readChar(rc, 1L)

    if (char %in% quote_chars) {
      if (pos == step_positions[1]) {
        quote_at_end <- TRUE
      }
      if (!in_quotes) {
        in_quotes <- TRUE
        quo_char <- char
      } else if (char == quo_char && !char_is_quote_escape) {
        if (pos == 0) {
          in_quotes <- FALSE
        } else {
          seek(rc, -2L, "current")
          esc_quo <- c(quo_char, "\\")
          if (readChar(rc, 1L, useBytes = TRUE) %in% esc_quo) {
            char_is_quote_escape <- TRUE
          } else {
            char_is_quote_escape <- FALSE
            in_quotes <- FALSE
            rm(quo_char)
          }
          seek(rc, 1L, "current")
        }
      }
    }

    if (look_for_char_before_alias) {
      if (quoted_string_at_end && !found_as_before_alias) {
        seek(rc, 0)
        expr_without_alias <- trimws(readChar(rc, pos + 1, useBytes = TRUE))
      } else if (is_non_word_character(char, useBytes = TRUE)) {
        seek(rc, 0)
        expr_without_alias <- trimws(readChar(rc, pos + 1, useBytes = TRUE))
      } else if (found_as_before_alias) {
        seek(rc, 0)
        expr_without_alias <- trimws(readChar(rc, pos + 3, useBytes = TRUE))
      }
      break;
    }

    if (
        !in_quotes && !char %in% quote_chars &&
        !possible_word_at_end &&
        !is_whitespace_character(char) &&
        !is_word_character(char)) {
      break;
    }

    if (look_for_as_keyword) {
      advance_positions <- 0
      prev_char <- char
      if (identical(prev_char, " ")) {
        seek(rc, -2L, "current")
        prev_char <- readChar(rc, 1L, useBytes = TRUE)
        advance_positions <- 1
      }
      if (prev_char %in% c("s","S")) {
        seek(rc, -2L, "current")
        prev_char <- readChar(rc, 1L, useBytes = TRUE)
        if (prev_char %in% c("a","A")) {
          found_as_before_alias <- TRUE
          advance_positions <- advance_positions + 2
        } else {
          advance_positions <- 0
        }
      } else {
        advance_positions <- 0
      }
      look_for_as_keyword <- FALSE
      if (!identical(char, ".")) {
        look_for_char_before_alias <- TRUE
      }
    }

    if (quote_at_end && was_in_quotes && !in_quotes) {
      quoted_string_at_end <- TRUE
      look_for_as_keyword <- TRUE
      column_alias <- readChar(rc, len - pos - 2, useBytes = TRUE)
    }

    if (possible_word_at_end && is_non_word_character(char, useBytes = TRUE)) {
      next_char <- readChar(rc, 1L)
      if (is_word_start_character(next_char) && !identical(next_char, ".")) {
        seek(rc, -1 * nchar(next_char, type = "bytes"), "current")
        column_alias <- readChar(rc, len - pos - 1L, useBytes = TRUE)
        if (char == " ") {
          look_for_as_keyword <- TRUE
        } else {
          look_for_char_before_alias <- TRUE
        }
      }
      advance_positions <- 0
      possible_word_at_end <- FALSE
    }

    if (pos == step_positions[1] && is_word_character(char)) {
      possible_word_at_end <- TRUE
    }

    was_in_quotes <- in_quotes
  }

  if (is.null(column_alias) || is.null(expr_without_alias)) {
    expr
  } else if (ends_with_operator_expecting_right_operand(expr_without_alias)) {
    expr
  } else if (tolower(column_alias) %in% sql_reserved_words && !quoted_string_at_end) {
    expr
  } else {
    names(expr_without_alias) <- column_alias
    expr_without_alias
  }
}

Try the queryparser package in your browser

Any scripts or data that you put into this service are public.

queryparser documentation built on Jan. 10, 2023, 1:08 a.m.