R/tags_mode.R
In LLMR: Interface for Large Language Model APIs in R

Documented in llm_mutate_tags llm_parse_tags llm_parse_tags_col

## tags_mode.R
## Soft structured output via XML-like tags
## --------------------------------------------------------------------------

.validate_tags <- function(tags) {
  tags <- as.character(tags)
  bad <- !length(tags) || anyNA(tags) || any(!nzchar(tags)) ||
    any(!grepl("^[A-Za-z][A-Za-z0-9_.-]*$", tags))
  if (isTRUE(bad)) {
    stop("`.tags` must be a non-empty character vector of simple tag names.")
  }
  unique(tags)
}

.escape_regex <- function(x) {
  gsub("([][{}()+*^$|\\\\?.])", "\\\\\\1", x, perl = TRUE)
}

.tag_prompt <- function(tags) {
  tag_lines <- paste0("<", tags, ">...</", tags, ">")
  paste(
    c(
      "Return only XML-like tags with these tag names.",
      tag_lines,
      "Do not include prose, Markdown, or code fences outside the tags."
    ),
    collapse = "\n"
  )
}

.add_tag_prompt <- function(.messages, .system_prompt, tags) {
  instruction <- .tag_prompt(tags)

  if (!is.null(.messages)) {
    if (is.null(names(.messages))) {
      names(.messages) <- rep("user", length(.messages))
    }
    sys <- which(names(.messages) == "system")
    if (length(sys)) {
      .messages[[sys[[1]]]] <- paste(.messages[[sys[[1]]]], instruction, sep = "\n\n")
    } else {
      .messages <- c(system = instruction, .messages)
    }
    return(list(.messages = .messages, .system_prompt = .system_prompt))
  }

  list(
    .messages = NULL,
    .system_prompt = paste(c(.system_prompt, instruction), collapse = "\n\n")
  )
}

.decode_tag_entities <- function(x) {
  x <- gsub("&lt;", "<", x, fixed = TRUE)
  x <- gsub("&gt;", ">", x, fixed = TRUE)
  x <- gsub("&quot;", "\"", x, fixed = TRUE)
  x <- gsub("&apos;", "'", x, fixed = TRUE)
  gsub("&amp;", "&", x, fixed = TRUE)
}

.coerce_tag_col <- function(vals, n) {
  nonnull <- vals[!vapply(vals, is.null, logical(1))]
  if (!length(nonnull)) return(rep(NA_character_, n))

  scalar <- vapply(nonnull, function(x) is.atomic(x) && length(x) == 1L, logical(1))
  if (!all(scalar)) return(vals)

  chr <- vapply(vals, function(x) {
    if (is.null(x)) NA_character_ else as.character(x)
  }, character(1))

  ok_num <- !is.na(chr) & nzchar(trimws(chr))
  if (any(ok_num) && all(!ok_num | !is.na(suppressWarnings(as.numeric(chr))))) {
    return(suppressWarnings(as.numeric(chr)))
  }

  low <- tolower(chr)
  ok_log <- !is.na(low) & low %in% c("true", "false")
  if (any(ok_log) && all(is.na(low) | ok_log)) {
    out <- rep(NA, n)
    out[ok_log] <- low[ok_log] == "true"
    return(out)
  }

  chr
}

#' Parse XML-like tags emitted by an LLM
#'
#' Extracts simple XML-like tags from a character scalar or [llmr_response], such
#' as `<age>21</age>` and `<job>student</job>`. This is intended for soft
#' structured output, not full XML validation.
#'
#' @param x Character scalar or [llmr_response].
#' @param tags Character vector of tag names to extract.
#' @return A named list of extracted tag values, or `NULL` when no requested tag
#'   is found.
#'
#' @examples
#' llm_parse_tags("<age>21</age><job>student</job>", tags = c("age", "job"))
#'
#' @seealso [llm_parse_tags_col()], [llm_mutate_tags()]
#' @export
llm_parse_tags <- function(x, tags) {
  tags <- .validate_tags(tags)
  if (inherits(x, "llmr_response")) x <- x$text %||% ""
  if (!is.character(x) || length(x) == 0) return(NULL)

  s <- as.character(x[[1]])
  if (is.na(s) || !nzchar(s)) return(NULL)

  s <- .strip_code_fences(s)
  out <- list()
  for (tag in tags) {
    tag_pat <- .escape_regex(tag)
    pat <- paste0("(?is)<\\s*", tag_pat, "(?:\\s+[^>]*)?>\\s*(.*?)\\s*</\\s*", tag_pat, "\\s*>")
    hit <- regmatches(s, regexec(pat, s, perl = TRUE))[[1]]
    if (length(hit) >= 2L) {
      out[[tag]] <- .decode_tag_entities(trimws(hit[[2]]))
    }
  }

  if (!length(out)) NULL else out
}

#' Parse XML-like tag fields from a column
#'
#' Appends `tags_ok`, `tags_data`, and one column per requested tag or field.
#'
#' @param .data data.frame/tibble.
#' @param tags Character vector of tag names to parse.
#' @param tags_col Column name to parse from. Default `"response_text"`.
#' @param fields `NULL` to extract all tags, a character vector of tags, a named
#'   vector such as `c(person_age = "age")`, or `FALSE` to skip field extraction.
#' @param prefix Optional prefix for extracted columns.
#' @return `.data` with tag diagnostics and extracted columns.
#'
#' @examples
#' df <- data.frame(response_text = "<age>21</age><job>student</job>")
#' llm_parse_tags_col(df, tags = c("age", "job"))
#' llm_parse_tags_col(df, tags = c("age", "job"), fields = c(person_age = "age"))
#'
#' @seealso [llm_parse_tags()], [llm_mutate_tags()], [llm_parse_structured_col()]
#' @export
llm_parse_tags_col <- function(.data, tags, tags_col = "response_text", fields = NULL, prefix = "") {
  tags <- .validate_tags(tags)
  if (!is.data.frame(.data)) {
    .data <- as.data.frame(.data, stringsAsFactors = FALSE)
  }
  n <- nrow(.data)
  out <- .data
  fields <- if (is.null(fields)) tags else fields

  if (!tags_col %in% names(.data)) {
    out$tags_ok <- rep(FALSE, n)
    out$tags_data <- replicate(n, NULL, simplify = FALSE)
    if (!identical(fields, FALSE) && length(fields)) {
      dest <- if (is.null(names(fields))) fields else names(fields)
      for (f in dest) out[[paste0(prefix, f)]] <- rep(NA_character_, n)
    }
    return(tibble::as_tibble(out))
  }

  src <- .data[[tags_col]]
  parsed <- vector("list", n)
  ok <- logical(n)
  for (i in seq_len(n)) {
    p <- llm_parse_tags(src[[i]], tags = tags)
    parsed[i] <- list(p)
    ok[[i]] <- !is.null(p) && all(tags %in% names(p))
  }
  out$tags_ok <- ok
  out$tags_data <- parsed

  if (!identical(fields, FALSE) && length(fields)) {
    src_tags <- unname(if (is.null(names(fields))) fields else fields)
    dest_names <- if (is.null(names(fields))) fields else names(fields)
    for (k in seq_along(src_tags)) {
      vals <- lapply(parsed, function(x) {
        if (is.null(x)) NULL else x[[src_tags[[k]]]]
      })
      out[[paste0(prefix, dest_names[[k]])]] <- .coerce_tag_col(vals, n)
    }
  }

  tibble::as_tibble(out)
}

#' Data-frame mutate with XML-like tag output
#'
#' Soft structured variant of [llm_mutate()]. It asks the model to return simple
#' XML-like tags, then parses those tags into columns.
#'
#' @inheritParams llm_mutate
#' @param .tags Character vector of tag names to request and parse.
#' @param .fields `NULL` to extract all tags, a character vector of tags, a named
#'   vector such as `c(person_age = "age")`, or `FALSE` to keep only `tags_data`.
#'
#' @details
#' Returns the mutated data frame plus:
#' \describe{
#'   \item{`tags_ok`}{`TRUE` when all requested tags were found.}
#'   \item{`tags_data`}{A list-column of parsed tag lists.}
#'   \item{tag columns}{One column per requested tag or field. Scalar columns are
#'   coerced to numeric or logical when all non-missing values allow it.}
#' }
#'
#' @section Shorthand syntax:
#' \preformatted{
#' df |> llm_mutate_tags(result = "{text}", .tags = c("age", "job"), .config = cfg)
#' }
#'
#' @examples
#' \dontrun{
#' df <- tibble::tibble(city = c("Cairo", "Lima"))
#' cfg <- llm_config("openai", "gpt-4.1-nano", temperature = 0)
#'
#' df |>
#'   llm_mutate_tags(
#'     geo = "Where is {city}? Give country and continent in their own tags.",
#'     .config = cfg,
#'     .system_prompt = paste(
#'       "Use XML tags for different parts of the answer, but do not nest tags.",
#'       "Return <country>...</country> and <continent>...</continent>."
#'     ),
#'     .tags = c("country", "continent")
#'   )
#' }
#'
#' @seealso [llm_mutate()], [llm_parse_tags()], [llm_parse_tags_col()],
#'   [llm_mutate_structured()], [llm_parse_structured_col()]
#' @export
llm_mutate_tags <- function(.data,
                            output,
                            prompt = NULL,
                            .messages = NULL,
                            .config,
                            .system_prompt = NULL,
                            .before = NULL,
                            .after = NULL,
                            .tags,
                            .fields = NULL,
                            ...) {
  tags <- .validate_tags(.tags)
  output_missing <- missing(output)
  before_missing <- missing(.before)
  after_missing <- missing(.after)
  dots <- rlang::dots_list(...)
  prompted <- .add_tag_prompt(.messages, .system_prompt, tags)

  args <- list(
    .data = .data,
    prompt = prompt,
    .messages = prompted$.messages,
    .config = .config,
    .system_prompt = prompted$.system_prompt,
    .return = "columns"
  )
  if (!before_missing) args$.before <- .before
  if (!after_missing) args$.after <- .after

  if (output_missing) {
    out <- do.call(llm_mutate, c(args, dots))
    new_cols <- setdiff(names(out), names(.data))
    if (!length(new_cols)) {
      stop("Could not determine output column name from shorthand syntax")
    }
    output_name <- new_cols[[1]]
  } else {
    output_sym <- rlang::ensym(output)
    args$output <- output_sym
    out <- do.call(llm_mutate, c(args, dots))
    output_name <- rlang::as_name(output_sym)
  }

  llm_parse_tags_col(out, tags = tags, tags_col = output_name, fields = .fields)
}