R/answer_using_r.R
In tidyprompt: Prompt Large Language Models and Enhance Their Functionality

Documented in answer_using_r

#' Enable LLM to draft and execute R code
#'
#' This function adds a prompt wrap to a [tidyprompt()] that instructs the
#' LLM to answer the prompt with R code. There are various options to customize
#' the behavior of this prompt wrap, concerning the evaluation of the R code,
#' the packages that may be used, the objects that already exist in the R
#' session, and if the console output that should be sent back to the LLM.
#'
#' @details For the evaluation of the R code, the 'callr' package is required.
#' Please note: automatic evaluation of generated R code may be dangerous to your
#' system; you must use this function with caution.
#'
#' @param prompt A single string or a [tidyprompt()] object
#'
#' @param add_text Single string which will be added to the prompt text,
#' informing the LLM that they must code in R to answer the prompt
#'
#' @param pkgs_to_use A character vector of package names that may be used
#' in the R code that the LLM will generate. If evaluating the R code, these
#' packages will be pre-loaded in the R session
#'
#' @param objects_to_use A named list of objects that may be used in the R code
#' that the LLM will generate. If evaluating the R code, these objects will be pre-loaded
#' in the R session. The names of the list will be used as the object names in the
#' R session
#'
#' @param list_packages Logical indicating whether the LLM should be informed
#' about the packages that may be used in their R code (if TRUE, a list of the
#' loaded packages will be shown in the initial prompt)
#'
#' @param list_objects Logical indicating whether the LLM should be informed
#' about the existence of 'objects_to_use' (if TRUE, a list of the objects
#' plus their types will be shown in the initial prompt)
#'
#' @param skim_dataframes Logical indicating whether the LLM should be informed
#' about the structure of dataframes present in 'objects_to_use' (if TRUE,
#' a skim summary of each `data.frame` type object will be shown in the initial prompt).
#' This uses the function [skim_with_labels_and_levels()]
#'
#' @param evaluate_code Logical indicating whether the R code should be
#' evaluated. If TRUE, the R code will be evaluated in a separate R session
#' (using 'callr' to create an isolated R session via \link[callr]{r_session}).
#' Note that setting this to 'TRUE' means that code generated by the LLM will
#' run on your system; use this setting with caution
#'
#' @param r_session_options A list of options to pass to the \link[callr]{r_session}.
#' This can be used to customize the R session. See \link[callr]{r_session_options}
#' for the available options. If no options are provided, the default options
#' will be used but with 'system_profile' and 'user_profile' set to FALSE
#'
#' @param output_as_tool Logical indicating whether the console output of the
#' evaluated R code should be sent back to the LLM, meaning the LLM will use
#' R code as a tool to formulate a final answer to the prompt. If TRUE, the LLM
#' can decide if they can answer the prompt with the output, or if they need to modify
#' their R code. Once the LLM does not provide new R code (i.e., the prompt is being answered)
#' this prompt wrap will end (it will continue for as long as the LLM provides R code).
#' When this option is enabled, the resulting [prompt_wrap()] will be of type 'tool'.
#' If TRUE, the return mode will also always be set to 'llm_answer'
#'
#' @param return_mode Single string indicating the return mode. One of:
#' \itemize{
#'  \item 'full': Return a list with the final LLM answer, the extracted R code,
#'  and (if argument 'evaluate_code' is TRUE) the output of the R code
#'  \item 'code': Return only the extracted R code
#'  \item 'console': Return only the console output of the evaluated R code
#'  \item 'object': Return only the object produced by the evaluated R code
#'  \item 'formatted_output': Return a formatted string with the extracted R code
#'  and its console output, and a print of the last object (this is identical to how it would
#'  be presented to the LLM if 'output_as_tool' is TRUE)
#'  \item 'llm_answer': Return only the final LLM answer
#'  }
#'  When choosing 'console' or 'object', an additional instruction will be added to
#'  the prompt text to inform the LLM about the expected output of the R code.
#'  If 'output_as_tool' is TRUE, the return mode will always be set to 'llm_answer'
#'  (as the LLM will be using the R code as a tool to answer the prompt)
#'
#' @return A [tidyprompt()] object with the [prompt_wrap()] added to it, which
#' will handle R code generation and possibly evaluation
#'
#' @export
#'
#' @example inst/examples/answer_using_r.R
#'
#' @seealso [answer_using_tools()]
#'
#' @family pre_built_prompt_wraps
#' @family answer_using_prompt_wraps
answer_using_r <- function(
    prompt,
    add_text = "You must code in the programming language 'R' to answer this prompt.",
    pkgs_to_use = c(),
    objects_to_use = list(),
    list_packages = TRUE,
    list_objects = TRUE,
    skim_dataframes = TRUE,
    evaluate_code = FALSE,
    r_session_options = list(),
    output_as_tool = FALSE,
    return_mode = c(
      "full",
      "code",
      "console",
      "object",
      "formatted_output",
      "llm_answer"
    )
) {
  ## Validate settings

  return_mode <- match.arg(return_mode)

  stopifnot(
    is.character(add_text), length(add_text) == 1,
    length(pkgs_to_use) == 0 || is.vector(pkgs_to_use) & all(sapply(pkgs_to_use, is.character)),
    is.list(objects_to_use),
    length(objects_to_use) == 0 || !is.null(names(objects_to_use)),
    is.logical(list_packages),
    is.logical(list_objects),
    is.logical(skim_dataframes),
    is.logical(evaluate_code),
    is.logical(output_as_tool),
    is.list(r_session_options)
  )

  if (evaluate_code & !requireNamespace("callr", quietly = TRUE))
    stop("The 'callr' package is required to evaluate R code.")
  if (!evaluate_code & output_as_tool)
    output_as_tool <- FALSE
  if (output_as_tool)
    return_mode <- "llm_answer"
  if (!evaluate_code & return_mode %in% c("console", "object", "formatted_output"))
    stop("The return mode must be 'full', 'code', or 'llm_answer' if 'evaluate_code' is FALSE")


  ## Validate evaluation_session & load packages

  if (evaluate_code) {
    if (length(r_session_options) == 0) {
      r_session_options <- callr::r_session_options()
      r_session_options$system_profile <- FALSE
      r_session_options$user_profile <- FALSE
    }
    evaluation_session <- callr::r_session$new(options = r_session_options)

    # Check if packages are installed using requireNamespace
    installed_pkgs <- evaluation_session$run(function(pkgs_to_use) {
      # Check if each package is installed using requireNamespace and return as a named list
      sapply(pkgs_to_use, function(pkg) {
        requireNamespace(pkg, quietly = TRUE)
      }, simplify = TRUE, USE.NAMES = TRUE)
    }, args = list(pkgs_to_use = pkgs_to_use))


    if (any(installed_pkgs == FALSE)) {
      stop(paste0(
        "The following packages are not installed: ",
        names(installed_pkgs)[installed_pkgs == FALSE]
      ))
    }

    # Load the packages
    loaded_pkgs <- evaluation_session$run(function(pkgs_to_use) {
      for (pkg_name in pkgs_to_use) {
        library(pkg_name, character.only = TRUE)
      }
      session_info <- utils::sessionInfo()
      loaded_pkgs <- names(session_info$otherPkgs)
      loaded_pkgs
    }, args = list(pkgs_to_use = pkgs_to_use))

    # Load the objects
    loaded_objects <- evaluation_session$run(function(objects_to_use) {
      for (i in seq_along(objects_to_use)) {
        obj <- objects_to_use[[i]]
        obj_name <- names(objects_to_use)[i]

        # Assign to the global environment of the r_session
        assign(obj_name, obj, envir = parent.env(environment()))
      }

      # List objects in the global environment
      ls(envir = .GlobalEnv)
    }, args = list(objects_to_use = objects_to_use))
    loaded_objects

    if (!all(loaded_objects %in% names(objects_to_use))) {
      stop(paste0(
        "The following objects could not be loaded: ",
        names(objects_to_use)[!(names(objects_to_use) %in% names(loaded_objects))]
      ))
    }

  } else {
    loaded_pkgs <- pkgs_to_use
    loaded_objects <- names(objects_to_use)
  }


  ## Define modify_fn which will add information about the setting
  ##   in which R code can be generated

  modify_fn <- function(original_prompt_text) {
    new_text <- glue::glue(
      "{original_prompt_text}\n\n",
      "{add_text}"
    )

    if (list_packages & length(loaded_pkgs) > 0) {
      new_text <- glue::glue(
        "{new_text}\n",
        "You can use functions from these packages: ",
        "{loaded_pkgs |> paste(collapse = ', ')}."
      )
    }

    new_text <- glue::glue(
      "{new_text}\n",
      "You may not install or load any additional packages."
    )

    if (list_objects) {
      object_types <- sapply(objects_to_use, function(obj) class(obj))
      objects_df <- data.frame(Object_name = names(objects_to_use), Type = object_types)

      if (nrow(objects_df) > 0) {
        new_text <- glue::glue(
          "{new_text}\n",
          "These objects already exist in the R session:\n\n",
          "{objects_df |> df_to_string()}.\n\n",
          "Do not define these objects in your R code."
        )

        if (skim_dataframes) {
          dataframes <- objects_df$Object_name[objects_df$Type == "data.frame"]
          if (length(dataframes) > 0) {
            if (!requireNamespace("skimr", quietly = TRUE)) {
              warning(paste0(
                "The 'skimr' package is required to skim dataframes.",
                " Skim summary of dataframes currently not shown in prompt"
              ))
            } else {
              for (df_name in dataframes) {
                df <- objects_to_use[[df_name]]
                new_text <- glue::glue(
                  "{new_text}\n\n",
                  "Summary of the dataframe '{df_name}':\n",
                  "{df |> skim_with_labels_and_levels() |> df_to_string()}\n\n"
                )
              }
            }
          }
        }

        if (output_as_tool) {
          new_text <- glue::glue(
            "{new_text}\n",
            "If you need more information about these objects,",
            " you can call R functions to describe them."
          )
        }
      }
    }

    if (evaluate_code & return_mode == "console") {
      new_text <- glue::glue(
        "{new_text}\n",
        "The R code should produce console output that answers the prompt."
      )
    }
    if (evaluate_code & return_mode == "object") {
      new_text <- glue::glue(
        "{new_text}\n",
        "The R code should produce an object that answers the prompt."
      )
    }

    if (output_as_tool) {
      new_text <- glue::glue(
        "{new_text}\n",
        "The console output of your R code will be sent back to you.",
        " Use print() on all objects or values that you need to see.",
        " You can not view plots, all output must be text-based.",
        " After you get console output from me, decide if you can answer the prompt or if",
        " you need to modify your R code. When you can formulate your final answer,",
        " do not provide any R code in it."
      )
    }

    return(new_text)
  }


  ## Define extraction_fn which will extract R code from the response
  ##   and handle it according to the settings of this function

  extraction_fn <- function(x) {
    return_list <- list()
    return_list$llm_answer <- x

    extracted_code <- answer_using_r_extract_r_code(x)

    if (length(extracted_code) == 0) {
      if (output_as_tool) {
        return(x)
      }

      return(llm_feedback(paste0(
        "No R code detected. You must provide R code",
        " between ```r and ```."
      )))
    }

    # Check if the R code is valid
    parsed_code <- tryCatch(parse(text = extracted_code), error = function(e) e)
    if (inherits(parsed_code, "error")) {
      return(llm_feedback(glue::glue(
        "Invalid R code detected:\n",
        "    {parsed_code$message}\n",
        "Please provide syntactically correct R code."
      )))
    }
    return_list$code <- parsed_code

    if (!evaluate_code) {
      if (return_mode == "code") return(parsed_code)
      if (return_mode == "llm_answer") return(x)
      return(return_list)
    }

    clone_session <- evaluation_session$clone() # Reset the session every time
    output <- clone_session$run_with_output(function(r_code) {
      eval(parse(text = r_code))
    }, args = list(parsed_code))

    # Check if errors occurred during execution
    if (!is.null(output$error)) {
      return(llm_feedback(glue::glue(
        "An error occurred while executing the R code:\n",
        "    {output$error}"
      )))
    }

    # Check if the code produced any relevant output
    if (output$stdout == "" & return_mode == "console") {
      return(llm_feedback(glue::glue(
        "The R code did not produce any console output.",
        " Please provide R code that produces console output."
      )))
    }
    if (is.null(output$result) & return_mode == "object") {
      return(llm_feedback(glue::glue(
        "The R code did not produce an object.",
        " Please provide R code that produces an object."
      )))
    }
    if (is.null(output$stdout) & is.null(output$result)) {
      return(llm_feedback(glue::glue(
        "The R code did not produce any output.",
        " Please provide R code that produces output."
      )))
    }

    return_list$output <- output
    return_list$formatted_output <- glue::glue(
      "--- R code: ---\n",
      "{extracted_code |> paste(collapse = \"\\n\")}\n\n",
      "--- Console output: ---\n",
      "{
          if (is.null(output$stdout) || output$stdout == \"\") {
            \"No console output produced.\"
          } else {
            output$stdout |>
            paste(collapse = \"\\n\") |>
            stringr::str_trunc(1000) |>
            print()
          }
        }\n\n",
      "--- Last object: ---\n",
      "{
          if (is.null(output$result)) {
            \"No object produced.\"
          } else {
            output$result |>
            paste(collapse = \"\\n\") |>
            stringr::str_trunc(100) |>
            print()
          }
        }"
    )

    if (output_as_tool) {
      return(llm_feedback(return_list$formatted_output, tool_result = TRUE))
    }

    if (return_mode == "full")
      return(return_list)
    if (return_mode == "code")
      return(return_list$code)
    if (return_mode == "console")
      return(return_list$output$stdout)
    if (return_mode == "object")
      return(return_list$output$result)
    if (return_mode == "formatted_output")
      return(return_list$formatted_output)
    if (return_mode == "llm_answer")
      return(x)

    return(output$stdout)
  }


  ## If we are sending back output, we can consider this wrapper a tool

  type <- "unspecified"
  if (output_as_tool) {
    type <- "tool"
  }


  ## Finally, wrap the prompt with the new prompt wrap

  prompt_wrap(
    prompt, modify_fn, extraction_fn,
    type = type, name = "answer_using_r"
  )
}



#' Helper function to extract R code from a string
#'
#' This function extracts R code from a string by matching all content between
#' '```r' and '```'.
#'
#' @param input_string A string containing R code, typically a response
#' from an LLM
#'
#' @return A character vector containing the extracted R code
#'
#' @noRd
answer_using_r_extract_r_code <- function(input_string) {
  # Use regular expression to match all content between ```r and ```, with case-insensitive matching
  matches <- gregexpr("(?s)```[rR]\\s*(.*?)\\s*```", input_string, perl = TRUE)
  extracted_code <- regmatches(input_string, matches)

  # Remove the ```r and ``` wrappers (works for both lowercase and uppercase R)
  extracted_code <- lapply(extracted_code, function(x) {
    sub("(?s)```[rR]\\s*(.*?)\\s*```", "\\1", x, perl = TRUE)
  })

  return(unlist(extracted_code))
}