R/distance.R
In NaileR: Interpreting Latent Variables with AI

Documented in dist_mat_llm dist_ref_llm

#' LLM text similarity
#'
#' Compute a similarity score, on a scale ranging from 0 (totally different) to 100 (the exact same), between two character strings.
#'
#' @param textA,textB two character strings.
#'
#' @return An integer between 0 and 100.
#'
#' @details The similarity score is generated by an LLM. Therefore, the result might vary if the function is run several times.
#'
#' @export
#'
#' @examples
#'\dontrun{
#' # Processing time is often longer than ten seconds
#' # because the function uses a large language model.
#'
#' textA <- "Participant A was described as a nice, outgoing man, with a friendly attitude."
#' textB <- "Participant A was an extroverted and caring individual."
#'
#' sim_llm(textA, textB)
#' }


sim_llm <- function (textA, textB) {
  ppt <- glue::glue("Two experts have each provided a short report. Please identify only the similarities in meaning between the two reports and give a score for the similarity between the two reports: 0, the two reports are totally different, 100 the two reports are identical.\n\n            ",
                    "# The first report is:\n            {textA}",
                    "\n\n            ",
                    "# The second report is:\n            {textB}",
                    "\n\n            At the very very end of your answer, write the score of similarity accordingly to this exact format:",
                    "\n            The similarity between the two reports is: ...")
  number <- numeric(0)
  while (length(number) == 0){
    res_comparison <- ollamar::generate("llama3", ppt, output = "df")
    last_sentence <- sub(".*\\.\\s*", "", res_comparison$response)
    matches <- gregexpr("[0-9]+", last_sentence)
    number <- as.numeric(regmatches(last_sentence, matches)[[1]][length(regmatches(last_sentence, matches)[[1]])])
  }
  return(number)
}


#' LLM distance matrix
#'
#' Compute a distance matrix between randomly-generated responses to an LLM prompt.
#'
#' @param ppt an LLM prompt.
#' @param n the number of responses to be generated.
#' @param per_miss the proportion of missing values in the final matrix (between 0 and 1; 0 by default).
#'
#' @return A list containing:
#' * a list of the LLM results for each iteration;
#' * a distance matrix.
#'
#' @details The final percentage of missing values might differ from the per_miss parameter value; rather than a percentage of values being turned to NA, each value has a per_miss probability of being NA.
#'
#' @export
#'
#' @examples
#'\dontrun{
#' # Processing time is often longer than ten seconds
#' # because the function uses a large language model.
#'
#' data(iris)
#'
#' intro_iris <- "A study measured various parts of iris flowers
#' from 3 different species: setosa, versicolor and virginica.
#' I will give you the results from this study.
#' You will have to identify what sets these flowers apart."
#' intro_iris <- gsub('\n', ' ', intro_iris) |>
#' stringr::str_squish()
#'
#' req_iris <- "Please explain what makes each species distinct.
#' Also, tell me which species has the biggest flowers,
#' and which species has the smallest."
#' req_iris <- gsub('\n', ' ', req_iris) |>
#' stringr::str_squish()
#'
#' res_iris <- nail_catdes(iris, num.var = 5,
#' introduction = intro_iris, request = req_iris)
#'
#' dist_mat_llm(res_iris$prompt, n = 5, per_miss = 0)
#' }


dist_mat_llm <- function(ppt, n, per_miss = 0){
  res_llm <- list()

  for (i in 1:n) res_llm[[i]] = ollamar::generate('llama3', ppt, output = 'df')

  sim_matrix <- matrix(NA, n, n)

  for (i in 1:n) sim_matrix[i, i] <- 100

  for (i in 2:n) {
    for (j in 1:(i-1)) {
      if (stats::runif(1) > per_miss) {
        sim_matrix[i, j] <- sim_llm(res_llm[[i]]$response, res_llm[[j]]$response)
        sim_matrix[j, i] <- sim_matrix[i, j]
        }
      }
    }
  return(list(boot_llm = res_llm, dist_llm = 100-sim_matrix))
}


#' LLM response consistency
#'
#' Compute distances between an LLM response of interest and some other responses to the same prompt.
#'
#' @param ppt an LLM prompt.
#' @param ref the reference response.
#' @param n the number of new responses to be generated.
#'
#' @return A list containing:
#' * a list with the newly-generated prompts;
#' * a vector of distances to the reference response.
#'
#' @export
#'
#' @examples
#'\dontrun{
#' # Processing time is often longer than ten seconds
#' # because the function uses a large language model.
#'
#' data(iris)
#'
#' intro_iris <- "A study measured various parts of iris flowers
#' from 3 different species: setosa, versicolor and virginica.
#' I will give you the results from this study.
#' You will have to identify what sets these flowers apart."
#' intro_iris <- gsub('\n', ' ', intro_iris) |>
#' stringr::str_squish()
#'
#' req_iris <- "Please explain what makes each species distinct.
#' Also, tell me which species has the biggest flowers,
#' and which species has the smallest."
#' req_iris <- gsub('\n', ' ', req_iris) |>
#' stringr::str_squish()
#'
#' res_iris <- nail_catdes(iris, num.var = 5,
#' introduction = intro_iris, request = req_iris)
#'
#' dist_ref_llm(res_iris$prompt, res_iris$response, n = 5)
#' }

dist_ref_llm <- function(ppt, ref, n){
  res_llm <- list()

  for (i in 1:n) res_llm[[i]] = ollamar::generate('llama3', ppt, output = 'df')

  distance_matrix <- matrix(0, n, 1)

  for (i in 1:n) distance_matrix[i, 1] <- 100 - sim_llm(ref, res_llm[[i]]$response)

  return(list(boot_llm = res_llm, dist_llm = distance_matrix))
}