R/sentence_similarity.R
In transforEmotion: Sentiment Analysis for Text, Image and Video using Transformer Models

Documented in sentence_similarity

#' Sentiment Analysis Scores
#'
#' @description Uses sentiment analysis pipelines from \href{https://huggingface.co}{huggingface}
#' to compute probabilities that the text corresponds to the specified classes
#'
#' @param text Character vector or list.
#' Text in a vector or list data format
#'
#' @param comparison_text Character vector or list.
#' Text in a vector or list data format
#'
#' @param transformer Character.
#' Specific sentence similarity transformer
#' to be used.
#' Defaults to \code{"all_minilm_l6"} (see \href{https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2}{huggingface})
#'
#' Also allows any sentence similarity models with a pipeline
#' from \href{https://huggingface.co/models?pipeline_tag=sentence-similarity}{huggingface}
#' to be used by using the specified name (e.g., \code{"typeform/distilbert-base-uncased-mnli"}; see Examples)
#'
#' @param device Character.
#' Whether to use CPU or GPU for inference.
#' Defaults to \code{"auto"} which will use
#' GPU over CPU (if CUDA-capable GPU is setup).
#' Set to \code{"cpu"} to perform over CPU
#'
#' @param preprocess Boolean.
#' Should basic preprocessing be applied?
#' Includes making lowercase, keeping only alphanumeric characters,
#' removing escape characters, removing repeated characters,
#' and removing white space.
#' Defaults to \code{FALSE}.
#' Transformers generally are OK without preprocessing and handle
#' many of these functions internally, so setting to \code{TRUE}
#' will not change performance much
#'
#' @param keep_in_env Boolean.
#' Whether the classifier should be kept in your global environment.
#' Defaults to \code{TRUE}.
#' By keeping the classifier in your environment, you can skip
#' re-loading the classifier every time you run this function.
#' \code{TRUE} is recommended
#'
#' @param envir Numeric.
#' Environment for the classifier to be saved for repeated use.
#' Defaults to the global environment
#'
#' @return Returns a \emph{n} x \emph{m} similarity matrix where \emph{n} is length of \code{text} and \emph{m} is the length of \code{comparison_text}
#'
#' @author Alexander P. Christensen <alexpaulchristensen@gmail.com>
#'
#' @examples
#' # Load data
#' data(neo_ipip_extraversion)
#'
#' # Example text
#' text <- neo_ipip_extraversion$friendliness[1:5]
#'
#' \dontrun{
#' # Example with defaults
#' sentence_similarity(
#'  text = text, comparison_text = text
#' )
#'
#' # Example with model from 'sentence-transformers'
#' sentence_similarity(
#'  text = text, comparison_text = text,
#'  transformer = "sentence-transformers/all-mpnet-base-v2"
#' )
#'
#'}
#'
#' @export
#'
# Sentence Similarity
# Updated 02.08.2024
sentence_similarity <- function(
    text, comparison_text,
    transformer = c("all_minilm_l6"),
    device = c("auto", "cpu", "cuda"),
    preprocess = FALSE, keep_in_env = TRUE, envir = 1
)
{

  # Check that input of 'text' argument is in the
  # appropriate format for the analysis
  non_text_warning(text) # see utils-transforEmotion.R for function

  # Check for comparison text
  if(missing(comparison_text)){
    stop(
      "Comparison text to compute similarity must be specified using the 'comparison_text' argument (e.g., `comparison_text = c(\"a similar sentence\", \a random sentence\")`)\n",
      call. = FALSE
    )
  }

  # Check for transformer
  if(missing(transformer)){
    transformer <- "all_minilm_l6"
  }

  # Check for multiple transformers
  if(length(transformer) > 1){
    stop("Only one transformer model can be used at a time.\n\nSelect one of the default models or select a model from huggingface: <https://huggingface.co/models?pipeline_tag=sentence-similarity>\n")
  }

  # Set device
  if(missing(device)){
    device <- "auto"
  }else{device <- tolower(match.arg(device))}

  # Check for classifiers in environment
  if(exists(transformer, envir = as.environment(envir))){
    classifier <- get(transformer, envir = as.environment(envir))
  }else{

    # Run setup for modules
    setup_modules()

    # Import 'sentence-transformers' module
    message("Importing sentence-transformers module...")
    sentence_transformers <- reticulate::import("sentence_transformers")

    # Check for custom transformer
    if(transformer %in% c("all_minilm_l6")){

      # Load pipeline
      classifier <- sentence_transformers$SentenceTransformer(
        switch(
          transformer,
          "all_minilm_l6" = "sentence-transformers/all-MiniLM-L6-v2",
          device = device
        )
      )

    }else{

      # Custom pipeline from huggingface
      # Try to catch non-existing pipelines
      pipeline_catch <- try(
        classifier <- sentence_transformers$SentenceTransformer(
          transformer, device = device
        ), silent = TRUE
      )

      # Errors
      if(is(pipeline_catch, "try-error")){

        # Model exists but no pipeline
        if(isTRUE(grepl("Tokenizer class", pipeline_catch))){

          stop(
            paste(
              "Transformer model '",
              transformer,
              "' exists but does not have a working pipeline yet.\n\nTry a default model or select a model from huggingface: <https://huggingface.co/models?pipeline_tag=zero-shot-classification>\n",
              sep = ""
            ), call. = FALSE
          )

        }else if(isTRUE(grepl("device_map", pipeline_catch))){

          # Try again without device
          pipeline_catch <- try(
            classifier <- sentence_transformers$SentenceTransformer(transformer), silent = TRUE
          )

        }else{
          stop(pipeline_catch, call. = FALSE)
        }

      }

    }

  }

  # Load into environment
  if(isTRUE(keep_in_env)){

    # Keep transformer module in environment
    assign(
      x = "sentence_transformers",
      value = sentence_transformers,
      envir = as.environment(envir)
    )

    # Keep classifier in environment
    assign(
      x = transformer,
      value = classifier,
      envir = as.environment(envir)
    )
  }

  # Basic preprocessing
  if(isTRUE(preprocess)){
    text <- preprocess_text( # Internal function. See `utils-transforEmotion`
      text,
      remove_stop = FALSE # Transformers will remove stop words
    )
  }

  # Message
  message("Obtaining similarities...")

  # Combine sentences
  sentences <- c(text, comparison_text)

  # Get embeddings
  embeddings <- classifier$encode(sentences)

  # Loop over text comparisons
  text_length <- length(text)
  comparison_length <- length(comparison_text)

  # Set up matrix
  similarity_matrix <- matrix(
    0, nrow = text_length, ncol = comparison_length,
    dimnames = list(text, comparison_text)
  )

  # Populate similarity matrix
  for(i in seq_len(text_length))
    for(j in seq_len(comparison_length)){

      # Compute cosine
      similarity_matrix[i,j] <- cosine(
        embed1 = embeddings[i,],
        embed2 = embeddings[j + text_length,]
      )

    }


  # Return similarities
  return(similarity_matrix)

}

#' @noRd
# Cosine function ----
# Updated 02.08.2024
cosine <- function(embed1, embed2)
{
  return(crossprod(embed1, embed2) / sqrt(crossprod(embed1) * crossprod(embed2)))
}

Any scripts or data that you put into this service are public.

transforEmotion documentation built on Sept. 11, 2024, 9:26 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

transforEmotion
Sentiment Analysis for Text, Image and Video using Transformer Models

R/sentence_similarity.R
In transforEmotion: Sentiment Analysis for Text, Image and Video using Transformer Models

Defines functions cosine sentence_similarity

Documented in sentence_similarity

Try the transforEmotion package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

transforEmotion Sentiment Analysis for Text, Image and Video using Transformer Models

R/sentence_similarity.R In transforEmotion: Sentiment Analysis for Text, Image and Video using Transformer Models

Defines functions cosine sentence_similarity

Documented in sentence_similarity

Try the transforEmotion package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

transforEmotion
Sentiment Analysis for Text, Image and Video using Transformer Models

R/sentence_similarity.R
In transforEmotion: Sentiment Analysis for Text, Image and Video using Transformer Models