Nothing
#' Sentiment Analysis Scores
#'
#' @description Uses sentiment analysis pipelines from \href{https://huggingface.co}{huggingface}
#' to compute probabilities that the text corresponds to the specified classes
#'
#' @param text Character vector or list.
#' Text in a vector or list data format
#'
#' @param comparison_text Character vector or list.
#' Text in a vector or list data format
#'
#' @param transformer Character.
#' Specific sentence similarity transformer
#' to be used.
#' Defaults to \code{"all_minilm_l6"} (see \href{https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2}{huggingface})
#'
#' Also allows any sentence similarity models with a pipeline
#' from \href{https://huggingface.co/models?pipeline_tag=sentence-similarity}{huggingface}
#' to be used by using the specified name (e.g., \code{"typeform/distilbert-base-uncased-mnli"}; see Examples)
#'
#' @param device Character.
#' Whether to use CPU or GPU for inference.
#' Defaults to \code{"auto"} which will use
#' GPU over CPU (if CUDA-capable GPU is setup).
#' Set to \code{"cpu"} to perform over CPU
#'
#' @param preprocess Boolean.
#' Should basic preprocessing be applied?
#' Includes making lowercase, keeping only alphanumeric characters,
#' removing escape characters, removing repeated characters,
#' and removing white space.
#' Defaults to \code{FALSE}.
#' Transformers generally are OK without preprocessing and handle
#' many of these functions internally, so setting to \code{TRUE}
#' will not change performance much
#'
#' @param keep_in_env Boolean.
#' Whether the classifier should be kept in your global environment.
#' Defaults to \code{TRUE}.
#' By keeping the classifier in your environment, you can skip
#' re-loading the classifier every time you run this function.
#' \code{TRUE} is recommended
#'
#' @param envir Numeric.
#' Environment for the classifier to be saved for repeated use.
#' Defaults to the global environment
#'
#' @return Returns a \emph{n} x \emph{m} similarity matrix where \emph{n} is length of \code{text} and \emph{m} is the length of \code{comparison_text}
#'
#' @author Alexander P. Christensen <alexpaulchristensen@gmail.com>
#'
#' @examples
#' # Load data
#' data(neo_ipip_extraversion)
#'
#' # Example text
#' text <- neo_ipip_extraversion$friendliness[1:5]
#'
#' \dontrun{
#' # Example with defaults
#' sentence_similarity(
#' text = text, comparison_text = text
#' )
#'
#' # Example with model from 'sentence-transformers'
#' sentence_similarity(
#' text = text, comparison_text = text,
#' transformer = "sentence-transformers/all-mpnet-base-v2"
#' )
#'
#'}
#'
#' @export
#'
# Sentence Similarity
# Updated 02.08.2024
sentence_similarity <- function(
text, comparison_text,
transformer = c("all_minilm_l6"),
device = c("auto", "cpu", "cuda"),
preprocess = FALSE, keep_in_env = TRUE, envir = 1
)
{
# Check that input of 'text' argument is in the
# appropriate format for the analysis
non_text_warning(text) # see utils-transforEmotion.R for function
# Check for comparison text
if(missing(comparison_text)){
stop(
"Comparison text to compute similarity must be specified using the 'comparison_text' argument (e.g., `comparison_text = c(\"a similar sentence\", \a random sentence\")`)\n",
call. = FALSE
)
}
# Check for transformer
if(missing(transformer)){
transformer <- "all_minilm_l6"
}
# Check for multiple transformers
if(length(transformer) > 1){
stop("Only one transformer model can be used at a time.\n\nSelect one of the default models or select a model from huggingface: <https://huggingface.co/models?pipeline_tag=sentence-similarity>\n")
}
# Set device
if(missing(device)){
device <- "auto"
}else{device <- tolower(match.arg(device))}
# Check for classifiers in environment
if(exists(transformer, envir = as.environment(envir))){
classifier <- get(transformer, envir = as.environment(envir))
}else{
# Run setup for modules
setup_modules()
# Import 'sentence-transformers' module
message("Importing sentence-transformers module...")
sentence_transformers <- reticulate::import("sentence_transformers")
# Check for custom transformer
if(transformer %in% c("all_minilm_l6")){
# Load pipeline
classifier <- sentence_transformers$SentenceTransformer(
switch(
transformer,
"all_minilm_l6" = "sentence-transformers/all-MiniLM-L6-v2",
device = device
)
)
}else{
# Custom pipeline from huggingface
# Try to catch non-existing pipelines
pipeline_catch <- try(
classifier <- sentence_transformers$SentenceTransformer(
transformer, device = device
), silent = TRUE
)
# Errors
if(is(pipeline_catch, "try-error")){
# Model exists but no pipeline
if(isTRUE(grepl("Tokenizer class", pipeline_catch))){
stop(
paste(
"Transformer model '",
transformer,
"' exists but does not have a working pipeline yet.\n\nTry a default model or select a model from huggingface: <https://huggingface.co/models?pipeline_tag=zero-shot-classification>\n",
sep = ""
), call. = FALSE
)
}else if(isTRUE(grepl("device_map", pipeline_catch))){
# Try again without device
pipeline_catch <- try(
classifier <- sentence_transformers$SentenceTransformer(transformer), silent = TRUE
)
}else{
stop(pipeline_catch, call. = FALSE)
}
}
}
}
# Load into environment
if(isTRUE(keep_in_env)){
# Keep transformer module in environment
assign(
x = "sentence_transformers",
value = sentence_transformers,
envir = as.environment(envir)
)
# Keep classifier in environment
assign(
x = transformer,
value = classifier,
envir = as.environment(envir)
)
}
# Basic preprocessing
if(isTRUE(preprocess)){
text <- preprocess_text( # Internal function. See `utils-transforEmotion`
text,
remove_stop = FALSE # Transformers will remove stop words
)
}
# Message
message("Obtaining similarities...")
# Combine sentences
sentences <- c(text, comparison_text)
# Get embeddings
embeddings <- classifier$encode(sentences)
# Loop over text comparisons
text_length <- length(text)
comparison_length <- length(comparison_text)
# Set up matrix
similarity_matrix <- matrix(
0, nrow = text_length, ncol = comparison_length,
dimnames = list(text, comparison_text)
)
# Populate similarity matrix
for(i in seq_len(text_length))
for(j in seq_len(comparison_length)){
# Compute cosine
similarity_matrix[i,j] <- cosine(
embed1 = embeddings[i,],
embed2 = embeddings[j + text_length,]
)
}
# Return similarities
return(similarity_matrix)
}
#' @noRd
# Cosine function ----
# Updated 02.08.2024
cosine <- function(embed1, embed2)
{
return(crossprod(embed1, embed2) / sqrt(crossprod(embed1) * crossprod(embed2)))
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.