text: Analyses of Text using Transformers Models from HuggingFace, Natural Language Processing and Machine Learning

Documented in textDistance textDistanceMatrix textDistanceNorm textSimilarity textSimilarityMatrix textSimilarityNorm

#' Compute cosine
#'
#' @param x A word embedding.
#' @param y A word embedding.
#' @return Cosine between x and y.
#' @noRd
# Computing the cosine between two word embeddings.
cosines <- function(x, y) {
  rowSums(x * y, na.rm = TRUE) / (sqrt(rowSums(x * x, na.rm = TRUE)) * sqrt(rowSums(y * y, na.rm = TRUE)))
}

#' Semantic Similarity
#'
#' textSimilarity() Computes the semantic similarity between two text variables.
#' @param x Word embeddings from textEmbed().
#' @param y Word embeddings from textEmbed().
#' @param method (character) Character string describing type of measure to be computed. Default is "cosine" (see also
#' "spearmen", "pearson" as well as measures from textDistance() (which here is computed as 1 - textDistance)
#' including "euclidean", "maximum", "manhattan", "canberra", "binary" and "minkowski").
#' @param center (boolean; from base::scale) If center is TRUE then centering is done by subtracting the column means
#' (omitting NAs) of x from their corresponding columns, and if center is FALSE, no centering is done.
#' @param scale (boolean; from base::scale) If scale is TRUE then scaling is done by dividing the (centered)
#' columns of x by their standard deviations if center is TRUE, and the root mean square otherwise.
#' @return A vector comprising semantic similarity scores. The closer the value is to 1 when using the default
#'  method, "cosine", the higher the semantic similarity.
#' @examples
#' # Compute the semantic similarity between the embeddings from "harmonytext" and "satisfactiontext".
#' \dontrun{
#' similarity_scores <- textSimilarity(
#'   x = word_embeddings_4$texts$harmonytext,
#'   y = word_embeddings_4$texts$satisfactiontext
#' )
#'
#' # Show information about how similarity_scores were constructed.
#' comment(similarity_scores)
#' }
#' @seealso See \code{\link{textDistance}} and \code{\link{textSimilarityNorm}}.
#' @export
textSimilarity <- function(x,
                           y,
                           method = "cosine",
                           center = TRUE,
                           scale = FALSE) {
  # Select necessary columns
  x1 <- dplyr::select(x, dplyr::starts_with("Dim"))
  y1 <- dplyr::select(y, dplyr::starts_with("Dim"))

  # Normalize embeddings help(scale)
  x1 <- tibble::as_tibble(t(scale(t(x1), center = center, scale = scale)))
  y1 <- tibble::as_tibble(t(scale(t(y1), center = center, scale = scale)))

  # Compute cosines
  if (method == "cosine") {
    ss <- cosines(x1, y1)
  }

  # Compute correlation
  if (method == "spearman" || method == "pearson") {
    ss <- list()

    for (i in seq_len(nrow(x1))) {
      ss[[i]] <- cor(as_vector(x1[i, ]),
        as_vector(y1[i, ]),
        method = method
      )
    }
    ss <- unlist(ss)
    ss
  }

  if (method %in% c(
    "euclidean", "maximum", "manhattan",
    "canberra", "binary", "minkowski"
  )) {
    ss <- 1 - textDistance(x1, y1, method = method)
  }

  # Add information about the used embeddings
  embedding_descriptions_x <- comment(x)
  embedding_descriptions_y <- comment(y)
  comment(ss) <- paste(
    "x embedding = ", embedding_descriptions_x,
    "y embedding = ", embedding_descriptions_y,
    "method = ", method,
    "center = ", center,
    "scale = ", scale,
    sep = ".", collapse = " "
  )
  ss
}


#' Semantic distance
#'
#' textDistance() computes the semantic distance between two text variables.
#'
#' @param x Word embeddings (from textEmbed()).
#' @param y Word embeddings (from textEmbed()).
#' @param method (character) Character string describing type of measure to be computed; default is
#' "euclidean" (see also measures from stats:dist() including "maximum", "manhattan", "canberra",
#' "binary" and "minkowski". It is also possible to use "cosine", which computes the cosine distance
#' (i.e., 1 - cosine(x, y)).
#' @param center (boolean; from base::scale) If center is TRUE then centering is done by subtracting
#' the embedding mean
#' (omitting NAs) of x from each of its dimension, and if center is FALSE, no centering is done.
#' @param scale (boolean; from base::scale) If scale is TRUE then scaling is done by dividing the
#' (centered) embedding dimensions by the standard deviation of the embedding if center is TRUE,
#'  and the root mean square otherwise.
#' @return A vector comprising semantic distance scores.
#' @examples
#' # Compute the semantic distance score between the embeddings
#' # from "harmonytext" and "satisfactiontext".
#'
#' \dontrun{
#' distance_scores <- textDistance(
#'   x = word_embeddings_4$texts$harmonytext,
#'   y = word_embeddings_4$texts$satisfactiontext
#' )
#'
#' # Show information about how distance_scores were constructed.
#'
#' comment(distance_scores)
#' }
#' @seealso See  \code{\link{textSimilarity}} and \code{\link{textSimilarityNorm}}.
#' @export
textDistance <- function(x,
                         y,
                         method = "euclidean",
                         center = FALSE,
                         scale = FALSE) {
  x1 <- textDimName(x, dim_names = FALSE)
  y1 <- textDimName(y, dim_names = FALSE)

  # Select necessary columns
  x1 <- dplyr::select(x1, dplyr::starts_with("Dim"))
  y1 <- dplyr::select(y1, dplyr::starts_with("Dim"))

  # normalize
  x1 <- as_tibble(t(scale(t(x1), center = center, scale = scale)))
  y1 <- as_tibble(t(scale(t(y1), center = center, scale = scale)))

  if (method == "cosine") {
    # Compute cosine distance
    ss <- 1 - cosines(x1, y1)
  } else {
    # Compute distance method = "euclidean"
    ss1 <- list()

    for (i in seq_len(nrow(x1))) {
      dist_df <- dplyr::bind_rows(x1[i, ], y1[i, ])
      ss1[i] <- stats::dist(dist_df, method = method)[1]
    }
    ss <- unlist(ss1)
  }

  # Add information about the used embeddings
  embedding_descriptions_x <- comment(x)
  embedding_descriptions_y <- comment(y)
  comment(ss) <- paste(
    "x embedding = ", embedding_descriptions_x,
    "y embedding = ", embedding_descriptions_y,
    "method = ", method,
    "center = ", center,
    "scale = ", scale,
    sep = ".", collapse = " "
  )
  ss
}


#' Semantic similarity across multiple word embeddings
#'
#' textSimilarityMatrix computes semantic similarity scores between all combinations in a word embedding
#' @inheritParams textSimilarity
#' @return A matrix of semantic similarity scores
#' @examples
#' similarity_scores <- textSimilarityMatrix(word_embeddings_4$texts$harmonytext[1:3, ])
#' round(similarity_scores, 3)
#' @seealso see \code{\link{textSimilarityNorm}}
#' @export
textSimilarityMatrix <- function(
    x,
    method = "cosine",
    center = TRUE,
    scale = FALSE) {
  # Extract a numeric matrix
  X <- as.matrix(dplyr::select(x, dplyr::starts_with("Dim")))

  # Normalise if needed
  if (scale || center) {
    # `apply()` returns it transposed (and that's by design)
    X <- t(
      apply(
        X,
        1,
        function(row) scale(row, center = center, scale = scale),
        simplify = TRUE
      )
    )
  }

  # Correlations
  if (method == "spearman" || method == "pearson") {
    S <- as.matrix(cor(t(X), method = method))
    # For unification: let dimension names be missing regardless of the method used
    dimnames(S) <- NULL

    # Cosine similarity
  } else if (method == "cosine") {
    # A vector of row norms `||r||`
    norms <- sqrt(rowSums(X^2, na.rm = TRUE))
    S <- (X %*% t(X)) / (norms %*% t(norms))
    dimnames(S) <- NULL

    # Traditional built-in distance metrics
  } else if (method %in% c(
    "euclidean", "maximum", "manhattan",
    "canberra", "binary", "minkowski"
  )) {
    S <- 1 - as.matrix(dist(X, method = method))
    dimnames(S) <- NULL

    # Throw an error if the method didn't match
  } else {
    stop(sprintf("Unknown method: %s", method))
  }

  # Add embedding metadata
  embedding_descriptions <- comment(x)
  comment(S) <- paste(
    "Text collection embedding = ", embedding_descriptions,
    "method = ", method,
    "center = ", center,
    "scale = ", scale,
    sep = ".", collapse = " "
  )
  return(S)
}

#' Semantic distance across multiple word embeddings
#'
#' textDistanceMatrix() computes semantic distance scores between all combinations in a word embedding
#' @inheritParams textDistance
#' @return A matrix of semantic distance scores
#' @examples
#' distance_scores <- textDistanceMatrix(word_embeddings_4$texts$harmonytext[1:3, ])
#' round(distance_scores, 3)
#' @seealso see \code{\link{textDistanceNorm}}
#' @export
textDistanceMatrix <- function(
    x,
    method = "euclidean",
    center = FALSE,
    scale = FALSE) {
  # Extract a numeric matrix
  X <- as.matrix(dplyr::select(x, dplyr::starts_with("Dim")))

  # Normalise if needed
  if (scale || center) {
    # `apply()` returns it transposed (and that's by design)
    X <- t(
      apply(
        X,
        1,
        function(row) scale(row, center = center, scale = scale),
        simplify = TRUE
      )
    )
  }

  # Traditional built-in distance metrics
  if (method %in% c(
    "euclidean", "maximum", "manhattan",
    "canberra", "binary", "minkowski"
  )) {
    S <- as.matrix(dist(X, method = method))
    # For unification: let dimension names be missing regardless of the method used
    dimnames(S) <- NULL

    # Cosine distance
  } else if (method == "cosine") {
    # A vector of row norms `||r||`
    norms <- sqrt(rowSums(X^2, na.rm = TRUE))
    S <- 1 - (X %*% t(X)) / (norms %*% t(norms))
    dimnames(S) <- NULL

    # Throw an error if the method didn't match
  } else {
    stop(sprintf("Unknown method: %s", method))
  }

  # Add embedding metadata
  embedding_descriptions <- comment(x)
  comment(S) <- paste(
    "Text collection embedding = ", embedding_descriptions,
    "method = ", method,
    "center = ", center,
    "scale = ", scale,
    sep = ".", collapse = " "
  )
  return(S)
}


#'  Semantic similarity between a text variable and a word norm
#'
#' textSimilarityNorm() computes the semantic similarity between a text variable and a word norm
#' (i.e., a text represented by one word embedding that represent a construct).
#' @param y Word embedding from textEmbed (from only one text).
#' @inheritParams textSimilarity
#' @return A vector comprising semantic similarity scores.
#' @examples
#' \dontrun{
#' library(dplyr)
#' library(tibble)
#' harmonynorm <- c("harmony peace ")
#' satisfactionnorm <- c("satisfaction achievement")
#'
#' norms <- tibble::tibble(harmonynorm, satisfactionnorm)
#' word_embeddings <- word_embeddings_4$texts
#' word_embeddings_wordnorm <- textEmbed(norms)
#' similarity_scores <- textSimilarityNorm(
#'   word_embeddings$harmonytext,
#'   word_embeddings_wordnorm$harmonynorm
#' )
#' }
#' @seealso see \code{\link{textSimilarity}}
#' @importFrom dplyr row_number slice select starts_with
#' @export
textSimilarityNorm <- function(x,
                               y,
                               method = "cosine",
                               center = TRUE,
                               scale = FALSE) {
  # Select Dimensions
  x1 <- dplyr::select(x, dplyr::starts_with("Dim"))
  y1 <- dplyr::select(as_tibble(as.list(y)), dplyr::starts_with("Dim"))

  y2 <- y1 %>%
    dplyr::slice(rep(dplyr::row_number(), nrow(x1)))

  # Compute similarity
  ss <- textSimilarity(x1, y2,
    method = method,
    center = center,
    scale = scale
  )

  # Add information about the used embeddings
  embedding_descriptions_x <- comment(x)
  embedding_descriptions_y <- comment(y)
  comment(ss) <- paste("x embedding = ", embedding_descriptions_x,
    "y embedding = ", embedding_descriptions_y,
    method,
    sep = ".", collapse = " "
  )
  ss
}


#' Semantic distance between a text variable and a word norm
#'
#' textDistanceNorm() computes the semantic distance between a text variable and a word norm
#' (i.e., a text represented by one word embedding that represent a construct/concept).
#' @param y Word embedding from textEmbed (from only one text).
#' @inheritParams textDistance
#' @return A vector comprising semantic distance scores.
#' @examples
#' \dontrun{
#' library(dplyr)
#' library(tibble)
#' harmonynorm <- c("harmony peace ")
#' satisfactionnorm <- c("satisfaction achievement")
#'
#' norms <- tibble::tibble(harmonynorm, satisfactionnorm)
#' word_embeddings <- word_embeddings_4$texts
#' word_embeddings_wordnorm <- textEmbed(norms)
#' similarity_scores <- textDistanceNorm(
#'   word_embeddings$harmonytext,
#'   word_embeddings_wordnorm$harmonynorm
#' )
#' }
#' @seealso see \code{\link{textDistance}}
#' @importFrom dplyr row_number slice select starts_with
#' @export
textDistanceNorm <- function(x,
                             y,
                             method = "euclidean",
                             center = FALSE,
                             scale = FALSE) {
  # Select Dimensions
  x1 <- dplyr::select(x, dplyr::starts_with("Dim"))
  y1 <- dplyr::select(as_tibble(as.list(y)), dplyr::starts_with("Dim"))

  y2 <- y1 %>%
    dplyr::slice(rep(dplyr::row_number(), nrow(x1)))

  # Compute similarity
  ss <- textDistance(x1, y2,
    method = method,
    center = center,
    scale = scale
  )

  # Add information about the used embeddings
  embedding_descriptions_x <- comment(x)
  embedding_descriptions_y <- comment(y)
  comment(ss) <- paste("x embedding = ", embedding_descriptions_x,
    "y embedding = ", embedding_descriptions_y,
    method,
    sep = ".", collapse = " "
  )
  ss
}

OscarKjell/text documentation built on April 3, 2025, 3:07 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

OscarKjell/text
Analyses of Text using Transformers Models from HuggingFace, Natural Language Processing and Machine Learning

R/3_1_textSimilarity.R
In OscarKjell/text: Analyses of Text using Transformers Models from HuggingFace, Natural Language Processing and Machine Learning

Defines functions textDistanceNorm textSimilarityNorm textDistanceMatrix textSimilarityMatrix textDistance textSimilarity cosines

Documented in textDistance textDistanceMatrix textDistanceNorm textSimilarity textSimilarityMatrix textSimilarityNorm

R Package Documentation

Browse R Packages

We want your feedback!

OscarKjell/text Analyses of Text using Transformers Models from HuggingFace, Natural Language Processing and Machine Learning

R/3_1_textSimilarity.R In OscarKjell/text: Analyses of Text using Transformers Models from HuggingFace, Natural Language Processing and Machine Learning

Defines functions textDistanceNorm textSimilarityNorm textDistanceMatrix textSimilarityMatrix textDistance textSimilarity cosines

Documented in textDistance textDistanceMatrix textDistanceNorm textSimilarity textSimilarityMatrix textSimilarityNorm

R Package Documentation

Browse R Packages

We want your feedback!

OscarKjell/text
Analyses of Text using Transformers Models from HuggingFace, Natural Language Processing and Machine Learning

R/3_1_textSimilarity.R
In OscarKjell/text: Analyses of Text using Transformers Models from HuggingFace, Natural Language Processing and Machine Learning