R/utils.R

Defines functions read_word2vec_model word2vec_available tokenize_text_by_language normalize_language_tag format_time get_data_path default_data_path check_python_module mmr_rerank rrf_fusion euclidean_distance cosine_similarity normalize_vectors generate_id

Documented in check_python_module cosine_similarity euclidean_distance format_time generate_id mmr_rerank normalize_language_tag normalize_vectors read_word2vec_model rrf_fusion tokenize_text_by_language word2vec_available

#' VectrixDB Utility Functions
#'
#' @name utils
#' @keywords internal
NULL

#' Generate deterministic ID from text
#'
#' @param text Character string to hash
#' @return Character string (12 character MD5 hash prefix)
#' @keywords internal
generate_id <- function(text) {

  substr(digest::digest(text, algo = "md5"), 1, 12)
}

#' Normalize vectors for cosine similarity
#'
#' @param vectors Matrix of vectors (rows are vectors)
#' @return Normalized matrix
#' @keywords internal
normalize_vectors <- function(vectors) {
  if (is.vector(vectors)) {
    vectors <- matrix(vectors, nrow = 1)
  }
  norms <- sqrt(rowSums(vectors^2))
  norms[norms == 0] <- 1  # Avoid division by zero

  vectors / norms
}

#' Compute cosine similarity
#'
#' @param query_vector Numeric vector (query)
#' @param doc_vectors Matrix of document vectors
#' @return Numeric vector of similarities
#' @keywords internal
cosine_similarity <- function(query_vector, doc_vectors) {
  if (is.vector(doc_vectors)) {
    doc_vectors <- matrix(doc_vectors, nrow = 1)
  }
  query_norm <- sqrt(sum(query_vector^2))
  doc_norms <- sqrt(rowSums(doc_vectors^2))

  if (query_norm == 0) return(rep(0, nrow(doc_vectors)))

  sims <- (doc_vectors %*% query_vector) / (doc_norms * query_norm)
  as.vector(sims)
}

#' Compute euclidean distance
#'
#' @param query_vector Numeric vector (query)
#' @param doc_vectors Matrix of document vectors
#' @return Numeric vector of distances
#' @keywords internal
euclidean_distance <- function(query_vector, doc_vectors) {
  if (is.vector(doc_vectors)) {
    doc_vectors <- matrix(doc_vectors, nrow = 1)
  }
  sqrt(rowSums((sweep(doc_vectors, 2, query_vector))^2))
}

#' Reciprocal Rank Fusion (RRF)
#'
#' @param rankings List of ranked ID vectors
#' @param k RRF constant (default 60)
#' @param weights Optional weights for each ranking
#' @return Named vector of fused scores
#' @keywords internal
rrf_fusion <- function(rankings, k = 60, weights = NULL) {
  if (is.null(weights)) {
    weights <- rep(1 / length(rankings), length(rankings))
  }

  scores <- list()

  for (i in seq_along(rankings)) {
    ranking <- rankings[[i]]
    weight <- weights[i]

    for (rank in seq_along(ranking)) {
      doc_id <- ranking[rank]
      rrf_score <- weight / (k + rank)

      if (is.null(scores[[doc_id]])) {
        scores[[doc_id]] <- 0
      }
      scores[[doc_id]] <- scores[[doc_id]] + rrf_score
    }
  }

  unlist(scores)
}

#' Maximal Marginal Relevance (MMR) reranking
#'
#' @param query_vector Query embedding
#' @param doc_vectors Matrix of document embeddings
#' @param doc_ids Vector of document IDs
#' @param scores Initial relevance scores
#' @param lambda Diversity parameter (0-1)
#' @param limit Number of results to return
#' @return Data frame with reranked results
#' @keywords internal
mmr_rerank <- function(query_vector, doc_vectors, doc_ids, scores,
                       lambda = 0.7, limit = 10) {
  if (is.vector(doc_vectors)) {
    doc_vectors <- matrix(doc_vectors, nrow = 1)
  }

  n <- length(doc_ids)
  limit <- min(limit, n)

  # Normalize scores to 0-1
  if (max(scores) > 0) {
    scores <- scores / max(scores)
  }

  selected <- c()
  selected_vectors <- matrix(nrow = 0, ncol = ncol(doc_vectors))
  remaining <- seq_len(n)

  for (i in seq_len(limit)) {
    if (length(remaining) == 0) break

    mmr_scores <- sapply(remaining, function(idx) {
      relevance <- scores[idx]

      if (nrow(selected_vectors) == 0) {
        diversity <- 0
      } else {
        sims <- cosine_similarity(doc_vectors[idx, ], selected_vectors)
        diversity <- max(sims)
      }

      lambda * relevance - (1 - lambda) * diversity
    })

    best_idx <- remaining[which.max(mmr_scores)]
    selected <- c(selected, best_idx)
    selected_vectors <- rbind(selected_vectors, doc_vectors[best_idx, ])
    remaining <- setdiff(remaining, best_idx)
  }

  data.frame(
    id = doc_ids[selected],
    score = scores[selected],
    stringsAsFactors = FALSE
  )
}

#' Check if Python module is available
#'
#' @param module Module name
#' @return Logical
#' @keywords internal
check_python_module <- function(module) {
  if (!requireNamespace("reticulate", quietly = TRUE)) {
    return(FALSE)
  }

  reticulate::py_module_available(module)
}

#' Get or create VectrixDB data directory
#'
#' @param path Optional custom path
#' @return Path to data directory
#' @keywords internal
#' @noRd
default_data_path <- function() {
  file.path(tempdir(), "vectrixdb_data")
}

get_data_path <- function(path = NULL) {

  if (!is.null(path)) {
    if (!dir.exists(path)) {
      dir.create(path, recursive = TRUE)
    }
    return(normalizePath(path, mustWork = FALSE))
  }


  # Default path
  default_path <- default_data_path()
  if (!dir.exists(default_path)) {
    dir.create(default_path, recursive = TRUE)
  }
  normalizePath(default_path, mustWork = FALSE)
}

#' Format time duration
#'
#' @param ms Time in milliseconds
#' @return Formatted string
#' @keywords internal
format_time <- function(ms) {

  if (ms < 1000) {
    sprintf("%.1fms", ms)
  } else if (ms < 60000) {
    sprintf("%.2fs", ms / 1000)
  } else {
    sprintf("%.1fm", ms / 60000)
  }
}

#' Normalize language tag to supported values
#'
#' @param language Language label ("en", "english", "ml", "multi", etc.)
#' @param default Default language when input is missing/invalid
#' @return "en" or "ml"
#' @keywords internal
normalize_language_tag <- function(language = NULL, default = "en") {
  default_norm <- tolower(trimws(as.character(default %||% "en")))
  if (!(default_norm %in% c("en", "ml"))) {
    default_norm <- "en"
  }

  if (is.null(language)) {
    return(default_norm)
  }

  lang <- tolower(trimws(as.character(language)))
  if (!nzchar(lang)) {
    return(default_norm)
  }

  if (lang %in% c("en", "english")) {
    return("en")
  }

  if (lang %in% c("ml", "multi", "multilingual")) {
    return("ml")
  }

  # Treat unknown labels as multilingual instead of silently forcing English.
  "ml"
}

#' Language-aware tokenizer used across embedders and keyword search
#'
#' @param text Input text
#' @param language "en" or "ml"
#' @param remove_stopwords Remove English stopwords when language is "en"
#' @return Character vector of tokens
#' @keywords internal
tokenize_text_by_language <- function(text, language = "en", remove_stopwords = FALSE) {
  lang <- normalize_language_tag(language, default = "en")
  value <- tolower(as.character(text %||% ""))
  if (!nzchar(value)) {
    return(character(0))
  }

  # EN keeps legacy ASCII behavior. ML is Unicode-aware (\p{L}/\p{N}).
  if (identical(lang, "en")) {
    value <- gsub("[^a-z0-9 ]", " ", value, perl = TRUE)
  } else {
    value <- gsub("[^\\p{L}\\p{N}]+", " ", value, perl = TRUE)
  }

  tokens <- unlist(strsplit(value, "\\s+", perl = TRUE), use.names = FALSE)
  min_chars <- if (identical(lang, "en")) 2L else 1L
  tokens <- tokens[nchar(tokens, type = "chars") >= min_chars]
  tokens <- tokens[nzchar(tokens)]

  if (isTRUE(remove_stopwords) && identical(lang, "en") && length(tokens) > 0) {
    sw <- tryCatch(
      stopwords::stopwords("en"),
      error = function(e) c(
        "the", "a", "an", "is", "are", "was", "were", "be", "been", "being",
        "have", "has", "had", "do", "does", "did", "will", "would", "could",
        "should", "may", "might", "must", "shall", "can", "of", "to", "in",
        "for", "on", "with", "at", "by", "from", "as", "into", "through"
      )
    )
    tokens <- tokens[!(tokens %in% sw)]
  }

  tokens
}

#' Check whether word2vec package is installed
#'
#' @return Logical
#' @keywords internal
word2vec_available <- function() {
  length(find.package("word2vec", quiet = TRUE)) > 0
}

#' Load word2vec model via optional runtime namespace lookup
#'
#' @param path Path to .bin model
#' @param normalize Whether to normalize embeddings (forwarded when supported)
#' @return word2vec model object
#' @keywords internal
read_word2vec_model <- function(path, normalize = TRUE) {
  if (!word2vec_available()) {
    stop("word2vec package required for .bin files. Install with: install.packages('word2vec')")
  }

  read_fun <- get("read.word2vec", envir = asNamespace("word2vec"), inherits = FALSE)
  read_fun(path, normalize = normalize)
}

Try the VectrixDB package in your browser

Any scripts or data that you put into this service are public.

VectrixDB documentation built on Feb. 20, 2026, 5:09 p.m.