R/utils.R
In VectrixDB: Lightweight Vector Database with Embedded Machine Learning Models

Documented in check_python_module cosine_similarity euclidean_distance format_time generate_id mmr_rerank normalize_language_tag normalize_vectors read_word2vec_model rrf_fusion tokenize_text_by_language word2vec_available

#' VectrixDB Utility Functions
#'
#' @name utils
#' @keywords internal
NULL

#' Generate deterministic ID from text
#'
#' @param text Character string to hash
#' @return Character string (12 character MD5 hash prefix)
#' @keywords internal
generate_id <- function(text) {

  substr(digest::digest(text, algo = "md5"), 1, 12)
}

#' Normalize vectors for cosine similarity
#'
#' @param vectors Matrix of vectors (rows are vectors)
#' @return Normalized matrix
#' @keywords internal
normalize_vectors <- function(vectors) {
  if (is.vector(vectors)) {
    vectors <- matrix(vectors, nrow = 1)
  }
  norms <- sqrt(rowSums(vectors^2))
  norms[norms == 0] <- 1  # Avoid division by zero

  vectors / norms
}

#' Compute cosine similarity
#'
#' @param query_vector Numeric vector (query)
#' @param doc_vectors Matrix of document vectors
#' @return Numeric vector of similarities
#' @keywords internal
cosine_similarity <- function(query_vector, doc_vectors) {
  if (is.vector(doc_vectors)) {
    doc_vectors <- matrix(doc_vectors, nrow = 1)
  }
  query_norm <- sqrt(sum(query_vector^2))
  doc_norms <- sqrt(rowSums(doc_vectors^2))

  if (query_norm == 0) return(rep(0, nrow(doc_vectors)))

  sims <- (doc_vectors %*% query_vector) / (doc_norms * query_norm)
  as.vector(sims)
}

#' Compute euclidean distance
#'
#' @param query_vector Numeric vector (query)
#' @param doc_vectors Matrix of document vectors
#' @return Numeric vector of distances
#' @keywords internal
euclidean_distance <- function(query_vector, doc_vectors) {
  if (is.vector(doc_vectors)) {
    doc_vectors <- matrix(doc_vectors, nrow = 1)
  }
  sqrt(rowSums((sweep(doc_vectors, 2, query_vector))^2))
}

#' Reciprocal Rank Fusion (RRF)
#'
#' @param rankings List of ranked ID vectors
#' @param k RRF constant (default 60)
#' @param weights Optional weights for each ranking
#' @return Named vector of fused scores
#' @keywords internal
rrf_fusion <- function(rankings, k = 60, weights = NULL) {
  if (is.null(weights)) {
    weights <- rep(1 / length(rankings), length(rankings))
  }

  scores <- list()

  for (i in seq_along(rankings)) {
    ranking <- rankings[[i]]
    weight <- weights[i]

    for (rank in seq_along(ranking)) {
      doc_id <- ranking[rank]
      rrf_score <- weight / (k + rank)

      if (is.null(scores[[doc_id]])) {
        scores[[doc_id]] <- 0
      }
      scores[[doc_id]] <- scores[[doc_id]] + rrf_score
    }
  }

  unlist(scores)
}

#' Maximal Marginal Relevance (MMR) reranking
#'
#' @param query_vector Query embedding
#' @param doc_vectors Matrix of document embeddings
#' @param doc_ids Vector of document IDs
#' @param scores Initial relevance scores
#' @param lambda Diversity parameter (0-1)
#' @param limit Number of results to return
#' @return Data frame with reranked results
#' @keywords internal
mmr_rerank <- function(query_vector, doc_vectors, doc_ids, scores,
                       lambda = 0.7, limit = 10) {
  if (is.vector(doc_vectors)) {
    doc_vectors <- matrix(doc_vectors, nrow = 1)
  }

  n <- length(doc_ids)
  limit <- min(limit, n)

  # Normalize scores to 0-1
  if (max(scores) > 0) {
    scores <- scores / max(scores)
  }

  selected <- c()
  selected_vectors <- matrix(nrow = 0, ncol = ncol(doc_vectors))
  remaining <- seq_len(n)

  for (i in seq_len(limit)) {
    if (length(remaining) == 0) break

    mmr_scores <- sapply(remaining, function(idx) {
      relevance <- scores[idx]

      if (nrow(selected_vectors) == 0) {
        diversity <- 0
      } else {
        sims <- cosine_similarity(doc_vectors[idx, ], selected_vectors)
        diversity <- max(sims)
      }

      lambda * relevance - (1 - lambda) * diversity
    })

    best_idx <- remaining[which.max(mmr_scores)]
    selected <- c(selected, best_idx)
    selected_vectors <- rbind(selected_vectors, doc_vectors[best_idx, ])
    remaining <- setdiff(remaining, best_idx)
  }

  data.frame(
    id = doc_ids[selected],
    score = scores[selected],
    stringsAsFactors = FALSE
  )
}

#' Check if Python module is available
#'
#' @param module Module name
#' @return Logical
#' @keywords internal
check_python_module <- function(module) {
  if (!requireNamespace("reticulate", quietly = TRUE)) {
    return(FALSE)
  }

  reticulate::py_module_available(module)
}

#' Get or create VectrixDB data directory
#'
#' @param path Optional custom path
#' @return Path to data directory
#' @keywords internal
#' @noRd
default_data_path <- function() {
  file.path(tempdir(), "vectrixdb_data")
}

get_data_path <- function(path = NULL) {

  if (!is.null(path)) {
    if (!dir.exists(path)) {
      dir.create(path, recursive = TRUE)
    }
    return(normalizePath(path, mustWork = FALSE))
  }


  # Default path
  default_path <- default_data_path()
  if (!dir.exists(default_path)) {
    dir.create(default_path, recursive = TRUE)
  }
  normalizePath(default_path, mustWork = FALSE)
}

#' Format time duration
#'
#' @param ms Time in milliseconds
#' @return Formatted string
#' @keywords internal
format_time <- function(ms) {

  if (ms < 1000) {
    sprintf("%.1fms", ms)
  } else if (ms < 60000) {
    sprintf("%.2fs", ms / 1000)
  } else {
    sprintf("%.1fm", ms / 60000)
  }
}

#' Normalize language tag to supported values
#'
#' @param language Language label ("en", "english", "ml", "multi", etc.)
#' @param default Default language when input is missing/invalid
#' @return "en" or "ml"
#' @keywords internal
normalize_language_tag <- function(language = NULL, default = "en") {
  default_norm <- tolower(trimws(as.character(default %||% "en")))
  if (!(default_norm %in% c("en", "ml"))) {
    default_norm <- "en"
  }

  if (is.null(language)) {
    return(default_norm)
  }

  lang <- tolower(trimws(as.character(language)))
  if (!nzchar(lang)) {
    return(default_norm)
  }

  if (lang %in% c("en", "english")) {
    return("en")
  }

  if (lang %in% c("ml", "multi", "multilingual")) {
    return("ml")
  }

  # Treat unknown labels as multilingual instead of silently forcing English.
  "ml"
}

#' Language-aware tokenizer used across embedders and keyword search
#'
#' @param text Input text
#' @param language "en" or "ml"
#' @param remove_stopwords Remove English stopwords when language is "en"
#' @return Character vector of tokens
#' @keywords internal
tokenize_text_by_language <- function(text, language = "en", remove_stopwords = FALSE) {
  lang <- normalize_language_tag(language, default = "en")
  value <- tolower(as.character(text %||% ""))
  if (!nzchar(value)) {
    return(character(0))
  }

  # EN keeps legacy ASCII behavior. ML is Unicode-aware (\p{L}/\p{N}).
  if (identical(lang, "en")) {
    value <- gsub("[^a-z0-9 ]", " ", value, perl = TRUE)
  } else {
    value <- gsub("[^\\p{L}\\p{N}]+", " ", value, perl = TRUE)
  }

  tokens <- unlist(strsplit(value, "\\s+", perl = TRUE), use.names = FALSE)
  min_chars <- if (identical(lang, "en")) 2L else 1L
  tokens <- tokens[nchar(tokens, type = "chars") >= min_chars]
  tokens <- tokens[nzchar(tokens)]

  if (isTRUE(remove_stopwords) && identical(lang, "en") && length(tokens) > 0) {
    sw <- tryCatch(
      stopwords::stopwords("en"),
      error = function(e) c(
        "the", "a", "an", "is", "are", "was", "were", "be", "been", "being",
        "have", "has", "had", "do", "does", "did", "will", "would", "could",
        "should", "may", "might", "must", "shall", "can", "of", "to", "in",
        "for", "on", "with", "at", "by", "from", "as", "into", "through"
      )
    )
    tokens <- tokens[!(tokens %in% sw)]
  }

  tokens
}

#' Check whether word2vec package is installed
#'
#' @return Logical
#' @keywords internal
word2vec_available <- function() {
  length(find.package("word2vec", quiet = TRUE)) > 0
}

#' Load word2vec model via optional runtime namespace lookup
#'
#' @param path Path to .bin model
#' @param normalize Whether to normalize embeddings (forwarded when supported)
#' @return word2vec model object
#' @keywords internal
read_word2vec_model <- function(path, normalize = TRUE) {
  if (!word2vec_available()) {
    stop("word2vec package required for .bin files. Install with: install.packages('word2vec')")
  }

  read_fun <- get("read.word2vec", envir = asNamespace("word2vec"), inherits = FALSE)
  read_fun(path, normalize = normalize)
}

Any scripts or data that you put into this service are public.

VectrixDB documentation built on Feb. 20, 2026, 5:09 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

VectrixDB
Lightweight Vector Database with Embedded Machine Learning Models

R/utils.R
In VectrixDB: Lightweight Vector Database with Embedded Machine Learning Models

Defines functions read_word2vec_model word2vec_available tokenize_text_by_language normalize_language_tag format_time get_data_path default_data_path check_python_module mmr_rerank rrf_fusion euclidean_distance cosine_similarity normalize_vectors generate_id

Documented in check_python_module cosine_similarity euclidean_distance format_time generate_id mmr_rerank normalize_language_tag normalize_vectors read_word2vec_model rrf_fusion tokenize_text_by_language word2vec_available

Try the VectrixDB package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

VectrixDB Lightweight Vector Database with Embedded Machine Learning Models

R/utils.R In VectrixDB: Lightweight Vector Database with Embedded Machine Learning Models

Defines functions read_word2vec_model word2vec_available tokenize_text_by_language normalize_language_tag format_time get_data_path default_data_path check_python_module mmr_rerank rrf_fusion euclidean_distance cosine_similarity normalize_vectors generate_id

Documented in check_python_module cosine_similarity euclidean_distance format_time generate_id mmr_rerank normalize_language_tag normalize_vectors read_word2vec_model rrf_fusion tokenize_text_by_language word2vec_available

Try the VectrixDB package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

VectrixDB
Lightweight Vector Database with Embedded Machine Learning Models

R/utils.R
In VectrixDB: Lightweight Vector Database with Embedded Machine Learning Models