R/advanced_search.R
In VectrixDB: Lightweight Vector Database with Embedded Machine Learning Models

Documented in acl_config_from_list parse_acl text_analyzer_english text_analyzer_keyword text_analyzer_simple text_analyzer_standard

#' VectrixDB Advanced Search Features
#'
#' @description Enterprise-grade search capabilities:
#' - Faceted search with aggregations
#' - ACL/Security filtering
#' - Text analyzers (stemming, synonyms, stopwords)
#'
#' @name advanced_search
NULL

# =============================================================================
# Faceted Search
# =============================================================================

#' Facet Configuration
#'
#' @description Configuration for a facet field
#' @export
FacetConfig <- R6::R6Class(
  "FacetConfig",
  public = list(
    #' @field field Field name to facet on
    field = NULL,
    #' @field limit Max values to return
    limit = 10,
    #' @field min_count Minimum count to include
    min_count = 1,
    #' @field sort_by Sort by "count" or "value"
    sort_by = "count",
    #' @field include_zero Include zero-count values
    include_zero = FALSE,

    #' @description Create a new FacetConfig
    #' @param field Field name
    #' @param limit Max values (default: 10)
    #' @param min_count Min count (default: 1)
    #' @param sort_by Sort method (default: "count")
    #' @param include_zero Include zeros (default: FALSE)
    initialize = function(field,
                          limit = 10,
                          min_count = 1,
                          sort_by = "count",
                          include_zero = FALSE) {
      self$field <- field
      self$limit <- limit
      self$min_count <- min_count
      self$sort_by <- sort_by
      self$include_zero <- include_zero
    }
  )
)

#' Facet Value
#'
#' @description A single facet value with count
#' @export
FacetValue <- R6::R6Class(
  "FacetValue",
  public = list(
    #' @field value The facet value
    value = NULL,
    #' @field count Number of occurrences
    count = 0,

    #' @description Create a new FacetValue
    #' @param value The value
    #' @param count The count
    initialize = function(value, count) {
      self$value <- value
      self$count <- count
    }
  )
)

#' Facet Result
#'
#' @description Result of facet aggregation
#' @export
FacetResult <- R6::R6Class(
  "FacetResult",
  public = list(
    #' @field field Field name
    field = NULL,
    #' @field values List of FacetValue objects
    values = NULL,
    #' @field total_count Total count
    total_count = 0,
    #' @field other_count Count of values not in top-N
    other_count = 0,

    #' @description Create a new FacetResult
    #' @param field Field name
    #' @param values List of FacetValue objects
    #' @param total_count Total count
    #' @param other_count Other count
    initialize = function(field, values, total_count, other_count = 0) {
      self$field <- field
      self$values <- values
      self$total_count <- total_count
      self$other_count <- other_count
    },

    #' @description Convert to list
    to_list = function() {
      list(
        field = self$field,
        values = lapply(self$values, function(v) list(value = v$value, count = v$count)),
        total_count = self$total_count,
        other_count = self$other_count
      )
    }
  )
)

#' Facet Aggregator
#'
#' @description Faceted search aggregator for computing aggregations/counts
#'
#' @examples
#' \dontrun{
#' aggregator <- FacetAggregator$new()
#' facets <- aggregator$aggregate(
#'   documents = list(
#'     list(category = "tech", author = "Alice"),
#'     list(category = "science", author = "Bob")
#'   ),
#'   facet_fields = c("category", "author")
#' )
#' }
#'
#' @export
FacetAggregator <- R6::R6Class(
  "FacetAggregator",
  public = list(
    #' @description Aggregate facet values from documents
    #' @param documents List of documents with metadata
    #' @param facet_configs List of field names or FacetConfig objects
    #' @return Named list mapping field names to FacetResult
    aggregate = function(documents, facet_configs) {
      # Normalize configs
      configs <- lapply(facet_configs, function(fc) {
        if (is.character(fc)) {
          FacetConfig$new(field = fc)
        } else {
          fc
        }
      })

      results <- list()
      for (config in configs) {
        results[[config$field]] <- private$aggregate_field(documents, config)
      }

      results
    },

    #' @description Convert facet results to list format
    #' @param facet_results Named list of FacetResult objects
    #' @return List format suitable for JSON
    to_list = function(facet_results) {
      lapply(facet_results, function(result) {
        values_map <- list()
        for (fv in result$values) {
          values_map[[as.character(fv$value)]] <- fv$count
        }
        list(
          values = values_map,
          total_count = result$total_count,
          other_count = result$other_count
        )
      })
    }
  ),

  private = list(
    aggregate_field = function(documents, config) {
      counter <- list()

      for (doc in documents) {
        value <- private$get_nested_value(doc, config$field)
        if (is.null(value)) next

        # Handle arrays
        if (is.list(value) || (is.vector(value) && length(value) > 1)) {
          for (v in value) {
            key <- as.character(v)
            counter[[key]] <- (counter[[key]] %||% 0) + 1
          }
        } else {
          key <- as.character(value)
          counter[[key]] <- (counter[[key]] %||% 0) + 1
        }
      }

      # Filter by min_count
      if (!config$include_zero) {
        counter <- counter[sapply(counter, function(x) x >= config$min_count)]
      }

      # Sort
      if (config$sort_by == "count") {
        sorted_names <- names(counter)[order(unlist(counter), decreasing = TRUE)]
      } else {
        sorted_names <- sort(names(counter))
      }

      # Limit
      top_names <- head(sorted_names, config$limit)
      other_names <- tail(sorted_names, length(sorted_names) - config$limit)
      other_count <- sum(unlist(counter[other_names]))

      facet_values <- lapply(top_names, function(name) {
        FacetValue$new(value = name, count = counter[[name]])
      })

      FacetResult$new(
        field = config$field,
        values = facet_values,
        total_count = sum(unlist(counter)),
        other_count = other_count
      )
    },

    get_nested_value = function(doc, field) {
      keys <- strsplit(field, "\\.")[[1]]
      value <- doc

      for (key in keys) {
        if (is.list(value) && key %in% names(value)) {
          value <- value[[key]]
        } else {
          return(NULL)
        }
      }

      value
    }
  )
)

# =============================================================================
# ACL / Security Filtering
# =============================================================================

#' ACL Operator Types
#'
#' @description ACL matching operators
#' @export
ACLOperator <- list(

USER = "user",
  GROUP = "group",
  ROLE = "role",
  EVERYONE = "everyone",
  DENY = "deny"
)

#' ACL Principal
#'
#' @description An ACL principal (user, group, or role)
#' @export
ACLPrincipal <- R6::R6Class(
  "ACLPrincipal",
  public = list(
    #' @field type Principal type
    type = NULL,
    #' @field value Principal value
    value = NULL,

    #' @description Create a new ACLPrincipal
    #' @param type Principal type (user, group, role)
    #' @param value Principal value
    initialize = function(type, value) {
      self$type <- type
      self$value <- value
    },

    #' @description Check if this principal matches another
    #' @param other Another ACLPrincipal
    #' @return Logical
    matches = function(other) {
      if (self$type != other$type) return(FALSE)
      if (self$value == "*" || other$value == "*") return(TRUE)
      self$value == other$value
    },

    #' @description Convert to string
    to_string = function() {
      sprintf("%s:%s", self$type, self$value)
    }
  )
)

#' Parse ACL String
#'
#' @description Parse ACL string like 'user:alice' or 'group:engineering'
#' @param acl_string ACL string
#' @return ACLPrincipal object
#' @export
parse_acl <- function(acl_string) {
  if (grepl(":", acl_string)) {
    parts <- strsplit(acl_string, ":", fixed = TRUE)[[1]]
    type_str <- tolower(parts[1])
    value <- paste(parts[-1], collapse = ":")

    valid_types <- c("user", "group", "role", "everyone", "deny")
    if (type_str %in% valid_types) {
      acl_type <- type_str
    } else {
      acl_type <- "user"
    }
  } else {
    acl_type <- "user"
    value <- acl_string
  }

  ACLPrincipal$new(type = acl_type, value = value)
}

#' ACL Configuration
#'
#' @description ACL configuration for a document or collection
#' @export
ACLConfig <- R6::R6Class(
  "ACLConfig",
  public = list(
    #' @field read_principals Who can read
    read_principals = NULL,
    #' @field deny_principals Who cannot read (takes precedence)
    deny_principals = NULL,
    #' @field is_public Is public access allowed
    is_public = FALSE,

    #' @description Create a new ACLConfig
    #' @param read_principals List of ACLPrincipal objects
    #' @param deny_principals List of ACLPrincipal objects
    #' @param is_public Logical
    initialize = function(read_principals = list(),
                          deny_principals = list(),
                          is_public = FALSE) {
      self$read_principals <- read_principals
      self$deny_principals <- deny_principals
      self$is_public <- is_public
    }
  )
)

#' Create ACL Config from List
#'
#' @description Create ACLConfig from list of ACL strings
#' @param acl_list Character vector of ACL strings
#' @return ACLConfig object
#' @export
acl_config_from_list <- function(acl_list) {
  read <- list()
  deny <- list()
  is_public <- FALSE

  for (acl_str in acl_list) {
    if (tolower(acl_str) %in% c("everyone", "public", "*")) {
      is_public <- TRUE
    } else if (startsWith(acl_str, "deny:")) {
      deny <- c(deny, list(parse_acl(substring(acl_str, 6))))
    } else {
      read <- c(read, list(parse_acl(acl_str)))
    }
  }

  ACLConfig$new(read_principals = read, deny_principals = deny, is_public = is_public)
}

#' ACL Filter
#'
#' @description Access Control List filter for security-aware search
#'
#' @examples
#' \dontrun{
#' acl_filter <- ACLFilter$new()
#' filtered <- acl_filter$filter(
#'   documents = results,
#'   user_principals = c("user:alice", "group:engineering")
#' )
#' }
#'
#' @export
ACLFilter <- R6::R6Class(
  "ACLFilter",
  public = list(
    #' @field acl_field Metadata field containing ACLs
    acl_field = "_acl",

    #' @description Create a new ACLFilter
    #' @param acl_field Field name for ACLs (default: "_acl")
    initialize = function(acl_field = "_acl") {
      self$acl_field <- acl_field
    },

    #' @description Filter documents based on user's ACL principals
    #' @param documents List of documents with metadata
    #' @param user_principals Character vector or list of ACLPrincipal
    #' @param default_allow Allow if no ACL defined (default: FALSE)
    #' @return Filtered documents
    filter = function(documents, user_principals, default_allow = FALSE) {
      # Parse user principals
      principals <- lapply(user_principals, function(p) {
        if (is.character(p)) {
          parse_acl(p)
        } else {
          p
        }
      })

      filtered <- list()
      for (doc in documents) {
        if (private$can_access(doc, principals, default_allow)) {
          filtered <- c(filtered, list(doc))
        }
      }

      filtered
    },

    #' @description Add ACL to document metadata
    #' @param metadata Document metadata
    #' @param principals Character vector of principal strings
    #' @return Updated metadata
    add_acl = function(metadata, principals) {
      metadata[[self$acl_field]] <- principals
      metadata
    },

    #' @description Create ACL filter condition for query
    #' @param user_principals Character vector of principals
    #' @return Filter condition list
    create_filter_condition = function(user_principals) {
      conditions <- list(
        list(field = self$acl_field, op = "$contains", value = "everyone"),
        list(field = self$acl_field, op = "$contains", value = "public")
      )

      for (p in user_principals) {
        conditions <- c(conditions, list(
          list(field = self$acl_field, op = "$contains", value = p)
        ))
      }

      list("$or" = conditions)
    }
  ),

  private = list(
    can_access = function(doc, user_principals, default_allow) {
      metadata <- if (!is.null(doc$metadata)) doc$metadata else doc
      acl_data <- metadata[[self$acl_field]]

      if (is.null(acl_data)) {
        return(default_allow)
      }

      # Parse ACL config
      if (is.character(acl_data)) {
        acl_config <- acl_config_from_list(acl_data)
      } else if (is.list(acl_data)) {
        if (!is.null(acl_data$read) || !is.null(acl_data$deny)) {
          read_principals <- lapply(acl_data$read %||% list(), parse_acl)
          deny_principals <- lapply(acl_data$deny %||% list(), parse_acl)
          acl_config <- ACLConfig$new(
            read_principals = read_principals,
            deny_principals = deny_principals,
            is_public = acl_data$public %||% FALSE
          )
        } else {
          acl_config <- acl_config_from_list(as.character(acl_data))
        }
      } else {
        return(default_allow)
      }

      # Public access
      if (acl_config$is_public) {
        return(TRUE)
      }

      # Check deny list first
      for (deny in acl_config$deny_principals) {
        for (user_p in user_principals) {
          if (deny$matches(user_p)) {
            return(FALSE)
          }
        }
      }

      # Check allow list
      for (allow in acl_config$read_principals) {
        for (user_p in user_principals) {
          if (allow$matches(user_p)) {
            return(TRUE)
          }
        }
      }

      FALSE
    }
  )
)

# =============================================================================
# Text Analyzers
# =============================================================================

#' English Stopwords
#'
#' @description Common English stopwords
#' @export
ENGLISH_STOPWORDS <- c(
  "a", "an", "and", "are", "as", "at", "be", "but", "by", "for",
  "if", "in", "into", "is", "it", "no", "not", "of", "on", "or",
  "such", "that", "the", "their", "then", "there", "these", "they",
  "this", "to", "was", "will", "with", "the", "and", "but", "or",
  "because", "as", "what", "which", "who", "when", "where", "how"
)

#' Simple Stemmer
#'
#' @description Simple suffix-stripping stemmer (no external dependencies)
#' @export
SimpleStemmer <- R6::R6Class(
  "SimpleStemmer",
  public = list(
    #' @field suffixes List of suffixes to remove
    suffixes = c(
      "ization", "ational", "fulness", "ousness", "iveness",
      "ation", "eness", "ment", "ness", "ible", "able", "ity",
      "ing", "ies", "ive", "ion", "ous", "ful", "ism", "ist",
      "ly", "ed", "er", "es", "al", "s"
    ),

    #' @description Stem a word
    #' @param word Word to stem
    #' @return Stemmed word
    stem = function(word) {
      word <- tolower(word)

      for (suffix in self$suffixes) {
        if (endsWith(word, suffix) && nchar(word) - nchar(suffix) >= 3) {
          return(substring(word, 1, nchar(word) - nchar(suffix)))
        }
      }

      word
    },

    #' @description Stem multiple words
    #' @param words Character vector
    #' @return Stemmed words
    stem_words = function(words) {
      sapply(words, self$stem, USE.NAMES = FALSE)
    }
  )
)

#' Text Analyzer
#'
#' @description Text analyzer for search indexing
#'
#' Provides text processing pipelines:
#' - Tokenization
#' - Lowercasing
#' - Stopword removal
#' - Stemming
#' - Synonym expansion
#'
#' @examples
#' \dontrun{
#' analyzer <- TextAnalyzer$english()
#' tokens <- analyzer$analyze("The quick brown foxes are jumping")
#' # c("quick", "brown", "fox", "jump")
#' }
#'
#' @export
TextAnalyzer <- R6::R6Class(
  "TextAnalyzer",
  public = list(
    #' @field lowercase Convert to lowercase
    lowercase = TRUE,
    #' @field remove_stopwords Remove stopwords
    remove_stopwords = FALSE,
    #' @field stopwords Set of stopwords
    stopwords = NULL,
    #' @field stemmer Stemmer object
    stemmer = NULL,
    #' @field synonyms Synonym dictionary
    synonyms = NULL,
    #' @field min_token_length Minimum token length
    min_token_length = 1,
    #' @field max_token_length Maximum token length
    max_token_length = 100,
    #' @field token_pattern Regex pattern for tokens
    token_pattern = "[a-zA-Z0-9]+",

    #' @description Create a new TextAnalyzer
    #' @param lowercase Lowercase text (default: TRUE)
    #' @param remove_stopwords Remove stopwords (default: FALSE)
    #' @param stopwords Custom stopwords (default: ENGLISH_STOPWORDS)
    #' @param use_stemmer Use stemming (default: FALSE)
    #' @param synonyms Named list of synonyms
    #' @param min_token_length Min length (default: 1)
    #' @param max_token_length Max length (default: 100)
    #' @param token_pattern Regex pattern
    initialize = function(lowercase = TRUE,
                          remove_stopwords = FALSE,
                          stopwords = NULL,
                          use_stemmer = FALSE,
                          synonyms = NULL,
                          min_token_length = 1,
                          max_token_length = 100,
                          token_pattern = "[a-zA-Z0-9]+") {
      self$lowercase <- lowercase
      self$remove_stopwords <- remove_stopwords
      self$stopwords <- stopwords %||% ENGLISH_STOPWORDS
      self$synonyms <- synonyms %||% list()
      self$min_token_length <- min_token_length
      self$max_token_length <- max_token_length
      self$token_pattern <- token_pattern

      if (use_stemmer) {
        self$stemmer <- SimpleStemmer$new()
      }
    },

    #' @description Analyze text and return tokens
    #' @param text Input text
    #' @return Character vector of tokens
    analyze = function(text) {
      if (is.null(text) || text == "") {
        return(character(0))
      }

      # Lowercase
      if (self$lowercase) {
        text <- tolower(text)
      }

      # Tokenize
      tokens <- unlist(regmatches(text, gregexpr(self$token_pattern, text)))

      # Filter by length
      tokens <- tokens[nchar(tokens) >= self$min_token_length &
                       nchar(tokens) <= self$max_token_length]

      # Remove stopwords
      if (self$remove_stopwords) {
        tokens <- tokens[!tolower(tokens) %in% self$stopwords]
      }

      # Stem
      if (!is.null(self$stemmer)) {
        tokens <- self$stemmer$stem_words(tokens)
      }

      # Expand synonyms
      if (length(self$synonyms) > 0) {
        expanded <- c()
        for (t in tokens) {
          expanded <- c(expanded, t)
          if (t %in% names(self$synonyms)) {
            expanded <- c(expanded, self$synonyms[[t]])
          }
        }
        tokens <- expanded
      }

      tokens
    },

    #' @description Analyze a query string
    #' @param query Query text
    #' @return Character vector of tokens
    analyze_query = function(query) {
      self$analyze(query)
    }
  )
)

#' Create Standard Text Analyzer
#'
#' @description Lowercase + basic tokenization
#' @return TextAnalyzer object
#' @export
text_analyzer_standard <- function() {
  TextAnalyzer$new(lowercase = TRUE, remove_stopwords = FALSE)
}

#' Create Simple Text Analyzer
#'
#' @description Lowercase + letter-only tokenization
#' @return TextAnalyzer object
#' @export
text_analyzer_simple <- function() {
  TextAnalyzer$new(lowercase = TRUE, token_pattern = "[a-zA-Z]+")
}

#' Create English Text Analyzer
#'
#' @description English analyzer with stemming and stopwords
#' @return TextAnalyzer object
#' @export
text_analyzer_english <- function() {
  TextAnalyzer$new(
    lowercase = TRUE,
    remove_stopwords = TRUE,
    use_stemmer = TRUE,
    min_token_length = 2
  )
}

#' Create Keyword Text Analyzer
#'
#' @description No tokenization - treat input as single token
#' @return TextAnalyzer object
#' @export
text_analyzer_keyword <- function() {
  KeywordAnalyzer$new()
}

#' Keyword Analyzer
#'
#' @description Treats entire input as single token
#' @export
KeywordAnalyzer <- R6::R6Class(
  "KeywordAnalyzer",
  inherit = TextAnalyzer,
  public = list(
    #' @description Analyze text as single keyword
    #' @param text Input text
    #' @return Single-element character vector
    analyze = function(text) {
      if (is.null(text) || text == "") {
        return(character(0))
      }
      c(trimws(tolower(text)))
    }
  )
)

#' Analyzer Chain
#'
#' @description Chain multiple analyzers together
#' @export
AnalyzerChain <- R6::R6Class(
  "AnalyzerChain",
  public = list(
    #' @field analyzers List of TextAnalyzer objects
    analyzers = NULL,

    #' @description Create a new AnalyzerChain
    #' @param analyzers List of TextAnalyzer objects
    initialize = function(analyzers) {
      self$analyzers <- analyzers
    },

    #' @description Run text through all analyzers
    #' @param text Input text
    #' @return Character vector of tokens
    analyze = function(text) {
      tokens <- c(text)

      for (analyzer in self$analyzers) {
        new_tokens <- c()
        for (token in tokens) {
          new_tokens <- c(new_tokens, analyzer$analyze(token))
        }
        tokens <- new_tokens
      }

      tokens
    }
  )
)

# =============================================================================
# Enhanced Search Results
# =============================================================================

#' Enhanced Search Results
#'
#' @description Search results with enterprise features
#' @export
EnhancedSearchResults <- R6::R6Class(
  "EnhancedSearchResults",
  public = list(
    #' @field results List of result items
    results = NULL,
    #' @field facets Named list of FacetResult objects
    facets = NULL,
    #' @field total_count Total results before filtering
    total_count = 0,
    #' @field filtered_count Results after ACL filtering
    filtered_count = 0,
    #' @field query_time_ms Query time in milliseconds
    query_time_ms = 0,
    #' @field rerank_time_ms Rerank time in milliseconds
    rerank_time_ms = 0,
    #' @field facet_time_ms Facet time in milliseconds
    facet_time_ms = 0,

    #' @description Create new EnhancedSearchResults
    #' @param results List of results
    #' @param facets Named list of FacetResult
    #' @param total_count Total count
    #' @param filtered_count Filtered count
    #' @param query_time_ms Query time
    #' @param rerank_time_ms Rerank time
    #' @param facet_time_ms Facet time
    initialize = function(results,
                          facets = list(),
                          total_count = 0,
                          filtered_count = 0,
                          query_time_ms = 0,
                          rerank_time_ms = 0,
                          facet_time_ms = 0) {
      self$results <- results
      self$facets <- facets
      self$total_count <- total_count
      self$filtered_count <- filtered_count
      self$query_time_ms <- query_time_ms
      self$rerank_time_ms <- rerank_time_ms
      self$facet_time_ms <- facet_time_ms
    },

    #' @description Convert to list
    #' @return List representation
    to_list = function() {
      facets_list <- list()
      for (field in names(self$facets)) {
        result <- self$facets[[field]]
        values_map <- list()
        for (fv in result$values) {
          values_map[[as.character(fv$value)]] <- fv$count
        }
        facets_list[[field]] <- list(
          values = values_map,
          total = result$total_count,
          other = result$other_count
        )
      }

      list(
        results = self$results,
        facets = facets_list,
        total_count = self$total_count,
        filtered_count = self$filtered_count,
        query_time_ms = self$query_time_ms,
        rerank_time_ms = self$rerank_time_ms,
        facet_time_ms = self$facet_time_ms
      )
    }
  )
)

#' Null coalescing for advanced_search.R
#' @keywords internal
#' @noRd
`%||%` <- function(a, b) if (is.null(a)) b else a
Any scripts or data that you put into this service are public.
VectrixDB documentation built on Feb. 20, 2026, 5:09 p.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
VectrixDB
Lightweight Vector Database with Embedded Machine Learning Models

R/advanced_search.R
In VectrixDB: Lightweight Vector Database with Embedded Machine Learning Models

Defines functions `%||%` text_analyzer_keyword text_analyzer_english text_analyzer_simple text_analyzer_standard acl_config_from_list parse_acl

Documented in acl_config_from_list parse_acl text_analyzer_english text_analyzer_keyword text_analyzer_simple text_analyzer_standard

Try the VectrixDB package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

VectrixDB Lightweight Vector Database with Embedded Machine Learning Models

R/advanced_search.R In VectrixDB: Lightweight Vector Database with Embedded Machine Learning Models

Defines functions `%||%` text_analyzer_keyword text_analyzer_english text_analyzer_simple text_analyzer_standard acl_config_from_list parse_acl

Documented in acl_config_from_list parse_acl text_analyzer_english text_analyzer_keyword text_analyzer_simple text_analyzer_standard

Try the VectrixDB package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

VectrixDB
Lightweight Vector Database with Embedded Machine Learning Models

R/advanced_search.R
In VectrixDB: Lightweight Vector Database with Embedded Machine Learning Models