R/find-matches.R

Defines functions .recog_match

.recog_match <- function(recog_db, source, protocols = ".*", matches = NULL) {

  if (
    identical(
      attr(recog_db[[1]]$fingerpints[[1]]$compiled_pattern, ".compiled"),
      new("externalptr")
    )) {
    stop(
      "The external pointers for the compiled patterns are not valid. ",
      "Please re-load the data you are supplying to the `recog_db` parameter."
    )
  }

  protocol_matchers <- if (is.null(protocols)) {
    numeric()
  } else {
    which(grepl(protocols, map_chr(recog_db, "protocol")))
  }

  matches_matchers <- if (is.null(matches)) {
    numeric()
  } else {
    which(grepl(matches, map_chr(recog_db, "matches")))
  }

  matchers <- unique(c(protocol_matchers, matches_matchers))

  if (length(matchers) == 0) return(list())

  lapply(recog_db[matchers], function(.x) {

    preference <- .x$preference_value

    lapply(.x$fingerpints, function(.x) {

      res <- ore::ore_search(.x$compiled_pattern, source, simplify=TRUE)

      if (!is.null(res)) {

        grps <- as.vector(ore::groups(res))

        lapply(.x$params, function(.x) {
          value <- if (.x$position == 0) .x$value else grps[.x$position]
          as.list(set_names(value, .x$name))
        }) %>% unlist(recursive = FALSE) -> mat_out

        mat_out$preference <- preference
        mat_out$description <- .x$description
        mat_out$pattern <- .x$pattern
        mat_out$orig <- source

        mat_out

      }
    }) %>%
      discard(is.null) %>%
      discard(~length(.x) == 0)

  }) %>%
    discard(is.null) %>%
    discard(~length(.x) == 0) %>%
    unlist(recursive = FALSE) %>%
    bind_rows() -> out

  class(out) <- c("tbl_df", "tbl", "data.frame")

  out

}

#' Find fingerprint matches for a given source
#'
#' This is an exhaustive lookup for the fingerprint in all the
#' `protocol`/`matches` categories. As a result, it's not very fast
#' on its own. However, the function has been [memoise::memosie()]'d.
#' As such, if you are performing a number of recogs in a single
#' R session and working from a typical data source (i.e. a large file
#' with many common strings, such as a collection of HTTP `Server``
#' header strings), you will see performance gains after each distinct
#' match input.
#'
#' @md
#' @param recog_db a structure created with [load_fingerprints()] or
#'        [use_builtin_fingerprints()]
#' @param source the 1-element character vector to compare against
#' @param protocol,matches regexs to limit what you're comparing against. These
#'        are boolean **OR'd** together
#' @export
#' @examples
#' recog_db <- use_builtin_fingerprints()
#' recog_match(recog_db, "VShell_Special_Edition_2_5_0_204 VShell", "ssh")
recog_match <- memoise::memoise(.recog_match)
hrbrmstr/rrecog documentation built on May 5, 2019, 6:55 p.m.