R/search-controller.R

# Class Definition --------------------------------------------------------

# Class functions generally redirect to internal functions beginning sc_*

#' Search Controller
#'
#' An R6 class to control the gathering, sorting and paging of search results. This
#' class controls a number of \code{SearchSource} instances by instantiating them,
#' and triggering new searches. Results are merged together and stored in the
#' controller, with pages returned as needed.
#'
#' @param sources a named list of sources. Each source must be a list containing
#' at least \code{solr.url}. The first source must be called \code{main_source}.
#' To access old solr instances specify \code{solr.api.version}.
#'
#' @section Details:
#' \code{$new(sources = NULL)} starts a new search controller
#'
#' \code{$set_sources(sources)}
#'  If sources is provided they are concatenated they are
#'  retrieved from \code{rcloud.config} (\code{main_source})
#'  and \code{.session} (\code{gist_sources}).
#'
#' \code{$search(all_sources, start, pagesize, sortby, orderby, ...)}
#'   Main search interface. Arguments match \code{\link{rcloud.search}}. If start is
#'   not zero it will read from internal cache, otherwise it will call \code{new_search}.
#'
#' \code{$new_search(all_sources, sortby, orderby, ...)}
#'   This triggers a search on the sources and merges/sorts the results back together into
#'   the \code{results} internal object.
#'
#' \code{$build_response(start, pagesize)}
#'   Build the response to send back to RCloud. This takes the start point and page size and
#'   returns the right page from the results.
#'
#' \code{$get_sources()} return the config
#' \code{$get_raw_results()} return the results as returned by sources
#' \code{$get_results()} return the merged results
#'
#'
#' @importFrom rcloud.support rcloud.config
#'
#' @name SearchController
#' @examples
#' \dontrun{
#' SC <-
#' SearchController$new(sources = list(
#'  main_source = list(solr.url = "http://solr:8983/solr/rcloudnotebooks")
#' ))
#' }
#'
NULL



#' @export

SearchController <- R6::R6Class("SearchController",

  public = list(

    initialize = function(sources = NULL)
      sc_initialize(self, private, sources),

    set_sources = function(sources = NULL)
      sc_set_sources(self, private, sources),

    search = function(all_sources, start, pagesize, sortby, orderby, ...)
      sc_search(self, private, all_sources, start, pagesize, sortby, orderby, ...),

    new_search = function(all_sources, sortby, orderby, ...)
      sc_new_search(self, private, all_sources, sortby, orderby, ...),

    build_response = function(start, pagesize)
      sc_build_response(self, private, start, pagesize),

    get_sources = function() private$sources,
    get_raw_results = function() private$raw_results,
    get_results = function() private$results
  ),

  private = list(
    sources = NULL,
    last_search = NULL,
    raw_results = list(),
    results = list(),
    n_results = 0
  )
)


# Methods -----------------------------------------------------------------

sc_initialize <- function(self, private, sources) {

  # This is called onload so account for situations where rcloud.support
  # doesn't have a config
  if(!is.null(rcloud.config("solr.url"))) {
    self$set_sources(sources)
  }

  invisible(self)
}

sc_set_sources <- function(self, private, sources) {
  # Get the main config from rcloud.config
  if (is.null(sources)) {
    sources <- try(sc_get_rcloud_sources())
    if("try-error" %in% class(sources)) {
      ulog::ulog("ERROR: SOLR source failed to initialise:", gsub("\n", "\\", as.character(sources)))
      return(NULL)
    }
  }

  # Check the sources
  if (names(sources)[1] != "main_source")
    stop("First source must be called \"main_source\"")

  has_url <- vapply(sources, function(x) "solr.url" %in% names(x), logical(1))
  sources <- sources[has_url]

  # Pull the names into the list
  source_named <- mapply(sources, names(sources),
                         FUN = function(x,y) c(source=y, x),
                         SIMPLIFY = FALSE)


  # Create new instances of the SearchSource class
  private$sources <- lapply(source_named, sc_create_source)

  # Remove bad sources
  private$sources <- private$sources[!vapply(private$sources, is.null, logical(1))]
}

#' Retrieve Solr Sources From RCloud Config
#'
sc_get_rcloud_sources <- function() {

  # TODO, can we grab all configs that begin with solr?
  main_source <- list(
    solr.url = rcloud.config("solr.url"),
    solr.auth.user = rcloud.config("solr.auth.user"),
    solr.auth.pwd = rcloud.config("solr.auth.pwd"),
    solr.api.version = rcloud.config("solr.api.version")
  )

  gist_sources <-
    lapply(rcloud.support:::.session$gist.sources.conf, as.list)

  # Combine and make sure that main goes first
  sources <- c(list(main_source = main_source), gist_sources)

  sources
}

#' Main search interface
#'
#' Called by rcloud.search
#'
#' @inheritParams rcloud.search
#' @param self pointer to this object
#' @param private private members
#' @param ... arguments to pass to \code{ss_search}
sc_search <- function(self, private, all_sources, start, pagesize, sortby, orderby, ...) {

  # If things went wrong try not to fall over
  if(length(private$sources)<1) {
    return(list(error = list (msg = "No valid sources")))
  }

  if(!all_sources && !exists("main_source", private$sources)) {
    return(list(error = list (msg = "Main source not valid")))
  }

  if (start == 0) {
    # Update cached results from all sources
    self$new_search(
      all_sources = all_sources,
      start = start,
      sortby = sortby,
      orderby = orderby,
      ...
    )

    # What if new_search went wrong?
  }


  self$build_response(start, pagesize)

}

sc_new_search <- function(self, private, all_sources, sortby, orderby, ...) {

  sources <- if(all_sources) private$sources else private$sources[1]

  # This can be parallelised
  private$raw_results <- lapply(sources, function(src) {
    src$search(sortby = sortby, orderby = orderby, ...)
  })

  # If there are any problems from solr then this will throw an error
  sc_check_errors(private$raw_results)

  private$results <- sc_merge_results(private$raw_results, sortby = sortby, orderby = orderby)

  invisible(private$results)

}

sc_build_response <- function(self, private, start, pagesize) {

  nn <- private$results$header$n_notebooks

  header <- private$results$header
  header$start <- start
  header$pagesize <- pagesize

  notebooks <- NULL

  if(nn > start + pagesize) {
    notebooks <- private$results$notebooks[(start + 1):(start + pagesize)]
  } else if (nn > start) {
    notebooks <- private$results$notebooks[(start + 1):nn]
  }

  response <- c(header, list(notebooks = notebooks))

  response

}


# Internal Functions ------------------------------------------------------

sc_create_source <- function(source) {

  api_major <- "2"

  if(exists("solr.api.version", source) && !is.null(source$solr.api.version)) {
    api_version_split <- strsplit(source$solr.api.version, split = "\\.")[[1]]
    api_major <- api_version_split[1]
  }

  switch(api_major,
         "1" = SearchSourceV1$new(source),
         "2" = SearchSource$new(source),
         SearchSource$new(source) #default
  )
}

#' Merge Raw Results
#'
#' Take a list of raw results and merge into one big list of results
#'
#' @param raw_results A list of results from solr returned from the SearchSource
#'   objects.
#' @inheritParams rcloud.search
#'
#' @return A single result set, merged and sorted
sc_merge_results <- function(raw_results, sortby, orderby) {

  header <- sc_merge_header(raw_results)

  notebooks <- sc_merge_notebooks(raw_results, sortby, orderby)

  list(header = header, notebooks = notebooks)

}

sc_merge_header <- function(raw_results) {

  # drop_items and keep_items are in utils.R
  summary_fields <- c("QTime", "status", "matches", "n_notebooks")
  drop_fields <- c("notebooks", "source")
  # Combine variables that we summarise into a data.frame
  headers_summary <- lapply(raw_results, keep_items, summary_fields)
  headers_summary <- lapply(headers_summary, as.data.frame, stringsAsFactors = FALSE)
  headers_df <- do.call(rbind, headers_summary)
  # All other variables should be the same, take from the first source
  headers_pass <- drop_items(raw_results[[1]], c(summary_fields, drop_fields))

  # Combine summary variables and passthrough variables
  header <- c(list(
    QTime = max(headers_df$QTime),
    status = max(headers_df$status),
    matches = sum(headers_df$matches),
    n_notebooks = sum(headers_df$n_notebooks)
  ),
  headers_pass)

  header

}

sc_merge_notebooks <- function(raw_results, sortby, orderby) {

  raw_notebooks <- lapply(raw_results, `[[`, "notebooks")

  all_notebooks <- do.call(c, unname(raw_notebooks))

  if(!is.null(all_notebooks) && length(all_notebooks) > 0) {

  order_col <- switch(sortby,
                      "updated_at" = sc_order_timestamp(all_notebooks, sortby, orderby),
                      sc_order_default(all_notebooks, sortby, orderby))

  all_notebooks <- all_notebooks[order_col]

  }

  all_notebooks

}

sc_order_default <- function(all_notebooks, sortby, orderby) {

  sort_col <- sapply(all_notebooks, `[[`, sortby)
  # should be a vector. If not try to correct it
  if(is.list(sort_col)) {
    sort_col <- sc_infer_sort_col(sort_col)
  }

  decreasing <- isTRUE(orderby == "desc")

  order(sort_col, decreasing = decreasing)

}

sc_order_timestamp <- function(all_notebooks, sortby, orderby) {

  sort_col_raw <- vapply(all_notebooks, `[[`, sortby, FUN.VALUE = character(1))

  sort_col <- as.POSIXct(sort_col_raw, format = "%Y-%m-%dT%H:%M:%S")

  decreasing <- isTRUE(orderby == "desc")

  order(sort_col, decreasing = decreasing)

}

# Try to convert to a vector of sortable results
sc_infer_sort_col <- function(sort_col) {
  sort_lengths <- vapply(sort_col, length, numeric(1))

  if(any(sort_lengths > 1)) stop("Sort column returned with multiple values")

  #Replace NULLS with NA
  sort_nulls <- vapply(sort_col, is.null, logical(1))
  sort_col[sort_nulls] <- NA

  # Convert to char and us conversion (similar to read.table)
  sort_char <- vapply(sort_col, as.character, character(1))
  utils::type.convert(sort_char, as.is = TRUE)
}

#' Get the schema version
#'
#' @param path The GET route
#' @inheritParams .solr.post
#'
#' @return The version as a string
#'
sc_schema_version <- function(path = "schema/version",
                              solr.url, solr.auth.user, solr.auth.pwd) {

  resp <- .solr.get.generic(query = NULL, path = path,
                            solr.url = solr.url,
                            solr.auth.user = solr.auth.user,
                            solr.auth.pwd = solr.auth.pwd)

  if(exists("version", resp)) {
    version <- tryCatch({
      maj.version <- gsub("(^[0-9]+).*", "\\1", as.character(resp$version))
      as.numeric(maj.version)
    },
    error = function(e) {
      list(error = e)
    })
  } else {
    version <- resp
  }
  version
}

#' Stop and return errors
#'
#' @param raw_results Results from solr sources
#'
sc_check_errors <- function(raw_results) {

  is_error <- vapply(raw_results,
                     function(x)
                       is.character(x[[1]]) && x[[1]] == "error",
                     logical(1))

  if(any(is_error)) {
    error_results <- raw_results[is_error]

    # Crunch down to a single error message
    error_msgs <- rjson::toJSON(error_results)

    ulog::ulog("ERROR: Search failed: ", error_msgs)

    stop(error_msgs)
  }

}
MangoTheCat/rcloud.solr documentation built on May 8, 2019, 3:25 p.m.