R/pxweb_split_query.R

Defines functions assert_query_can_be_split_to_batches permutations generate_permutations split_dimensions_left_right pxweb_query_dim_splittable pxweb_split_query

Documented in assert_query_can_be_split_to_batches generate_permutations permutations pxweb_query_dim_splittable pxweb_split_query split_dimensions_left_right

#' Split query in optimal sub-queries
#'
#' @details
#' Computes (brute-force) the optimal split of a query to
#' match the api maximum value limit. It also take into account
#' that time variables and content variables should not be split.
#' Also variables with filter "top" should not be split, since
#' the top filter does not supply the individual levels, just a
#' number. This can probably be improved further.
#'
#' @param pxq a \code{pxweb_query} object.
#' @param px a \code{pxweb} object.
#' @param pxmd a \code{pxweb_metadata} object.
#'
#' @return a list with \code{pxweb_query} objects.
#'
#' @keywords internal
pxweb_split_query <- function(pxq, px, pxmd) {
  checkmate::assert_class(pxq, "pxweb_query")
  checkmate::assert_class(px, "pxweb")
  checkmate::assert_class(pxmd, "pxweb_metadata")

  pxqd <- pxweb_query_dim(pxq)
  # Get variables that can be split
  pxqds <- pxweb_query_dim_splittable(pxq, pxmd)

  mxv <- px$config$max_values_to_download

  # If able to download in one batch
  if (prod(pxqd) <= mxv) {
    return(list(pxq))
  }

  # Search through optimal combination
  assert_query_can_be_split_to_batches(pxq, pxmd, mxv)
  comb <- generate_permutations(which(pxqds))
  no_comb <- matrix(which(!pxqds), nrow = nrow(comb), ncol = sum(!pxqds), byrow = TRUE)
  comb <- cbind(comb, no_comb)
  batches <- numeric(nrow(comb))
  for (i in 1:nrow(comb)) {
    batches[i] <- split_dimensions_left_right(x = pxqd[comb[i, ]], bool = pxqds[comb[i, ]], max_size = mxv)$total_batches
  }
  min_comb <- which.min(batches)
  batch_structure <- split_dimensions_left_right(x = pxqd[comb[min_comb, ]], bool = pxqds[comb[min_comb, ]], max_size = mxv)


  # Create grid of possible variable permutations
  pxq_vals <- pxweb_query_values(pxq)
  value_idx_list <- list()
  value_labels_list <- list()
  for (i in seq_along(pxqd)) {
    if (batch_structure$no_of_splits[i] > 1) {
      var_name <- names(batch_structure$no_of_splits)[i]
      value_idx_list[[var_name]] <- 1:batch_structure$no_of_splits[i]

      value_labels_list[[var_name]] <- list()
      for (j in 1:batch_structure$no_of_splits[i]) {
        bs <- batch_structure$max_batch_size[i]
        value_labels_list[[var_name]][[j]] <- pxq_vals[[var_name]][((j - 1) * bs + 1):min(j * bs, length(pxq_vals[[var_name]]))]
      }
    }
  }

  # Create (and check) batches
  batch_idx <- expand.grid(value_idx_list)
  pxq_list <- list()
  pxq_names <- names(pxq_vals)
  for (i in 1:nrow(batch_idx)) {
    pxq_tmp <- pxq
    for (j in 1:ncol(batch_idx)) {
      query_no <- which(pxq_names %in% colnames(batch_idx)[j])
      query_nm <- pxq_names[query_no]
      pxq_tmp$query[[query_no]]$selection$values <- value_labels_list[[query_nm]][[batch_idx[i, j]]]
    }
    pxq_list[[i]] <- pxq_tmp
    assert_pxweb_query(pxq_list[[i]])
  }
  pxq_list
}




#' Get vector indicating splittable variables
#'
#' @details
#' Splitable variables are variables that can be split. Content variables cannot be split,
#' nor variables with filter == "top".
#'
#' Currently, we can only be sure that time variables and eliminated variables can be split.
#' Hopefully the next API makes this more clear.
#'
#' @param pxq a \code{pxweb_query} object.
#'
#' @return a named logical vector.
#'
#' @keywords internal
pxweb_query_dim_splittable <- function(pxq, pxmd) {
  checkmate::assert_class(pxmd, "pxweb_metadata")
  checkmate::assert_class(pxq, "pxweb_query")

  can_be_eliminated <- pxweb_metadata_elimination(pxmd)
  is_time_variable <- pxweb_metadata_time(pxmd)
  can_be_eliminated[is_time_variable] <- TRUE

  filter <- pxweb_query_filter(pxq)
  # can_be_eliminated <- can_be_eliminated[sample(1:length(can_be_eliminated))]
  spltable <- can_be_eliminated[names(filter)]
  spltable[tolower(filter) == "top"] <- FALSE
  spltable
}



#' Split variables into chunks
#'
#' @details
#' Splitable variables are variables that can be split. Content variables cannot be split,
#' not variables with filter == "top"
#'
#' @return a \code{pxweb_split_dimensions}
#'
#' @keywords internal
split_dimensions_left_right <- function(x, bool, max_size) {
  checkmate::assert_integerish(x, lower = 1)
  checkmate::assert_named(x)
  checkmate::assert_logical(bool)
  checkmate::assert_names(names(bool), identical.to = names(x))
  checkmate::assert_int(max_size, lower = 1)

  call_dims <- c(prod(x[!bool]), x[bool])
  for (i in seq_along(call_dims)) {
    batch_size <- prod(call_dims[1:i])
    prod_value <- batch_size / max_size
    if (prod_value > 1) {
      if (i == 1) {
        stop("\nToo large query. \nVariable(s) '", paste(names(x[!bool]), collapse = "', '"), "' cannot be split into batches (eliminate is set to FALSE by the API). \nThe smallest batch size is ", batch_size, " and the maximum number of values that can be downloaded through the API is ", max_size, ". \nFor details and workarounds, see:\nhttps://github.com/rOpenGov/pxweb/blob/master/TROUBLESHOOTING.md", call. = FALSE)
      }
      for (j in 1:call_dims[i]) {
        if (prod(call_dims[1:(i - 1)]) * j > max_size) break
      }
      call_dims[i] <- j - 1
    }
  }

  max_batch_size <- x
  max_batch_size[names(call_dims[-1])] <- call_dims[-1]
  res <- list(
    total_dim = x,
    max_batch_size = max_batch_size,
    no_of_splits = ceiling(x / max_batch_size)
  )
  res$total_batches <- prod(res$no_of_splits)
  class(res) <- c("pxweb_split_dimensions", "list")
  res
}



#' Generate batch permutations
#'
#' @details
#' Generates permutations of dim. If more than 6 dim (highly unlikely) a sample of 1000 combinations
#' is drawn. Otherwise all possible permutations are returned.
#'
#' @param x a vector with elements to permute
#'
#' @keywords internal
generate_permutations <- function(x) {
  checkmate::assert_integerish(x, lower = 1)
  n <- length(x)
  if (n < 7) {
    res <- permutations(n = n, v = x, r = n)
  } else {
    res <- matrix(0, ncol = n, nrow = 1000)
    for (i in 1:1000) {
      res[i, ] <- sample(x, n)
    }
  }
  res
}



#' Generate permutations of dimensions to find optimal no of batches
#'
#' @details
#' Taken from gtools to minimize dependencies. See permutations
#' of the gtools packages for details
#'
#' @param n See \code{gtools::permutations}.
#' @param r See \code{gtools::permutations}.
#' @param v See \code{gtools::permutations}.
#' @param set See \code{gtools::permutations}.
#' @param repeats.allowed See \code{gtools::permutations}.
#'
#' @keywords internal
permutations <- function(n, r, v = 1:n, set = TRUE, repeats.allowed = FALSE) {
  if (mode(n) != "numeric" || length(n) != 1 || n < 1 || (n %% 1) !=
    0) {
    stop("bad value of n")
  }
  if (mode(r) != "numeric" || length(r) != 1 || r < 1 || (r %% 1) !=
    0) {
    stop("bad value of r")
  }
  if (!is.atomic(v) || length(v) < n) {
    stop("v is either non-atomic or too short")
  }
  if ((r > n) & repeats.allowed == FALSE) {
    stop("r > n and repeats.allowed=FALSE")
  }
  if (set) {
    v <- unique(sort(v))
    if (length(v) < n) {
      stop("too few different elements")
    }
  }
  v0 <- vector(mode(v), 0)
  if (repeats.allowed) {
    sub <- function(n, r, v) {
      if (r == 1) {
        matrix(v, n, 1)
      } else if (n == 1) {
        matrix(v, 1, r)
      } else {
        inner <- Recall(n, r - 1, v)
        cbind(rep(v, rep(nrow(inner), n)), matrix(t(inner),
          ncol = ncol(inner), nrow = nrow(inner) * n,
          byrow = TRUE
        ))
      }
    }
  } else {
    sub <- function(n, r, v) {
      if (r == 1) {
        matrix(v, n, 1)
      } else if (n == 1) {
        matrix(v, 1, r)
      } else {
        X <- NULL
        for (i in 1:n) {
          X <- rbind(X, cbind(v[i], Recall(n -
            1, r - 1, v[-i])))
        }
        X
      }
    }
  }
  sub(n, r, v[1:n])
}


#' Assert that a given pxweb query can be split
#'
#' @param pxq a [pxweb_query] object
#' @param pxmd a [pxweb_metadata] object
#' @param mxv maximum batch size
assert_query_can_be_split_to_batches <- function(pxq, pxmd, mxv) {
  pxqd <- pxweb_query_dim(pxq)
  pxqds <- pxweb_query_dim_splittable(pxq, pxmd)
  if (all(!pxqds)) stop("\nToo large query. \nNo Variable(s) can be split into batches (eliminate is set to FALSE by the API). \nThe smallest batch size is ", prod(pxqd), " and the maximum number of values that can be downloaded through the API is ", mxv, ". \nFor details and workarounds, see:\nhttps://github.com/rOpenGov/pxweb/blob/master/TROUBLESHOOTING.md", call. = FALSE)
}
rOpenGov/pxweb documentation built on Feb. 18, 2024, 7:44 a.m.