R/popmod.R
In edpclient: Empirical Data Platform Client

Documented in build_popmod col_assoc is.popmod joint_probability popmod predict.edp_population_model schema simulate.edp_population_model

# popmod.R contains functions for creating and operating on
# edp_population_model objects.

print.edp_population_model <- function(x, ...) {
  cat(sprintf("popmod(id=%s, name=%s, #cols=%d, parent_id=%s, %s)\n", x$id,
              x$name, length(x$schema$columns), x$parent_id, x$build_status))
}

# Exported. Returns a list of the columns of edp_population_model `x`.
names.edp_population_model <- function(x, ...) {
  vapply(x$schema$columns, function(c) c$name, "")
}

as.character.edp_population_model <- function(x, ...) {
  x$id
}

# Returns a character vector with the names of all modeled columns of `pm`.
modeled_names <- function(pm) {
  s <- Filter(function(e) e$stat_type != "void", pm$schema$columns)
  return(vapply(s, function(e) e$name, ""))
}

# Returns the schema JSON for population `pid`.
schema_json <- function(sess, pid) {
  stopifnot(is.edp_session(sess))
  stopifnot(is.character(pid) && startsWith(pid, "p-"))
  op <- paste("rpc/population", utils::URLencode(pid), "schema",
              sep = "/")
  schema <- try(httr::content(edp_get(sess, op)), silent = TRUE)
  if (class(schema) == "try-error") {
    stop(sprintf("Could not load schema for population '%s'. [%s]", pid,
                 toString(schema, width = 60)))
  }
  return(schema)
}

# Exported; see ?edpclient::popmod.
popmod <- function(sess, pmid) {
  # `pm_resp` is a #/definitions/population_model in edp.schema.json.
  op <- paste("rpc/population_model", utils::URLencode(pmid), sep = "/")
  pm_resp <- try(httr::content(edp_get(sess, op)), silent = TRUE)
  if (class(pm_resp) == "try-error") {
    stop(sprintf("Could not load population model for '%s'.", pmid))
  }
  pm <- list(sess = sess, id = pmid, name = pm_resp$name,
             parent_id = pm_resp$parent_id,
             schema = schema_json(sess, pm_resp$parent_id),
             build_status = pm_resp$build_progress$status)
  class(pm) <- "edp_population_model"
  return(pm)
}

is.popmod <- function(x) {
  "edp_population_model" %in% class(x)
}

schema_data_frame_from_json <- function(j) {
  extract_schema_row <- function(e) {
    c(name = e$name,
      display_name = ifelse(is.null(e$display_name), as.character(NA),
                            e$display_name),
      stat_type = e$stat_type,
      nvalues = length(e$values))
  }
  s <- plyr::ldply(j$columns, extract_schema_row)
  s$nvalues <- as.integer(s$nvalues)
  is.na(s$nvalues) <- (s$nvalues == 0)
  return(s)
}

schema <- function(x) {
  UseMethod("schema", x)
}

schema.edp_population <- function(x) {
  # This makes a network call every time it's called, but the population model
  # one doesn't. This is a little weird.
  schema_data_frame_from_json(schema_json(x$sess, x$pid))
}

schema.edp_population_model <- function(x) {
  schema_data_frame_from_json(x$schema)
}

# Converts a list, which may contain NULLs, to an atomic vector with the same
# elements, replacing NULLs with NAs.  `mode` should be "numeric",
# "character", or "logical". `v` is a list. This function is useful for
# converting from JSON arrays parsed by jsonlite.
unlist_with_na <- function(v, mode) {
  this_NA <- as.vector(NA, mode = mode)
  v_is_null <- vapply(v, is.null, NA)
  v_with_na <- ifelse(v_is_null, this_NA, v)
  return(as.vector(v_with_na, mode = mode))
}

# Given a list structured as result_set.schema.json, returns a data frame with
# the columns of the result set. `target` is the column names for the data
# frame in the correct order (which can be mangled by JSON iteration order).
# `schema` should be the schema for the population model `rs` came from,
# defined in population_schema.schema.json.
result_set_to_data_frame <- function(rs, target, schema) {
  stopifnot(isTRUE(all.equal(sort(names(rs$columns)), sort(target))))
  # If there are rowids (from a select), use them. Otherwise, use the usual
  # 1-based row names. We specify the default row names explicitly to force the
  # initially-empty data.frame to have the right number of rows.
  rowids <- if (is.null(rs$rowids)) seq_along(rs$columns[[1]]) else rs$rowids
  table <- data.frame(stringsAsFactors = FALSE, row.names = rowids)
  # Iterate through columns of the result set, adding them to `table`.
  for (col_name in target) {
    # Find the schema entry for the column named `col_name`.
    col_schema <- Filter(function(e) e$name == col_name, schema$columns)[[1]]
    st <- col_schema$stat_type
    stopifnot(length(st) == 1)
    # Convert the list in the result set to a suitable atomic vector.
    if (st %in% c("realAdditive", "realMultiplicative")) {
      table[[col_name]] <- unlist_with_na(rs$columns[[col_name]], "numeric")
    } else if (st %in% c("categorical", "orderedCategorical")) {
      v <- unlist_with_na(rs$columns[[col_name]], "character")
      levels <- vapply(col_schema$values, function(e) e$value, "")
      table[[col_name]] <- factor(v, levels = levels,
                                  ordered = (st == "orderedCategorical"))
    } else if (st == "date") {
      days <- unlist_with_na(rs$columns[[col_name]], "integer")
      table[[col_name]] <- as.Date(days, origin = "1970-01-01", tz = "UTC")
    } else if (st == "time") {
      secs <- unlist_with_na(rs$columns[[col_name]], "integer")
      table[[col_name]] <- as.POSIXct(secs, origin = "1970-01-01",
                                      tz = "UTC")
    } else if (st == "void") {
      table[[col_name]] <- unlist_with_na(rs$columns[[col_name]], "character")
    } else {
      stop(sprintf("unsupported stat type '%s'", st))
    }
    # Pull identifying-column-ness and display name from schema.
    if (col_name %in% schema$identifying_columns) {
      identifying(table[[col_name]]) <- TRUE
    }
    if (!is.null(col_schema$display_name)) {
      display_name(table[[col_name]]) <- col_schema$display_name
    }
    stat_type(table[[col_name]]) <- col_schema$stat_type
  }
  return(table)
}

select.edp_population_model <- function(x, target = NULL, where = NULL,
                                        rowids = NULL) {
  select(population(x$sess, x$parent_id),
         target = target, where = where, rowids = rowids)
}

make_simulate_req <- function(pm, nsim, seed, target, given) {
  stopifnot(is.null(seed) || is.numeric(seed))
  if (is.null(target)) {
    target <- modeled_names(pm)
  }
  for (g in names(given)) {
    if (!(g %in% modeled_names(pm))) {
      stop(sprintf("Given column '%s' is not modeled.", g))
    }
  }
  if (!is.null(given)) {
    if (!is.data.frame(given)) {
      stop("`given` must be NULL or a data frame.")
    }
    if (nrow(given) != 1) {
      stop("`given`, if set, must have one row.")
    }
  }
  req <- list(target = as.list(target), n = nsim)
  if (!is.null(seed)) {
    req$random_seed <- as.integer(seed)  # seed could be a non-integer number
  }
  if (!is.null(given)) {
    req$given <- given
  }
  return(req)
}

make_simulate_by_model_result <- function(schema, target, resp) {
  # simulate-by-model returns one result set per model. Turn them into data
  # frames with a `model` attribute and stack them.
  model_data_frames <- list()
  for (i in 1:length(resp)) {
    stopifnot(length(resp[[i]]) == 1)  # zero or one given row
    d <- result_set_to_data_frame(resp[[i]][[1]], target, schema)
    d$model <- i
    model_data_frames[[i]] <- d
  }
  all_data <- do.call(rbind, model_data_frames)
  all_data$model <- as.factor(all_data$model)  # not ordered
  return(all_data)
}

# Exported; see ?edpclient::simulate.edp_population_model.
simulate.edp_population_model <- function(object, nsim = 1, seed = NULL, ...,
                                          target = NULL, columns = NULL,
                                          given = NULL, by_model = FALSE) {
  pm <- object  # Give the standard argument a better name.
  # `columns` is the R-idiomatic name. `target` is the EDP-idiomatic name.
  if (is.null(target) && !is.null(columns)) {
    target <- columns
  }
  req <- make_simulate_req(pm, nsim, seed, target, given)
  target <- as.character(req$target)  # i.e. not a `list`
  op <- paste("rpc/population_model", utils::URLencode(pm$id),
              ifelse(by_model, "simulate_by_model", "simulate"), sep = "/")
  resp <- httr::content(edp_post(pm$sess, op, req))
  if (by_model) {
    return(make_simulate_by_model_result(pm$schema, target, resp))
  } else {
    # simulate not-by-model returns one result set per given, and we passed at
    # most one given.
    stopifnot(length(resp) == 1)
    return(result_set_to_data_frame(resp[[1]], target, pm$schema))
  }
}

# Exported; see ?edpclient::predict.edp_population_model.
predict.edp_population_model <- function(object, ...,
                                         target = NULL,
                                         rowids = NULL, infer_present = FALSE,
                                         seed = NULL) {
  stopifnot(is.null(seed) || is.numeric(seed))
  stopifnot(is.null(rowids) || is.numeric(rowids))
  pm <- object  # Give the standard argument a better name.
  if (is.null(target)) {
    target <- modeled_names(pm)
  }
  # If we don't specify rowids, use the magic "all" token
  if (is.null(rowids)) {
    rowids <- "all"
  }
  op <- paste("rpc/population_model", utils::URLencode(pm$id), "infer_observed",
              sep = "/")
  req <- list(target = as.list(target), rowids = rowids,
              infer_present = infer_present)
  if (!is.null(seed)) {
    req$seed <- as.integer(seed)  # seed could be a non-integer number
  }
  resp <- httr::content(edp_post(pm$sess, op, req))
  # TODO(asilvers): This is ignoring the inferred confidence.
  return(result_set_to_data_frame(resp, target, pm$schema))
}

joint_probability <- function(pm, target, given = NULL, log = FALSE) {
  stopifnot(is.popmod(pm))
  stopifnot(is.data.frame(target))
  op <- paste("rpc/population_model", utils::URLencode(pm$id), "logpdf_rows",
              sep = "/")
  req <- list(targets = target)
  if (!is.null(given)) {
    stopifnot(is.data.frame(given))
    if (!all(names(given) %in% modeled_names(pm))) {
      stop("All names in `given` must be in the population model.")
    }
    req$givens <- given
  }
  resp <- edp_post(pm$sess, op, req)
  lp <- unlist_with_na(httr::content(resp), "numeric")
  lp[is.na(lp)] <- -Inf
  if (log) {
    return(lp)
  } else {
    return(exp(lp))
  }
}

# Returns element (i, j) of the lower-triangular matrix defined by the
# row-major vector `elems`. This layout is specified by
# association_matrix.schema.json. `i` and `j` may be vectors of compatible
# length; in that case, a vector of elements at those index-pairs is returned.
get_lower_tri_elem <- function(elems, i, j) {
  # Suppose the matrix is 4x4. `elems` fills out the Xs of this triangle:
  #
  #   X...
  #   XX..
  #   XXX.
  #   XXXX
  #
  # Then, suppose we want the index of element (i, j) = (3, 2), marked C:
  #
  #   A...
  #   AA..
  #   .C..
  #   ....
  #
  # i * (i - 1) / 2 = 3 * (3 - 1) / 2 = 3 is the number of elements in the
  # triangle above row 3. So, the index into `elems` where you find element
  # (3, 2) is i * (i - 1) / 2 + j.
  return(elems[i * (i - 1) / 2 + j])
}

# Exported; see ?edpclient::col_assoc.
col_assoc <- function(
    pm, target = NULL, given = NULL,
    statistic = c("mutual information", "R squared", "classic dep prob"),
    seed = NULL) {
  statistic <- match.arg(statistic)
  stopifnot(is.null(seed) || is.numeric(seed))
  op <- paste("rpc/population_model", utils::URLencode(pm$id),
              "relation", sep = "/")
  if (is.null(target)) {
    target <- modeled_names(pm)
    if (!is.null(given)) {
      target <- target[!(target %in% names(given))]
    }
  }
  if (!all(names(given) %in% names(pm))) {
    stop("Not all given columns are in population model.")
  }
  if (any(names(given) %in% target)) {
    stop("Can't compute association of givens.")
  }
  req <- list(response_columns = as.list(target),
              predictor_columns = as.list(target),
              given = given, statistic = statistic)
  if (!is.null(seed)) {
    req$random_seed <- as.integer(seed)
  }
  resp <- edp_post(pm$sess, op, req)
  # `elems` is a row-major lower-triangular matrix, as specified in
  # association_matrix.schema.json. Use it to fill in the regular association
  # matrix `a`.
  elems <- unlist_with_na(httr::content(resp)$elements, "numeric")
  a <- matrix(nrow = length(target), ncol = length(target),
              dimnames = list(target, target))
  for (i in seq_along(target)) {
    j <- seq(1, i)
    v <- get_lower_tri_elem(elems, i, j)  # row `i` of `elems`.
    a[i, j] <- v
    a[j, i] <- v
  }
  return(a)
}

build_popmod <- function(pop, models = 16, seed = NULL) {
  stopifnot(is.population(pop))
  stopifnot(is.null(seed) || (length(seed) == 1 && round(seed) == seed))
  url <- paste("rpc/population", utils::URLencode(pop$pid), "build", sep = "/")
  req <- list(name = paste(pop$name, "(model)"),
              build_def = list(num_models = models, builder = "lazy",
                               duration = list(iterations = 1)))
  if (!is.null(seed)) {
    req$build_def$random_seed <- seed
  }
  resp <- edp_post(pop$sess, url, req)
  pmid <- id_from_url(httr::headers(resp)$location, "pm-")
  return(popmod(pop$sess, pmid))
}