#' R6 class to manage personalized pagerank calculations
#'
#' @importFrom R6 R6Class
#'
Tracker <- R6Class("Tracker", list(
#' @field seeds A character vector of the seed nodes.
seeds = character(0),
#' @field path A character vector of nodes whose neighborhoods we
#' examined.
path = character(0),
#' @field stats A [tibble::tibble()] with one row for each visited
#' node and the following columns:
#'
#' - `name`: Name of a node (character).
#' - `r`: Current estimate of residual per out-degree for a node.
#' - `p`: Current estimate of the pagerank for a node.
#' - `in_degree`: Number of incoming edges to a node.
#' - `out_degree`: Number of outcoming edges from a node.
#'
stats = NULL,
#' @field failed A character vector of nodes that could not be visited.
failed = character(0),
#' @field graph An abstract graph object.
graph = NULL,
#' @field alpha Teleportation constant from Algorithm 3.
alpha = numeric(0),
#' @field alpha_prime Transformed teleportation constant from Algorithm 3.
alpha_prime = numeric(0),
#' @field epsilon Error tolerance.
epsilon = numeric(0),
#' @field max_visits Maximum number of nodes to visit before terminating.
max_visits = integer(0),
#' @field tau Regularization parameter used in Algorithm 4.
tau = numeric(0),
#' @description
#'
#' Create a new Tracker object.
#'
#' @param graph See [appr()].
#' @param alpha See [appr()].
#' @param epsilon See [appr()].
#' @param tau See [appr()].
#' @param max_visits See [appr()].
#'
#' @return A new `Tracker` object.
#'
#' @importFrom tibble tibble
#'
initialize = function(graph, alpha, epsilon, tau, max_visits) {
self$graph <- graph
self$alpha <- alpha
self$alpha_prime <- alpha / (2 - alpha)
self$epsilon <- epsilon
self$tau <- tau
self$max_visits <- max_visits
self$stats <- tibble::tibble(
name = character(0),
regularized = numeric(0),
p = numeric(0),
in_degree = numeric(0),
out_degree = numeric(0),
degree_adjusted = numeric(0),
r = numeric(0)
)
},
#' @description
#'
#' Print the tibble containing the current state of the pagerank
#' calculation.
#'
print = function() {
cat("Personalized PageRank Approximator\n")
cat("----------------------------------\n\n")
cat(glue(" - number of seeds: {length(self$seeds)}\n", .trim = FALSE))
cat(glue(" - unique nodes visited so far: {length(unique(self$path))} out of maximum of {self$max_visits}\n", .trim = FALSE))
cat(glue(" - total visits so far: {length(self$path)}\n", .trim = FALSE))
cat(glue(" - bad nodes so far: {length(self$failed)}\n\n", .trim = FALSE))
cat(glue(" - teleportation constant (alpha): {self$alpha}\n", .trim = FALSE))
cat(glue(" - desired approximation error (epsilon): {self$epsilon}\n", .trim = FALSE))
cat(glue(" - achieved bound on approximation error: {self$current_approximation_error()}\n", .trim = FALSE))
cat(glue(" - length of to visit list: {length(self$remaining())}\n\n", .trim = FALSE))
cat(glue("PPR table (see $stats field):\n\n"))
print(self$stats)
invisible(self)
},
#' @description
#'
#' Determine nodes that need to be visited. Note that,
#' if there is a node with zero out degree, you will never
#' leave from that node. So it is important to make sure
#' we never add nodes with zero out degree into the tracker.
#'
#' @return A character vector of node names with current residuals
#' greater than `epsilon`.
#'
remaining = function() {
# when we initialize, we need to initialize to the seeds
# here we check for initialization by consider the path
# of nodes we've visited so far. it's very important that
# we do not populate `path` when adding the seeds
if (length(self$path) < 1)
return(self$seeds)
self$stats[self$stats$r > self$epsilon * self$stats$out_degree, ]$name
},
#' @description
#'
#' Determine current quality of approximation.
#'
#' @return A numeric vector of length one with the current worst
#' error bound.
#'
current_approximation_error = function() {
nodewise_approx_error <- self$stats$r / self$stats$out_degree
max(nodewise_approx_error)
},
#' @description
#'
#' Check if there is already a row for a particular node
#'
#' @param nodes Character name of node(s) in the graph.
#'
#' @return `TRUE` if there is a row for `node`, `FALSE` if there
#' is not a row for `node`.
#'
in_tracker = function(nodes) {
nodes %in% self$stats$name
},
#' @description
#'
#' Check if we previously failed to visit a node
#'
#' @param node Character name of a node in the graph.
#'
#' @return `TRUE` if we failed to visit `node`, `FALSE` otherwise.
#' Note that this function will return `FALSE` if `node` is new
#' and we haven't seen it before.
#'
in_failed = function(node) {
node %in% self$failed
},
#' @description
#'
#' Create an entry for `node` in the tracker. Assumes that
#' `node` is not in the tracker yet, and does not check if
#' this is the case.
#'
#' @param seeds The name of the node in the graph as a length 1
#' character vector.
#'
#' @param preference TODO: recall what on earth this is.
#'
add_seed = function(seeds, preference) {
self$seeds <- c(self$seeds, seeds)
self$add_nodes(nodes = seeds, preference = preference)
},
#' @description
#'
#' TODO
#'
#' @param node The name of the node in the graph as a length 1
#' character vector.
#'
add_to_path = function(node) {
self$path <- c(self$path, node)
},
#' @description
#'
#' Create an entry for `node` in the tracker. Assumes that
#' `node` is not in the tracker yet, and does not check if
#' this is the case.
#'
#' @param nodes The name(s) of node(s) in the graph as a character vector.
#'
#' @param preference TODO: recall what on earth this is.
#'
add_nodes = function(nodes, preference = 0) {
log_trace(glue("Adding node(s) to tracker: {nodes}"))
degree <- node_degrees(self$graph, nodes)
self$stats <- tibble::add_row(
self$stats,
name = nodes,
regularized = NA_real_,
p = 0,
in_degree = degree$in_degree,
out_degree = degree$out_degree,
degree_adjusted = NA_real_,
r = preference
)
},
#' @description
#'
#' Add `node` to the list of nodes we failed to visit.
#' Assumes that `node` is not in the failed list yet, and
#' does not check if this is the case.
#'
#' @param nodes The name of the node in the graph as a length 1
#' character vector.
#'
add_failed = function(nodes) {
self$failed <- c(self$failed, nodes)
},
#' @description
#'
#' Update the estimate of the personalized pagerank for a given node
#'
#' @param node Character name of a node in the graph.
#'
update_p = function(node) {
node_index <- which(self$stats$name == node)
self$stats[[node_index, "p"]] <- self$stats[[node_index, "p"]] +
self$alpha_prime * self$stats[[node_index, "r"]]
},
#' @description
#'
#' Update the residual of a *good* node in the neighborhood of
#' the current node, adding it to the tracker if necessary
#'
#' @param u Character name of the node we are currently visiting.
#' @param v Names of neighbors of `u` as a character vector. Can
#' contain multiple elements. Can also contain zero elements.
#'
update_r_neighbor = function(u, v) {
log_trace(glue("update_r_neighbor({u}, {v})"))
stopifnot(length(u) == 1)
if (length(v) < 1)
return(invisible(NULL))
new_nodes <- v[!self$in_tracker(v)]
if (length(new_nodes) > 0)
self$add_nodes(new_nodes)
u_index <- which(self$stats$name == u)
v_index <- match(v, self$stats$name)
self$stats[v_index, "r"] <- self$stats[v_index, "r"] +
(1 - self$alpha_prime) * self$stats[[u_index, "r"]] /
(2 * self$stats[[u_index, "out_degree"]])
},
#' @description
#'
#' Update the residual of current node
#'
#' @param node Character name of the node we are currently visiting.
#'
update_r_self = function(node) {
node_index <- which(self$stats$name == node)
self$stats[[node_index, "r"]] <- (1 - self$alpha_prime) *
self$stats[[node_index, "r"]] / 2
},
#' @description
#'
#' Compute the degree-adjusted and regularized variants of personalized
#' PageRank as in Algorithm 4, based on the outputs of Algorithm 3.
#'
#' @param node Character name of the node we are currently visiting.
#'
regularize = function() {
if (is.null(self$tau)) {
tau <- mean(self$stats$in_degree)
}
# might divide by 0 here
self$stats$degree_adjusted <- self$stats$p / self$stats$in_degree
self$stats$regularized <- self$stats$p / (self$stats$in_degree + tau)
},
#' @description
#'
#' Main driver function to perform the computations outlined in
#' Algorithm 3.
#'
#' @param node Character name of the node we are currently visiting.
#'
calculate_ppr = function() {
log_info("Approximating PPR ...")
remaining <- self$remaining()
unique_visits_so_far <- length(unique(self$path))
log_info(glue(
"Visits: {length(self$path)} total / ",
"{unique_visits_so_far} unique (max {self$max_visits}) / ",
"{length(remaining)} to visit / ",
"current epsilon: {self$current_approximation_error()}.",
.trim = FALSE
))
while (length(remaining) > 0) {
if (unique_visits_so_far >= self$max_visits) {
warning("Maximum visits reached. Finishing aPPR calculation early.", call. = FALSE)
break
}
u <- if (length(remaining) == 1) remaining else sample(remaining, size = 1)
log_trace(glue("Visting {u}"))
self$update_p(u)
# here we come into contact with reality and must depart from the
# warm embrace of algorithm 3
# this is where we learn about new nodes. there are two kinds of new
# nodes: "good" nodes that we can visit, and "bad" nodes that we can't
# visit, such as protected Twitter accounts or nodes that the API fails
# to get for some reason. we want to:
#
# - update the good nodes are we typically would
# - pretend the bad nodes don't exist
#
# also note that we only want to *check* each node once
neighbors <- memo_neighborhood(self$graph, u)
self$add_to_path(u)
# first deal with the good neighbors we've already seen all
# at once
known_good <- neighbors[self$in_tracker(neighbors)]
known_bad <- neighbors[self$in_failed(neighbors)]
unknown <- setdiff(neighbors, c(known_good, known_bad))
new_good <- check(self$graph, unknown)
new_bad <- setdiff(unknown, new_good)
log_debug(
glue(
"{length(known_good)} known good / ",
"{length(known_bad)} known bad / ",
"{length(new_good)} new good / ",
"{length(new_bad)} new bad",
sep = " "
)
)
log_trace(glue("known good: {known_good}"))
log_trace(glue("known bad: {known_bad}"))
log_trace(glue("new good: {new_good}"))
log_trace(glue("new bad: {new_bad}"))
self$add_failed(new_bad)
self$update_r_neighbor(u, known_good)
self$update_r_neighbor(u, new_good)
self$update_r_self(u)
remaining <- self$remaining()
unique_visits_so_far <- length(unique(self$path))
log_info(glue(
"Visits: {length(self$path)} total / ",
"{unique_visits_so_far} unique (max {self$max_visits}) / ",
"{length(remaining)} to visit / ",
"current epsilon: {self$current_approximation_error()}.",
.trim = FALSE
))
}
log_info("Approximating PPR ... done")
}
))
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.