#' @title Tibble of data used in the manuscript
#'
#' @description Datasets are Karate, C.elegans,
#' Politcal blogs, Protein interactions, and arXiv
#' citations.
#'
#' @return A tibble with 3 columns, one for the name
#' which corresponds to the variable name for the
#' included data, one for number of nodes, and one
#' for a formal name used for plotting.
#' Names are factors ordered by number of nodes.
#' @export
gmmle_data <- function() {
tibble::tribble(
~name, ~n, ~formal_name,
"karate", 34, "Karate",
"celegans", 297, "C.elegans",
"polblogs", 1222, "Pol. blogs",
"yeast", 2617, "Protein inter.",
"arxiv", 16726, "arXiv citation"
) %>%
dplyr::mutate_at(dplyr::vars(name, formal_name),
forcats::as_factor)
}
#' Gets the igraph object for data from name
#'
#' @seealso \code{\link{gmmle_data}}
#'
#' @param gname Name of graph.
#' @return Corresponding igraph object
#' @examples
#' graph_from_name("karate")
#' @export
graph_from_name <- function(gname){
tryCatch({
graph <- eval(parse(text = gname))
}, error = function(e) {
stop(paste0(
"This packages only includes the datasets: ",
paste0(gmmle_data()$name, collapse = ", "), ". ",
"graph should be one of those as a character ",
"vector or an igraph object."))
})
}
make_grid <- function(graph, l = NULL){
if(is.null(l)){
2^seq(1:floor(log2(gorder(graph))))
} else {
ceiling(2^seq(1, log2(gorder(graph)), length.out = l))
}
}
#' Disagreements simulation
#'
#' @description Simulations to test for the difficulty to match
#' a graph by randomly permuting nodes. get_disag_stat
#' runs on one graph and par_run_all_disag runs the
#' simulation in the manuscript in parallel.
#'
#' @param graph iGraph object or string corresponding to graph data in package
#' @param cl Cluster to run the shuffling on.
#' @param m Number of shuffles to perform to compute statistics
#'
#' @returns Tibble with 7 columns
#' K: number nodes permuted, on a logarithmic grid
#' avedis: average distance between original and permuted graph
#' sd: standard deviation of distance,
#' min: minimum distance
#' pthresh: estimated bound on error tolerance for graph matching
#' data: name of data if graph is a string
#' n: number of nodes in provided graph
#' @examples
#' get_disag_stat("karate", m = 10)
#' @export
get_disag_stat <- function(graph, cl = NULL, m = 500) {
l <- NULL
graph_name <- NA
if ( is.character(graph) || is.factor(graph)) {
graph_name <- as.character(graph)
graph <- graph_from_name(graph_name)
}
if(!is.na(graph_name) && graph_name == "karate"){
l <- 10
}
grid <- make_grid(graph, l)
if (is.null(cl)){
disag <- purrr::map(grid,
kshuffle_mean_sd, G = graph, m = m)
} else {
parallel::clusterExport(cl = cl, varlist =
list("kshuffle_mean_sd", "graph", "tibble", "m"))
disag <- parallel::parLapply(cl, grid, function(k) {
kshuffle_mean_sd(k = k, G = graph, m = m)
})
}
disag %>%
dplyr::bind_rows() %>%
dplyr::mutate(data = graph_name, n = gorder(graph)) %>%
dplyr::rename(avedis = mean, pthresh = pmax)
}
#' @describeIn get_disag_stat
#'
#'
#' @param nclust Number of parallel clusters to create via
#' parallel::makeCluster
#' @param ask Whether to warn about long run times.
#' @export
par_run_all_disag <- function(nclust = 7, ask = TRUE){
if (!are_you_sure(ask)) {
return();
}
cl <- parallel::makeCluster(nclust)
parallel::clusterEvalQ(cl, library(igraph))
parallel::clusterEvalQ(cl, library(iGraphMatch))
parallel::clusterEvalQ(cl, library(Matrix))
disag_data <- purrr::map_df(gmmle_data()$name,
get_disag_stat, cl, m=500)
parallel::stopCluster(cl)
disag_data
}
#' Shuffle graph matching simulation for one graph run in parallel
#'
#' @description For a given graph and probability p, this
#' procedure randomly flips edges with probability p and
#' then matches the noisy graph to the original starting
#' at the "true" correspondence.
#' This procedure is run nmc times for each p in p_grid.
#' See local_gm_errors. par_run_all_gm runs this once for
#' each graph in the package, corresponding to the simulation
#' in the manuscript.
#'
#' @param graph Graph to run on
#' @param cl Cluster generated by parallel::makeCluster.
#' If NULL then not run in parallel.
#' @param p_grid Grid of error probabilities
#' @param nmc number of Monte Carlo replicates
#' @param ask Whether to warn about long run times.
#'
#'
#' @returns A list of lists of graph mathching results for each p in p_grid
#' @examples
#' shuffle_gm_sim(karate, p_grid = c(0.01, 0.1),
#' nmc = 2, ask = FALSE)
#'
#' @export
shuffle_gm_sim <- function(graph, cl = NULL,
p_grid = 10^(-seq(0.5, 4, length.out = 8)),
nmc = 30, ask = TRUE) {
if (!are_you_sure(ask)) {
return();
}
graph_name <- NULL
if ( is.character(graph) || is.factor(graph)){
graph_name <- as.character(graph)
graph <- graph_from_name(graph_name)
}
graph <- get.adjacency(graph)
if (is.null(cl)){
1:nmc %>% purrr::map(function(s){
lapply(p_grid, function(p){
local_gm_errs(graph, p, seed = s)
})
})
} else {
parallel::clusterExport(cl = cl,
varlist = list("graph",
"local_gm_errs", "corrupting_channel"),
envir=environment())
parallel::parLapply(cl, 1:nmc,
function(s){
lapply(p_grid, function(p){
local_gm_errs(graph, p, seed = s)
})
})
}
}
#' @describeIn shuffle_gm_sim
#'
#' @param data_df A tibble that is a subset of \code{gmmle_data}.
#' @export
par_run_all_gm <- function(nclust = 7, data_df = NULL,
nmc = 30,
p_grid = 10^(-seq(0.5, 4, length.out = 8)),
ask = TRUE) {
if (!are_you_sure(ask)) {
return();
}
if (is.null(data_df)){
data_df <- gmmle_data()
} else if( !is.data.frame(data_df)) {
data_df <- tibble::tibble(data = data_df)
}
cl <- parallel::makeCluster(nclust)
parallel::clusterEvalQ(cl, library(igraph))
parallel::clusterEvalQ(cl, library(iGraphMatch))
parallel::clusterEvalQ(cl, library(Matrix))
res_df <- data_df %>%
dplyr::mutate(results = purrr::map(name, shuffle_gm_sim,
cl = cl, nmc = nmc, p_grid = p_grid, ask = FALSE))
parallel::stopCluster(cl)
res_df
}
#' Plot probability bounds and normalized mean disag
#'
#' @description Plot showing the predicted probability bounds
#' and the normalized mean degree as a function of the
#' fraction of shuffled vertices.
#'
#' @param data Should be either NULL for data used in the manuscript
#' or something like the output of \code{\link{par_run_all_disag}}
#'
#' @examples
#' plot_prob_bound_norm_disag()
#'
#' @export
plot_prob_bound_norm_disag <- function(data = NULL){
if (!is.null(data)) {
disag_data <- data
}
if(ncol(disag_data) == 4){
disag_data <- disag_data %>%
tidyr::pivot_wider(names_from = type,
values_from = val) %>%
dplyr::mutate(data = ifelse(data == "collab", "arxiv",
as.character(data))) %>%
dplyr::mutate(n = dplyr::case_when(
data == "karate" ~ 34,
data == "celegans" ~ 297,
data == "polblog" ~ 1222,
data == "yeast" ~ 2617,
data == "arxiv" ~ 16726
))
}
disag_df <- disag_data %>%
dplyr::select(data, K, n, avedis, pthresh) %>%
dplyr::mutate(avedis = avedis / K * log(n)) %>%
tidyr::pivot_longer(c(pthresh, avedis),
names_to = "type", values_to = "val_normalized") %>%
dplyr::mutate(type = forcats::as_factor(type)) %>%
dplyr::mutate(K.n = K / n) %>%
dplyr::mutate(Network = dplyr::case_when(
data == "karate" ~ "Karate",
data == "celegans" ~ "C.elegans",
data == "polblog" ~ "Pol. blogs",
data == "yeast" ~ "Protein inter.",
data == "arxiv" ~ "arXiv citation")) %>%
dplyr::mutate(Network = forcats::as_factor(stringr::str_c(Network, " (n=", n, ")")))
flabel <- c("pthresh" = "Probability bound",
"avedis"= "Normalized mean disag.")
p <- ggplot(disag_df, aes(x = K.n,
y = val_normalized,
color = Network, shape = Network, linetype = Network)) +
scale_x_log10() +
geom_point() +
geom_line() +
theme_bw() +
xlab("Fraction of shuffled vertices") +
facet_grid(type ~ ., scales = "free",
labeller = as_labeller(flabel)) + ylab("") +
scale_color_manual(name = "Network",
values = scales::hue_pal()(5))
#pdf("Realdata-p-threshold.pdf", width = 6, height = 4)
p
#dev.off()
}
#' Compute statistics from shuffle_gm_sim results
#'
#'
#' @param errors_list Generated from \code{\link{shuffle_gm_sim}}
#' @param p_grid Grid of probabilities used for above.
#' @param data name of the data
#'
#' @export
convert_gmresults_to_df <- function(errors_list, p_grid, data = NULL) {
errors_table <- sapply(errors_list,
function(errs) sapply(errs, function(e) e$errors))
rdf <- tibble::tibble(p.channel = rep(p_grid, length(errors_list)),
errors = c(errors_table))
if (!is.null(data)) {
rdf <- rdf %>% dplyr::mutate(data = data)
}
rdf
}
#' Plot graph matching errors as a function or error rate
#'
#' @param data_df A tibble with columns name, n, and
#' formal_name as in \code{\link{gmmle_data}} and a column
#' results which has output from \code{\link{shuffle_gm_sim}}
#' for each graph.
#' @param p_grid Grid of error probabilities used to generate results
#'
#' @examples
#' plot_matching_errors()
#'
#' @export
plot_matching_errors <- function(
data_df = NULL,
p_grid = 10^(-seq(0.5, 4, length.out = 8))) {
if (is.null(data_df)){
data_df <- realnet_error_df
}
binded_res <- data_df %>%
dplyr::mutate(err = results %>%
purrr::map(convert_gmresults_to_df, p_grid)) %>%
dplyr::select(-results) %>% tidyr::unnest(err) %>%
dplyr::rename(Network = formal_name) %>%
dplyr::group_by(Network, p.channel) %>%
dplyr::summarize(
sd = stats::sd(errors),
se = sd/sqrt(dplyr::n()),
errors = mean(errors))
p <- ggplot(binded_res, aes(x=p.channel, y=errors)) +
geom_line(aes(p.channel, errors, color=Network, linetype=Network)) +
geom_point(aes(p.channel, errors, color=Network, shape=Network))+
scale_x_log10() +
ylab("Matching accuracy (fraction of vertices)") +
geom_errorbar(aes(ymin=errors-se, ymax=errors+se,
color = Network), width=.1) +
theme_bw() +
xlab("Channel probability (p)")
# #pdf("Matching-accuracy.pdf", width = 6, height = 4)
p
# #dev.off()
}
#' Plot cumulative accurcy by normalized degree rank
#'
#'
#' @param data_df A tibble with columns name, n, and
#' formal_name as in \code{\link{gmmle_data}} and a column
#' results which has output from \code{\link{shuffle_gm_sim}}
#' for each graph.
#' @param p_grid Grid of error probabilities used to generate results
#'
#' @examples
#' \dontrun{
#' # This takes a few minutes
#' plot_cum_acc_by_norm_deg()
#' }
#' @export
plot_cum_acc_by_norm_deg <- function(
data_df = NULL,
p_grid = 10^(-seq(0.5, 4, length.out = 8))
){
if (is.null(data_df)){
data_df <- realnet_error_df
}
message("Computing cumaltive errors for each match. This can take a few minutes")
hddf <- data_df %>%
dplyr::mutate(g =
purrr::map(as.character(name),graph_from_name) %>%
purrr::map(~.[])
) %>%
dplyr::mutate(hd = purrr::map2(g, results,
highdegree_errors, p_grid)) %>%
dplyr::select(-results, -g) %>%
tidyr::unnest(hd)
hddf_sum <- hddf %>%
dplyr::mutate(p.channel = forcats::as_factor(p.channel),
Data = forcats::as_factor(formal_name)) %>%
dplyr::group_by(Data, n, degree_rank, p.channel) %>%
dplyr::summarize(m = mean(errors.by.degree),
sd = stats::sd(errors.by.degree),
nmc = dplyr::n()) %>%
dplyr::mutate(se = sd / sqrt(nmc)) %>%
dplyr::ungroup()
labels <- rev(c(0.32, 0.1, 0.032, 0.01, 0.0032, 0.001))
p <- hddf_sum %>%
ggplot(aes(y = m, x = degree_rank,
color = p.channel, linetype = p.channel)) +
geom_line() +
ylab("Matching accuracy (fraction of vertices)") +
xlab("Normalized degree rank") +
scale_color_manual(
values = scales::hue_pal(h = c(180, 270))(6),
labels = labels, name = "p") +
scale_linetype_manual(values = 6:1, labels = labels, name = "p") +
facet_grid(Data ~ .) +
theme_bw()
# pdf("Highdegree.pdf", width = 6, height = 6)
p
# dev.off()
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.