R/Real-networks.R

Defines functions plot_cum_acc_by_norm_deg plot_matching_errors convert_gmresults_to_df plot_prob_bound_norm_disag par_run_all_gm shuffle_gm_sim par_run_all_disag get_disag_stat make_grid graph_from_name gmmle_data

Documented in convert_gmresults_to_df get_disag_stat gmmle_data graph_from_name par_run_all_disag par_run_all_gm plot_cum_acc_by_norm_deg plot_matching_errors plot_prob_bound_norm_disag shuffle_gm_sim

#' @title Tibble of data used in the manuscript
#' 
#' @description Datasets are Karate, C.elegans, 
#'   Politcal blogs, Protein interactions, and arXiv
#'   citations.
#' 
#' @return A tibble with 3 columns, one for the name
#'  which corresponds to the variable name for the
#'  included data, one for number of nodes, and one
#'  for a formal name used for plotting.
#'  Names are factors ordered by number of nodes.
#' @export
gmmle_data <- function() {
  tibble::tribble(
    ~name,       ~n,     ~formal_name,
    "karate",    34,     "Karate",
    "celegans",  297,    "C.elegans",
    "polblogs",  1222,   "Pol. blogs",
    "yeast",     2617,   "Protein inter.",
    "arxiv",     16726,  "arXiv citation"
  ) %>%
    dplyr::mutate_at(dplyr::vars(name, formal_name), 
      forcats::as_factor)
}








#' Gets the igraph object for data from name
#' 
#' @seealso \code{\link{gmmle_data}}
#'
#' @param gname Name of graph.
#' @return Corresponding igraph object
#' @examples
#'  graph_from_name("karate")
#' @export
graph_from_name <- function(gname){
  tryCatch({
    graph <- eval(parse(text = gname))
  }, error = function(e) {
    stop(paste0(
        "This packages only includes the datasets: ",
        paste0(gmmle_data()$name, collapse = ", "), ". ",
        "graph should be one of those as a character ",
        "vector or an igraph object."))
  })
}


make_grid <- function(graph, l = NULL){
  if(is.null(l)){
    2^seq(1:floor(log2(gorder(graph))))
  } else {
    ceiling(2^seq(1, log2(gorder(graph)), length.out = l))
  }
}

#' Disagreements simulation
#' 
#' @description Simulations to test for the difficulty to match
#'  a graph by randomly permuting nodes. get_disag_stat
#'  runs on one graph and par_run_all_disag runs the
#'  simulation in the manuscript in parallel.
#' 
#' @param graph iGraph object or string corresponding to graph data in package
#' @param cl Cluster to run the shuffling on.
#' @param m Number of shuffles to perform to compute statistics
#'
#' @returns Tibble with 7 columns
#'  K: number nodes permuted, on a logarithmic grid
#'  avedis: average distance between original and permuted graph
#'  sd: standard deviation of distance,
#'  min: minimum distance
#'  pthresh: estimated bound on error tolerance for graph matching
#'  data: name of data if graph is a string
#'  n: number of nodes in provided graph
#' @examples
#' get_disag_stat("karate", m = 10)
#' @export
get_disag_stat <- function(graph, cl = NULL, m = 500) {
  l <- NULL
  graph_name <- NA
  if ( is.character(graph) || is.factor(graph)) {
    graph_name <- as.character(graph)
    graph <- graph_from_name(graph_name)
  }
  if(!is.na(graph_name) && graph_name == "karate"){
    l <- 10
  }

  grid <- make_grid(graph, l)

  if (is.null(cl)){
    disag <- purrr::map(grid,
      kshuffle_mean_sd, G = graph, m = m)
  } else {
    parallel::clusterExport(cl = cl, varlist =
      list("kshuffle_mean_sd", "graph", "tibble", "m"))
    disag <- parallel::parLapply(cl, grid, function(k) {
        kshuffle_mean_sd(k = k, G = graph, m = m)
      })
  }
  disag %>%
    dplyr::bind_rows() %>%
    dplyr::mutate(data = graph_name, n = gorder(graph)) %>%
    dplyr::rename(avedis = mean, pthresh = pmax)

}



#' @describeIn get_disag_stat
#' 
#'
#' @param nclust Number of parallel clusters to create via
#'  parallel::makeCluster
#' @param ask Whether to warn about long run times.
#' @export
par_run_all_disag <- function(nclust = 7, ask = TRUE){
  if (!are_you_sure(ask)) {
    return();
  }

  cl <- parallel::makeCluster(nclust)
  parallel::clusterEvalQ(cl, library(igraph))
  parallel::clusterEvalQ(cl, library(iGraphMatch))
  parallel::clusterEvalQ(cl, library(Matrix))
  disag_data <- purrr::map_df(gmmle_data()$name, 
    get_disag_stat, cl, m=500)
  parallel::stopCluster(cl)
  disag_data
}

#' Shuffle graph matching simulation for one graph run in parallel
#'
#' @description For a given graph and probability p, this
#'  procedure randomly flips edges with probability p and
#'  then matches the noisy graph to the original starting
#'  at the "true" correspondence.
#'  This procedure is run nmc times for each p in p_grid.
#'  See local_gm_errors. par_run_all_gm runs this once for
#'  each graph in the package, corresponding to the simulation
#'  in the manuscript.
#'
#' @param graph Graph to run on
#' @param cl Cluster generated by parallel::makeCluster. 
#'    If NULL then not run in parallel.
#' @param p_grid Grid of error probabilities
#' @param nmc number of Monte Carlo replicates
#' @param ask Whether to warn about long run times.
#' 
#'
#' @returns A list of lists of graph mathching results for each p in p_grid
#' @examples
#' shuffle_gm_sim(karate, p_grid = c(0.01, 0.1),
#'   nmc = 2, ask = FALSE)
#' 
#' @export
shuffle_gm_sim <- function(graph, cl = NULL,
  p_grid = 10^(-seq(0.5, 4, length.out = 8)),
  nmc = 30, ask = TRUE) {

  if (!are_you_sure(ask)) {
    return();
  }


  graph_name <- NULL
  if ( is.character(graph) || is.factor(graph)){
    graph_name <- as.character(graph)
    graph <- graph_from_name(graph_name)
  }
  graph <- get.adjacency(graph)
  if (is.null(cl)){
    1:nmc %>% purrr::map(function(s){
      lapply(p_grid, function(p){
        local_gm_errs(graph, p, seed = s)
      })
    })
  } else {
    parallel::clusterExport(cl = cl,
      varlist = list("graph",
        "local_gm_errs", "corrupting_channel"),
                            envir=environment())
    parallel::parLapply(cl, 1:nmc,
      function(s){
        lapply(p_grid, function(p){
         local_gm_errs(graph, p, seed = s)
        })
      })
  }
}



#' @describeIn shuffle_gm_sim
#' 
#' @param data_df A tibble that is a subset of \code{gmmle_data}.
#' @export
par_run_all_gm <- function(nclust = 7, data_df = NULL,
    nmc = 30,
    p_grid = 10^(-seq(0.5, 4, length.out = 8)),
    ask = TRUE) {
  
  if (!are_you_sure(ask)) {
    return();
  }


  if (is.null(data_df)){
    data_df <- gmmle_data()
  } else if( !is.data.frame(data_df)) {
    data_df <- tibble::tibble(data = data_df)
  }

  cl <- parallel::makeCluster(nclust)
  parallel::clusterEvalQ(cl, library(igraph))
  parallel::clusterEvalQ(cl, library(iGraphMatch))
  parallel::clusterEvalQ(cl, library(Matrix))

  res_df <- data_df %>%
    dplyr::mutate(results = purrr::map(name, shuffle_gm_sim,
      cl = cl, nmc = nmc, p_grid = p_grid, ask = FALSE))
  
  parallel::stopCluster(cl)
  res_df
}


#' Plot probability bounds and normalized mean disag
#' 
#' @description Plot showing the predicted probability bounds
#' and the normalized mean degree as a function of the 
#' fraction of shuffled vertices.
#' 
#' @param data Should be either NULL for data used in the manuscript
#'  or something like the output of \code{\link{par_run_all_disag}}
#' 
#' @examples
#'  plot_prob_bound_norm_disag()
#' 
#' @export
plot_prob_bound_norm_disag <- function(data = NULL){
  if (!is.null(data)) {
    disag_data <- data
  }


  if(ncol(disag_data) == 4){
    disag_data <- disag_data %>%
      tidyr::pivot_wider(names_from = type,
        values_from = val) %>%
      dplyr::mutate(data = ifelse(data == "collab", "arxiv",
        as.character(data))) %>%
      dplyr::mutate(n = dplyr::case_when(
        data == "karate" ~ 34,
        data == "celegans" ~ 297,
        data == "polblog" ~ 1222,
        data == "yeast" ~ 2617,
        data == "arxiv" ~ 16726
      ))
  }

  disag_df <- disag_data %>%
    dplyr::select(data, K, n, avedis, pthresh) %>%
    dplyr::mutate(avedis = avedis / K * log(n)) %>%
    tidyr::pivot_longer(c(pthresh, avedis),
      names_to = "type", values_to = "val_normalized") %>%
    dplyr::mutate(type = forcats::as_factor(type)) %>%
    dplyr::mutate(K.n = K / n) %>%
    dplyr::mutate(Network = dplyr::case_when(
        data == "karate" ~ "Karate",
        data == "celegans" ~ "C.elegans",
        data == "polblog" ~ "Pol. blogs",
        data == "yeast" ~ "Protein inter.",
        data == "arxiv" ~ "arXiv citation")) %>%
    dplyr::mutate(Network = forcats::as_factor(stringr::str_c(Network, " (n=", n, ")")))

  flabel <- c("pthresh" = "Probability bound",
      "avedis"= "Normalized mean disag.")


  p <-  ggplot(disag_df, aes(x = K.n,
      y = val_normalized,
      color = Network, shape = Network, linetype = Network)) +
    scale_x_log10() +
    geom_point() +
    geom_line() +
    theme_bw() +
    xlab("Fraction of shuffled vertices") +
    facet_grid(type ~ ., scales = "free",
               labeller = as_labeller(flabel)) + ylab("") +
    scale_color_manual(name = "Network",
                       values = scales::hue_pal()(5))


  #pdf("Realdata-p-threshold.pdf", width = 6, height = 4)
  p
  #dev.off()
}

#' Compute statistics from shuffle_gm_sim results
#'
#' 
#' @param errors_list Generated from \code{\link{shuffle_gm_sim}}
#' @param p_grid Grid of probabilities used for above.
#' @param data name of the data
#' 
#' @export
convert_gmresults_to_df <- function(errors_list, p_grid, data = NULL) {
  errors_table <- sapply(errors_list,
                         function(errs) sapply(errs, function(e) e$errors))
  rdf <- tibble::tibble(p.channel = rep(p_grid, length(errors_list)),
        errors = c(errors_table))
  if (!is.null(data)) {
    rdf <- rdf %>% dplyr::mutate(data = data)
  }
  rdf
}


#' Plot graph matching errors as a function or error rate
#' 
#' @param data_df A tibble with columns name, n, and
#' formal_name as in \code{\link{gmmle_data}} and a column
#' results which has output from \code{\link{shuffle_gm_sim}}
#' for each graph.
#' @param p_grid Grid of error probabilities used to generate results
#' 
#' @examples
#' plot_matching_errors()
#' 
#' @export
plot_matching_errors <- function(
  data_df = NULL,
  p_grid = 10^(-seq(0.5, 4, length.out = 8))) {

  if (is.null(data_df)){
    data_df <- realnet_error_df
  }


  binded_res <- data_df %>%
    dplyr::mutate(err = results %>%
      purrr::map(convert_gmresults_to_df, p_grid)) %>%
    dplyr::select(-results) %>% tidyr::unnest(err) %>%
    dplyr::rename(Network = formal_name) %>%
    dplyr::group_by(Network, p.channel) %>%
    dplyr::summarize(
      sd = stats::sd(errors),
      se = sd/sqrt(dplyr::n()),
      errors = mean(errors))

  p <- ggplot(binded_res, aes(x=p.channel, y=errors)) +
    geom_line(aes(p.channel, errors, color=Network, linetype=Network)) +
    geom_point(aes(p.channel, errors, color=Network,  shape=Network))+
    scale_x_log10() +
    ylab("Matching accuracy (fraction of vertices)") +
    geom_errorbar(aes(ymin=errors-se, ymax=errors+se,
                      color = Network), width=.1) +
    theme_bw() +
    xlab("Channel probability (p)")

  # #pdf("Matching-accuracy.pdf", width = 6, height = 4)
  p
  # #dev.off()
}


#' Plot cumulative accurcy by normalized degree rank
#' 
#' 
#' @param data_df A tibble with columns name, n, and
#' formal_name as in \code{\link{gmmle_data}} and a column
#' results which has output from \code{\link{shuffle_gm_sim}}
#' for each graph.
#' @param p_grid Grid of error probabilities used to generate results
#' 
#' @examples
#' \dontrun{
#' # This takes a few minutes
#' plot_cum_acc_by_norm_deg()
#' }
#' @export
plot_cum_acc_by_norm_deg <- function(
    data_df = NULL,
    p_grid = 10^(-seq(0.5, 4, length.out = 8))
  ){
  if (is.null(data_df)){
    data_df <- realnet_error_df
  }
  message("Computing cumaltive errors for each match. This can take a few minutes")
  hddf <- data_df %>%
    dplyr::mutate(g = 
      purrr::map(as.character(name),graph_from_name) %>%
      purrr::map(~.[])
    ) %>%
    dplyr::mutate(hd = purrr::map2(g, results,
      highdegree_errors, p_grid)) %>%
    dplyr::select(-results, -g) %>%
    tidyr::unnest(hd)

  hddf_sum <- hddf %>%
    dplyr::mutate(p.channel = forcats::as_factor(p.channel),
      Data = forcats::as_factor(formal_name)) %>%
    dplyr::group_by(Data, n, degree_rank, p.channel) %>%
    dplyr::summarize(m = mean(errors.by.degree),
      sd = stats::sd(errors.by.degree),
      nmc = dplyr::n()) %>%
    dplyr::mutate(se = sd / sqrt(nmc)) %>%
    dplyr::ungroup()

  labels <- rev(c(0.32, 0.1, 0.032, 0.01, 0.0032, 0.001))

  p <- hddf_sum %>%
    ggplot(aes(y = m, x = degree_rank,
       color = p.channel, linetype = p.channel)) +
    geom_line() +
    ylab("Matching accuracy (fraction of vertices)") +
    xlab("Normalized degree rank") +
    scale_color_manual(
      values = scales::hue_pal(h = c(180, 270))(6),
      labels = labels, name = "p") +
    scale_linetype_manual(values = 6:1, labels = labels,  name = "p") +
    facet_grid(Data ~ .) +
    theme_bw()

  # pdf("Highdegree.pdf", width = 6, height = 6)
  p
  # dev.off()
}
dpmcsuss/gmmle documentation built on July 2, 2020, 6:24 p.m.