graphstats: Graph Statistics

#' Vertex Nomination via Seeded Graph Matching
#'
#' Finds the seeds in an \eqn{h}-hop induced nbd of \eqn{G_1} around the
#' VOI, x,that is, finds induced subgraph generated by vertices that are
#' within a path of length \eqn{h} the VOI, and then finds an \eqn{ell}-hop
#' induced nbd of \eqn{G_1} around the seeds within the \eqn{h}-hop nbd of
#' x, and an \eqn{ell}-hop induced nbd of \eqn{G_2} around the corresponding
#' seeds. Then, matches these induced subgraphs via \code{multiStart}.
#'
#' @param x vector of indices for vertices of interest (voi) in \eqn{G_1}
#' @param seeds a numeric matrix, the \eqn{m \times 2}{m*2} matching vertex table,
#' where m is the number of seeds.
#' @param g1 \eqn{G_1} in \code{igraph} object where voi is known
#' @param g2 \eqn{G_2} in \code{igraph}
#' @param h \eqn{h}-hop for distance from voi to other vertices to create
#' \eqn{h}-hop induced subgraph of \eqn{G_1}
#' @param ell \eqn{ell}-hop for distance from seeds to other vertices
#' to create \eqn{ell}-hop induced subgraph of \eqn{G_1}
#' @param R number of restarts for \code{multiStart}
#' @param gamma to be used with \code{multiStart}, max tolerance
#' for alpha, how far away from the barycenter user is willing to go for
#' the initialization of \code{sgm} on any given iteration
#' @param maxiter the number of maxiters for the Frank-Wolfe algorithm.
#' @param pad a scalar value for padding for sgm (cases where two graphs
#' have different number of vertices); defaulted to 0
#' @param verbose logical verbose outputs
#' @param plotF boolean to plot the probability matrix
#'
#' @return \code{VOI}  Vertex of Interest
#' @return \code{seeds} \code{s} seeds
#' @return \code{Sx1} \eqn{S_x} the seeds within an h-path, i.e. in the
#' h-neighborhood, of VOI x in \eqn{G_1}
#' @return \code{Sx2} \eqn{S'_x} the corresponding seeds of \eqn{S_x} in \eqn{G_2}
#' @return \code{candidates_for_matching} labels for the candidates for
#' matching VOIs in \eqn{G_2}
#' @return \code{G1_vertices} vertices within ell-neighborhood of S_x in \eqn{G_1}
#' (vertices used in vertex nomination for VOIs)
#' @return \code{G2_vertices} vertices within ell-neighborhood of S'_x in \eqn{G_2}
#' (including candidates x' for matching VOIs x in \eqn{G_1})
#' @return \code{P} matrix \eqn{P(i,j)} is the proportion of times that vertex \eqn{j} in
#' the induced subgraph of \eqn{G_2} was mapped to vertex \eqn{i} in the induced subgraph of \eqn{G_1}.
#' Then the \eqn{i-th} and \eqn{j-th} elements of the labels vector tells you which vertices these actually were
#' in \eqn{G_1} and \eqn{G_2}, respectively.
#'
#' @import igraph
#' @import plyr
#' @author Youngser Park <youngser@jhu.edu>, Kemeng Zhang <kzhang@jhu.edu>
#'
#' @references Patsolic, Heather G.; Park, Youngser; Lyzinski, Vince; Priebe, Carey E. (2017).
#' Vertex Nomination Via Local Neighborhood Matching
#' Online: \url{https://arxiv.org/abs/1705.00674}
#'
#' Fishkind, D. E., Adali, S., Priebe, C. E. (2012). Seeded Graph Matching
#' Online: \url{http://arxiv.org/abs/1209.0367}

#' @export

# Matches Graphs given a seeding of vertex correspondences
vnsgm <- function(x,seeds,g1,g2,h,ell,R,gamma,maxiter=20,pad=0,verbose=FALSE,plotF=FALSE) {

  vn.validateInput(x,seeds,g1,g2,h,ell,R,gamma,maxiter,pad,verbose,plotF)

  A <- as.matrix(get.adjacency(g1))
  B <- as.matrix(get.adjacency(g2))
  nv1<-nrow(A)
  nv2<-nrow(B)
  nv<-max(nv1,nv2)

  nsx1 <- setdiff(1:nv1,c(seeds[,1],x))
  vec <- c(seeds[,1],x,nsx1)

  AA <- A[vec,vec]
  ga <- graph_from_adjacency_matrix(AA,mode="undirected")

  ns2 <- setdiff(1:nv2,seeds[,2])
  vec2 <- c(seeds[,2],ns2)

  BB <- B[vec2,vec2]
  gb <- graph_from_adjacency_matrix(BB,mode="undirected")

  s <- nrow(seeds)
  voi <- (s+1):(s+length(x))

  vn <- vnsgm.ordered(voi,s,ga,gb,h,ell,R,gamma,maxiter,pad,verbose,plotF=FALSE)
  Sx1 = vec[vn$Sx1]
  Sx2 = vec2[vn$Sx2]
  Cx2 = vec2[vn$candidates_for_matching]
  ind1 = vec[sort(vn$G1_vertices)]
  ind2 = vec2[sort(vn$G2_vertices)]
  P_matrix = vn$P
  row.names(P_matrix) = ind1
  colnames(P_matrix) = ind2
  matched.seed = intersect(seeds[,1],ind1)
  P_matrix = P_matrix[order(ind1),order(ind2)]
  if (plotF == TRUE) {
    gg = plotP(P_matrix,matched.seed,ind1,ind2,x)
    print(gg)
  }
  return(list(VOI=x, seeds=seeds, Sx1=Sx1, Sx2=Sx2, candidates_for_matching=Cx2, G1_vertices=ind1, G2_vertices=ind2, P=P_matrix))
}

#' Vertex Nomination via Seeded Graph Matching
#'
#' Finds the seeds in an \eqn{h}-hop induced nbd of \eqn{G_1} around the
#' VOI, x,that is, finds induced subgraph generated by vertices that are
#' within a path of length \eqn{h} the VOI, and then finds an \eqn{ell}-hop
#' induced nbd of \eqn{G_1} around the seeds within the \eqn{h}-hop nbd of
#' x, and an \eqn{ell}-hop induced nbd of \eqn{G_2} around the corresponding
#' seeds. Then, matches these induced subgraphs via \code{multiStart}.
#' Assume first \eqn{s} vertices in two graphs are matched seeds.
#'
#' @param x vector of indices for vertices of interest (voi) in \eqn{G_1}
#' @param s the number of seeds.
#' @param g1 \eqn{G_1} in \code{igraph} object where voi is known
#' @param g2 \eqn{G_2} in \code{igraph}
#' @param h \eqn{h}-hop for distance from voi to other vertices to create
#' \eqn{h}-hop induced subgraph of \eqn{G_1}
#' @param ell \eqn{ell}-hop for distance from seeds to other vertices
#' to create \eqn{ell}-hop induced subgraph of \eqn{G_1}
#' @param R number of restarts for \code{multiStart}
#' @param gamma to be used with \code{multiStart}, max tolerance
#' for alpha, how far away from the barycenter user is willing to go for
#' the initialization of \code{sgm} on any given iteration
#' @param maxiter the number of maxiters for the Frank-Wolfe algorithm.
#' @param pad a scalar value for padding for sgm (cases where two graphs
#' have different number of vertices); defaulted to 0
#' @param verbose logical verbose outputs
#' @param plotF boolean to plot the probability matrix
#'
#' @return \code{VOI}  Vertex of Interest
#' @return \code{seeds} \code{s} seeds
#' @return \code{Sx1} \eqn{S_x} the seeds within an h-path, i.e. in the
#' h-neighborhood, of VOI x in \eqn{G_1}
#' @return \code{Sx2} \eqn{S'_x} the corresponding seeds of \eqn{S_x} in \eqn{G_2}
#' @return \code{candidates_for_matching} labels for the candidates for
#' matching VOIs in \eqn{G_2}
#' @return \code{G1_vertices} vertices within ell-neighborhood of S_x in \eqn{G_1}
#' (vertices used in vertex nomination for VOIs)
#' @return \code{G2_vertices} vertices within ell-neighborhood of S'_x in \eqn{G_2}
#' (including candidates x' for matching VOIs x in \eqn{G_1})
#' @return \code{P} matrix \eqn{P(i,j)} is the proportion of times that vertex \eqn{j} in
#' the induced subgraph of \eqn{G_2} was mapped to vertex \eqn{i} in the induced subgraph of \eqn{G_1}.
#' Then the \eqn{i-th} and \eqn{j-th} elements of the labels vector tells you which vertices these actually were
#' in \eqn{G_1} and \eqn{G_2}, respectively.
#'
#' @author Youngser Park <youngser@jhu.edu>, Kemeng Zhang <kzhang@jhu.edu>
#'
#' @references Patsolic, Heather G.; Park, Youngser; Lyzinski, Vince; Priebe, Carey E. (2017).
#' Vertex Nomination Via Local Neighborhood Matching
#' Online: \url{https://arxiv.org/abs/1705.00674}
#'
#' Fishkind, D. E., Adali, S., Priebe, C. E. (2012). Seeded Graph Matching
#' Online: \url{http://arxiv.org/abs/1209.0367}

#' @export

vnsgm.ordered <- function(x,s,g1,g2,h,ell,R,gamma,maxiter=20,pad=0,verbose=FALSE,plotF=FALSE) {

  vn.validateInput.ordered(x,s,g1,g2,h,ell,R,gamma,maxiter,pad,verbose,plotF)
  # Note, V = {x,S,W,J}
  # sanity check
  #    W <- intersect(V(g1),V(g2))
  #    J1 <- setdiff(V(g1),W); m1 <- length(J1)
  #    J2 <- setdiff(V(g2),W); m2 <- length(J2)
  #    W <- setdiff(W,x) # exclude x from W
  #    W <- setdiff(W,S)
  #    stopifnot(1+s+length(W)+m1 == vcount(g1))
  #    stopifnot(1+s+length(W)+m2 == vcount(g2))
  # end of sanity check
  S = 1:s
  if (length(x) == 1) {
    min.hop = min(distances(g1,x)[,S])
  } else {
    min.hop = min(apply(distances(g1,x)[,S],1,min))
  }

  possible = TRUE
  if (!is.finite(min.hop)) {
    possible = FALSE
    warning("The seeds is not within reach from vertex of interest. vnsgm failed to nominate matches for x.")
  }

  if (h < min.hop) {
    h = min.hop
    warning("The seeds is not within h neighborhood from voi. Please choose a value of h no smaller than ", min.hop)
  }

  if (possible) {
    # make h-hop nbhd for x in A
    Nh <- unlist(ego(g1,h,nodes=x,mindist=1)) # mindist=0: close, 1: open
    Sx1 <- Sx2 <- NULL
    Sx1 <- sort(intersect(Nh,S)); sx1 <- length(Sx1)
    Sx2 <- sort(intersect(V(g2),Sx1)); sx2 <- length(Sx2)

    # Find induced subgraph which is ell-neighbors of Sx1 & Sx2
    Gx1 <- sort(unique(unlist(ego(g1,ell,nodes=Sx1,mindist=0)))) # mindist=0: close, 1: open
    Gx2 <- sort(unique(unlist(ego(g2,ell,nodes=Sx2,mindist=0)))) # mindist=0: close, 1: open
    # Candidates for matching VOI in the second graph
    Cx2 <- setdiff(Gx2,S)

    #Vertices in both graphs for nominating VOIs
    ind1 <- c(Sx1,x,setdiff(Gx1,c(Sx1,x)))
    ind2 <- c(Sx2,setdiff(Gx2,Sx2))

    if (verbose) {
      cat("seed = ", Sx1, ", matching ", ind1, " and ", ind2, "\n")
    }

    A <- as.matrix(g1[][ind1,ind1])
    B <- as.matrix(g2[][ind2,ind2])
    P <- multiStart(A,B,R,length(Sx1),gamma,pad=pad,iter=maxiter)
    matched.seed = intersect(S,ind1)
    num.matched.seed = length(matched.seed)
    m = length(ind1)
    n = length(ind2)
    P_trunc = P[1:m,1:n]
    P.df = as.data.frame(P_trunc)
    row.names(P.df) = ind1
    colnames(P.df) = ind2
    P_matrix = P.df[order(ind1),order(ind2)]
    P_matrix[1:num.matched.seed,] = 0
    P_matrix[,1:num.matched.seed] = 0
    P_matrix[1:num.matched.seed,1:num.matched.seed] = diag(num.matched.seed)

    if (plotF) {
      gg = plotP(P_matrix,matched.seed,ind1,ind2,x)
      print(gg)
    }
  } else {
    ind1 <- ind2 <- P <- Cx2 <- NULL
  }

  return(list(VOI=x, seeds=S, Sx1=Sx1, Sx2=Sx2, candidates_for_matching=Cx2, G1_vertices=ind1, G2_vertices=ind2, P=P_matrix))
}

plotP <- function(P,Seed,G1_vertices,G2_vertices,x)
{
  P$vertex = sort(G1_vertices)
  P$vertex = as.factor(sort(G1_vertices))
  x.f = as.factor(x)
  P.m <- suppressMessages(melt(P))
  P.s <- plyr::ddply(P.m, .(variable), transform,
                 rescale = scale(value))
  P.s$Category <- P.s$vertex
  seed.set = as.factor(Seed)
  set1 = setdiff(G1_vertices,x.f)
  set2 = setdiff(set1,seed.set)
  set3 = setdiff(set1,set2)
  set4 = setdiff(G1_vertices,set1)
  levels(P.s$Category) <- list("VOI" = set4,
                               "Seed" = set3,
                               "Other" = set2)
  P.s$rescale = (P.s$rescale-min(P.s$rescale))/(max(P.s$rescale)-min(P.s$rescale))

  gg = ggplot(P.s, aes(variable, vertex)) +
    geom_tile(aes(alpha = rescale,fill = Category), colour = "white") +
    scale_alpha(range=c(0,1)) +
    xlab("Vertices of Graph 2") + ylab("Vertices of Graph 1") +
    labs(title = "Proportion of times Vertices in Graph 1 Matched to Vertices in Graph 2",
         alpha = "Proportion") +
    scale_x_discrete(expand = c(0, 0)) +
    scale_y_discrete(expand = c(0, 0)) +
    theme_classic(base_size = 9) +
    theme(axis.ticks = element_blank(),
    axis.text.x = element_text(angle = 0, hjust = 0))
    return(gg)
}

# Unordered function validation.
vn.validateInput <- function(x,seeds,g1,g2,h,ell,R,gamma,maxiter,pad,verbose,plotF) {

  # VOIs x
  if (!is.vector(x)) { stop("Error: Input 'x' must be a numeric vector.") }
  if (class(x) != "numeric") { stop("Error: Input 'x' must be a numeric vector.") }

  # Graphs
  if (class(g1) == "dgCMatrix") { g1 = igraph::graph_from_adjacency_matrix(g1) }
  if (class(g1) == "matrix") { g1 = igraph::graph_from_adjacency_matrix(g1) }
  if (class(g1) != 'igraph') { stop("Input object 'g1' is not an igraph object.") }
  if (class(g2) == "dgCMatrix") { g2 = igraph::graph_from_adjacency_matrix(g2) }
  if (class(g2) == "matrix") { g2 = igraph::graph_from_adjacency_matrix(g2) }
  if (class(g2) != 'igraph') { stop("Input object 'g2' is not an igraph object.") }

  # Seeds
  if (class(seeds) != "matrix" || ncol(seeds) != 2 || !all(seeds%%1 == 0)) {
    stop("Error: Input 'seeds' must be an m by 2 matrix of corresponding seed indices.")
  }
  if (nrow(seeds) > vcount(g1)) { stop("Error: Number of seeds is greater than number of vertices in g1.") }
  if (nrow(seeds) > vcount(g2)) { stop("Error: Number of seeds is greater than number of vertices in g2.") }

  # VOIs x and Seeds non-overlap
  if (sum(is.element(x, seeds[,1])) != 0) { stop("Error: Input 'x' contains elements from 'seeds'.")}

  # h
  if (!is.numeric(h)) { stop("Error: Input 'h' must be a positive number.")}
  if (length(h) != 1) { stop("Error: Input 'h' must be a positive number.")}
  if (h <= 0) { stop("Error: Input 'h' must be a positive number.")}

  # ell
  if (!is.numeric(ell)) { stop("Error: Input 'ell' must be a positive number.")}
  if (length(ell) != 1) { stop("Error: Input 'ell' must be a positive number.")}
  if (ell <= 0) { stop("Error: Input 'ell' must be a positive number.")}

  # R
  if (!is.numeric(R)) { stop("Error: Input 'R' must be a number.")}
  if (length(R) != 1) { stop("Error: Input 'R' must be a number.")}
  if (R <= 0) { stop("Error: Input 'R' must be a positive number.")}

  # gamma
  if (!is.numeric(gamma)) { stop("Error: Input 'gamma' must be a number.")}
  if (length(gamma) != 1) { stop("Error: Input 'gamma' must be a number.")}
  if (gamma < 0 || gamma > 1) { stop("Error: Input 'gamma' must be between 0 and 1.")}

  # maxiter
  if (!is.numeric(maxiter)) { stop("Error: Input 'maxiter' must be a positive number.")}
  if (length(maxiter) != 1) { stop("Error: Input 'maxiter' must be a positive number.")}
  if (maxiter <= 0) { stop("Error: Input 'maxiter' must be a positive number.")}

  # Padding
  if (!is.numeric(pad)) { stop("Error: Input 'pad' must be a number.")}

  # Verbosity
  if (!is.logical(verbose)) { stop("Error: Input 'verbose' must be a logical.")}

  # plotF
  if (!is.logical(plotF)) { stop("Error: Input 'plotF' must be a logical.")}

}

# Ordered function validation.
vn.validateInput.ordered <- function(x,s,g1,g2,h,ell,R,gamma,maxiter,pad,verbose,plotF) {

  # VOIs x
  if (!is.vector(x)) { stop("Error: Input 'x' must be a numeric vector.") }
  if (!(class(x) == "numeric" || class(x) == "integer")) { stop("Error: Input 'x' must be a numeric vector.") }

  # s
  if (!is.numeric(s)) { stop("Error: Input 's' must be a positive number.")}
  if (length(s) != 1) { stop("Error: Input 's' must be a positive number.")}
  if (s <= 0) { stop("Error: Input 's' must be a positive number.")}

  # VOIs x and Seeds non-overlap
  if (sum(x < s) != 0) { stop("Error: Input 'x' contains elements from 'seeds'.")}

  # Graphs
  if (class(g1) == "dgCMatrix") { g1 = igraph::graph_from_adjacency_matrix(g1) }
  if (class(g1) == "matrix") { g1 = igraph::graph_from_adjacency_matrix(g1) }
  if (class(g1) != 'igraph') { stop("Input object 'g1' is not an igraph object.") }
  if (class(g2) == "dgCMatrix") { g2 = igraph::graph_from_adjacency_matrix(g2) }
  if (class(g2) == "matrix") { g2 = igraph::graph_from_adjacency_matrix(g2) }
  if (class(g2) != 'igraph') { stop("Input object 'g2' is not an igraph object.") }

  # h
  if (!is.numeric(h)) { stop("Error: Input 'h' must be a positive number.")}
  if (length(h) != 1) { stop("Error: Input 'h' must be a positive number.")}
  if (h <= 0) { stop("Error: Input 'h' must be a positive number.")}

  # ell
  if (!is.numeric(ell)) { stop("Error: Input 'ell' must be a positive number.")}
  if (length(ell) != 1) { stop("Error: Input 'ell' must be a positive number.")}
  if (ell <= 0) { stop("Error: Input 'ell' must be a positive number.")}

  # R
  if (!is.numeric(R)) { stop("Error: Input 'R' must be a number.")}
  if (length(R) != 1) { stop("Error: Input 'R' must be a number.")}
  if (R <= 0) { stop("Error: Input 'R' must be a positive number.")}

  # gamma
  if (!is.numeric(gamma)) { stop("Error: Input 'gamma' must be a number.")}
  if (length(gamma) != 1) { stop("Error: Input 'gamma' must be a number.")}
  if (gamma < 0 || gamma > 1) { stop("Error: Input 'gamma' must be between 0 and 1.")}

  # maxiter
  if (!is.numeric(maxiter)) { stop("Error: Input 'maxiter' must be a positive number.")}
  if (length(maxiter) != 1) { stop("Error: Input 'maxiter' must be a positive number.")}
  if (maxiter <= 0) { stop("Error: Input 'maxiter' must be a positive number.")}

  # Padding
  if (!is.numeric(pad)) { stop("Error: Input 'pad' must be a number.")}

  # Verbosity
  if (!is.logical(verbose)) { stop("Error: Input 'verbose' must be a logical.")}

  # plotF
  if (!is.logical(plotF)) { stop("Error: Input 'plotF' must be a logical.")}

}