R/similarities.R

Defines functions projection_direction bonacich_norm sim_method dist_sim_matrix jaccard similarity_option co_ocurrence fractional_approach

Documented in bonacich_norm co_ocurrence dist_sim_matrix fractional_approach jaccard

#' Fractional approach
#'
#' Matrix transformation from incidence matrices to citation networks, fractional counting for co-citation or fractional counting for bibliographic coupling
#'
#' @param A1   From incidence matrix (e.g. paper and authors)
#' @param A2   To incidence matrix (e.g. author to paper)
#' @param approach    Character string, \dQuote{citation}, \dQuote{cocitation} and \dQuote{bcoupling}
#'
#' @return Return a type of "citation network"
#'
#' @references
#'
#' Batagelj, V. (2020). Analysis of the Southern women network using fractional approach. Social Networks, 68, 229-236 \url{https://doi.org/10.1016/j.socnet.2021.08.001}
#'
#' Batagelj, V., & Cerinšek, M. (2013). On bibliographic networks. Scientometrics, 96(3), 845–864. \url{https://doi.org/10.1007/s11192-012-0940-1}
#'
#' @author Alejandro Espinosa-Rada
#'
#' @examples
#'
#' A1 <- matrix(c(
#'   1, 0, 0, 0,
#'   0, 1, 0, 0,
#'   0, 1, 1, 1,
#'   0, 0, 0, 0,
#'   0, 0, 0, 1
#' ), byrow = TRUE, ncol = 4)
#'
#' A2 <- matrix(c(
#'   1, 1, 1, 0, 0,
#'   0, 0, 1, 0, 0,
#'   0, 0, 1, 1, 0,
#'   0, 0, 0, 1, 1
#' ), byrow = TRUE, ncol = 5)
#'
#' fractional_approach(A1, A2)
#' @export

fractional_approach <- function(A1, A2, approach = c("citation", "cocitation", "bcoupling")) {
  A1 <- as.matrix(A1)
  A2 <- as.matrix(A2)

  similarity <- switch(similarity_option(approach),
    "citation" = 1,
    "cocitation" = 2,
    "bcoupling" = 3
  )

  Ci <- A1 %*% A2
  Ci <- t(Ci) %*% Ci

  # Citation Networks
  if (similarity == 1) {
    return(Ci)
  }

  # Co-citations
  if (similarity == 2) {
    D <- ifelse(rowSums(Ci) > 0, rowSums(Ci), 1)
    D <- diag(1 / D)
    Cin <- t(D %*% Ci)
    coCit <- Cin %*% Ci
    return(coCit)
  }

  # Bibliographic coupling
  if (similarity == 3) {
    D <- ifelse(rowSums(Ci) > 0, rowSums(Ci), 1)
    D <- diag(1 / D)
    biCo <- Ci %*% t(Ci)
    biC <- D %*% biCo
    return(biC)
  }
}

#' Co‐occurrence
#'
#' Co‐occurrence matrix based on overlap function
#'
#' @param A  A matrix
#' @param similarity  The similarities available are either \code{Ochiai} (default) or \code{cosine}.
#' @param occurrence  Whether to treat the matrix as a two-mode structure (a.k.a. rectangular matrix, occurrence matrix, affiliation matrix, bipartite network)
#' @param projection Whether to apply a projection (inner product multiplication) to the matrix
#'
#' @return This function returns the normalisation of a matrix into a symmetrical co‐occurrence matrix
#'
#' @references
#'
#' Borgatti, S. P., Halgin, D. S., 2011. Analyzing affiliation networks. In: J. Scott and P. J. Carrington (Eds.) The Sage handbook of social network analysis (pp. 417-433), Sage.
#'
#' Zhou, Q., & Leydesdorff, L. (2016). The normalization of occurrence and Co-occurrence matrices in bibliometrics using Cosine similarities and Ochiai coefficients. Journal of the Association for Information Science and Technology, 67(11), 2805–2814. \url{https://doi.org/10.1002/asi.23603}
#'
#' @author Alejandro Espinosa-Rada

#' @examples
#'
#' A <- matrix(
#'   c(
#'     2, 0, 2,
#'     1, 1, 0,
#'     0, 3, 3,
#'     0, 2, 2,
#'     0, 0, 1
#'   ),
#'   nrow = 5, byrow = TRUE
#' )
#'
#' co_ocurrence(A)
#' @export

co_ocurrence <- function(A, similarity = c("ochiai", "cosine"),
                         occurrence = TRUE, projection = FALSE) {
  A <- as.matrix(A)
  similarity <- switch(similarity_option(similarity),
    "ochiai" = 1,
    "cosine" = 2
  )

  ### Occurrence matrix
  if (occurrence) {
    # OCHIAI
    if (similarity == 1) {
      Di <- rowSums(A)
      Dj <- colSums(A)
      Ab <- (t(A) %*% A)
      diag(Ab) <- colSums(A) # impute diagonal
      return(Ab / (sqrt(outer(Dj, Dj, "*"))))
    }
    # COSINE
    if (similarity == 2) {
      # coOC <- t(A) %*% A
      # D <- diag(coOC)
      # return(coOC / (sqrt(outer(D, D, "*"))))

      return((t(A) %*% A) / (sqrt(outer(colSums(A^2), colSums(A^2), "*"))))
    }
  }

  ### Co-occurrence matrix based on inner product
  if (!occurrence) {
    IN <- (t(A) %*% A)

    # OCHIAI:
    if (projection) {
      if (similarity == 1) {
        Di <- rowSums(A^2)
        Dj <- colSums(A^2)
        return(IN / (sqrt(outer(Dj, Dj, "*"))))
      }

      # COSINE:
      if (similarity == 2) {
        INb <- IN
        Di <- rowSums(INb^2)
        Dj <- colSums(INb^2)
        return((IN %*% t(IN)) / (sqrt(outer(Di, Dj, "*"))))
      }
    }
    ### Co-occurrence matrix based on minmax_overlap function/OCHIAI
    # COSINE:
    if (!projection) {
      # OCHIAI:
      if (similarity == 1) {
        D <- colSums(A)
        return(OVER / (sqrt(outer(D, D, "*"))))
      }

      if (similarity == 2) {
        OVER <- minmax_overlap(A, row = FALSE)
        OVERb <- OVER
        Di <- rowSums(OVERb^2)
        Dj <- colSums(OVERb^2)
        return(OVER %*% t(OVER) / (sqrt(outer(Di, Dj, "*"))))
      }
    }
  }
}

similarity_option <- function(arg, choices, several.ok = FALSE) {
  if (missing(choices)) {
    formal.args <- formals(sys.function(sys.parent()))
    choices <- eval(formal.args[[deparse(substitute(arg))]])
  }

  arg <- tolower(arg)
  choices <- tolower(choices)

  match.arg(arg = arg, choices = choices, several.ok = several.ok)
}

#' Jaccard similarity
#'
#' Jaccard similarity identifies the changes of ties between two matrices.
#'
#' @param A  Binary matrix A
#' @param B  Binary matrix B
#' @param directed  Whether the matrix is symmetric
#' @param diag  Whether the diagonal should be considered
#' @param coparticipation  Select nodes that co-participate in both matrices
#' @param bipartite  Whether the matrix is incidence
#'
#' @return The output are: \code{jaccard} = Jaccard similarity, \code{proportion} =
#' proportion among the ties present at a given observation of ties that
#' are also present in the other matrix, and \code{table} = a table with the
#' tie changes between matrices.
#'
#' If \code{coparticipation = TRUE}, then
#' also: \code{match} = The number of nodes present in both matrices;
#'  \code{size_matrix1} = The size of the first matrix;
#'  \code{size_matrix2} = The size of the second matrix;
#' \code{coparticipation1} = The percentage of nodes in the first matrix also present in the second matrix;
#' \code{coparticipation2} = The percentage of nodes in the second matrix also present in the first matrix:
#' \code{overlap_actors} = Overlap of nodes between two matrices
#'
#' #' If \code{coparticipation = TRUE} and \code{bipartite = TRUE}, then
#' also: \code{matchM1} = The number of nodes in the first 'mode' present in both matrices;
#' \code{matchM2} = The number of nodes in the second 'mode' present in both matrices;
#'  \code{size_matrix1_M1} = The number of nodes in the first 'mode' of the first matrix;
#'  \code{size_matrix1_M2} = The number of nodes in the second 'mode' of the first matrix;
#'  \code{size_matrix2_M1} = The number of nodes in the first 'mode' of the second matrix;
#'  \code{size_matrix2_M2} = The number of nodes in the second 'mode' of the second matrix;
#' \code{coparticipation1_M2} = The percentage of nodes of the first 'mode' in the first matrix present in the second matrix.
#' \code{coparticipation1_M2} = The percentage of nodes of the second 'mode' in the first matrix present in the second matrix.
#' \code{coparticipation2_M1} = The percentage of nodes of the first 'mode' in the second matrix present in the first matrix.
#'  \code{coparticipation2_M2} = The percentage of nodes of the second 'mode' in the second matrix present in the first matrix.
#' \code{overlap_actors_M1} = Overlap between two matrices (nodes of the first 'mode')
#' \code{overlap_actors_M2} = Overlap between two matrices (nodes of the second 'mode')
#'
#' @references
#'
#' Batagelj, V., and Bren, M. (1995). Comparing resemblance measures. Journal of Classification 12, 73–90.
#'
#'
#' @author Alejandro Espinosa-Rada

#' @examples
#'
#' A <- matrix(c(
#'   0, 1, 1, 0,
#'   1, 0, 0, 0,
#'   1, 0, 0, 0,
#'   0, 0, 1, 0
#' ), byrow = TRUE, ncol = 4)
#' B <- matrix(c(
#'   0, 1, 1, 0,
#'   1, 0, 0, 0,
#'   1, 0, 0, 0,
#'   0, 0, 0, 0
#' ), byrow = TRUE, ncol = 4)
#' jaccard(A, B, directed = TRUE)
#' @export

# TODO: expand for n periods
# TODO: expand for other similarities

jaccard <- function(A, B, directed = TRUE, diag = FALSE,
                    coparticipation = FALSE, bipartite = FALSE) {
  A <- as.matrix(A)
  B <- as.matrix(B)
  if (any(abs(A > 1), na.rm = TRUE)) stop("The matrix should be binary")
  if (any(abs(B > 1), na.rm = TRUE)) stop("The matrix should be binary")

  if (coparticipation) {
    if (!bipartite) {
      if (all(rownames(A) != colnames(A))) stop("The names of rows and columns do not match")
      if (all(rownames(B) != colnames(B))) stop("The names of rows and columns do not match")

      n1t <- ncol(A)
      n2t <- ncol(B)
      name1 <- rownames(A) %in% rownames(B)
      name1 <- rownames(A)[name1 == TRUE]
      A <- A[rownames(A) %in% name1, rownames(A) %in% name1]
      B <- B[rownames(B) %in% name1, rownames(B) %in% name1]

      n1 <- ncol(A)
      n2 <- ncol(B)
    } else {
      # bipartite
      n1_at <- nrow(A)
      n1_bt <- ncol(A)
      n2_at <- nrow(B)
      n2_bt <- ncol(B)

      name1a <- rownames(A) %in% rownames(B)
      name1a <- rownames(A)[name1a == TRUE]

      name1b <- colnames(A) %in% colnames(B)
      name1b <- colnames(A)[name1b == TRUE]

      A <- A[rownames(A) %in% name1a, colnames(A) %in% name1b]
      B <- B[rownames(B) %in% name1a, colnames(B) %in% name1b]

      n1a <- nrow(A)
      n1b <- ncol(A)
    }
  }

  if (bipartite) {
    if (!coparticipation) {
      if (ncol(A) != ncol(B)) {
        stop("The matrices have different dimensions")
      } else {
        if (all(rownames(A) != rownames(B))) stop("The names of nodes do not match")
      }
      if (nrow(A) != nrow(B)) {
        stop("The matrices have different dimensions")
      } else {
        if (all(colnames(B) != colnames(B))) stop("The names of nodes do not match")
      }
    }

    t <- table(A, B, useNA = c("always"))
  } else {
    if (!directed) {
      t <- table(A[lower.tri(A, diag = diag)], B[lower.tri(B, diag = diag)])
    } else {
      if (all(A[lower.tri(A)] == t(A)[lower.tri(A)])) message("The matrix is symmetric")
      A <- c(A[lower.tri(A, diag = diag)], A[upper.tri(A, diag = diag)])
      B <- c(B[lower.tri(B, diag = diag)], B[upper.tri(B, diag = diag)])
      t <- table(A, B, useNA = c("always"))
    }
  }
  n11 <- t[2, 2]
  n10 <- t[2, 1]
  n01 <- t[1, 2]
  n00 <- t[1, 1]

  if (coparticipation) {
    if (!bipartite) {
      return(list(
        jaccard = n11 / (n10 + n01 + n11),
        proportion = n11 / (n10 + n11),
        table = t,
        coparticipation = cbind(
          match = n1,
          size_matrix1 = n1t,
          size_matrix2 = n2t,
          coparticipation1 = n1 / n1t,
          coparticipation2 = n2 / n2t,
          overlap_actors = ((n1 / n1t + n1 / n2t) / 2)
        )
      ))
    } else {
      # bipartite
      return(list(
        jaccard = n11 / (n10 + n01 + n11),
        proportion = n11 / (n10 + n11),
        table = t,
        coparticipation = cbind(
          matchM1 = n1a, # match mode 1
          matchM2 = n1b, # match mode 2

          size_matrix1_M1 = n1_at, # size matrix rows A
          size_matrix1_M2 = n1_bt, # size matrix columns A
          size_matrix2_M1 = n2_at, # size matrix rows B
          size_matrix2_M2 = n2_bt, # size matrix columns B

          coparticipation1_M1 = n1a / n1_at,
          coparticipation1_M2 = n1b / n1_bt,
          coparticipation2_M1 = n1a / n2_at,
          coparticipation2_M2 = n1b / n2_bt,
          overlap_actors_M1 = ((n1a / n1_at + n1a / n2_at) / 2), # overlap
          overlap_actors_M2 = ((n1b / n1_bt + n1b / n2_bt) / 2)
        )
      ))
    }
  } else {
    return(list(
      jaccard = n11 / (n10 + n01 + n11),
      proportion = n11 / (n10 + n11),
      table = t
    ))
  }
}

#' Structural similarities
#'
#' In the literature of social network, Euclidean distance (Burt, 1976) or correlations (Wasserman and Faust, 1994) were considered as measures of structural equivalence.
#'
#' @param A  A matrix
#' @param method  The similarities/distance currently available are either \code{Euclidean} (default), \code{Hamming}, or \code{Jaccard}.
#' @param bipartite  Whether the object is an incidence matrix
#'
#' @return This function returns a distance matrix between nodes of the same matrix.
#'
#' @references
#'
#' Burt, Ronald S. (1976) Positions in networks. Social Forces, 55(1): 93-122.
#'
#' Wasserman, S. and Faust, K. (1994). Social network analysis: Methods and applications. Cambridge University Press.
#'
#' @author Alejandro Espinosa-Rada
#'
#' @examples
#' A <- matrix(c(
#'   0, 1, 0, 0, 1,
#'   0, 0, 0, 1, 1,
#'   0, 1, 0, 0, 1,
#'   0, 0, 1, 1, 0,
#'   0, 1, 0, 0, 0
#' ), nrow = 5, ncol = 5, byrow = TRUE)
#' rownames(A) <- letters[1:nrow(A)]
#' colnames(A) <- rownames(A)
#' dist_sim_matrix(A, method = "jaccard")
#'
#' A <- matrix(c(
#'   0, 0, 3, 0, 5,
#'   0, 0, 2, 0, 4,
#'   5, 4, 0, 4, 0,
#'   0, 3, 0, 1, 0,
#'   0, 0, 0, 0, 2
#' ), nrow = 5, ncol = 5, byrow = TRUE)
#' dist_sim_matrix(A, method = "euclidean")
#' @export

# TODO: Expand for more than one matrix

dist_sim_matrix <- function(A, method = c("euclidean", "hamming", "jaccard"),
                            bipartite = FALSE) {
  A <- as.matrix(A)
  if (!bipartite) {
    if (ncol(A) != nrow(A)) message("The object is an incidence matrix. The `bipartite=TRUE` parameter should be specified.")
  }

  method <- switch(sim_method(method),
    "euclidean" = 1,
    "hamming" = 2,
    "jaccard" = 3
  )
  profile <- list()
  profile2 <- list()

  if (method == 1) { # euclidean

    if (bipartite == TRUE) {
      for (i in 1:nrow(A)) {
        for (j in i:nrow(A)) {
          profile[[j]] <- sqrt(sum((A[i, ] - A[j, ])^2))
        }
        profile2[[i]] <- unlist(profile)
      }
      m1 <- do.call(rbind, profile2)
      m1[lower.tri(m1)] <- t(m1)[lower.tri(m1)] # Symmetrize
      return(m1)
    } else {
      for (i in 1:nrow(A)) {
        for (j in 1:ncol(A)) {
          profile[[j]] <- sqrt(sum((A[i, ] - A[j, ])^2))
        }
        profile2[[i]] <- unlist(profile)
      }
      m1 <- do.call(rbind, profile2)
      return(m1)
    }
  }

  if (method == 2) { # hamming

    if (bipartite == TRUE) {
      for (i in 1:nrow(A)) {
        for (j in i:ncol(A)) {
          profile[[j]] <- sum(A[i, ] != A[j, ])
        }
        profile2[[i]] <- unlist(profile)
      }
      m1 <- do.call(rbind, profile2)
      m1[lower.tri(m1)] <- t(m1)[lower.tri(m1)] # Symmetrize
      return(m1)
    } else {
      for (i in 1:nrow(A)) {
        for (j in 1:ncol(A)) {
          profile[[j]] <- sum(A[i, ] != A[j, ])
        }
        profile2[[i]] <- unlist(profile)
      }
      m1 <- do.call(rbind, profile2)
      return(m1)
    }
  }

  if (method == 3) { # jaccard
    if (bipartite == TRUE) {
      for (i in 1:nrow(A)) {
        for (j in i:nrow(A)) {
          t <- table(A[i, ], A[j, ])
          n11 <- t[2, 2]
          n10 <- t[2, 1]
          n01 <- t[1, 2]
          n00 <- t[1, 1]
          profile[[j]] <- n11 / (n11 + n01 + n10)
        }
        profile2[[i]] <- unlist(profile)
      }
      m1 <- do.call(rbind, profile2)
      m1[lower.tri(m1)] <- t(m1)[lower.tri(m1)] # Symmetrize

      return(1 - m1)
    } else {
      for (i in 1:nrow(A)) {
        for (j in 1:ncol(A)) {
          t <- table(A[i, ], A[j, ])
          n11 <- t[2, 2]
          n10 <- t[2, 1]
          n01 <- t[1, 2]
          n00 <- t[1, 1]
          profile[[j]] <- n11 / (n11 + n01 + n10)
        }
        profile2[[i]] <- unlist(profile)
      }
      m1 <- do.call(rbind, profile2)
      return(1 - m1)
    }
  }
}

sim_method <- function(arg, choices, several.ok = FALSE) {
  if (missing(choices)) {
    formal.args <- formals(sys.function(sys.parent()))
    choices <- eval(formal.args[[deparse(substitute(arg))]])
  }

  arg <- tolower(arg)
  choices <- tolower(choices)

  match.arg(arg = arg, choices = choices, several.ok = several.ok)
}


#' Bonacich normalization
#'
#' The function provide a normalisation provided by Bonacich (1972).
#'
#' @param A  An incidence matrix
#' @param projection  Whether to normalise by \code{rows} (default), or \code{columns} of the matrix.
#' @param normalisation  Normalise the measure
#'
#' @return This function returns the Bonacich normalisation.
#'
#' @references
#'
#' Bonacich, P. (1972). Factoring and weighting approaches to status scores and clique identification. Journal of Mathematical Sociology, 2: 112-120.
#'
#' @source Adapted from Borgatti, S., Everett, M., Johnson, J. and Agneessens, P. (2022) Analyzing Social Networks Using R. Sage.
#'
#' @examples
#' A <- matrix(
#'   c(
#'     1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0,
#'     1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
#'     0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
#'     1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
#'     0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0,
#'     0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0,
#'     0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
#'     0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0,
#'     0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0,
#'     0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0,
#'     0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0,
#'     0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1,
#'     0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1,
#'     0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1,
#'     0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0,
#'     0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0,
#'     0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0,
#'     0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0
#'   ),
#'   byrow = TRUE, ncol = 14
#' )
#' bonacich_norm(A)
#' @export

bonacich_norm <- function(A, projection = c("rows", "columns"),
                          normalisation = FALSE) {
  projection <- switch(projection_direction(projection),
    "rows" = 1,
    "columns" = 2
  )

  if (projection == 1) {
    P <- matrix_projection(A)[[2]]
    n <- ncol(A)
  }
  if (projection == 2) {
    P <- matrix_projection(A)[[1]]
    n <- nrow(A)
  }

  M <- matrix(0, nrow(P), ncol(P))
  for (i in 1:nrow(P)) {
    for (j in i:ncol(P)) {
      temp1 <- P[i, j] * (n + P[i, j] - P[i, i] - P[j, j])
      temp2 <- (P[i, i] - P[i, j]) * (P[j, j] - P[i, j])

      if (temp1 == temp2) {
        M[i, j] <- 0.5
      } else {
        M[i, j] <- (temp1 - sqrt(temp1 * temp2)) / (temp1 - temp2)
      }
      M[j, i] <- M[i, j]
    }
    M[i, i] <- 1
  }

  if (projection == 1) {
    if (!is.null(rownames(A))) {
      rownames(M) <- rownames(A)
      colnames(M) <- rownames(A)
    }
  }
  if (projection == 2) {
    if (!is.null(colnames(A))) {
      rownames(M) <- colnames(A)
      colnames(M) <- colnames(A)
    }
  }
  if (normalisation) {
    M <- M * 100
  }

  return(M)
}


projection_direction <- function(arg, choices, several.ok = FALSE) {
  if (missing(choices)) {
    formal.args <- formals(sys.function(sys.parent()))
    choices <- eval(formal.args[[deparse(substitute(arg))]])
  }

  arg <- tolower(arg)
  choices <- tolower(choices)

  match.arg(arg = arg, choices = choices, several.ok = several.ok)
}
anespinosa/netmem documentation built on April 5, 2025, 5:02 p.m.