SBT: Fits the Bradley-Terry Model to Potentially Large and Sparse Networks of Comparison Data

Documented in btdata summary.btdata

# Converts a graph representation of wins into a square matrix.
graph_to_matrix <- function(g) {

  # check that graph is a directed igraph object
  if(!igraph::is.igraph(g))  stop("g must be a directed igraph object")
  if(!igraph::is.directed(g))  stop("g must be a directed igraph object")

  # check names
  if(!is.null(igraph::V(g)$name)) {

    arg <- deparse(substitute(g))

    if(anyDuplicated(igraph::V(g)$name) > 0) stop(paste0("Vertex names must be unique. Consider fixing with V(", arg, ")$name <- make.names(V(", arg, ")$name, unique = TRUE)"))
  }

  if (igraph::is.weighted(g)) W <- igraph::as_adjacency_matrix(g, sparse = TRUE, attr = "weight", names = TRUE)
  else W <- igraph::as_adjacency_matrix(g, sparse = TRUE, names = TRUE)

  return(W)

}

# Converts a data frame of paired results into a square matrix.
pairs_to_matrix <- function(df) {
  # Check for Matrix.utils
  if (!requireNamespace("Matrix.utils", quietly = TRUE)) {
    stop("The package Matrix.utils is needed for this function to work. Please install it.",
         call. = FALSE)
  }

  # Check for stringr
  if (!requireNamespace("stringr", quietly = TRUE)) {
    stop("The package stringr is needed for this function to work. Please install it.",
         call. = FALSE)
  }

  # check if data frame
  if(!(is.data.frame(df))) stop ("Argument must be a data frame")

  # ensure df is a data.frame (rather than tbl_df or tbl)
  class(df) <- "data.frame"

  # check number of columns
  if (!(ncol(df) %in% 3:4 )) stop("Argument must be a data frame with three or four columns")

  # get base data
  items <- sort(base::union(df[[1]], df[[2]]))
  n <- length(items)

  # get formula for dMcast
  f <- stats::as.formula(paste(names(df)[1:2], collapse= " ~ "))

  # convert names to factors
  if(!is.factor(df[,1])) {
    df[,1] <- factor(df[,1])
  }

  if(!is.factor(df[,2])) {
    df[,2] <- factor(df[,2])
  }
  
  # create empty mat if all zeros in column 3
  if(all(df[,3] == 0)) {
    mat <- Matrix::Matrix(0, n, n, sparse = TRUE)
  }
  
  # create matrix with wins from column 3
  else {
    # create cross-tabs matrix (not square)
    mat <- Matrix.utils::dMcast(df, f, value.var = names(df)[3], as.factors = TRUE)
  
    # fix colnames
    colnames(mat) <- stringr::str_replace(colnames(mat), names(df)[2], "")
  
    # remove zeros, if any, taking care with dimnames
    summary_mat <- Matrix::summary(mat)
    x <- NULL # hack to avoid CRAN note
    if (any(summary_mat[,3] == 0)) {
      summary_mat <- dplyr::filter(summary_mat, x != 0)
  
      mat_rownames <- rownames(mat)
      mat_colnames <- colnames(mat)
  
      new_mat_rownames <- mat_rownames[sort(unique(summary_mat[,1]))]
      new_mat_colnames <- mat_colnames[sort(unique(summary_mat[,2]))]
  
      mat <- Matrix::sparseMatrix(i = summary_mat[,1], j = summary_mat[,2], x = summary_mat[,3])
  
      nonzero_rows <- which(Matrix::rowSums(mat) != 0)
      nonzero_cols <- which(Matrix::colSums(mat) != 0)
  
      mat <- mat[nonzero_rows, nonzero_cols, drop = FALSE]
      dimnames(mat) <- list(new_mat_rownames, new_mat_colnames)
    }
  
  
    # add in zeros for missing rows
    if (nrow(mat) < n) {
      new_rows <- Matrix::Matrix(0, n - nrow(mat), ncol(mat),
                                 dimnames = list(base::setdiff(items, rownames(mat)), colnames(mat)))
      mat <- rbind(mat, new_rows)
    }
  
    # add in zeros for missing columns
    if (ncol(mat) < n) {
      new_cols <- Matrix::Matrix(0, n, n - ncol(mat),
                                 dimnames = list(rownames(mat), base::setdiff(items, colnames(mat))))
      mat <- cbind(mat, new_cols)
    }
  
    # get rows and columns in same, sorted order and return
    mat <- mat[items,]
    mat <- mat[, rownames(mat)]
  }

  # repeat above steps if in 4-column format (for item2 beating item1)
  # as long as col 4 isn't all zeros
  if (ncol(df) == 4) fourth_all_zero <- all(df[,4] == 0)
  else fourth_all_zero <- TRUE
  
  if (ncol(df) == 4 & !fourth_all_zero) {
    f2 <- stats::as.formula(paste(names(df)[2:1], collapse= " ~ "))
    mat2 <- Matrix.utils::dMcast(df, f2, value.var = names(df)[4], as.factors = TRUE)
    colnames(mat2) <- stringr::str_replace(colnames(mat2), names(df)[1], "")


    # remove zeros, if any, taking care with dimnames
    summary_mat2 <- Matrix::summary(mat2)
    if (any(summary_mat2[,3] == 0)) {
      summary_mat2 <- dplyr::filter(summary_mat2, x != 0)

      mat2_rownames <- rownames(mat2)
      mat2_colnames <- colnames(mat2)

      new_mat2_rownames <- mat2_rownames[sort(unique(summary_mat2[,1]))]
      new_mat2_colnames <- mat2_colnames[sort(unique(summary_mat2[,2]))]

      mat2 <- Matrix::sparseMatrix(i = summary_mat2[,1], j = summary_mat2[,2], x = summary_mat2[,3])

      nonzero_rows2 <- which(Matrix::rowSums(mat2) != 0)
      nonzero_cols2 <- which(Matrix::colSums(mat2) != 0)

      mat2 <- mat2[nonzero_rows2, nonzero_cols2, drop = FALSE]
      dimnames(mat2) <- list(new_mat2_rownames, new_mat2_colnames)
    }

    # add in zeros for missing rows
    if (nrow(mat2) < n) {
      new_rows2 <- Matrix::Matrix(0, n - nrow(mat2), ncol(mat2),
                                  dimnames = list(base::setdiff(items, rownames(mat2)), colnames(mat2)))
      mat2 <- rbind(mat2, new_rows2)
    }

    # add in zeros for missing columns
    if (ncol(mat2) < n) {
      new_cols2 <- Matrix::Matrix(0, n, n - ncol(mat2),
                                  dimnames = list(rownames(mat2), base::setdiff(items, colnames(mat2))))
      mat2 <- cbind(mat2, new_cols2)
    }

    # get rows and columns in same, sorted order and return
    mat2 <- mat2[items,]
    mat2 <- mat2[, rownames(mat2)]

    # add the result to mat
    mat <- mat + mat2
  }

  if(!is.null(colnames(df)[1]) & !is.null(colnames(df)[2])) names(dimnames(mat)) <- colnames(df)[1:2]

  return(mat)
}



#' Create a btdata object
#'
#' Creates a btdata object, primarily for use in the \link{btfit} function.
#'
#' The \code{x} argument to \code{btdata} can be one of four types:
#'
#' \itemize{
#'
#' \item{A matrix (either a base \code{matrix}) or a class from the \code{Matrix} package), dimension \eqn{K} by \eqn{K}, where \eqn{K} is the number of items. The \emph{i,j}-th element is \eqn{w_{ij}}, the number of times item \eqn{i} has beaten item \eqn{j}. Ties can be accounted for by assigning half a win (i.e. 0.5) to each item.}
#' \item{A contingency table of class \code{table}, similar to the matrix described in the above point.}
#' \item{An \code{igraph}, representing the \emph{comparison graph}, with the \eqn{K} items as nodes. For the edges:
#' \itemize{
#' \item{If the graph is unweighted, a directed edge from node \eqn{i} to node \eqn{j} for every time item \eqn{i} has beaten item \eqn{j}}
#' \item{If the graph is weighted, then one edge from node \eqn{i} to node \eqn{j} if item \eqn{i} has beaten item \eqn{j} at least once, with the weight attribute of that edge set to the number of times \eqn{i} has beaten \eqn{j}.}
#' }}
#' \item{
#' If \code{x} is a data frame, it must have three or four columns:
#' \itemize{
#' \item{3-column data frame}{The first column contains the name of the winning item, the second column contains the name of the losing item and the third columns contains the number of times that the winner has beaten the loser. Multiple entries for the same pair of items are handled correctly. If \code{x} is a three-column dataframe, but the third column gives a code for who won, rather than a count, see \code{\link{codes_to_counts}}.}
#' \item{4-column data frame}{The first column contains the name of item 1, the second column contains the name of item 2, the third column contains the number of times that item 1 has beaten item 2 and the fourth column contains the number of times item 2 has beaten item 1. Multiple entries for the same pair of items are handled correctly. This kind of data frame is also the output of \code{\link{codes_to_counts}}.}
#' \item{In either of these cases, the data can be aggregated, or there can be one row per comparison.}
#' \item{Ties can be accounted for by assigning half a win (i.e. 0.5) to each item.}
#' }
#' }
#'
#' }
#'
#' \code{summary.btdata} shows the number of items, the density of the \code{wins} matrix and whether the underlying comparison graph is fully connected. If it is not fully connected, \code{summary.btdata} will additional show the number of fully-connected components and a table giving the frequency of components of different sizes. For more details on the comparison graph, and how its structure affects how the Bradley-Terry model is fitted, see \code{\link{btfit}} and the vignette: \url{https://ellakaye.github.io/BradleyTerryScalable/articles/BradleyTerryScalable.html}.
#'
#' @param x The data, which is either a three- or four-column data frame, a directed igraph object, a square matrix or a square contingency table. See Details.
#' @param return_graph Logical. If TRUE, an igraph object representing the comparison graph will be returned.
#' @return An object of class "btdata", which is a list containing:
#' \item{wins}{A \eqn{K} by \eqn{K} square matrix, where \eqn{K} is the total number of players. The \eqn{i,j}-th element is \eqn{w_{ij}}, the number of times item \eqn{i} has beaten item \eqn{j}. If the items in \code{x} are unnamed, the wins matrix will be assigned row and column names 1:K.}
#' \item{components}{A list of the fully-connected components.}
#' \item{graph}{The comparison graph of the data (if return_graph = TRUE). See Details.}
#' @seealso \code{\link{codes_to_counts}} \code{\link{select_components}}
#' @author Ella Kaye
#' @examples
#' citations_btdata <- btdata(BradleyTerryScalable::citations)
#' summary(citations_btdata)
#' toy_df_4col <- codes_to_counts(BradleyTerryScalable::toy_data, c("W1", "W2", "D"))
#' toy_btdata <- btdata(toy_df_4col)
#' summary(toy_btdata)
#' @export
btdata <- function(x, return_graph = FALSE) {

  # if x is a table, convert it to a matrix
  if (is.table(x)) {
    attr(x, "class") <- NULL
    attr(x, "call") <- NULL
  }

  # if x is a df
  if (is.data.frame(x)) {
    if (!(ncol(x) %in% 3:4 )) stop("If x is a dataframe, it must have 3 or 4 columns.")
    wins <- pairs_to_matrix(x)
    g <- igraph::graph.adjacency(wins, weighted = TRUE, diag = FALSE)
  }

  # if x is a graph
  else if (igraph::is.igraph(x)) {
    if(!igraph::is.directed(x))  stop("If x is a graph, it must be a directed igraph object")

    # check for names
    if(!is.null(igraph::V(x)$name)) {

      arg <- deparse(substitute(x))

      if(anyDuplicated(igraph::V(x)$name) > 0) stop(paste0("If x is a graph, vertex names must be unique. Consider fixing with V(", arg, ")$name <- make.names(V(", arg, ")$name, unique = TRUE)"))
    }

    wins <- graph_to_matrix(x)
    g <- x
  }

  else if ((methods::is(x, "Matrix") | is.matrix(x) )) {

    # check dimensions/content
    if (dim(x)[1] != dim(x)[2]) stop("If x is a matrix or table, it must be a square")
    if(is.matrix(x)) {if (!is.numeric(x)) stop("If x is a matrix or table, all elements must be numeric")}
    if(methods::is(x, "Matrix")) {if (!is.numeric(as.vector(x))) stop("If x is a matrix or table, all elements must be numeric")}
    if (any(x < 0)) stop("If x is a matrix or table, all elements must be non-negative")
    if(!identical(rownames(x), colnames(x))) stop("If x is a matrix or table, rownames and colnames of x should be the same")
    if (anyDuplicated(rownames(x)) > 0) {

      arg <- deparse(substitute(x))
      stop("If x is a matrix or table with row- and column names, these must be unique. Consider fixing with rownames(", arg, ") <- colnames(", arg, ") <- make.names(rownames(", arg, "), unique = TRUE)")
    }

    # ensure wins is a dgCMatrix
    if (is.matrix(x)) wins <- Matrix::Matrix(x, sparse = TRUE)
    else wins <- x
    if (class(wins) != "dgCMatrix") wins <- methods::as(wins, "dgCMatrix")
    g <- igraph::graph.adjacency(wins, weighted = TRUE, diag = FALSE)
  }

  else stop("x must be a 3 or 4 column dataframe, a directed igraph object, or square matrix or contingency table.")


  ## get components
  comp <- igraph::components(g, mode = "strong")
  components <- igraph::groups(comp)

  # name the rows and columns of the wins matrix, if NULL
  if (is.null(unlist(dimnames(wins)))) {
    K <- nrow(wins)
    dimnames(wins) <- list(1:K, 1:K)
  }

  # return
  result <- list(wins = wins, components = components)
  if (return_graph) result$graph <- g
  class(result) <- c("btdata", "list")
  result
}

#' @rdname btdata
#' @param object An object of class "btdata", typically the result \code{ob} of \code{ob <- btdata(..)}.
#' @param ... Other arguments
#' @export
summary.btdata <- function(object, ...){
  if (!inherits(object, "btdata")) stop("object should be a 'btdata' object")
  K <- nrow(object$wins)
  num_comps <- length(object$components)
  connected <- num_comps == 1
  components_greater_than_one <- Filter(function(x) length(x) > 1, object$components)
  my_tab <- table(sapply(object$components, length))
  my_df <- as.data.frame(my_tab)

  colnames(my_df) <- c("Component size", "Freq")

  density <- Matrix::mean(object$wins != 0)

  cat("Number of items:", K, "\n")
  cat("Density of wins matrix:", density, "\n")
  cat("Fully-connected:", connected, "\n")


  if (num_comps > 1) {
    cat("Number of fully-connected components:", num_comps, "\n")
    cat("Summary of fully-connected components: \n")
    print(my_df)
  }
}

EllaKaye/SBT documentation built on May 31, 2022, 8:28 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

EllaKaye/SBT
Fits the Bradley-Terry Model to Potentially Large and Sparse Networks of Comparison Data

R/btdata.R
In EllaKaye/SBT: Fits the Bradley-Terry Model to Potentially Large and Sparse Networks of Comparison Data

Defines functions summary.btdata btdata pairs_to_matrix graph_to_matrix

Documented in btdata summary.btdata

R Package Documentation

Browse R Packages

We want your feedback!

EllaKaye/SBT Fits the Bradley-Terry Model to Potentially Large and Sparse Networks of Comparison Data

R/btdata.R In EllaKaye/SBT: Fits the Bradley-Terry Model to Potentially Large and Sparse Networks of Comparison Data

Defines functions summary.btdata btdata pairs_to_matrix graph_to_matrix

Documented in btdata summary.btdata

R Package Documentation

Browse R Packages

We want your feedback!

EllaKaye/SBT
Fits the Bradley-Terry Model to Potentially Large and Sparse Networks of Comparison Data

R/btdata.R
In EllaKaye/SBT: Fits the Bradley-Terry Model to Potentially Large and Sparse Networks of Comparison Data