R/forEvaluation.R

#' #' Find prototypes given clustering, and radius (maximum distance to prototype)
#' #'
#' #' Given pairwise similarities and links (a clustering), find prototypes for
#' #' each cluster and maximum distance to prototype for that cluster. The output
#' #' is a data frame with one row representing one cluster, and the metric max
#' #' minimax radius for the given clustering is given by max(out$minimaxRadius).
#' #'
#' #' @param allPairwise name of data frame containing all pairwise comparisons.
#' #'   This needs to have at least four columns, one representing the first item
#' #'   in the comparison, one representing the second item, one representing
#' #'   whether the pair is linked in the given clustering, and the last
#' #'   representing a distance or similarity metric. These are enumerated in the
#' #'   next three parameters.
#' #' @param distSimCol name of column in `allPairwise` indicating distances or
#' #'   similarities, input as character, e.g. "l2dist". If this is a similarity
#' #'   and not a difference, input `myDist` parameter to be FALSE. If a similarity
#' #'   measure is used, distance will be calcualted as 1 - similarity.
#' #' @param linkCol name of column in `allPairwise` with links, input as
#' #'   character, e.g. "minimax0.4"
#' #' @param pairColNums vector of length 2 indicating the column numbers in
#' #'   `allPairwise` of 1. item 1 in comparison, 2. item 2 in comparison
#' #' @param myDist is `distSimCol` a distance or similarity measure? Default TRUE,
#' #'   i.e. distance measure
#' #'
#' #' @return data frame with columns `cluster`, `minimaxRadius`, `prototype`. The
#' #'   metric max minimax radius for the given clustering is given by
#' #'   max(out$minimaxRadius)
#' #'
#' #' @importFrom magrittr "%>%"
#' #' @importFrom dplyr group_by summarize
#' #' @export
#'
#' distToPrototype <- function(allPairwise, distSimCol, linkCol, pairColNums, myDist = TRUE) {
#'     myClusts <- getClust(allPairwise, linkCol, pairColNums)
#'
#'     myClusts$maxRadius <- NA # maximum radius if this item is the prototype of its cluster
#'     for (j in 1:nrow(myClusts)) { # if item i is the prototype
#'       tmp <- allPairwise[allPairwise[, linkCol] == 1 & (allPairwise[, pairColNums[1]] == myClusts$item[j] | allPairwise[, pairColNums[2]] == myClusts$item[j]), distSimCol]
#'
#'       if (length(tmp) > 0) { # if item[j] is in a cluster
#'         if (myDist == FALSE) {
#'           tmp <- 1 - tmp
#'         }
#'         myClusts$maxRadius[j] <- max(tmp)
#'       } else { # if item[j] is a singleton
#'         myClusts$maxRadius[j] <- 0
#'       }
#'     }
#'
#'     `%>%` <- magrittr::`%>%`
#'     out <- myClusts %>% dplyr::group_by(cluster) %>% dplyr::summarize(minimaxRadius = min(maxRadius), prototype = item[which.min(maxRadius)])
#'
#'     return(out)
#' }
#'
#' #' Going from links to clusters
#' #'
#' #' Given pairwise links as generated by `makeLinkCol()`, produce a list of
#' #' individual items and their cluster membership
#' #'
#' #' @param allPairwise name of data frame containing all pairwise comparisons.
#' #'   This needs to have at least four columns, one representing the first item
#' #'   in the comparison, one representing the second item, and one representing
#' #'   whether the pair is linked in the given clustering. These are enumerated in
#' #'   the next two parameters.
#' #' @param linkCol name of column in `allPairwise` with links (a binary indicator
#' #'   for whether the pair is linked after hierarchical clustering), input as
#' #'   character, e.g. "minimax0.4"
#' #' @param pairColNums vector of length 2 indicating the column numbers in
#' #'   `allPairwise` of 1. item 1 in comparison, 2. item 2 in comparison
#' #'
#' #' @return data frame with two columns: `item`, the name of the item, and
#' #'   `cluster`, the cluster number the item is a member of
#' #' @export
#'
#' getClust <- function(allPairwise, linkCol, pairColNums) {
#'     distMat <- longToSquare(allPairwise, pairColNums, linkCol, myDist = FALSE) # here dist is for linkCol
#'     distObj <- as.dist(distMat)
#'
#'     hcluster <- hclust(distObj, method = "single") # doesn't matter because everything is already linked properly
#'     set.seed(0)
#'     clustersAll <- cutree(hcluster, h = .5)
#'
#'     hashes <- unique(c(allPairwise[, pairColNums[1]], allPairwise[, pairColNums[2]]))
#'     hashes <- sort(hashes)
#'     # names(clustersAll) <- hashes
#'
#'     outClusters <- data.frame(item = hashes, cluster = clustersAll, stringsAsFactors = FALSE)
#'     rownames(outClusters) <- NULL
#'
#'     return(outClusters)
#' }
#'
#' #' Calculate misclassification rate given pairs, model prediction and true match
#' #' status
#' #'
#' #' @param allPairwise name of data frame containing all pairwise comparisons.
#' #'   This needs to have at least two columns, one representing whether the pair
#' #'   is linked in the given clustering, and one representing the true
#' #'   match/non-match status. These are enumerated in the next two parameters.
#' #' @param linkCol name of column in `allPairwise` with links, input as
#' #'   character, e.g. "minimax0.4"
#' #' @param matchColNum column number of column in `allPairwise` indicating true
#' #'   match/non-match status
#' #'
#' #' @return misclassification rate
#' #' @export
#'
#' misclassRate <- function(allPairwise, linkCol, matchColNum) {
#'     out <- sum(allPairwise[, linkCol] != allPairwise[, matchColNum])/nrow(allPairwise)
#'     return(out)
#' }

#' Calculate precision and recall given pairs, model prediction and true match
#' status
#'
#' @param allPairwise name of data frame containing all pairwise comparisons.
#'   This needs to have at least two columns, one representing whether the pair
#'   is linked in the given clustering, and one representing the true
#'   match/non-match status. These are enumerated in the next two parameters.
#' @param linkCol name of column in `allPairwise` with links, input as
#'   character, e.g. "minimax0.4"
#' @param matchColNum column number of column in `allPairwise` indicating true
#'   match/non-match status
#'
#' @return list with two items, `precision` and `recall`
#' @export

precisionRecall <- function(allPairwise, linkCol, matchColNum) {
    numerator <- sum(allPairwise[, linkCol] >= .5 & allPairwise[, matchColNum] == 1) # preds are 0 or 1 so doesn't matter that i used .5
    denom <- sum(allPairwise[, linkCol] >= .5)
    if (denom == 0) {
      precision <- 1
    } else {
      precision <- numerator/denom
    }
    recall <- numerator/sum(allPairwise[, matchColNum] == 1)

    out <- list(precision = precision, recall = recall)
    return(out)
}
xhtai/cartridges documentation built on June 1, 2019, 2:58 p.m.