#' Confirm image matches in a Shiny app
#'
#' \code{confirm_matches} takes the image matches produced by
#' \code{\link{identify_matches}} and displays them in an interactive Shiny app
#' for visual inspection and confirmation. Image matches with extremely low
#' Hamming distances can be optionally excluded, and pairwise duplicates can be
#' detected and excluded as well.
#'
#' The interface presents pairs of images alongside a best guess as to the match
#' status ("Match" or "No match"). For matches which are correctly identified,
#' no further action is necessary, while incorrect identifications can be
#' corrected by clicking "Match" or "No match" next to the image pair. Images
#' are presented in batches, and at any point the user can click the "Save and
#' exit" button to close the comparison app and retrieve the results up through
#' the last batch which was viewed. This means that even extremely large sets of
#' potential matches can be manually verified over the course of several
#' sessions.
#'
#' Through the "Enable highlighting" button, specific matches can be highlighted
#' for further follow-up after image comparison is finished.
#'
#' The Shiny app will only launch in an interactive R session; if
#' \code{confirm_matches} is called in a non-interactive context, it will
#' identify identical matches according to the `thresholds` argument and return
#' only those results.
#'
#' @param result A data frame produced by \code{\link{identify_matches}}, which
#' has fields `index` (list of integer vectors), `x_sig` (matchr_signature
#' vector), `y_sig` (matchr_signature vector) and `distance` (numeric vector).
#' @param remove_duplicates A logical scalar. Should x-y pairs which are
#' identical to other x-y pairs be reduced to a single x-y pair? This step can
#' be computationally expensive for large datasets, but can dramatically reduce
#' the number of matches to be verified.
#' @param batch_size An integer scalar. The number of images to display at a
#' time in the Shiny app (default 100).
#' @param thresholds A named integer vector. Which Hamming distances establish
#' thresholds for an "Identical" match (default 2L), a "Match" (default 4L), a
#' "Likely match" (default 12L), a "Possible match" (default 15L), and "No
#' match" (remaining values)? Image pairs with a distance equal to or less than
#' the "Identical" threshold will be considered exact duplicates and will not be
#' shown for verification in the comparison app. (Set "Identical" to -1L to
#' force manual verification of all image pairs). Remaining image pairs will be
#' grouped in the comparison app by these thresholds. Image pairs with distances
#' equal to or under the "Likely match" value will be given a default value of
#' "match" in the comparison app, while others will be given a default value of
#' "no match". If `remove_duplicates` is TRUE, the "Identical" threshold will be
#' used to identify duplicated images. (I.e. if the distance between two `x` or
#' two `y` images is <= the "Identical" threshold value, the images will be
#' considered duplicates.) If `thresholds` elements are not named, their names
#' will be inferred by ordering the values from smallest to largest. If
#' `thresholds` elements are not integers, they will be silently converted to
#' integers by truncating all digits to the right of the decimal point.
#' @param previous A logical scalar. Should the results of previous runs of
#' `compare_images` be incorporated into the new results (default TRUE), or
#' should previously compared matches be compared again? If this argument is
#' TRUE, then any rows in `result` with a `confirmed` value of TRUE will be
#' removed from the data frame before processing (and so will not be present
#' in the comparison interface) and then re-added unchanged to the output.
#' @param quiet A logical scalar. Should the function execute quietly, or should
#' it return status updates throughout the function (default)?
#' @return A data frame with the following fields: `index` from the original
#' `result` data frame; a logical vector `new_match_status`, which is TRUE for
#' confirmed matches, FALSE for confirmed non-matches, and NA for matches which
#' were not confirmed; and a logical vector `new_highlight` which is TRUE for
#' any matches which were highlighted using the in-app interface, FALSE for
#' matches which were not highlighted, and NA for matches which were not
#' confirmed. Confirmation is determined by how many pages into the Shiny app
#' the user proceeded, and thus how many pairings were viewed. If all pages are
#' viewed, then the output will have no NA values.
#' @examples
#' \dontrun{
#' # Setup
#' sigs <- create_signature(test_urls)
#' matches <- match_signatures(sigs)
#' result <- identify_matches(matches)
#'
#' # Assign the output of compare_images to retrieve results
#' change_table <- confirm_matches(result)
#' }
#' @export
confirm_matches <- function(result, remove_duplicates = TRUE, batch_size = 100L,
thresholds = c("Identical" = 80L, "Match" = 100L,
"Likely match" = 120L,
"Possible match" = 150L),
previous = TRUE, quiet = FALSE) {
# Check if necessary packages are installed
if (!requireNamespace("shiny", quietly = TRUE) ||
!requireNamespace("shinyjs", quietly = TRUE)) stop(
"`compare_images` requires the \"shiny\" and \"shinyjs\" packages.",
call. = FALSE)
# Error checking and object initialization
stopifnot(is.data.frame(result), is.numeric(c(batch_size, thresholds)),
is.logical(c(remove_duplicates, previous)))
stopifnot("`result` needs `index`, `x_sig`, `y_sig` and `distance` fields." =
all(c("index", "x_sig", "y_sig", "distance") %in% names(result)))
batch_size <- as.integer(batch_size)
stopifnot("`thresholds` needs four values." = length(thresholds) == 4)
thresholds <- thresholds[order(thresholds)]
thresholds <- as.integer(thresholds)
names(thresholds) <- c("Identical", "Match", "Likely match", "Possible match")
# Exit early for zero-row input
if (nrow(result) == 0) {
output <- data.frame(index = list(), new_match_status = character(),
new_highlight = logical())
if (requireNamespace("dplyr", quietly = TRUE)) {
output <- dplyr::as_tibble(output)
}
return(output)
}
# Initialize df
output <- compare_prepare_table(result, previous, thresholds)
df <- output[[1]]
df_all <- output[[2]]
df_prev <- output[[3]]
df_cor <- output[[4]]
# Exit early if df is empty
if (nrow(df) == 0) {
out <- compare_exit(df, remove_duplicates, df_prev, df_cor, df_all)
return(out)
}
# Exit early if not interactive
if (!interactive()) {
warning("The `compare_images` tool only runs in interactive mode.")
out <- compare_exit(df, remove_duplicates, df_prev, df_cor, df_all)
return(out)
}
# Remove duplicates
if (!remove_duplicates) {
df$x_id <- NA_integer_
df$y_id <- NA_integer_
df$x_name <- get_path(df$x_sig)
df$y_name <- get_path(df$y_sig)
df <- df[c(".UID", "x_id", "y_id", "x_name", "y_name", "distance")]
df_dups <- df[0,]
df_full <- df
df$duplicates <- 0L
} else {
# Find duplicates
df <- find_duplicates(df, thresholds[["Identical"]], quiet = quiet)
# Create trimmed df table
df$x_name <- get_path(df$x_sig)
df$y_name <- get_path(df$y_sig)
df <- df[c(".UID", "x_id", "y_id", "x_name", "y_name", "distance")]
df_full <- df
df_b <- df[!is.na(df$x_id) & !is.na(df$y_id),]
df_b <- df_b[order(df_b$x_id, df_b$y_id, df_b$distance),]
df_dups <- df_b[duplicated(df_b[c("x_id", "y_id")]),]
df_b <- df_b[!duplicated(df_b[c("x_id", "y_id")]),]
df_unique <- df[is.na(df$x_id) | is.na(df$y_id),]
df <- rbind(df_b, df_unique)
# Add duplicate counts
dup_list <-
stats::aggregate(df_full, by = list(df_full$x_id, df_full$y_id), length)
dup_list <- dup_list[c("Group.1", "Group.2", "y_name")]
names(dup_list) <- c("x_id", "y_id", "duplicates")
dup_list <- dup_list[dup_list$duplicates >= 2,]
if (nrow(dup_list) > 0)
dup_list$duplicates <- dup_list$duplicates - 1L else
dup_list$duplicates <- as.integer(dup_list$duplicates)
df <- merge(df, dup_list, all = TRUE)
df$duplicates <- ifelse(is.na(df$duplicates), 0L, df$duplicates)
df <- df[order(df$.UID),]
df <- df[c(".UID", "x_id", "y_id", "x_name", "y_name", "distance",
"duplicates")]
if (requireNamespace("dplyr", quietly = TRUE)) df <- dplyr::as_tibble(df)
}
# Make summary table
summary_table <- data.frame(
category = c("Total matches", "Matches previously checked",
"Identical matches", "Matches identified as duplicates",
"Matches to check"),
value = prettyNum(
c(nrow(result), nrow(df_prev), nrow(df_cor), nrow(df_dups),
nrow(result) - nrow(df_prev) - nrow(df_dups) - nrow(df_cor)), ","))
# Prepare Shiny input
match_vector <- rep("No match", nrow(df))
match_vector[df$distance <= thresholds["Possible match"]] <- "Possible match"
match_vector[df$distance <= thresholds["Likely match"]] <- "Likely match"
match_vector[df$distance <= thresholds["Match"]] <- "Match"
dist_vector <- paste("Distance:", df$distance)
dup_vector <- paste(df$duplicates, "duplicates")
# Launch Shiny app then return results
out <- compare_images(x = df$x_name, y = df$y_name, match = match_vector,
info = summary_table, distance = dist_vector,
duplicates = dup_vector)
out <- compare_finish(out, remove_duplicates, df, df_full, df_prev, df_cor,
df_all)
return(out)
}
# ------------------------------------------------------------------------------
compare_prepare_table <- function(result, previous, thresholds) {
# Prepare result table for processing
df <- result
df$.UID <- paste0("id-", formatC(seq_len(nrow(df)), width = floor(log10(
nrow(df))) + 1, flag = "0"))
df_all <- df
# Subset table if previous is TRUE
if (previous && suppressWarnings(!is.null(df$match))) {
df_prev <- df[!is.na(df$match),]
df <- df[is.na(df$match),]
} else {
df_prev <- df[0,]
df_prev$match <- NA
df_prev$highlight <- NA
}
# Remove results with very low distance
df_cor <- df[df$distance <= thresholds["Identical"],]
df <- df[df$distance > thresholds["Identical"],]
return(list(df, df_all, df_prev, df_cor))
}
# ------------------------------------------------------------------------------
compare_finish <- function(out, remove_duplicates, df, df_full, df_prev, df_cor,
df_all) {
x_id <- y_id <- .UID <- NULL
out$.UID <- df$.UID
if (remove_duplicates) {
out$x_id <- df$x_id
out$y_id <- df$y_id
change_groups <- out[c("x_id", "y_id", "new_match_status", "new_highlight")]
change_groups <- subset(change_groups, !is.na(x_id) & !is.na(y_id))
# Get match status for de-duplicated matches
out_b <- merge(df_full, change_groups)
out_b <- out_b[c(".UID", "new_match_status", "new_highlight")]
} else out_b <- data.frame(.UID = character(), new_match_status = logical(),
new_highlight = logical())
out_a <- subset(out, !.UID %in% out_b$.UID)
out_a <- out_a[c(".UID", "new_match_status", "new_highlight")]
# Add previous results
out_prev <- df_prev[c(".UID", "match", "highlight")]
names(out_prev) <- c(".UID", "new_match_status", "new_highlight")
# Add low-distance results
out_cor <- df_cor[".UID"]
out_cor$new_match_status <- TRUE
out_cor$new_highlight <- FALSE
# Combine results
out_IDs <- rbind(out_a, out_b, out_prev, out_cor)
out <- df_all[c("index", ".UID")]
out <- merge(out, out_IDs)
# Return output
out$.UID <- NULL
if (requireNamespace("dplyr", quietly = TRUE)) out <- dplyr::as_tibble(out)
return(out)
}
# ------------------------------------------------------------------------------
compare_exit <- function(df, remove_duplicates, df_prev, df_cor, df_all) {
if (nrow(df) == 0) {
out <- data.frame(x = character(), y = character(),
new_match_status = logical(), new_highlight = logical())
df$x_id <- df$y_id <- df$distance <- df$duplicates <- integer(nrow(out))
df$x_name <- df$y_name <- character()
} else {
out <- df
out$x <- get_path(out$x_sig)
out$y <- get_path(out$y_sig)
out$new_match_status <- NA
out$new_highlight <- NA
out <- out[c("x", "y", "new_match_status", "new_highlight")]
df$x_name <- out$x
df$y_name <- out$y
df$x_id <- df$y_id <- df$duplicates <- integer(nrow(out))
}
df <- df[c(".UID", "x_id", "y_id", "x_name", "y_name", "distance",
"duplicates")]
df_full <- df
df_full$duplicates <- NULL
out <- compare_finish(out, remove_duplicates = FALSE, df, df_full, df_prev,
df_cor, df_all)
return(out)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.