# The handwriter R package performs writership analysis of handwritten documents.
# Copyright (C) 2021 Iowa State University of Science and Technology on behalf of its Center for Statistics and Applications in Forensic Evidence
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
# EXPORTED ----------------------------------------------------------------
#' Format Template Data
#'
#' `format_template_data()` formats the template data for use with
#' [`plot_cluster_fill_counts()`]. The output is a list that contains a data frame
#' called `cluster_fill_counts`.
#'
#' @param template A single cluster template created by
#' [`make_clustering_template()`]
#' @return List that contains the cluster fill counts
#'
#' @examples
#' template_data <- format_template_data(template = example_cluster_template)
#' plot_cluster_fill_counts(formatted_data = template_data, facet = TRUE)
#'
#' @export
#' @md
format_template_data <- function(template) {
writer <- doc <- cluster <- count <- NULL
# make dataframe
counts <- data.frame("writer" = template$writers, "doc" = template$doc, "cluster" = template$cluster)
# get cluster fill counts
counts <- counts %>%
dplyr::group_by(writer, doc, cluster) %>%
dplyr::summarize(count = dplyr::n())
# make a column for each cluster
counts <- counts %>%
tidyr::pivot_wider(names_from = cluster, values_from = count, values_fill = 0)
# sort cluster columns
sorted <- as.character(sort(unique(as.integer(colnames(counts[, -c(1, 2)])))))
if ("-1" %in% colnames(counts)){ # outliers
sorted <- sorted[sorted != -1]
counts <- cbind(counts[, c(1, 2)], counts[, "-1"], counts[, sorted])
} else { # no outliers
counts <- cbind(counts[, c(1, 2)], counts[, sorted])
}
# make list
data <- list("cluster_fill_counts" = counts)
return(data)
}
# Internal Functions ------------------------------------------------------
#' Format Model Data
#'
#' `format_model_data()` formats the data need for the rjags model.
#'
#' @param model_clusters Data frame of cluster assignments for a set of model
#' training documents created by `get_clusterassignment()`
#' @param writer_indices Vector of start and end indices for the writer id in
#' the document names.
#' @param doc_indices Vector of start and end indices for the document id in the
#' document names.
#' @param a Scalar
#' @param b Scalar
#' @param c Scalar
#' @param d Scalar
#' @param e Scalar
#' @return List of data formatted for rjags.
#'
#' @noRd
format_model_data <- function(model_clusters, writer_indices, doc_indices, a = 2, b = 0.25, c = 2, d = 2, e = 0.5) {
# bind global variable to fix note
cluster <- NULL
graph_measurements <- model_clusters
# if clusters aren't numbered sequentially, relabel them
if (length(unique(graph_measurements$cluster)) < max(graph_measurements$cluster)) {
graph_measurements <- graph_measurements %>% dplyr::rename("original_cluster" = cluster)
cluster_lookup <- data.frame("original_cluster" = sort(unique(graph_measurements$original_cluster)), "cluster" = 1:length(unique(graph_measurements$original_cluster)))
graph_measurements <- graph_measurements %>% dplyr::left_join(cluster_lookup, by = "original_cluster")
}
# get cluster fill counts ----
# count number of graphs in each cluster for each writer
cluster_fill_counts <- get_cluster_fill_counts(graph_measurements[, c("docname", "writer", "doc", "cluster")])
# format data for rjags ----
rjags_data <- list(
Y = cluster_fill_counts[, -c(1, 2, 3)], # multinomial data
G = ncol(cluster_fill_counts[, -c(1, 2, 3)]), # number of clusters (40)
D = nrow(cluster_fill_counts[, -c(1, 2, 3)]), # total number of documents
W = length(unique(cluster_fill_counts$writer)), # number of unique writers
# docwise
docN = as.integer(apply(cluster_fill_counts[, -c(1, 2, 3)], FUN = sum, MARGIN = 1)), # number of letters in each doc, e.g. N[1] = 354
docwriter = as.factor(cluster_fill_counts$writer), # vector of writers
# letterwise
zero_vec = rep(0, times = length(graph_measurements$pc_wrapped)),
Gsmall = length(unique(graph_measurements$cluster)), # number of clusters (20)
numletters = length(graph_measurements$pc_wrapped), # total number of letters
pc_wrapped = graph_measurements$pc_wrapped, # principal component rotation observations
letterwriter = as.factor(graph_measurements$writer), # vector of writers for each letter
lettercluster = as.integer(graph_measurements$cluster), # vector of cluster assignments, one for each letter
zero_mat = matrix(0, nrow = length(unique(cluster_fill_counts$writer)), ncol = length(unique(graph_measurements$cluster))),
a = a, b = b, c = c, d = d, e = e
)
data <- list(
"graph_measurements" = graph_measurements,
"cluster_fill_counts" = cluster_fill_counts,
"rjags_data" = rjags_data
)
return(data)
}
#' Format Questioned Data
#'
#' `format_questioned_data()` formats the questioned data for analysis with the
#' hierarchical model.
#'
#' @param model A fitted model created by [`fit_model`]
#' @param questioned_clusters Data frame of cluster assignments for a set of
#' questioned documents created by `get_clusterassignment()`
#' @param writer_indices Vector of start and end indices for the writer id in
#' the document names.
#' @param doc_indices Vector of start and end indices for the document id in the
#' document names.
#' @return List of data formatted analysis.
#'
#' @noRd
format_questioned_data <- function(model, questioned_clusters, writer_indices, doc_indices) {
# bind global variable to fix check() note
original_cluster <- cluster <- NULL
graph_measurements <- questioned_clusters
# if model clusters were relabeled, relabel the questioned clusters
if (any(names(model$graph_measurements) == "original_cluster")) {
# make lookup table from model cluster data
cluster_lookup <- model$graph_measurements %>%
dplyr::select(original_cluster, cluster) %>%
dplyr::distinct()
# store clusters as old clusters
graph_measurements <- graph_measurements %>%
dplyr::rename("original_cluster" = cluster)
# get new cluster labels. NOTE: adds NA if questioned doc(s) use
# clusters that model doc(s) did not
graph_measurements <- graph_measurements %>%
dplyr::left_join(cluster_lookup, by = "original_cluster")
# if more than 10% of cluster values are NA, throw an error. Otherwise,
# remove any rows with NA for cluster from graph_measurements
count_nas <- sum(is.na(graph_measurements$cluster))
if (100*count_nas / nrow(graph_measurements) > 10) {
stop("More than 10% of graph_measurements$cluster have NA values.")
} else if (count_nas > 0 && (100*count_nas / nrow(graph_measurements) <= 10)) {
graph_measurements <- graph_measurements %>%
dplyr::filter(!is.na(cluster))
}
}
# get cluster fill counts ----
# count number of graphs in each cluster for each writer
cluster_fill_counts <- get_cluster_fill_counts(graph_measurements[, c("docname", "writer", "doc", "cluster")])
# check for missing clusters
if (ncol(cluster_fill_counts) < ncol(model$cluster_fill_counts)){
# zero data frame
full_cluster_fill_counts <- as.data.frame(matrix(0, nrow = nrow(cluster_fill_counts), ncol = ncol(model$cluster_fill_counts)))
# fill column names
colnames(full_cluster_fill_counts) <- colnames(model$cluster_fill_counts)
# fill writers and docs and docnames
full_cluster_fill_counts$writer <- cluster_fill_counts$writer
full_cluster_fill_counts$doc <- cluster_fill_counts$doc
full_cluster_fill_counts$docname <- cluster_fill_counts$docname
# add missing columns
full_cluster_fill_counts <- dplyr::left_join(cluster_fill_counts, full_cluster_fill_counts) %>%
dplyr::mutate(dplyr::across(dplyr::where(is.numeric), ~ tidyr::replace_na(.x, 0)))
# sort columns
cols <- c(colnames(full_cluster_fill_counts[, c(1, 2, 3)]), sort(as.numeric(colnames(full_cluster_fill_counts[, -c(1, 2, 3)]))))
full_cluster_fill_counts <- full_cluster_fill_counts[, cols]
# rename
cluster_fill_counts <- full_cluster_fill_counts
}
data <- list(
"graph_measurements" = graph_measurements,
"cluster_fill_counts" = cluster_fill_counts
)
return(data)
}
#' angle
#'
#' `angle()` gives a value in (-pi,pi), where negative values come from unit vectors below the x axis (kinda weird/not traditional)
#' https://stackoverflow.com/questions/1897704/angle-between-two-vectors-in-r
#'
#' @param N a vector
#' @param M a vector
#' @return an angle value in (-pi,pi)
#'
#' @noRd
angle <- function(N, M) {
theta <- atan2(N[2], N[1]) - atan2(M[2], M[1])
ifelse(theta > 0, as.numeric(theta), theta + pi)
return(theta)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.