# The handwriter R package performs writership analysis of handwritten documents.
# Copyright (C) 2021 Iowa State University of Science and Technology on behalf of its Center for Statistics and Applications in Forensic Evidence
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
# Internal Functions ------------------------------------------------------
addGraphFeatures <- function(comps, img, dims) {
# For each component in comps, adds features, such as apsect info and centroid info, for each graph in the component
n <- length(comps)
for (i in 1:n){
if (length(comps[[i]]$paths$graphList) > 0){
comps[[i]]$paths$graphList <- addGraphFeaturesForComponent(img = img,
graphList = comps[[i]]$paths$graphList,
graphs = comps[[i]]$paths$graphs,
dims = dims)
}
}
return(comps)
}
addGraphFeaturesForComponent <- function(img, graphList, graphs, dims) {
# adds features, such as apsect info and centroid info, for each graph
featureSets <- extract_character_features(img, graphList, dims)
for (i in 1:length(graphs))
{
graphList[[i]]$characterFeatures <- featureSets[[i]]
}
letterPlaces <- matrix(unlist(lapply(featureSets, FUN = function(x) {
c(x$line_number, x$order_within_line)
})), ncol = 2, byrow = TRUE)
letterOrder <- order(letterPlaces[, 1], letterPlaces[, 2])
graphList <- graphList[letterOrder]
return(graphList)
}
#' extract_character_features
#'
#' Parses all characters from a processed image.
#'
#' @param img The thinned image bitmap
#' @param character_lists Output from processHandwriting$letterLists
#' @param dims Dimensions of binary image
#' @return nested lists associating features to respective characters.
#'
#' @noRd
extract_character_features <- function(img, character_lists, dims){
character_features = list()
for(i in 1:length(character_lists)){
cur_features = char_to_feature(character_lists[[i]], dims, i)
character_features = append(character_features, list(cur_features))
}
character_features = add_updown_neighboring_char_dist(character_features, character_lists, img, dims)
character_features = add_line_info(character_features, dims)
character_features = nov_neighboring_char_dist(character_features)
character_features = add_covariance_matrix(character_lists, character_features, dims)
return(character_features)
}
#' char_to_feature
#'
#' Secondary driver of feature extraction
#' Extracts features from a single character
#'
#' @param character character to extract information from
#' @param dims Dimensions of binary image
#' @param uniqueid Unique numerical reference to character
#' @return List containing features of character
#'
#' @noRd
char_to_feature = function(character, dims, uniqueid){
aspect_info = get_aspect_info(character$path,dims)
centroid_info = get_centroid_info(character$path,dims)
features = c(aspect_info, centroid_info)
#persistent index for sorting/rearranging the features list
features$uniqueid = uniqueid
return(features)
}
#' get_aspect_info
#'
#' Extracts aspect ratio & supporting information from a character
#' Relevant Features:
#' Aspect Ratio: Row (Height) over (Column Width)
#' Height, Width (Each measure of pixels)
#' The rest are supporting features that are minor independently.
#'
#' @param character character to extract information from
#' @param dims Dimensions of binary image
#' @return List containing aspect_ratio,
#'
#' @noRd
get_aspect_info = function(character, dims)
{
rowcol = i_to_rci(character,dims)
rows_y = rowcol[,'y']
cols_x = rowcol[,'x']
row_dist = max(rows_y) - min(rows_y) + 1 #vertical distance
col_dist = max(cols_x) - min(cols_x) + 1 #horizontal distance
aspect_info = list(aspect_ratio = row_dist/col_dist,height = row_dist, width = col_dist,topmost_row = min(rows_y),bottom_row = max(rows_y),leftmost_col=min(cols_x),rightmost_col=max(cols_x))
return(aspect_info)
}
#' get_centroid_info
#'
#' Extracts centroid & supporting information from a character
#' Relevant Features:
#' Centroid Index: R Index representation of centroid location
#' Centroid x,y: X,Y representations of the centroid, see ?i_to_rci
#' Centroid Horiz Location: How far along horizontally (Represented as a number between 0 and 1) the centroid is in its respective character.
#' Centroid Vertical Location: How far along vertically (Represented as a number between 0 and 1) the centroid is in its respective character.
#' Slope: 'Letter Lean', slope found between the centroids of each disjoint half in a single character.
#' The letter is split in half, each halve's centroid is calculated independently, the slope is taken between the two.
#' Box Density: (Dimensions of box around letter width height) / (how much of the document it covers) //Might be a more document as opposed to letter based feature
#' Pixel Density: Ratio of black to white pixels found in box drawn around the letter.
#'
#' @param character character to extract information from
#' @param dims Dimensions of binary image
#' @return List containing centroid, pixel density,letter 'lean', and all supporting information
#'
#' @noRd
get_centroid_info = function(character, dims)
{
rowcol = i_to_rci(character,dims)
rows_y = rowcol[,'y']
cols_x = rowcol[,'x']
centroid_row = mean(rows_y)
centroid_col = mean(cols_x)
row_dist = max(rows_y) - min(rows_y) + 1 #vertical distance
col_dist = max(cols_x) - min(cols_x) + 1 #horizontal distance
centroid_index = rc_to_i(centroid_row,centroid_col,dims)
#relative density: draw a box around the letter, ratio of black to white pixels in the box
r_density = length(character)/(row_dist*col_dist)
#box density: dimensions of box around letter / how much of the document it covers
box_density = (row_dist*col_dist) / (dims[1]*dims[2])
centroid_horiz_location = (centroid_col-min(cols_x) + 1) / col_dist
centroid_vert_location = (centroid_row-min(rows_y) + 1) / row_dist
#used for getting skew, assuming centroid is more middle than the median col_x
#probably can be removed, I just want nic to be able to plot them to determine if its an appropriate 'split' in the letter
lHalf = list(rows_y = rows_y[which(cols_x<centroid_col)],cols_x = cols_x[which(cols_x<centroid_col)])
rHalf = list(rows_y = rows_y[which(cols_x>centroid_col)],cols_x = cols_x[which(cols_x>centroid_col)])
lHalfCentroidrc = list(y=mean(lHalf$rows_y),x=mean(lHalf$cols_x))
rHalfCentroidrc = list(y=mean(rHalf$rows_y),x=mean(rHalf$cols_x))
lHalfCentroidi = rc_to_i(mean(lHalf$rows_y),mean(lHalf$cols_x),dims)
rHalfCentroidi = rc_to_i(mean(rHalf$rows_y),mean(rHalf$cols_x),dims)
#indices of each half
lHi = rc_to_i(lHalf$rows_y,lHalf$cols_x,dims)
rHi = rc_to_i(rHalf$rows_y,rHalf$cols_x,dims)
#finding slope, in case of long letters like e in csafe maybe account length?
#errrrr does the y need a +1
slope = ((dims[1] - rHalfCentroidrc$y)-(dims[1] - lHalfCentroidrc$y))/(rHalfCentroidrc$x-lHalfCentroidrc$x+1)
if(length(lHalf[[1]]) == 0 & length(rHalf[[1]]) == 0)
{
slope = 0
}
lHalfCentroid = rc_to_i(mean(lHalf$rows_y),mean(lHalf$cols_x),dims)
centroid_info = list(centroid_index = centroid_index, centroid_y = centroid_row, centroid_x = centroid_col, centroid_horiz_location = centroid_horiz_location,centroid_vert_location = centroid_vert_location,lHalf = lHi,rHalf=rHi,disjoint_centroids = list(left = lHalfCentroidi,right = rHalfCentroidi),slope = slope, pixel_density = r_density,box_density = box_density)
return(centroid_info)
}
#' add_covariance_matrix
#'
#' @param character_lists Output from processHandwriting$letterLists
#' @param character_features Nested lists associating features to respective characters.
#' @param dims Dimensions of binary image
#' @return nested lists associating features to respective characters.
#'
#' @noRd
add_covariance_matrix <- function(character_lists, character_features, dims){
for(i in 1:length(character_lists)){
matrix = i_to_rc(character_lists[[i]]$path, dims)
x = matrix[,2]
y = matrix[,1]
y = dims[1] - y #FLIPS Y VALUE SO IT REPS A REAL COORD PLANE
variance_of_x = stats::var(x)
variance_of_y = stats::var(y)
covariance_of_xy = stats::cov(x,y)
#Add Covariance to the character features
character_features[[i]]$xvar = variance_of_x
character_features[[i]]$yvar = variance_of_y
character_features[[i]]$covar = covariance_of_xy
}
return(character_features)
}
#' add_line_info
#'
#' Associates characters to their respective line numbers
#' Needs improvement if runtime becomes a problem
#'
#' @param character_features All extracted features
#' @param dims Dimensions of binary image
#' @return Appends line information to character features
#'
#' @noRd
add_line_info <- function(character_features,dims){
line_info = line_number_extract(all_down_dists(character_features), all_centroids(character_features), dims)
line_order = lapply(line_info, sort)
for(i in 1:length(character_features)){
cur_letter_index = character_features[[i]]$centroid_index
for(j in 1:length(line_info)){
if(cur_letter_index %in% line_info[[j]]){
character_features[[i]] = c(character_features[[i]],list(line_number = j, order_within_line = which(line_order[[j]] == cur_letter_index)))
}
}
}
return(character_features)
}
#Return a list of the distances from the top of a character to the first thing above it
add_updown_neighboring_char_dist <- function(character_features, character_lists, img, dims){
#For each character
for(i in 1:length(character_lists)){
down_distance = Inf
#Get the lowest point as an index point
lowest_point = character_lists[[i]]$path[[1]]
rci = i_to_rci(character_lists[[i]]$path, dims)
min_y_sorted = rci[order(rci[,1],decreasing=TRUE),]
lowest_index = min_y_sorted[[1,3]]
row = min_y_sorted[[1,1]]
col = min_y_sorted[[1,2]]
#go down until hit another index (don't go past the bottom)
cur_row = row + 1
while(cur_row <= dims[1]){
index_to_check = rc_to_i(cur_row, col, dims)
if(img[[cur_row, col]] == 0){
down_distance = cur_row-row
break;
}
cur_row = cur_row + 1
}
#Do the math on the difference
character_features[[i]]$down_dist = down_distance
}
return(character_features)
}
# Principle: Appending inside of a nested loop
# NOTE: Uses distances between centroids, NOT right edge to left edge
nov_neighboring_char_dist <- function(character_features){
by_line = character_features_by_line(character_features)
for(line in 1:length(by_line)){
for(i in 1:length(by_line[[line]])){
cur_char = by_line[[line]][[i]]
l_neighbor_centroid_dist = NULL
r_neighbor_centroid_dist = NULL
if(i != 1){
prev_char = by_line[[line]][[i-1]]
l_neighbor_centroid_dist = cur_char$centroid_x - prev_char$centroid_x
}
if(i != length(by_line[[line]])){
next_char = by_line[[line]][[i+1]]
r_neighbor_centroid_dist = next_char$centroid_x - cur_char$centroid_x
}
character_features[[cur_char$uniqueid]] = c(character_features[[cur_char$uniqueid]],list(l_neighbor_dist = l_neighbor_centroid_dist, r_neighbor_dist = r_neighbor_centroid_dist))
}
}
return(character_features)
}
#sort character features indexed by their respective line
character_features_by_line <- function(character_features){
max_line = -Inf
for(i in 1:length(character_features)){
max_line = max(max_line, character_features[[i]]$line_number)
}
characters_by_line = rep(list(list()), max_line)
for(j in 1:length(character_features)){
characters_by_line[[character_features[[j]]$line_number]] <- append(characters_by_line[[character_features[[j]]$line_number]], list(character_features[[j]]))
}
return(characters_by_line)
}
#' all_centroids
#'
#' Iterates through extracted character features, extracting
#' all centroids found for later use in line numbering.
#'
#' @param character_features Features extracted from any given document
#' @return All centroids concatenated with one another (unlisted)
#'
#' @noRd
all_centroids <- function(character_features){
centroids = list()
for(i in 1:length(character_features)){
centroids = c(centroids, character_features[[i]]$centroid_index)
}
return(unlist(centroids))
}
#' all_down_dists
#'
#' Iterates through extracted character features, extracting
#' all downward distances found for later use in line separating.
#'
#' @param character_features Features extracted from any given document
#' @return All downdistance concatenated with one another (unlisted)
#'
#' @noRd
all_down_dists <- function(character_features){
down_dists <- list()
for(i in 1:length(character_features)){
down_dists <- c(down_dists, character_features[[i]]$down_dist)
}
return(unlist(down_dists))
}
#' line_number_extract
#'
#' Primary logic unit for line number to character association.
#'
#' @param down_dists how far down to the next character from each character
#' @param all_centroids List of centroids extracted from cumulative character_features
#' @param dims Dimensions of binary image
#' @return List associating line numbers to characters
#'
#' @importFrom stats median
#' @importFrom utils head
#'
#' @noRd
line_number_extract <- function(down_dists, all_centroids, dims){
centroid_rci = matrix(i_to_rci(all_centroids,dims), ncol = 3)
#sorting list based on y
centroid_rci = matrix(centroid_rci[order(centroid_rci[,1]),], ncol = 3)
#Do some down_distance math
sorted_down_dists = sort(down_dists)
inf_removed = sorted_down_dists[!is.na(sorted_down_dists) & !is.infinite(sorted_down_dists)]
length_of_vector = length(inf_removed)
items_to_remove = length_of_vector/5 #Removing top 20% right now
trimmed = head(inf_removed, -items_to_remove)
threshold_num = as.numeric(median(trimmed)/2)
lines = list()
cur_line = vector(mode="double", length=0)
threshold = vector(mode="double", length=0)
i = 1
while(i <= max(dim(centroid_rci)[1], 1)){
tm = mean(threshold)
cur_index = centroid_rci[i,3][[1]]
cur_y = centroid_rci[i,1][[1]] #centroid of current
if(length(threshold)==0){
cur_line = c(cur_line,cur_index)
}
else if (is.na(threshold_num)){
cur_line = c(cur_line,cur_index)
}
else if(abs(tm-cur_y) < threshold_num){
cur_line = c(cur_line,cur_index)
}
else{
lines = c(lines,list(cur_line))
cur_line = vector(mode="double", length=0)
i = i-1
threshold = vector(mode="double", length=0)
}
if(i==dim(centroid_rci)[1]){
lines = c(lines,list(cur_line))
}
threshold = c(threshold,cur_y)
i = i + 1
}
return(lines)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.