R/LetterListFunctions.R

Defines functions rc_hs_to_rc_letter rc_to_i i_hs_to_i_letter i_hs_to_rc_letter i_hs_to_rc_hs i_to_rci i_to_rc i_to_y i_to_x i_to_c i_to_r centeredImageOnCentroid centeredImage loop_extract MakeSamplingDF AddSamplingStrata MakeCenterStarts MakeLetterListLetterSpecific

# The handwriter R package performs writership analysis of handwritten documents. 
# Copyright (C) 2021 Iowa State University of Science and Technology on behalf of its Center for Statistics and Applications in Forensic Evidence
# 
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
# 
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
# 
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <https://www.gnu.org/licenses/>.


# Internal Functions ------------------------------------------------------


#' MakeLetterListLetterSpecific
#'
#' Description
#' 
#' @param letterList List of letters in a handwriting sample
#' @param dims Dimensions of the handwriting sample
#' @return letterList with locations given with respect to each letter
#' @noRd
MakeLetterListLetterSpecific = function(letterList, dims)
{
  # NOTE: There are two ways to specify the location of the paths and nodes in a
  # letter. 1) by index number - each pixel in the letter's image (binary
  # matrix) is numbered moving top to bottom and left to right. index =
  # matrix(1:15, nrow=5, ncol=3) shows the index numbers of the pixels in a 5x3
  # image. 2) by row and column numbers - each pixel in the letter's image can
  # be referred to by its row number and column number. Rows are numbered from
  # top to bottom and columns are numbered from left to right. Much of the code
  # in this function switches between the two location types.
  
  # Make a list of the path from each letter. skeletons[[i]] contains the
  # locations (index numbers) of the path of the i-th letter.
  skeletons = lapply(letterList, function(x) x$path)
  
  # Make a list of the nodes in each letter. nodes[[i]] contains the
  # locations (index numbers) of the nodes in the i-th letter.
  nodes = lapply(letterList, function(x) x$nodes)
  
  # For each letter in the handwriting sample
  for(i in 1:length(letterList))
  { # Find locations (row and column numbers) of the path of the letter
    # relative to the handwriting sample
    path_rc = i_hs_to_rc_hs(index_nums = skeletons[[i]], hs_num_rows = dims[1])
    
    # Find the row number of the top of the letter
    letter_topmost_row = min(path_rc$row)
    
    # Find the row number of the bottom of the letter
    letter_bottom_row = max(path_rc$row)
    
    # Find the column number of the leftmost column of the letter
    letter_leftmost_col = min(path_rc$col)
    
    # Find the number of rows spanned by the letter
    letter_num_rows  = letter_bottom_row - letter_topmost_row + 1
    
    # Find the locations (row and column number) of the nodes relative to the
    # top and left side of the letter
    nodes_rc = i_hs_to_rc_letter(index_nums = nodes[[i]], 
                            hs_num_rows = dims[1], 
                            letter_topmost_row = letter_topmost_row,
                            letter_leftmost_col = letter_leftmost_col)
    
    # Find the locations (index numbers) of each path in the letter relative to the top and left
    # side of the letter
    letterList[[i]]$allPaths = lapply(letterList[[i]]$allPaths, 
                                      function(x){
                                        # Find the locations (index numbers) of the path
                                        # in the letter image
                                        i_hs_to_i_letter(index_nums = x, 
                                                    hs_num_rows = dims[1],
                                                    letter_num_rows = letter_num_rows,
                                                    letter_topmost_row = letter_topmost_row,
                                                    letter_leftmost_col = letter_leftmost_col)
                                        })
    
    # Find the locations (index numbers) of the nodes in the adjacency matrix relative to the 
    # handwriting sample
    nameVect_i = i_hs_to_i_letter(index_nums = as.numeric(colnames(letterList[[i]]$adjMatrix)), 
                           hs_num_rows = dims[1], 
                           letter_num_rows = letter_num_rows,
                           letter_topmost_row = letter_topmost_row,
                           letter_leftmost_col = letter_leftmost_col)
    
    # Change the row and column names of the adjacency matrix to the locations (index numbers)
    # of the nodes relative to the top and left of the letter
    colnames(letterList[[i]]$adjMatrix) = format(nameVect_i, scientific = FALSE, trim = TRUE)
    rownames(letterList[[i]]$adjMatrix) = colnames(letterList[[i]]$adjMatrix)
    
    # Find the locations (row and column numbers) of the letter's centroid relative to the 
    # top and left of the letter
    centroid_rc = rc_hs_to_rc_letter(row_nums = letterList[[i]]$characterFeatures$centroid_y,
                                  col_nums = letterList[[i]]$characterFeatures$centroid_x,
                                  letter_topmost_row = letter_topmost_row,
                                  letter_leftmost_col = letter_leftmost_col)
    letterList[[i]]$characterFeatures$centroid_y = centroid_rc$row
    letterList[[i]]$characterFeatures$centroid_x = centroid_rc$col
    
    # Store the locations (row and column numbers) of lHalf relative to the
    # top and left of the letter.
    lHalf_rc = i_hs_to_rc_letter(index_nums = letterList[[i]]$characterFeatures$lHalf, 
                             hs_num_rows = dims[1], 
                             letter_topmost_row = letter_topmost_row,
                             letter_leftmost_col = letter_leftmost_col)
    letterList[[i]]$characterFeatures$lHalfr = lHalf_rc$row
    letterList[[i]]$characterFeatures$lHalfc = lHalf_rc$col
    
    # Store the locations (row and column numbers) of rHalf relative to the
    # top and left of the letter.
    rHalf_rc = i_hs_to_rc_letter(index_nums = letterList[[i]]$characterFeatures$rHalf, 
                                 hs_num_rows = dims[1], 
                                 letter_topmost_row = letter_topmost_row,
                                 letter_leftmost_col = letter_leftmost_col)
    letterList[[i]]$characterFeatures$rHalfr = rHalf_rc$row
    letterList[[i]]$characterFeatures$rHalfc = rHalf_rc$col
    
    # Find the locations (row and column numbers) of the left and right centroids
    # relative to the top and left of the letter
    letterList[[i]]$characterFeatures$lCentroid = c(mean(lHalf_rc$row), mean(lHalf_rc$col))
    letterList[[i]]$characterFeatures$rCentroid = c(mean(rHalf_rc$row), mean(rHalf_rc$col))
    
    # Store the locations (index number) of lHalf and rHalf relative to the
    # top and left of the letter
    letterList[[i]]$characterFeatures$lHalf = i_hs_to_i_letter(index_nums = letterList[[i]]$characterFeatures$lHalf, 
                                                               hs_num_rows = dims[1], 
                                                               letter_num_rows = letter_num_rows,
                                                               letter_topmost_row = letter_topmost_row,
                                                               letter_leftmost_col = letter_leftmost_col)
    letterList[[i]]$characterFeatures$rHalf = i_hs_to_i_letter(index_nums = letterList[[i]]$characterFeatures$rHalf, 
                                                               hs_num_rows = dims[1], 
                                                               letter_num_rows = letter_num_rows,
                                                               letter_topmost_row = letter_topmost_row,
                                                               letter_leftmost_col = letter_leftmost_col)
    
    # Set the locations (row and column numbers) of the bottom row and rightmost column of the letter
    # relative to the top and left of the letter. Set the topmost row and leftmost column both to 1.
    bottom_right_rc = rc_hs_to_rc_letter(row_nums = letterList[[i]]$characterFeatures$bottom_row,
                                         col_nums = letterList[[i]]$characterFeatures$rightmost_col,
                                         letter_topmost_row = letter_topmost_row,
                                         letter_leftmost_col = letter_leftmost_col)
    letterList[[i]]$characterFeatures$bottom_row = bottom_right_rc$row
    letterList[[i]]$characterFeatures$rightmost_col = bottom_right_rc$col
    letterList[[i]]$characterFeatures$topmost_row = 1
    letterList[[i]]$characterFeatures$leftmost_col = 1
    
    # Set the locations (index numbers) of the path of the letter 
    # relative to the top and left of the letter
    letterList[[i]]$path = i_hs_to_i_letter(index_nums = letterList[[i]]$path,
                                            hs_num_rows = dims[1],
                                            letter_num_rows = letter_num_rows,
                                            letter_topmost_row = letter_topmost_row,
                                            letter_leftmost_col = letter_leftmost_col)
    
    # Set the locations (index numbers) of the nodes in the letter 
    # relative to the top and left of the letter
    letterList[[i]]$nodes = i_hs_to_i_letter(index_nums = letterList[[i]]$nodes,
                                            hs_num_rows = dims[1],
                                            letter_num_rows = letter_num_rows,
                                            letter_topmost_row = letter_topmost_row,
                                            letter_leftmost_col = letter_leftmost_col)
  }
  return(letterList)
}


#' MakeCenterStarts
#'
#' Select K graphs as the starting centers of the K clusters. All graphs from
#' the training handwriting samples are grouped by stratum -- the number of
#' loops if the graph has one or two loops or the number of paths in the graph.
#' Then the number of samples specified by numstrat are randomly selected from
#' each stratum group.
#'
#' @param procList List of handwriting samples
#' @param K Integer number of clusters
#' @param numPathCuts Integer number of cuts to make when comparing segments of paths
#' @return centerstarts List of K graphs
#'
#' @noRd
MakeCenterStarts = function(procList, K, numPathCuts)
{
  # Add the numloops and stratum fields to the characterFeatures for each letter
  # in each handwriting sample. numloops is the number of loops in a letter found
  # with handwriter::loop_extract. stratum is "1loop", "2loops", or the length of
  # the letter's allPaths field
  procList = lapply(procList, function(x){x$process$letterList = AddSamplingStrata(x$process$letterList); return(x)})
  
  # Make sampling dataframe
  samplingdf <- MakeSamplingDF(procList = procList)
  
  centerstarts = list()
  startingIndices = sample(1:length(procList), K, replace = FALSE)
  for(j in 1:K) {
    centerstarts[[j]] = letterToPrototype(procList[[startingIndices[j]]], numPathCuts)
  }
  
  return(centerstarts)
}


#' AddSamplingStrata
#'
#' Add two new fields, numloops and stratum, to the characterFeatures list for
#' each letter in letterList.
#'
#' @param letterList List of letters
#' @return List of letters
#'
#' @noRd
AddSamplingStrata = function(letterList){
  # For each letter in the sample
  for(i in 1:length(letterList)){
    # Count the number of loops in the letter and store as a new field called numloops under characterFields
    letterList[[i]]$characterFeatures$numloops = length(loop_extract(letterList[[i]]$allPaths))
    # Add a new field called stratum under characterFields, where stratum is "1loop", "2loops", or the length of allPaths
    if(letterList[[i]]$characterFeatures$numloops == 2){
      letterList[[i]]$characterFeatures$stratum = "2loop"
    } else if(letterList[[i]]$characterFeatures$numloops == 1){
      letterList[[i]]$characterFeatures$stratum = "1loop"
    } else {
      letterList[[i]]$characterFeatures$stratum = length(letterList[[i]]$allPaths)
    }
  }
  return(letterList)
}


#' MakeSamplingDF
#'
#' Group each letter in procList by stratum -- the number of loops if the letter has one or two loops
#' or the number of paths in the letter -- and randomly sample letters from each group. The numstrat
#' parameter sets the number of letters to sample from each group.
#'
#' @param procList List of handwriting samples
#' @return Dataframe
#' 
#' @importFrom purrr map2
#'
#' @noRd
MakeSamplingDF <- function(procList)
{ stratum <- stratumfac <- data <- n <- samp <- NULL
  
  # Initialize
  doc = letter = stratum_a = c()
  
  # Create a vector called doc that numbers each handwriting sample. Create a
  # vector called letter that numbers each letter within a handwriting sample.
  # Create a vector called stratum_a that shows the number of loops if the
  # letter has one or two loops and otherwise shows the number of paths in the
  # letter
  for(i in 1:length(procList)){
    # For each letter in handwriting sample
    for(j in 1:length(procList[[i]]$process$letterList)){
      doc = c(doc,i)
      letter = c(letter,j)
      stratum_a = c(stratum_a, procList[[i]]$process$letterList[[j]]$characterFeatures$stratum)
    }
  }
  
  # Make a vector of unique values in stratum_a. Make "1loop" and "2loop" the first two entries.
  lvls = c("1loop", "2loop", sort(as.numeric(unique(stratum_a[!(stratum_a %in% c("1loop", "2loop"))]))))
  
  # Number of strat to sample at each level. Pad vector with zeros to make the same length as levels.
  if (length(lvls) > 17){
    numstrat = c(5, 2, 5, 6, 5, 3, 2, 2, 2, rep(1,8), rep(0, length(lvls)-17))
  } else if (length(lvls)==17) {
    numstrat = c(5, 2, 5, 6, 5, 3, 2, 2, 2, rep(1,8))
  } else {
    numstrat = c(5, 2, 5, 6, 5, 3, 2, 2, 2, rep(1,8))
    numstrat = numstrat[1:length(lvls)]
  }
  
  # Make the dataframe
  samplingdf = data.frame(doc = doc, letter = letter, stratum = stratum_a, ind = 1:length(stratum_a))
  
  # Randomly select graphs by stratum level
  samplingdf = samplingdf %>%
    dplyr::mutate(stratumfac = factor(stratum, levels = lvls)) %>%  # convert to factor
    dplyr::group_by(stratumfac) %>%
    tidyr::nest() %>%  # make nested dataframe for each factor level            
    dplyr::ungroup() %>% 
    dplyr::arrange(stratumfac) %>%  # sort by factor levels
    dplyr::mutate(n = numstrat) %>%  # add column with number to sample from each nested dataframe
    dplyr::mutate(samp = purrr::map2(data, n, dplyr::sample_n)) %>%  # sample n graphs from each nested dataframe
    dplyr::select(-data) %>%  # Remove old rows
    tidyr::unnest(samp)  # Keep sampled rows
  
  return(samplingdf)
  
}


#' loop_extract
#'
#' Iterates through all available paths from processHandwriting()
#' Picks out loops for later character association.
#' 
#' @param allPaths All character (formerly letter) paths from processHandwriting()
#' 
#' @noRd
loop_extract = function(allPaths){
  loops = list()
  for(i in 1:length(allPaths)){
    if(length(allPaths)<1){
      next
    }
    if(allPaths[[i]][[1]]==allPaths[[i]][[length(allPaths[[i]])]]){
      loops = c(loops,list(allPaths[[i]]))
    }
  }
  return(loops)
}


#' centeredImage
#'
#' Find the letter's centroid and proptroid relative to the bottom left corner of the
#' letter's image.
#'
#' @param letter A letter
#' @return a named list with fields nodes, centroid, proptroid, image and allPaths
#'
#' @noRd
centeredImage = function(letter)
{
  res = list()
  res$nodes = letter$nodes
  # Find the location (column and row numbers) of the centroid relative
  # to the bottom left of the lettter image. Like (x,y) coordinates with 
  # (0,0) in the bottom left corner of the image.
  res$centroid = round(c(letter$characterFeatures$centroid_x, letter$characterFeatures$height - letter$characterFeatures$centroid_y + 1))
  # Calculate the proptroid
  res$proptroid = c(letter$characterFeatures$centroid_x, letter$characterFeatures$height - letter$characterFeatures$centroid_y + 1)/c(letter$characterFeatures$width, letter$characterFeatures$height)
  res$image = letter$image
  res$allPaths = letter$allPaths
  return(res)
}

#' centeredImageOnCentroid
#'
#' Place the letter's centroid at (0,0) and find the locations (column and row
#' numbers) of a letter's paths and path ends relative to the letter's centroid.
#'
#' @param letter A letter from a handwriting sample
#' @return The letter with it's paths and path ends locations given relative to the 
#' letter's centroid.
#'
#' @noRd
centeredImageOnCentroid <- function(letter){
  # column number of node from the left of the letter image
  nodes_c = ((letter$nodes-1) %/% dim(letter$image)[1]) + 1
  # row number of node from the bottom of the letter image
  nodes_r = dim(letter$image)[1] - ((letter$nodes-1) %% dim(letter$image)[1])
  # Make a matrix of column and row pairs. Like (x,y) coordinates with 
  # (0,0) in the bottom left corner of the letter image
  letter$nodesrc = cbind(nodes_c, nodes_r)
  # Shift the nodes so that the origin (0,0) is on the centroid. Do this by subtracting the centroid from 
  # each node. 
  letter$nodesrc = letter$nodesrc - matrix(rep(letter$centroid, each = dim(letter$nodesrc)[1]), ncol = 2)
  # Find the ends locations (column and row numbers) of the ends of each relative to the bottom left corner
  # of the letter image
  letter$pathEndsrc = lapply(letter$allPaths, function(z){cbind(((z[c(1, length(z))]-1) %/% dim(letter$image)[1]) + 1, dim(letter$image)[1] - ((z[c(1,length(z))]-1) %% dim(letter$image)[1]))})
  # Shift the path ends so that the origin (0,0) is on the centroid. Do this by subtracting the centroid from 
  # each set of path ends.
  letter$pathEndsrc = lapply(letter$pathEndsrc, function(z){z - matrix(rep(letter$centroid, each = 2), ncol = 2)})
  return(letter)
}


#' i_to_r
#'
#' Convert index number location in a matrix to 
#' the row number location in that matrix.
#' 
#' @param index_num Numeric Index number location in matrix
#' @param num_rows Number of rows in matrix
#' @return Row number location in matrix
#' 
#' @noRd
i_to_r = function(index_num, num_rows){
  r = ((index_num-1) %% num_rows) + 1
  return(r)
}


#' i_to_c
#'
#' Convert index number location in a matrix to 
#' the column number location in that matrix.
#' 
#' @param index_num Numeric Index number location in matrix
#' @param num_rows Number of rows in matrix
#' @return Column number location in matrix
#' @noRd
i_to_c = function(index_num, num_rows){
  c = ((index_num-1) %/% num_rows) + 1
  return(c)
}


#' i_to_x
#'
#' Convert index number location in a matrix to 
#' the x-coordinate location in that matrix where (0,0)
#' is the bottom left corner of the matrix
#' 
#' @param index_num Numeric Index number location in matrix
#' @param num_rows Number of rows in matrix
#' @return x-coordinate location in matrix
#' @noRd
i_to_x = function(index_num, num_rows){
  x = ((index_num-1) %/% num_rows) + 1
  return(x)
}


#' i_to_y
#'
#' Convert index number location in a matrix to 
#' the y-coordinate location in that matrix where (0,0)
#' is the bottom left corner of the matrix
#' 
#' @param index_num Numeric Index number location in matrix
#' @param num_rows Number of rows in matrix
#' @return y-coordinate location in matrix
#' @noRd
i_to_y = function(index_num, num_rows){
  y = num_rows - (index_num-1) %% num_rows
  return(y)
}

#' i_to_rc
#'
#' Function for converting indices to respective row, col.
#' 
#' @param nodes nodes to be converted.
#' @param dims dimensions of binary image
#' @return returns matrix mapping nodes to respective row, 
#' @noRd
i_to_rc = function(nodes, dims)
{
  cs = (nodes-1)%/%dims[1] + 1
  rs = (nodes-1)%%dims[1] + 1
  return(matrix(c(rs,cs), ncol = 2))
}

#' i_to_rci
#'
#' Function for converting indices to respective row, col and associates the original index.
#' 
#' @param nodes nodes to be converted.
#' @param dims dimensions of binary image
#' @param fixed instead of normal computation of rows, put it in a fixed location.
#' @return returns matrix mapping nodes' indices to respective row, col
#' @noRd
i_to_rci = function(nodes, dims, fixed = FALSE)
{
  cs = (nodes-1)%/%dims[1] + 1
  rs = (nodes-1)%%dims[1] + 1
  if(fixed) rs = dims[1] - rs + 1
  rowcolmatrix = cbind(rs,cs,nodes)
  colnames(rowcolmatrix) = c('y','x','index')
  return(rowcolmatrix)
}

#' i_hs_to_rc_hs
#'
#' Convert index number(s) locations in a handwriting sample to 
#' the row and column number(s) locations in that handwriting sample.
#' 
#' @param index_nums Vector of index number(s) in handwriting sample
#' @param hs_num_rows Number of rows in handwriting sample
#' @return List of row and column number(s) locations in handwriting sample
#' @noRd
i_hs_to_rc_hs = function(index_nums, hs_num_rows){
  rc = list()
  rc[['row']] = ((index_nums-1) %% hs_num_rows) + 1
  rc[['col']] = ((index_nums-1) %/% hs_num_rows) + 1
  return(rc)
}


#' i_hs_to_rc_letter
#'
#' Convert index number(s) locations in a handwriting sample to 
#' the row and column number(s) locations in a letter in that 
#' handwriting sample.
#' 
#' @param index_nums Vector of index number(s) in handwriting sample
#' @param hs_num_rows Integer number of rows in handwriting sample
#' @param letter_topmost_row Integer number of the top row of letter
#' @param letter_leftmost_col Integer number of the left column of letter
#' @return List of row and column number(s) locations in letter
#' @noRd
i_hs_to_rc_letter = function(index_nums, hs_num_rows, letter_topmost_row, letter_leftmost_col){
  rc = list()
  
  # Find locations (row and column numbers) in handwriting sample
  rc = i_hs_to_rc_hs(index_nums = index_nums, hs_num_rows = hs_num_rows)
  
  # Find locations (row and column numbers) in letter
  rc[['row']] <- rc$row - letter_topmost_row + 1
  rc[['col']] <- rc$col - letter_leftmost_col + 1
  return(rc)
}


#' i_hs_to_i_hs
#'
#' Convert index number(s) locations in a handwriting sample to 
#' index number(s) locations in a letter in that handwriting sample.
#' 
#' @param index_nums Vector of index number(s) in handwriting sample
#' @param hs_num_rows Integer number of rows in handwriting sample
#' @param letter_num_rows Integer number of rows in letter
#' @param letter_topmost_row Integer number of the top row of letter
#' @param letter_leftmost_col Integer number of the left column of letter
#' @return Integer index number(s) locations in letter
#' @noRd
i_hs_to_i_letter = function(index_nums, hs_num_rows, letter_num_rows, letter_topmost_row, letter_leftmost_col){
  
  # Find locations (row and column numbers) in letter
  rc <- i_hs_to_rc_letter(index_nums, hs_num_rows, letter_topmost_row, letter_leftmost_col)
  
  # Find the locations (index numbers) in letter
  i <- rc$row + (rc$col - 1)*letter_num_rows
  
  return(i)
}

#' rc_to_i
#'
#' Convert rows and columns to their respective indices.
#' This is index sensitive, so row_y[[1]] should correspond to col_x[[1]]
#' 
#' @param row_y Row(s) to be converted to an index
#' @param col_x Columns(s) to be converted to an index
#' @param dims Dimensions of binary image
#' @param fixed Logical value asking if row_y is fixed to a point.
#' @return Returns index(icies) of all row_y's and col_x's
#' @noRd
rc_to_i = function(row_y,col_x,dims, fixed = FALSE)
{
  row_y = as.integer(row_y)
  if(fixed) row_y = dims[1] - row_y + 1
  col_x = as.integer(col_x)
  return((col_x-1)*dims[1]+row_y)
}

#' rc_hs_to_rc_letter
#'
#' Convert row and column number(s) locations in a handwriting sample to 
#' row and column number(s) locations in a letter in that handwriting sample.
#' 
#' @param row_nums Integer row numbers in handwriting sample
#' @param col_nums Integer column numbers in handwriting sample
#' @param letter_topmost_row Integer number of the top row of letter
#' @param letter_leftmost_col Integer number of the left column of letter
#' @return List of row and column number(s) locations in letter
#' @noRd
rc_hs_to_rc_letter = function(row_nums, col_nums, letter_topmost_row, letter_leftmost_col){
  rc = list()
  
  # Find locations (row and column numbers) in letter
  rc[['row']] <- row_nums - letter_topmost_row + 1
  rc[['col']] <- col_nums - letter_leftmost_col + 1
  return(rc)
}
CSAFE-ISU/handwriter documentation built on March 24, 2024, 6:23 p.m.