In srvanderplas/bulletsamplr: Resample bullet signatures

knitr::opts_chunk$set(
  collapse = TRUE,
  echo = FALSE,
  cache = T,
  comment = "#>",
  dpi = 300,
  dev = "png",
  out.width = "\\textwidth",
  fig.width = 8, fig.height = 5,
  message = F, warning = F
)

library(bulletsamplr)
library(bulletxtrctr)

library(dplyr)
library(tidyr)
library(stringr)
library(purrr)
library(furrr)
plan(multicore, .cleanup = T)

library(ggplot2)
library(gridExtra)
library(grid)
library(cowplot)
theme_set(theme_bw())

ccdata_align <- function(ccd1, ccd2) {
  sig_align(ccd1$sig, ccd2$sig)
}

plot_align_multistudy <- function(df, framed = NULL, show_phase = F, plot_title = sprintf("Barrel %02d, Bullet 1 & 2, across 3 Hamby studies", unique(df$Barrel) %>% as.numeric())) {

  df <- df %>% mutate_at(vars(matches("Study")), ~str_replace_all(., "Hamby|Rescan", "")) %>%
    mutate(bs = sprintf("Study %02s\nBullet %s", Study, Bullet),
           bs1 = sprintf("Study %02s\nBullet %s", Study1, Bullet1))
  framed <- framed %>% 
    ungroup() %>% 
    mutate_at(vars(matches("Study")), ~str_replace_all(., "Hamby|Rescan", "")) %>%
    mutate(bs = sprintf("Study %02s\nBullet %s", Study, Bullet),
           bs1 = sprintf("Study %02s\nBullet %s", Study1, Bullet1))

  p <- df %>%
    ggplot() + 
    geom_tile(aes(x = Land, y = Land1, fill = ccf)) + 
    facet_grid(bs1 ~ bs) + 
    scale_fill_gradient2(low = "gray", high = "darkorange", mid = "white", midpoint = .5) + 
    ggtitle(plot_title) + 
    coord_fixed() + 
    theme(axis.title = element_blank())

  if (!is.null(framed)) {
      p <- p + geom_tile(aes(x = Land, y = Land1), 
                color = "darkorange", size = .75, fill = 'transparent', 
                data = framed)
  }

  if (!is.null(framed) & show_phase) {
    tmp <- framed %>% select(bs, bs1, phase) %>% unique() %>%
      mutate(x = ifelse(phase %in% c("C", "D", "E"), 3.5, 5),
             y = ifelse(phase %in% c("C", "D", "E"), 3.5, 2))
    p <- p + geom_text(aes(x = x, y = y, label = phase), data = tmp)
  }

  p
}

aligned_barrels <- c(1:10) %>% as.list()
match_all_studies <- function(i, con) {
  barrel <- sprintf(c("%d", "%02d"), i)
  barrel <- tbl(con, "bullet.crosssection") %>%
    filter(Barrel %in% barrel) %>%
    collect()

  barrel <- barrel %>% group_by(Study, Barrel, Bullet, Land, id, source) %>%
    nest(x, y, value, sig, .key = 'ccdata') %>%
    ungroup()

  barrel_align <- tidyr::crossing(barrel, barrel) %>%
    filter(!(Study == Study1 & Bullet == Bullet1)) %>%
    mutate(align = furrr::future_map2(ccdata, ccdata1, ccdata_align))

  barrel_align <- barrel_align %>%
    mutate(ccf = sapply(align, function(.) .$ccf),
           lag = sapply(align, function(.) .$lag))

  aligned_barrels[[i]] <<- barrel_align

  plot_align_multistudy(barrel_align, i)
}

circular_slice <- function(x, offset) {
  n <- ifelse(is.vector(x), length(x), nrow(x))
  offset <- (offset %% n)
  if (offset > 0) {
    idx <- c((offset + 1):n, 1:offset)
  } else {
    idx <- 1:n
  }

  if (is.vector(x)) {
    return(x[idx])
  } else if (length(dim(x)) > 1) {
    return(x[idx,])
  } else return(NULL)
}

compute_best_phase <- function(df, phasevars = c("Land", "Land1"), 
                               val_var = "ccf", 
                               group_vars = c("Study", "Study1", 
                                              "Barrel", "Barrel1", 
                                              "Bullet", "Bullet1")) {
  v1 <- df %>% magrittr::extract2(phasevars[1]) %>% unique()
  v2 <- df %>% magrittr::extract2(phasevars[2]) %>% unique()
  phase <- LETTERS[1:length(v1)]
  n1 <- length(v1)
  n2 <- length(v2)

  nms <- list("v1", "v2") %>% set_names(phasevars)

  phasedata <- tibble(
    v1 = rep(v1, times = n2),
    v2 = sapply(1:n2 - 1, circular_slice, x = v2) %>% as.vector(),
    phase = rep(phase, each = n1)
  )  %>%
    rename_(.dots = nms)

  bestphase <- phasedata %>%
    left_join(df, by = phasevars) %>%
    group_by_(.dots = as.list(c(group_vars, "phase"))) %>%
    mutate_(.dots = list("val" = val_var)) %>%
    summarize(val = mean(val, na.rm = T)) %>%
    arrange(desc(val)) %>%
    filter(row_number() == 1)

  bestphase %>% left_join(phasedata, by = "phase") %>% ungroup()
}

# Functions to align all lands 
barrel_setup <- function(i, land_equiv, con) {
  barrel <- sprintf("%02d", i)
  barrelquery <- c(barrel, sprintf("% 2d", i), sprintf("%d", i)) %>% unique()

  crossing(
    tibble(Study = c("Hamby224", "Hamby36", "Hamby44Rescan")),
    tibble(Bullet = c("1-", "2-"))) %>%
  mutate(Barrel = str_pad(as.character(i), width = 2, pad = ifelse(Study == "Hamby224", " ", "0")),
         id = paste0(Barrel, "-", Bullet)) %>%
  mutate(start_at = land_equiv) %>%
  crossing(tibble(land = -1:4, letter = letters[1:6])) %>%
  mutate(land = (land + start_at) %% 6 + 1)  %>%
  mutate(id = paste0(id, land)) %>%
  arrange(letter) %>%
  select(-start_at, -land, -Barrel, -Bullet) %>%
  left_join(
    tbl(con, "bullet.crosssection") %>%
      filter(Barrel %in% barrelquery) %>%
      collect()
  ) %>%
  mutate(ref = paste(Study, id, sep = "_")) %>%
    group_by(ref) %>%
    mutate(allNA = sum(!is.na(sig))) %>%
    ungroup() %>%
    filter(allNA > 0)
}

barrel_align <- function(x, y) {
    xname <- unique(x$ref)
    yname <- unique(y$ref)
    tmp <- sig_align(x$sig, y$sig)
    tmp$lands <- tmp$lands %>%
      magrittr::set_names(c("x", xname, yname))

    if (is.na(tmp$lands[1,xname])) {
      tmp$lands$x <- tmp$lands$x - which.min(is.na(tmp$lands[,xname])) + 1
    }

    tibble(b1 = xname,
               b2 = yname,
               ccf = tmp$ccf,
               lag = tmp$lag,
               lands = list(tmp$lands))
}

barrel_cross <- function(df) {
  df %>%
    select(-Barrel, -Bullet, -Land, -source) %>%
    nest(-Study, -id, -letter) %>%
    crossing(., .) %>%
    filter(letter == letter1) %>%
    group_by(letter) %>%
    filter(!(Study == Study1 & id == id1)) %>% # Same data comparisons
    filter(id == id[1]) # Get only one set of comparisons for each matching land
}

barrel_cross_align <- function(df) {
  df %>% 
    barrel_cross() %>%
    mutate(align = future_map2(data, data1, barrel_align))
}

full_join_all <- function(lx, by = NULL) {
  if (length(lx) == 1) {
    lx[[1]]
  } else {
    full_join(lx[[1]], full_join_all(lx[-1], by = by), by = by)
  }
}

plot_aligned_seq <- function(letter, long, med) {
  barrel <- unique(long$Barrel)

  long <- long %>%
    mutate(set = str_replace_all(set, "[^\\d]", "")) %>%
    mutate(fac = sprintf("Set%s-%s", set, Bullet)) %>%
    mutate(fac_id = factor(fac) %>% as.numeric())

  suppressWarnings({
    ggplot(long, aes(x = x, y = sig, color = factor(fac_id))) + 
      geom_line() + 
      geom_line(aes(x = x, y = med, color = NA), data = med) + 
      # facet_grid(fac~., labeller = label_both) +
      scale_color_discrete(na.value = 'grey', guide = 'none') + 
      theme(axis.title = element_blank()) + 
      ggtitle(sprintf("Hamby Barrel %02s Bullets, Land Set %s", barrel, letter))
  })
}

if (!file.exists("barrel_match.rda")) {
  barrel_match <- as.list(1:10)
  con <- odbc::dbConnect(odbc::odbc(), "bullets", timeout = 10)
  barrel_match[[1]] <- barrel_setup(1, c(1, 6, 3, 6, 6, 4), con = con)
  barrel_match[[2]] <- barrel_setup(2, c(1, 6, 6, 6, 3, 5), con = con)
  barrel_match[[3]] <- barrel_setup(3, c(1, 3, 5, 5, 2, 2), con = con)
  barrel_match[[4]] <- barrel_setup(4, c(1, 6, 6, 6, 1, 5), con = con)
  barrel_match[[5]] <- barrel_setup(5, c(1, 5, 2, 5, 5, 6), con = con)
  barrel_match[[6]] <- barrel_setup(6, c(1, 4, 4, 4, 3, 3), con = con) # Really good matches
  barrel_match[[7]] <- barrel_setup(7, c(1, 2, 2, 2, 1, 3), con = con)
  barrel_match[[8]] <- barrel_setup(8, c(1, 5, 1, 2, 6, 3), con = con)
  barrel_match[[9]] <- barrel_setup(9, c(1, 5, 1, 1, 2, 5), con = con)
  barrel_match[[10]] <- barrel_setup(10, c(1, 3, 4, 4, 5, 5), con = con)

  saveRDS(barrel_match, "barrel_match.rda")
} else {
  barrel_match <- readRDS("barrel_match.rda")
}

if (!file.exists("barrel_align.rda")) {
  barrel_align <- map(barrel_match, barrel_cross_align) %>% 
    bind_rows() %>%
    mutate(Barrel = id %>% str_extract("^ ?\\d{1,}") %>% as.numeric())
  saveRDS(barrel_align, "barrel_align.rda")
} else {
  barrel_align <- readRDS("barrel_align.rda")
}

if (!file.exists("barrel_align_plot.rda")) {
  barrel_align_plotable <- barrel_align %>%
    unnest(align) %>%
    select(-data, -data1, -letter1) %>%
    group_by(Barrel, letter) %>%
    summarize(lands = list(full_join_all(lands))) %>%
    mutate(
      longalign = map(lands, gather, key = id, value = sig, -x),
      longalign = map(longalign, extract, col = id, 
                      into = c("set", "Barrel", "Bullet", "Land"), 
                      regex = "(.*)_([ \\d]{2})-(\\d)-(\\d)", remove = F)
    ) %>%
    select(-lands) %>%
    unnest() %>%
    group_by(Barrel, letter) %>%
    mutate(line_id = factor(id) %>% as.numeric()) %>%
    ungroup() %>%
    mutate(LandSet = letter)

  barrel_align_cycle <- barrel_align_plotable %>% 
  select(Barrel, LandSet, letter, id, line_id, set, x, sig) %>%
  mutate(xold = x, sigold = sig) %>%
  split(.$id) %>%
  purrr::map_df(., ~crosscut_slice(.) %>% bind_rows()) %>%
  mutate(sig_cycle = sig, sig = sigold, x_cycle = x, x = xold)

  barrel_align_plotable <- left_join(barrel_align_plotable, barrel_align_cycle)

  saveRDS(barrel_align_plotable, "barrel_align_plot.rda")
} else {
  barrel_align_plotable <- readRDS("barrel_align_plot.rda")
}

if (!file.exists("barrel_align_med.rda")) {
  # Median/mean data only - smaller than long form, easier to plot single line
  barrel_align_med <- barrel_align_plotable  %>%
    group_by(Barrel, LandSet, x) %>%
    summarize(
      n_not_na = sum(!is.na(sig)),
      med = ifelse(n_not_na >= 3, median(sig, na.rm = T), NA),
      mean = ifelse(n_not_na >= 3, mean(sig, na.rm = T), NA)
    ) %>%
    filter(n_not_na >= 3) %>%
    ungroup()
  saveRDS(barrel_align_med, "barrel_align_med.rda")
} else {
  barrel_align_med <- readRDS("barrel_align_med.rda")
}

Introduction

A common question in forensics (whether statistical or examiner based) is the error rate of the assessment procedure. The term ``error rate'' includes the statistical concepts of false positives (declaring a match when the two pieces of evidence are from different sources) and false negatives (declaring that two pieces are from different sources when they are from the same source). Statistical methods for forensic comparisons take the following form:

Reduce the evidence to some set of numeric data used for comparison
Use the numeric data to calculate a match score
Compare the match score to reference distributions derived from known matches (KM) and known non matches (KNM)
Calculate a score-based likelihood ratio[@lund_likelihood_2017] for a known match: $$LR(KM) = \frac{\text{Probability of observing }x | \text { known match}}{\text{Probability of observing }x | \text{ known non match}}$$

Derivation of this likelihood ratio depends heavily on the reference distributions derived from known matches and known non matches. These reference distributions are assembled from collected data, which may be sparse for a given set of class characteristics (firearm, barrel manufacturing process, ammunition type).

In order to present evidence in court, it is particularly important to assess the false positive rate (the probability of declaring a match when the data are from different sources). For score-based problems, the false positive rate is tied to the quality of the estimate of the denominator above, that is, the probability of observing $x$ when it is known that the two pieces of evidence are from different sources. It is generally impractical to derive or even bound this probability experimentally, as that requires the collection and comparison of very large amounts of data from different sources stemming from the population of interest. For bullet comparisons, for instance, it would be necessary to gather bullets fired from a large number of barrels with class characteristics similar to those referenced, compare each fired bullet to each other bullet, and determine the number of bullet matches which have a score greater than $x$ observed score.

It is much more reasonable to statistically derive the distribution of scores under known match or known non-match conditions by using a bootstrap or resampling technique to generate the numeric data produced in step 1 above. \citet{bachrach_development_2013} generated bullet signatures using wavelets and fractals to mimic the identified features of land engraved area signatures. Other data generation and augmentation techniques have been used for handwriting data \citep{sesa-nogueras_writer_2013, rabasse_new_2008}, iris data\citep{wecker_multiresolution_2010}, and facial images \citep{leibe_we_2016}. With generated data, we can be reasonably certain that any correspondence between two generated patterns is entirely random. As long as important characteristics of the data are preserved, these randomly assembled pattern data provide a much quicker method to assess the distribution of known non-match and known match pattern scores.

Statistical Analysis of Bullet Land Engraved Areas

Comparison of bullet land engraved areas (LEAs) typically involves distilling the 3D scan of the area of interest into a two-dimensional ``signature" that removes artifacts originating from the grooves as well as the curvature of the bullet. \citet{aoas} provides one method for distilling bullet scans into signatures; alternate methods used in \citet{vorburgerApplicationsCrosscorrelationFunctions2011} should also be compatible with the approach to data generation described in this paper. \todo{cite CMPS paper, Will's implementation} Important characteristics of the LEA signature depend in part on the method used to transform the collected 3D image into a signature. We would not expect that different filtering and noise removal methods would produce signatures with exactly the same shapes or properties. By working with comparisons at the signature level, rather than the full image level, we also do not have to simulate the entire 3D image, as in \citet{bachrach_development_2013}.

It is important that the presence of problematic features, such as tank rash and missing values due to microscope error, occur with approximately the same frequency in the generated sequences as in sequences from real data. This ensures that the reference distributions for known matches and known non-matches account for real-world constraints. The method for synthetic signature generation proposed in \citet{bachrach_development_2013} produces sequences and 3D images which do not mimic the problems seen in collected data. Examples of problematic lands in the Hamby 44 test set are shown in \Cref{fig:tankrash}. To address this issue, this paper proposes a method for generating sequences from examiner test kits and persistence studies using resampling techniques designed for data with temporal or spatial autocorrelation.

problem_crosscuts <- filter(barrel_align_plotable, 
                            id %in% c("Hamby44Rescan_02-2-5", "Hamby44Rescan_08-2-2")) %>%
  mutate(type = "crosscut")
problem_crosscut_med <- left_join(select(problem_crosscuts, Barrel, LandSet) %>% unique(),
                                  barrel_align_med, by = c("Barrel", "LandSet")) %>%
  mutate(sig = med, type = "median")
problem_crosscuts <- bind_rows(problem_crosscuts, problem_crosscut_med)

titles <- tibble(Barrel = unique(problem_crosscuts$Barrel),
                 Problem = c("tank rash", "pitting"),
                 imgs = c("./images/B2-B2-L5.png", "./images/B8-B2-L2.png"))

for (i in unique(problem_crosscuts$Barrel)) {
  tmp <- titles %>%
    filter(Barrel == i) 

  p1 <- ggdraw() + draw_image(tmp$imgs, clip = T)
  p2 <- ggplot(data = filter(problem_crosscuts, Barrel == i)) + 
      geom_line(aes(x = x, y = sig, color = type)) + 
      scale_color_manual("Crosscut", values = c("crosscut" = "black", "median" = "blue"), labels = c("This Land", "Median")) + 
      theme_bw() + 
      theme(axis.title = element_blank(),  legend.position = c(.99, .99), legend.justification = c(.99, .99), legend.direction = 'horizontal', legend.background = element_rect(fill = "transparent"))

  print(plot_grid(p1, p2, ncol = 2, align = "v"))
}

Threshold Bootstrap

The threshold bootstrap was proposed by \citet{park_threshold_1999} as an alternative to the moving block bootstrap that preserves continuity at block endpoints. Typically used in time series analysis to preserve complex dependency structures, it can be repurposed here because of our desire to maintain spatial continuity. Using the threshold bootstrap ensures that we get a continuous sequence that, while not guaranteed to be differentiable, adequately mimics the smoothness of the source data.

As described in \citet{park_threshold_1999}, the threshold bootstrap divides a weakly dependent, stationary series $\mathbf{X}={X_1, ..., X_n}$ with a threshold $\overline X$ that divides the sequence into $R$ chunks $C_i = {X_{i,1}, ..., X_{i,n_i}}$, where the ith chunk has size $n_i$, $i = 1, ..., R$ and $\sum_{i=1}^R n_i = n$. Samples are constructed at random with replacement from the set ${C_1, ..., C_R}$ to produce a sequence of the desired length.

\Cref{fig:threshold-bootstrap-cycle-demo} shows a bullet signature, with the signature mean value marked; the bottom two images shows the blocks produced for use in the threshold bootstrap, where each block contains 1 or 3 cycles, respectively, as defined by \citet{park_threshold_1999}.

```r; the signature was created using the \texttt{bulletxtrctr} package and the method described in \citet{aoas}.", fig.height = 6, fig.width = 8, warning = F} theme_set(theme_bw()) data(sig) threshold <- median(sig$sig, na.rm = T) p2 <- ggplot(aes(x = x, y = sig), data = sig) + geom_hline(aes(yintercept = threshold), color = "red") + scale_y_continuous(expand = expansion(mult = .05, add = c(0, .5))) + geom_line() + ggtitle("Original Signature + Threshold used to create cycles") + theme(axis.title = element_blank(), axis.text.x = element_blank())

sig_slices <- sig %>% mutate(xold = x, sigold = sig) %>% crosscut_slice() %>% bind_rows() %>% group_by(.chunk) %>% ungroup()

sig_slices2 <- sig %>% mutate(xold = x, sigold = sig) %>% crosscut_slice(., ncycle = 3) %>% bind_rows() %>% group_by(.chunk) %>% ungroup()

shuffle <- function(x) { sample(x, size = length(x), replace = F) }

sig_slices_sum <- sig_slices %>% select(.chunk, xold, sigold) %>% group_by(.chunk) %>% mutate(chunksplit = (row_number() == 1), labelmax = max(sigold, na.rm = T), chunkmid = mean(xold, na.rm = T), chunkmin = min(xold), chunkmax = max(xold)) %>% ungroup() %>% filter(chunksplit) %>% select(-chunksplit)

sig_slices_sum2 <- sig_slices2 %>% select(.chunk, xold, sigold) %>% group_by(.chunk) %>% mutate(chunksplit = (row_number() == 1), labelmax = max(sigold, na.rm = T), chunkmid = mean(xold, na.rm = T), chunkmin = min(xold), chunkmax = max(xold)) %>% ungroup() %>% filter(chunksplit) %>% select(-chunksplit)

p3 <- ggplot(data = sig_slices, aes(x = xold, y = sigold, color = factor(.chunk))) + geom_segment(aes(x = chunkmin, xend = chunkmax, y = labelmax + 0.05, yend = labelmax + 0.05), data = sig_slices_sum, arrow = arrow(ends = "both", type = "closed", length = unit(0.15, "lines"))) + geom_text(aes(x = chunkmid, y = labelmax + .07, label = .chunk), vjust = -0.1, data = sig_slices_sum, inherit.aes = F) + geom_hline(aes(yintercept = threshold), color = "grey40") + geom_line() + scale_color_discrete("Chunk", guide = 'none') + scale_y_continuous(expand = expansion(mult = .05, add = c(0, .5))) + ggtitle("Chunks (and boundary regions), 1 cycle/chunk") + theme(axis.title = element_blank(), axis.text.x = element_blank())

p4 <- ggplot(data = sig_slices2, aes(x = xold, y = sigold, color = factor(.chunk))) + geom_segment(aes(x = chunkmin, xend = chunkmax, y = labelmax + 0.05, yend = labelmax + 0.05), data = sig_slices_sum2, arrow = arrow(ends = "both", type = "closed", length = unit(0.15, "lines"))) + geom_text(aes(x = chunkmid, y = labelmax + .07, label = .chunk), vjust = -0.1, data = sig_slices_sum2, inherit.aes = F) + geom_hline(aes(yintercept = threshold), color = "grey40") + geom_line() + scale_color_discrete("Chunk", guide = 'none') + scale_y_continuous(expand = expansion(mult = .05, add = c(0, .5))) + ggtitle("Cycles (and boundary regions), 3 cycles/chunk") + theme(axis.title = element_blank(), axis.text.x = element_blank())

grid.arrange(p2, p3, p4, ncol = 1)

This paper demonstrates the use of the threshold bootstrap to create simulated LEA signatures under known match and known non-match conditions, using several different sets of bullet signatures derived from examiner test kits. We describe first the creation of known non-match sequences, and then the creation of known match sequences, by applying the threshold bootstrap process systematically to a library of LEA signatures.

<!-- Three Hamby test sets (36, 44, 252) \todo{add in houston persistence} were used to generate the ``library" of chunk data fed into the threshold bootstrap to produce the simulated signatures. In general, it is important to have source data which is composed of both same-source and different-source sequences: the same-source sequences are used to create known match LEA signatures, while the different-source sequences are used to create known non-match LEA signatures.  -->

<!-- ## Hamby studies -->

<!-- The Hamby studies work well to demonstrate the threshold bootstrap because of their structure: each set consists of 35 bullets fired from 10 barrels. The study is a `closed set' study, which means that all unknown bullets were actually fired from the same barrels which produced the known bullets; that is, each unknown bullet does match a set of known bullets. In each Hamby set, there are 20 known bullets; 2 for each of the 10 barrels used in the study. There are also 15 unknown bullets which can be matched to one of the 10 barrels.  -->

<!-- The input data are LEA signatures taken from three Hamby studies (36, 44, 224). The bullets were scanned using a Sensofar Sneox1 confocal microscope at 20x resolution. The scans were processed using the `bulletxtrctr` package \citep{bulletxtrctr} with default smoothing parameters; grooves were adjusted manually after automatic identification. The `bulletxtrctr` package provides an open-source implementation of the algorithm described in \citet{aoas}.  -->

<!-- With the Hamby data sets, we have separated the barrels into two groups consisting of odd and even-numbered barrels respectively. Bootstrap sequences are assembled from segments of signatures in one group and compared to sequences assembled from segments of signatures in another.  -->

# Simulating Bullet LEA Signatures

## Notation

Let us define notation to describe the data used in this study. We have  test sets $h = 1, ... H$ which may be developed under different conditions and which may use different types of firearms and ammunition. Each study uses firearms (or more specifically, barrels) $i = 1, ..., I_h$, with bullets $j = 1, ..., J_{hi}$ fired from each barrel in the study. Each bullet has $k = 1, ..., K_{hi}$ land engraved areas.

We will define the index $k$ such that any signatures $S_{hijk}$ with the same $h$ and $i$ index have the same land index $k$ (that is, we assume that lands are indexed such that all bullets fired from the same barrel have the same index scheme corresponding to the same land in the rifled barrel). This condition significantly simplifies the notation without loss of generality. In practice, when aligning these sets, we do have to re-index the bullets from the scanned sequence of lands, but this is not a difficult process.

```r
df <- tidyr::crossing(Bullet = 1, Bullet1 = 2, Land = 1:6, Land1 = 1:6, phase = LETTERS[1:6]) %>%
  mutate(ccf = rnorm(n()))
phasedata <- tibble(
  Bullet = 1, Bullet1 = 2,
  Land = rep(1:6, times = 6),
  Land1 = sapply(1:6 - 1, circular_slice, x = 1:6) %>% as.vector(),
  phase = rep(LETTERS[1:6], each = 6)
) 

tmp <- phasedata %>% select(-Land, -Land1) %>% unique() %>%
  mutate(x = ifelse(phase %in% c("C", "D", "E"), 3.5, 5),
         y = ifelse(phase %in% c("C", "D", "E"), 3.5, 2))

ggplot() + 
  geom_tile(aes(x = Land, y = Land1, fill = ccf), data = df, color = 'grey', size = .25) + 
  geom_tile(aes(x = Land, y = Land1), fill = 'blue', data = phasedata) + 
  facet_grid(.~phase) + 
  scale_x_continuous("Sequence 2", breaks = 1:6) + scale_y_continuous("Sequence 1", breaks = 1:6) + 
  # theme(axis.title = element_blank()) + 
  scale_fill_gradient2(guide = 'none', low = "blue", high = "red", mid = "white", limits = c(-500, 500)) + 
  coord_fixed() + 
  ggtitle("Alignment phases between two bullets")

We will split each signature $S_{hijk}$ into $m$ chunks $${{S_{hijk1}, ..., S_{hijk(\ell_1 -1)}}, {S_{hijk\ell_1}, ..., S_{hijk(\ell_2 - 1)}}, ..., {S_{hijk\ell_m},..., S_{hijkL}}}.$$ That is, let $m = 1, ..., M$ index the chunks, and $\ell = 1, ..., L$ index the sequence S. By indexing the chunks this way, we implicitly get both the chunk length and the chunk order.

sig_slices_sum2 <- sig_slices_sum2 %>%
  mutate(idxmin = chunkmin/1.5625, idxmax = chunkmax/1.5625)

breaks <- unique(c(sig_slices_sum2$chunkmin, (sig_slices_sum2$chunkmax + 1.5625)))
idx <- breaks/1.5625

ggplot(data = sig_slices2, aes(x = xold, y = sigold, color = factor(.chunk))) +
  geom_segment(aes(x = chunkmin, xend = chunkmax, 
                   y = labelmax + 0.05, yend = labelmax + 0.05), 
               data = sig_slices_sum2, 
               arrow = arrow(ends = "both", type = "closed", 
                             length = unit(0.15, "lines"))) + 
  geom_text(aes(x = chunkmid, y = labelmax + .07, label = paste("m = ", .chunk)), 
            vjust = -0.1, data = sig_slices_sum2, inherit.aes = F) +
  geom_hline(aes(yintercept = threshold), color = "grey40") + 
  geom_line() + 
  scale_color_discrete("Chunk", guide = 'none') + 
  scale_y_continuous(name = "", expand = expansion(mult = .05, add = c(0, .5))) + 
  ggtitle("Indexing signatures and chunks in \u2113 and m") + 
  scale_x_continuous("Signature index \u2113", breaks = breaks, labels = idx)
  # theme(axis.title = element_blank(), axis.text.x = element_blank())

Source Data

This method requires signatures from known match and known non-match bullets; that is, from bullets fired from the same barrel as well as bullets fired from different barrels. One of the strengths of utilizing the threshold bootstrap for sequence generation is that we can be confident that features of the LEA sequences which are attributable to the pre-processing method used are preserved in the data. This is, of course, a trade-off: the feature distributions which are the end result of the bootstrapping process will not generalize to other methods for creating LEA signatures. Similarly, if there are differences in the features of signatures produced from different ammunition or barrel manufacturing methods, it may be unwise to use distributions generated from different source materials, particularly because it is known that different combinations of firearms and ammunition mark differently \citep{bonfanti_influence_1999}. Ideally, the distributional results would be relatively robust to these changes, as there are several levels of abstraction between the data and the resulting match score, but that is left to some future study.

Provided that relevant source data exist, the threshold bootstrap is applicable to any automatic processing method which reduces a 3D scan of a land to a single sequence of points describing a cross-section of the land engraved area. The choice to demonstrate the method using one specific processing method is primarily one of convenience.

In this paper, we will use data collected from 3 Hamby sets \citep{hamby}: Hamby 36, Hamby 44, and Hamby 224. Each set was produced by firing the same 10 barrels to create a set of 20 known and 15 unknown bullets (thus, we are working with between 60 and 105 bullets, depending on whether we include the unknown bullets in the alignments; each barrel will have at least 6 scans). These sets were scanned using a Sensofar confocal light microscope, and processed using the method described in \citet{aoas}. Once scans were reduced to signatures, bullets fired from each barrel were aligned by maximizing the cross-correlation over the 6-land sequence.

Threshold Bootstrap Procedure

Chunk Creation

Each bullet signature $S_{hijk}$ is split into $m$ chunks $${{S_{hijk1}, ..., S_{hijk(\ell_1 -1)}}, {S_{hijk\ell_1}, ..., S_{hijk(\ell_2 - 1)}}, ..., {S_{hijk\ell_m},..., S_{hijkL}}}$$ around the sequence mean or median. Each chunk consists of $t$ complete cycles around the middle line value. We can reference these chunks (rather than the underlying sequence) as $C_{hijkm}$, if it is so desired. Boundary chunks $C_{hijk1} := (S_{hijk1}, ..., S_{hijk(\ell_1-1)})$ and $C_{hijkm} := (S_{hijk\ell_m},..., S_{hijkL})$ may not contain the full $t$ cycles; these chunks are handled separately and $C_{hijkm}$ is reversed so that it has the same orientation as $C_{hijk1}$ - this allows for a bit more flexibility in generating the sequence ends using the cycle bootstrap.

Bootstrap Signature Creation\label{threshold-bootstrap-alg}

Then a signature generated from the threshold bootstrap, $S^\ast$, of target length $\mathscr{T}_t^\ast$ is assembled from $Z^\ast$ chunks as follows:

\begin{enumerate} \item The sign of the sequence, $a$, is selected randomly from $-1, +1$, where $-1$ means that full chunks will initially be negative and the last value will be positive. \item Boundary chunks $C^\ast_0, C^\ast_Z$ are sampled from the collection of boundary chunks. These chunks have length $\mathscr{L}0$ and $\mathscr{L}_Z$ respectively. \item The total number of missing values at the beginning of the boundary chunks, $m^\ast$ is calculated. \item $C^\ast_0$ is arranged so as to be compatible with the chosen sign $a$. \item $C^\ast_Z$ is reversed and arranged to be compatible with the chosen sign $a$. \item The list of all complete chunks ${C}{0 < l < L}$ is permuted and truncated such that the cumulative length of the chunks is less than the remaining length excluding missing values, $\mathscr{L}{0} + \mathscr{L}{Z} - m^\ast$, or the first chunk in the case that the first chunk selected is longer than the remaining length. \item The chunks are assembled by index \item The sequence is either truncated or padded with NAs so that it has target length $\mathscr{T}_t^\ast$ \end{enumerate}

Generation of Known Non-Match Sequences

For known non-matching sequences, the input sequences $S$ are the computed signatures from each of the lands in the collection of imaged data and are indexed by $i$, the barrel number, $j$, the sequence number for each barrel $i$, and $k$, the land number.

barrel_align_cycle_split <- barrel_align_plotable %>% 
  select(id, .chunk, xold, line_id, Barrel, LandSet) %>%
  group_by(id, .chunk) %>% 
  mutate(chunksplit = (row_number() == 1 & .chunk != 0)) %>%
  ungroup() %>%
  filter(chunksplit) %>%
  mutate(xold = xold - .5)

ggplot() + 
  theme_bw() +
  facet_grid(line_id~LandSet, labeller = label_both) +
  geom_line(aes(x = x, y = sigold), data = filter(barrel_align_plotable, Barrel == 1, LandSet == 'a')) + 
  geom_vline(aes(xintercept = xold), data = filter(barrel_align_cycle_split, Barrel == 1, LandSet == 'a')) + 
  theme(axis.title = element_blank(), axis.text.y = element_blank()) + 
  ggtitle("Cycle Breaks for Hamby Barrel 1 Aligned LEAs")

As signatures from the same physical land in the same barrel have similar features, we need to ensure that the bootstrap sampling scheme used does not allow for incidental overlap due to the fact that our signatures are not independent. As shown in \Cref{fig:aligned-chunks}, the chunk boundaries of aligned sequences tend to be aligned, though the correspondence is not exact due to slight variations in how the signature median intersects the signature. Even with these variations, simple random sampling from the entire set of available signature chunks would introduce the potential for unacceptable correspondence between sequences which are intended to be non-matching. Thus, a more restrictive sampling method which does not allow for the possibility of sampling from signatures originating from the same source barrel is necessary.

The simplest way to ensure non-matching sequences is to compare signatures sampled from a collection of barrels $G_1$ to signatures sampled from a collection of barrels $G_2$, where $G_1 \cap G2 = \emptyset$. This ensures that incidental matches from different signatures originating from the same land of the same barrel do not occur, because same-barrel comparisons are not possible under this sampling strategy. Any arrangement such that generated sequences $S^\ast_1$ and $S^\ast_2$ are do not contain chunks from the same barrel is permissible, however, it is much simpler to consider a sampling scheme where $S^\ast_1$ is generated from chunks originating from barrels in $G_1$ and $S^\ast_2$ is generated from chunks originating from barrels in $G_2$ than to consider the more general case.

Generation of Known Match Sequences

Generation of known match sequences is somewhat more complicated, in part because it is necessary to generate two separate sequences: the pattern which might be thought of as originating from the physical object (in this case, toolmarks on the lands of the barrel), which is consistent across sequences, and the variability around this pattern which reflects the differences in the firing process, any damage to the bullet, and any measurement error. To generate these two separate sequences, we must first have a set of co-aligned signatures from the same land of the same barrel. These sets of lands will be used to generate a set of consensus sequences which will be resampled to generate the underlying structure corresponding to the physical object. Deviations from the consensus sequence will be resampled separately to produce individual signature differences from the consensus sequence. In both cases, resampling will take place using the general threshold bootstrap framework described in \Cref{threshold-bootstrap-alg}.

hb2 <- filter(barrel_align_plotable, Barrel == 2) %>%
  mutate(LandSet = paste0("Aligned Land ", match(LandSet, letters), "*"))

hb2m <- hb2 %>% group_by(LandSet, x) %>% summarize(na = sum(is.na(sig)), 
                                                   sig = mean(sig, na.rm = T)) %>%
  filter(na < 4)

ggplot(data = hb2, aes(x = x, y = sig)) + 
  geom_line(aes(color = interaction(set, Bullet)), data = hb2) +  
  geom_line(data = hb2m) +  
  facet_wrap(~LandSet, nrow = 4) + 
  scale_color_discrete(guide = 'none') + 
  scale_y_continuous(expression(paste("Signature Height (", mu, "m)")), limits = c(-5, 5)) + 
  scale_x_continuous("") + 
  ggtitle("Hamby Barrel 2 Consensus Sequences")

Dividing the sequences shown in \Cref{fig:known-match-align} into the mean structure and variability around the mean produces two separate sets of information which can be resampled separately. \svp{Cite Basu's theorem saying these are point-wise independent?}

hb2 <- hb2 %>%
  left_join(rename(hb2m, mean_sig = sig))

hb2 <- hb2 %>% 
  mutate(variability = sig - mean_sig)

hb2long <- hb2 %>%
  pivot_longer(c(mean_sig, variability), names_to = "type", values_to = "value") %>%
  mutate(type = str_remove(type, "_sig"))

p1 <- ggplot(data = filter(hb2long, str_detect(LandSet, "[1-3]")), aes(x = x, y = value, group = interaction(set, Bullet))) + 
  geom_line(alpha = .5) + 
  coord_cartesian(ylim = c(-5, 5), xlim = c(-200, 3000)) + 
  facet_grid(type ~ LandSet) + 
  scale_y_continuous(expression(paste("Height (", mu, "m)"))) + 
  scale_x_continuous("") + 
  ggtitle("Hamby Barrel 2 Consensus Sequences and Variability")
p2 <- ggplot(data = filter(hb2long, str_detect(LandSet, "[4-6]")), aes(x = x, y = value, group = interaction(set, Bullet))) + 
  geom_line(alpha = .5) + 
  coord_cartesian(ylim = c(-5, 5), xlim = c(-200, 3000)) + 
  facet_grid(type ~ LandSet) + 
  scale_y_continuous(expression(paste("Height (", mu, "m)"))) + 
  scale_x_continuous("") + 
  theme(plot.title = element_blank())

gridExtra::grid.arrange(p1, p2, ncol = 1)

# Goal: Show minimal association between mean and sd with partitions like this

hball <- barrel_align_plotable %>%
  mutate(LandSet = paste0("Aligned Land ", match(LandSet, letters), "*"))

hballm <- hball %>% 
  group_by(LandSet, x) %>% summarize(na = sum(is.na(sig)), 
                                     sig = mean(sig, na.rm = T)) %>%
  filter(na < 4)

hball <- hball %>% 
  left_join(rename(hballm, mean_sig = sig)) %>% 
  mutate(variability = sig - mean_sig)

decorrelated_res <- filter(hball, x %% 120 == 15) %>% # Take every nth point to decorrelate
group_by(Barrel, LandSet, x) %>% 
summarize(mean = mean(mean_sig), sd = sd(sig)) %>%
  filter(!is.na(mean)) %>%
  group_by(Barrel, LandSet) %>%
  mutate(n = n(), acf = purrr::map(mean, acf, plot = F))

decorrelated_res %>%
ggplot(aes(x = mean, y = sd)) + geom_point() + 
  geom_smooth()

summary(lm(sd ~ mean, data = decorrelated_res))
hball %>%
  group_by(Barrel, LandSet, x) %>%
  summarize(n = sum(!is.na(sig)), mean = mean(sig), sd = sd(sig)) %>%
  filter(n > 3) %>% 
  pivot_longer(c(mean, sd), names_to = "type", values_to = "value") %>%
ggplot(aes(x = x, y = value, color = type, group = interaction(type, Barrel))) + 
  geom_line(alpha = .5) + 
  coord_cartesian(ylim = c(-5, 5), xlim = c(-200, 3000)) + 
  facet_grid(Barrel ~ LandSet) + 
  scale_y_continuous(expression(paste("Height (", mu, "m)"))) + 
  scale_x_continuous("")

Consecutive Matching Striae and resampling

Theoretical probability of signature overlap due to same-piece resampling

Computational assessment

Case study: Resampling Hamby bullet signatures

Conclusion

srvanderplas/bulletsamplr documentation built on June 10, 2021, 7:55 a.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

srvanderplas/bulletsamplr
Resample bullet signatures

In srvanderplas/bulletsamplr: Resample bullet signatures

Introduction

Statistical Analysis of Bullet Land Engraved Areas

Threshold Bootstrap

Source Data

Threshold Bootstrap Procedure

Chunk Creation

Bootstrap Signature Creation\label{threshold-bootstrap-alg}

Generation of Known Non-Match Sequences

Generation of Known Match Sequences

Consecutive Matching Striae and resampling

Theoretical probability of signature overlap due to same-piece resampling

Computational assessment

Case study: Resampling Hamby bullet signatures

Conclusion

R Package Documentation

Browse R Packages

We want your feedback!

srvanderplas/bulletsamplr Resample bullet signatures

In srvanderplas/bulletsamplr: Resample bullet signatures

Introduction

Statistical Analysis of Bullet Land Engraved Areas

Threshold Bootstrap

Source Data

Threshold Bootstrap Procedure

Chunk Creation

Bootstrap Signature Creation\label{threshold-bootstrap-alg}

Generation of Known Non-Match Sequences

Generation of Known Match Sequences

Consecutive Matching Striae and resampling

Theoretical probability of signature overlap due to same-piece resampling

Computational assessment

Case study: Resampling Hamby bullet signatures

Conclusion

R Package Documentation

Browse R Packages

We want your feedback!

srvanderplas/bulletsamplr
Resample bullet signatures