In srvanderplas/ShoeScrubR: Functions for cleaning data from CSAFE's Longitudinal Shoe Study

knitr::opts_chunk$set(echo = F, cache = T, cache.lazy = F, dpi = 300)
library(EBImage)
library(ShoeScrubR)
library(tidyverse)

set.seed(3142095)

lss_dir <- "/lss/research/csafe-shoeprints/ShoeImagingPermanent"

# For a bunch of images...
full_imglist <- list.files("/lss/research/csafe-shoeprints/ShoeImagingPermanent/",
                           pattern = "0[01]\\d{4}[RL]_\\d{8}_5_1_1_.*_.*_.*", full.names = T)
dir <- "/tmp/film-prints"
if (!dir.exists(dir)) dir.create(dir)

file.copy(full_imglist, file.path(dir, basename(full_imglist)))
imglist <- file.path(dir, basename(full_imglist))

shoe_info <- read_csv("~/Projects/CSAFE/2018_Longitudinal_Shoe_Project/Clean_Data/shoe-info.csv") %>%
  filter(ShoeID %in% str_sub(basename(imglist), 1, 3)) %>%
  select(ShoeID, Brand, Size) %>%
  mutate(Size = str_remove(Size, "[ MW]") %>% parse_number()) %>%
  crossing(tibble(Mask_foot = c("R", "L"), Shoe_foot = c("L", "R"))) %>%
  mutate(mask = purrr::pmap(list(Brand, Size, Mask_foot, ppi = 300), shoe_mask))

scan_info <- tibble(
  file = imglist,
  ShoeID = str_extract(basename(file), "^\\d{3}"),
  Shoe_foot = str_extract(basename(file), "\\d{6}[RL]") %>% str_remove_all("\\d"),
  date = str_extract(basename(file), "\\d{8}") %>% parse_date(format = "%Y%m%d")
) %>%
  left_join(select(shoe_info, ShoeID, Brand, Size, Shoe_foot)) %>%
  group_by(Shoe_foot, Brand) %>%
  sample_n(2) %>%
  ungroup() %>%
  group_by(ShoeID, Shoe_foot) %>%
  # arrange(desc(date)) %>%
  # filter(row_number() == 1) %>%
  ungroup() %>%
  mutate(
    img = purrr::map(file, EBImage::readImage, all = F),
    img = purrr::map(img, EBImage::channel, "luminance"),
    im_dim = purrr::map(img, dim)
  )

Parameters in the Rough Alignment algorithm

Image Exaggeration

Gaussian blur
- kernel size
- kernel shape
- sigma
Threshold for binarization
Opening diameter
Closing diameter

Selection of Binarization Threshold

par(mfrow = c(1, 2))
plot(scan_info$img[[1]])
hist(scan_info$img[[1]])

Use EM Algorithm - each image is a mixture of signal and noise pixels, with different intensity distributions.

em_thresh <- function(img, scale = 10, ngroups = 3) {
  imsmall <- img_resize(img, w = floor(dim(img)[1]/scale), h = floor(dim(img)[2]/scale))
  em <- mixtools::normalmixEM(imsmall, k = ngroups)
  values <- cbind(em$x, em$posterior)
  # values <- values[order(values[,1]),]
  # values[values[,2]*100 < values[,3], ]

  mean_idx <- order(em$mu, decreasing = F)

  # get probs for full img
  img_dens_functions <- lapply(mean_idx, function(x) 0*img + dnorm(img, mean = em$mu[x], sd = em$sigma[x]))

  img_dens_total <- abind::abind(img_dens_functions, along = 3) %>%
    apply(c(1, 2), sum)

  img_dens_ratio <- lapply(img_dens_functions, function(x) as.Image(x/(img_dens_total - x)))

  return(list(img_densities = img_dens_functions, img_ratios = img_dens_ratio, em = em))
}

emt_adidas <- em_thresh(scan_info$img[[1]])
emt <- em_thresh(scan_info$img[[2]])
Image(abind::abind(emt_adidas$img_ratios, along = 3)) %>% plot(all = T, nx = 3)
Image(abind::abind(emt$img_ratios, along = 3)) %>% plot(all = T, nx = 3)

The first image shows pixels(in white) identified as most likely providing signal data. The third image shows pixels identified as most likely belonging to the background. The second image shows intermediate pixels, which are not necessarily consistent with background or signal.

One primary advantage of using only the first image is that it is nearly discrete - the values are the likelihood ratio of belonging to signal vs. other (background + intermediate), and are almost completely separated between infinite and 0.

df <- tribble(~type, ~values,
              "adidas", as.numeric(emt_adidas$img_ratios[[1]]),
              "nike", as.numeric(emt$img_ratios[[1]])) %>%
  unnest(values)

df.inf <- df %>%
  group_by(type) %>%
  summarize(inf = sum(is.infinite(values)))

ggplot() + 
  geom_histogram(aes(x = values), data = df, binwidth = .1) + 
  scale_x_log10() + 
  geom_text(aes(x = 1, y = Inf, label = sprintf("%d infinite values", inf)), data = df.inf, hjust = 0, vjust = 1.2) + 
  facet_wrap(~type)

This allows us to get away from selecting a single threshold value in favor of computing an implicit automatic threshold via the EM algorithm.

Diameter selection

By run length

Idea - use rle to count runs in rows and columns to get an approximate idea of which parameters to use for tuning. E.g. using binarized image, runs of 0 will be either relatively short (adidas stripes, between Nike blobs) or relatively long (outside of the shoe).

bin_img <- normalize(emt$img_ratios[[1]])
bin_img[is.nan(bin_img)] <- 1

# bin_img[1000,] %>% as.numeric() %>% rle() %>% do.call("bind_cols", .) %>% filter(values == 1) %>% `[[`(1) %>% median()

col_runs <- apply(bin_img, 1, function(x) rle(x) %>% do.call("bind_cols", .)) %>%
  purrr::map2_df(., 1:length(.), ~dplyr::mutate(.x, idx = .y, type = "col", prop_lengths = lengths/dim(bin_img)[2]))

row_runs <- apply(bin_img, 2, function(x) rle(x) %>% do.call("bind_cols", .)) %>%
  purrr::map2_df(., 1:length(.), ~dplyr::mutate(.x, idx = .y, type = "row", prop_lengths = lengths/dim(bin_img)[1]))

runs <- bind_rows(col_runs, row_runs) %>%
  group_by(type) %>%
  mutate(pos = idx/max(idx)) %>%
  ungroup()

In this plot, there is a cluster of 0-runs of length 100(ish) and 250(ish), while the 1-runs are at most 60 pixels long, and usually closer to 10-20. The better question is how do we "see" that automatically?

ggplot(data = runs, aes(y = lengths, x = pos, fill = factor(values))) + geom_jitter(alpha = .05) + facet_wrap(~type + values, scales = 'free')

By blob size

# Clean up just enough to label well
bin_img_clean <- bin_img %>%
  erode(makeBrush(size = 1)) %>%
  dilate(matrix(c(1, 0, 1, 0, 1, 0, 1, 0, 1), byrow = T, nrow = 3)) %>%
  dilate(makeBrush(size = 3, shape = "line")) %>%
  # dilate(makeBrush(size = 3)) %>%
  erode(makeBrush(size = 3, shape = "disc")) %>%
  dilate(makeBrush(size = 9, shape = "disc"))

bin_img_label <- bwlabel(bin_img) %>%
  fillHull()
bin_img_clean_label <- bwlabel(bin_img_clean) %>%
  fillHull()

label_counts <- table(bin_img_label) %>% as.data.frame() %>% set_names(c("label", "freq")) 
tiny_labels <- label_counts %>% filter(freq <= 15)

clean_label_counts <- table(bin_img_clean_label) %>% as.data.frame() %>% set_names(c("label", "freq")) 
tiny_clean_labels <- clean_label_counts %>% filter(freq <= 2 * sum(makeBrush(size = 9, "disc")))

bin_big_clean_labels <- bin_img_clean_label
bin_big_clean_labels[bin_big_clean_labels %in% tiny_clean_labels$label] <- 0

bin_big_labels <- bin_img_label
bin_big_labels[bin_big_labels %in% tiny_labels$label] <- 0

par(mfrow = c(2, 3))
plot(bin_img)
colorLabels(bin_img_label) %>% plot()
colorLabels(bin_big_labels) %>% plot()
plot(bin_img_clean)
colorLabels(bin_img_clean_label) %>% plot()
colorLabels(bin_big_clean_labels) %>% plot()

bind_rows(mutate(label_counts, type = "binary image"),
          mutate(clean_label_counts, type = "cleaned binary image")) %>%
  filter(label != 0) %>%
  ggplot(aes(x = freq, colour = type)) + 
  stat_density(geom = "line") + 
  # coord_cartesian(xlim = c(1, 16000), ylim = c(0, .015)) + 
  scale_x_log10() + 
  xlab("Blob size in pixels")

par(mfrow = c(1, 6))
purrr::walk(c(100, 1000, 2500, 5000, 10000, 15000), function(z) {
  interesting_blobs <- clean_label_counts %>% filter(freq > z) 
  small_interesting_blobs_clean <- bin_img_clean_label * (bin_img_clean_label %in% as.numeric(as.character(interesting_blobs$label)))

  colorLabels(small_interesting_blobs_clean) %>% plot()
  text(x = dim(bin_img)[1]/2, y = dim(bin_img)[2] - 300, label = sprintf("Blobs bigger \n than %d", z), col = "white", adj = c(.5, .5))
})

Unfortunately, filtering by blobs may not be particularly useful when dealing with Adidas shoes.

For a bunch of images...

scan_info <- mutate(scan_info, em = purrr::map(img, em_thresh, ngroups = 3))

scan_info <- mutate(scan_info,
                    bin_img = purrr::map(em, ~{
                      z <- .$img_ratios[[1]]
                      z <- z > 10
                      as.Image(normalize(z))
                    }))

par(mfrow = c(2, 8))
purrr::walk(scan_info$img, plot)
purrr::walk(scan_info$bin_img, plot)

Filtering each image to include only points with a ratio of P(signal)/P(not signal) greater than 10 removes most of the noise in the images without scale-dependent parameters, but the resulting image is a bit fragmented spatially, because the EM algorithm is performed on point intensity only; it does not include any spatial context because including 2 additional dimensions vastly increases memory requirements and computational time.

Some additional cleaning may be useful, but ideally, would use parameters which are small enough relative to the image to not depend on image scaling within reason (e.g. an image which has dimensions 1.5x the length/width of a similar image).

From that point, we can label the image, so that each self contained region has a different numeric label. We can use these regions to facilitate certain additional assumptions:

small regions in the corner of the scan are not shoe
any region which is closed should be filled in
very small specks (e.g. containing fewer pixels than ~2x the maximum dilation brush region) are likely not part of the shoe

easy_clean <- function(img) {
  img %>%
  erode(makeBrush(size = 1)) %>%
  # dilate(matrix(c(1, 0, 1, 0, 1, 0, 1, 0, 1), byrow = T, nrow = 3)) %>%
  dilate(makeBrush(size = 3, shape = "line")) %>%
  # dilate(makeBrush(size = 3)) %>%
  erode(makeBrush(size = 5, shape = "disc")) %>%
  dilate(makeBrush(size = 9, shape = "disc"))
}

clean_img_corners <- function(labeled_img, len = 50) {
  tmp <- bind_rows(
    mutate(as.data.frame(table(labeled_img[1:len, 1:len]), stringsAsFactors = F), corner = "1"),
    mutate(as.data.frame(table(labeled_img[1:len, (ncol(labeled_img) - len):ncol(labeled_img)]), stringsAsFactors = F), corner = "2"),
    mutate(as.data.frame(table(labeled_img[(nrow(labeled_img) - len):nrow(labeled_img), 1:len]), stringsAsFactors = F), corner = "3"),
    mutate(as.data.frame(table(labeled_img[(nrow(labeled_img) - len):nrow(labeled_img), (ncol(labeled_img) - len):ncol(labeled_img)]), stringsAsFactors = F), corner = "4")
  ) %>%
    set_names(c("label", "freq", "corner")) %>%
    mutate(label = as.numeric(as.character(label)))

  pct_label <- tibble(label = unique(tmp$label), pct = lapply(unique(tmp$label), function(x) mean(labeled_img == x)))

  pct_label <- filter(pct_label, pct <= .1)

  labeled_img[labeled_img %in% pct_label$label] <- 0
  labeled_img
}


scan_info <- scan_info %>%
  mutate(clean_img = purrr::map(bin_img, easy_clean),
         clean_img_hull = purrr::map(clean_img, fillHull),
         labeled_img = purrr::map(clean_img, bwlabel),
         labeled_img = purrr::map(labeled_img, clean_img_corners),
         labeled_fill = purrr::map(labeled_img, fillHull),
         label_counts = purrr::map(labeled_fill, ~table(.) %>% as.data.frame(stringsAsFactors = F) %>% set_names(c("label", "freq"))))

par(mfrow = c(1, 8)) 
purrr::walk(scan_info$labeled_fill, ~colorLabels(.) %>% plot())

The default parameters (erode 1px, dilate 3px in a line, erode 5px in a disc, dilate 9px in a disc) seem to work fairly well on both shoe models.

At this point, we can use the labeled image as a mask: if the label is nonzero, we take the value of the pixel in the original image.

scan_info <- mutate(scan_info, censored_img = map2(img, labeled_fill, ~{.x * (.y > 0) + (.y == 0)}))

purrr::walk(scan_info$censored_img, ~plot(normalize(.)))

In order to create an exaggerated mask of the shoe print to align with the default mask, we start by blurring the image using a gaussian filter. Rather than using predetermined parameters, we instead select the sigma of the gaussian distribution to be 1/50th of the size of the dimensions of a square image with the same area as our image. Thresholding the blurred image at the median provides a first pass exaggerated shoe mask.

scan_info <- mutate(scan_info, blurred_img = purrr::map2(censored_img, im_dim, ~gblur(.x, sigma = sqrt(prod(.y))/50, boundary = "replicate")))
par(mfrow = c(3, 8))
purrr::walk(scan_info$blurred_img, plot)
purrr::walk(scan_info$blurred_img, ~plot(. < median(.)))
purrr::walk(scan_info$blurred_img, ~plot(colorLabels(bwlabel(. < median(.)))))

Labeling the resulting image's distinct regions, we can select the region that is not the background with the largest area, and then fill any holes in that region. Finally, another image-area based operation can be conducted: shrinking the image by 1/5 of the sqrt(area) of our image, then enlarging the image by the same amount. This smoothes out the image boundary and also minimizes the effects of interactions with the page border.

scan_info <- mutate(scan_info, exag_mask = purrr::map(blurred_img, ~{
  z <- (. < median(.))
  z <- bwlabel(z)
  labels <- table(z[z!=0]) %>% sort(decreasing = T)
  fillHull(z == as.numeric(names(labels[1]))) %>%
    opening(makeBrush(size = floor(sqrt(length(.))/5), "disc"))
}))
par(mfrow = c(1, 8))
purrr::walk(scan_info$exag_mask, plot)

Image Aggregation

get_subimage <- function(img, row, col, width = 5, height = 5) {
  idx_row <- pmin(nrow(img), pmax(1, row:(row + height)))
  idx_col <- pmin(ncol(img), pmax(1, col:(col + width)))

  img[idx_col, idx_row]
}

get_coord_sample <- function(img, N = 50000) {
  tibble::tibble(row = sample(1:nrow(img), N, replace = T), col = sample(1:ncol(img), N, replace = T)) %>%
    mutate(subimg = purrr::map2(row, col, ~get_subimg(img = img, row = .x, col = .y)))
}

bin_img_sample <- get_coord_sample(bin_img)

srvanderplas/ShoeScrubR documentation built on Nov. 27, 2019, 2:09 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

srvanderplas/ShoeScrubR
Functions for cleaning data from CSAFE's Longitudinal Shoe Study

In srvanderplas/ShoeScrubR: Functions for cleaning data from CSAFE's Longitudinal Shoe Study

Parameters in the Rough Alignment algorithm

Selection of Binarization Threshold

Diameter selection

By run length

By blob size

For a bunch of images...

Image Aggregation

R Package Documentation

Browse R Packages

We want your feedback!

srvanderplas/ShoeScrubR Functions for cleaning data from CSAFE's Longitudinal Shoe Study

In srvanderplas/ShoeScrubR: Functions for cleaning data from CSAFE's Longitudinal Shoe Study

Parameters in the Rough Alignment algorithm

Selection of Binarization Threshold

Diameter selection

By run length

By blob size

For a bunch of images...

Image Aggregation

R Package Documentation

Browse R Packages

We want your feedback!

srvanderplas/ShoeScrubR
Functions for cleaning data from CSAFE's Longitudinal Shoe Study