phenoptr: inForm Helper Functions

Documented in count_touching_cells

# Find, count and image touching cells by morphological analysis
# of the membrane mask.


#' Find and count touching cells for pairs of phenotypes.
#'
#' `count_touching_cells` uses morphological analysis of nuclear and
#' membrane segmentation maps to find touching cells of paired phenotypes.
#' It reports the number of touching cells found and, optionally,
#' writes image files showing the touching cells.
#'
#' This function requires a cell seg data file and a matching
#' segmentation map file. If `write_images` is true,
#' a composite image is required.
#' If the cell seg data uses micron units, a composite data file is also
#' required.
#'
#' Cells are considered to touch if they have any amount of common membrane
#' as determined by the inForm membrane segmentation. Cells which meet only
#' at a corner, like black squares on a checkerboard, are not counted as
#' touching.
#'
#' The number of touching cells is reported in three ways. For a pair of
#' phenotypes A and B, this function reports the number of A touching a B,
#' the number of B touching an A, and the number of mutually touching pairs.
#' Note that the number of mutual touches is often larger than the count of
#' either "A touching B" or "B touching A" because a single touching cell
#' may be part of multiple pairs.
#'
#' The image files written show cells of the selected phenotypes on a background
#' of the composite. Touching cells are filled in the provided color;
#' cells which are not touching the other phenotype are outlined.
#' Image files are written as
#' TIFF files to preserve the fine detail of cell boundaries.
#'
#' Images are only written when both phenotypes of the pair are represented.
#'
#' See the tutorial
#' [Selecting cells within a cell segmentation table](https://akoyabio.github.io/phenoptr/articles/selecting_cells.html)
#' for more on
#' the use of `pairs` and `phenotype_rules`.
#'
#' @param cell_seg_path The path to the cell seg data file. The same directory
#'   must also contain `_memb_seg_map.tif` or `_binary_seg_maps.tif` and, if
#'   `write_images` is true, a  TIFF or JPEG composite image from inForm.
#' @param pairs A list of pairs of phenotypes. Each entry is a two-element
#'   vector. The result will contain one line for each pair showing the
#'   number of cells and number of touches. Pairs must match two different
#'   sets of cells; in particular, touches of a phenotype to itself are
#'   not supported.
#' @param colors A named list of phenotype colors to use when drawing
#'   the output. Only used when `write_images` is `TRUE`.
#' @param phenotype_rules (Optional) A named list.
#'   Item names are phenotype names and must match entries in `pairs`.
#'   Item values are selectors for [select_rows].
#' @param categories If given, a vector or list of tissue category names.
#' Categories not in the list will be excluded from the analysis.
#' @param write_images If `TRUE`, for each pair, write an image showing the
#' touching pairs. Requires `colors` and a composite image in the same
#' directory as the cell seg table.
#' @param output_base Base path for image output.
#' If `NULL`, output will be to the same
#' directory as the cell table.
#' @return Returns a `tibble` with one row for each pair in
#' `pairs`, containing these columns:
#'   \describe{
#'    \item{\code{slide_id}}{Slide ID from the data file, if available.}
#'    \item{\code{source}}{Base file name of the source file with
#'    `_cell_seg_data.txt` stripped off for brevity.}
#'    \item{\code{phenotype1}}{The name of the first phenotype in
#'    the touching pair.}
#'    \item{\code{phenotype2}}{The name of the second phenotype in
#'    the touching pair.}
#'    \item{\code{total1}}{The total number of `phenotype1` cells
#'    in the image.}
#'    \item{\code{total2}}{The total number of `phenotype2` cells
#'    in the image.}
#'    \item{\code{p1_touch_p2}}{The number of `phenotype1` cells
#'    touching a `phenotype2` cell.}
#'    \item{\code{p2_touch_p1}}{The number of `phenotype2` cells
#'    touching a `phenotype1` cell.}
#'    \item{\code{touch_mutual}}{The number of mutually touching pairs.}
#'  }
#' @examples
#' \dontrun{
#' # This example creates an image in a subdirectory of the
#' # current user's directory.
#' cell_seg_path <- sample_cell_seg_path()
#'
#' pairs <- list(c("CD68+", "CD8+"))
#' colors <- c("CD68+"='magenta', "CD8+"='yellow')
#' output_base <- path.expand('~/touches')
#'
#' count_touching_cells(cell_seg_path, pairs, colors,
#'   output_base=output_base)
#'
#' # This example will count and image all files in the `base_path` directory.
#' base_path <- '/path/to/data'
#' output_base <- file.path(base_path, 'touches')
#' files <- list_cell_seg_files(base_path)
#'
#' # The phenotype pairs to locate. This will find CD8 cells touching
#' # tumor cells, and, separately, CD8 cells touching CD68 cells.
#' pairs <- list(c("CD8+", "CK+"),
#'              c("CD8+", "CD68+"))
#'
#' # Colors for all the phenotypes mentioned in pairs
#' colors <- list(
#'   'CD8+' = 'yellow',
#'   'CK+' = 'cyan',
#'   'CD68+' = 'magenta'
#' )
#'
#' # Count and visualize touching cells
#' touch_counts <- purrr::map_df(files, function(path) {
#'   cat('Processing', path, '\n')
#'   count_touching_cells(path, pairs, colors, output_base=output_base)
#' })
#'
#' # Save the result
#' touches_path <- file.path(output_base, 'TouchCounts.csv')
#' readr::write_csv(touch_counts, touches_path)
#'
#' # The phenotype definitions can be more complex. The default is to use
#' # the names in `pairs`. Using `phenotype_rules`, the definition can be
#' # anything allowed by select_rows().
#'
#' # You can also limit the tissue category.
#'
#' # For example, find all touches between lymphocytes and tumor cells
#' # within the tumor:
#' pairs <- list(c('Tumor', 'Lymphocyte'))
#' colors <- list(Tumor='cyan', Lymphocyte='yellow')
#' phenotype_rules <- list(
#'   Lymphocyte=c('CD8+', 'FoxP3+')
#' )
#'
#' touch_counts <- map_df(files, function(path) {
#'   cat('Processing', path, '\n')
#'   count_touching_cells(path, pairs, colors, phenotype_rules,
#'                        categories='tumor',
#'                        output_base=output_base)
#' })
#'
#' # Then write the results as above.
#' }
#' @md
#' @export
#' @family distance functions
#' @importFrom magrittr %>%
count_touching_cells <- function(cell_seg_path, pairs, colors=NULL,
                                phenotype_rules=NULL, categories=NULL,
                                write_images=!is.null(colors), output_base=NULL)
{
  # Allow a single pair to be specified as a plain vector
  if (is.character(pairs) && length(pairs)==2)
    pairs = list(pairs)

  # Don't allow pairs of self to self
  equal_pairs = purrr::map_lgl(pairs, ~.x[1]==.x[2])
  if (any(equal_pairs)) {
    warning('count_touching_cells does not count self touching self.\n',
            'Omitting ',
            paste(purrr::map_chr(pairs[equal_pairs],
                                 ~paste(.x, collapse=' - ')),
                  collapse=', '))
    pairs = pairs[!equal_pairs]
  }

  # Make phenotype_rules for any not already specified
  phenotypes = unique(do.call(c, pairs))
  phenotype_rules = make_phenotype_rules(phenotypes, phenotype_rules)

  # Check the requirements for writing images, if requested
  if (write_images) {
    if (is.null(colors))
      stop('count_touching_cells requires colors when write_images is TRUE.')
    if (!all(phenotypes %in% names(colors)))
      stop('count_touching_cells requires colors for all phenotypes.')

    composite_path = find_composite_image(cell_seg_path)
  }

  # Make the output directory
  if (!is.null(output_base) && !file.exists(output_base))
    dir.create(output_base, showWarnings=FALSE, recursive=TRUE)

  # Read the data. We don't want to convert to microns here, we need
  # image coordinates
  name = basename(cell_seg_path)
  name = sub('_cell_seg_data.txt', '', name)
  csd = read_cell_seg_data(cell_seg_path, pixels_per_micron=NA)
  csd = force_pixel_locations(csd, cell_seg_path)

  # Filter out unwanted tissue categories
  if (!is.null(categories)) {
    if (!'Tissue Category' %in% names(csd))
      stop('Cell seg data does not include "Tissue Category" column.')
    csd = csd %>% dplyr::filter(`Tissue Category` %in% categories)
  }

  slide = ifelse('Slide ID' %in% names(csd),
                 as.character(csd[1, 'Slide ID']), NA)

  masks = read_masks(cell_seg_path)

  # Make images for the cells in each phenotype by filling in the membrane mask
  # at each cell, then removing the membrane.
  # Fill with a unique ID value for each cell (here, the cell ID)
  cell_images = lapply(phenotypes, function(phenotype) {
    rule = phenotype_rules[[phenotype]]
    d = csd[select_rows(csd, rule), ]
    make_cell_image(d, masks$nuclei, masks$membrane)
  })
  names(cell_images) = phenotypes

  # Will be a data frame with counts etc
  result = NULL

  # Process each pair of phenotypes
  for (pair in pairs) {
    # Get the names and images for each phenotype
    pheno_name1 = pair[1]
    cell_image1 = cell_images[[pheno_name1]]

    pheno_name2 = pair[2]
    cell_image2 = cell_images[[pheno_name2]]

    p1_count = sum(select_rows(csd, phenotype_rules[[pheno_name1]]))
    p2_count = sum(select_rows(csd, phenotype_rules[[pheno_name2]]))

    touches_found = 0

    if (p1_count == 0 || p2_count == 0) {
      # No data for one of the phenotypes in this pair
      # Report empty result and go on
      result = rbind(result,
                   tibble::tibble(
                      slide_id=slide,
                      source=name,
                      phenotype1=pheno_name1,
                      phenotype2=pheno_name2,
                      total1=p1_count,
                      total2=p2_count,
                      p1_touch_p2=0,
                      p2_touch_p1=0,
                      touch_pairs=0))
      if (write_images)
        warning('No image for ', name, ', ', pheno_name1,
                ' touching ', pheno_name2)
      next
    }

    # We need a bigger dilation kernel if the membrane is two pixels wide
    extra_size = ifelse(masks$membrane_width==1, 0, 2)
    touch_pairs = find_touching_cell_pairs(cell_image1, cell_image2, extra_size)

    # Need individual IDs for imaging and counting
    touching_ids = list(unique(touch_pairs[, 1]), unique(touch_pairs[, 2]))
    p1_touching_count = length(touching_ids[[1]])
    p2_touching_count = length(touching_ids[[2]])
    touches_found = nrow(touch_pairs)

    result = rbind(result,
                 tibble::tibble(
                    slide_id=slide,
                    source=name,
                    phenotype1=pheno_name1,
                    phenotype2=pheno_name2,
                    total1=p1_count,
                    total2=p2_count,
                    p1_touch_p2=p1_touching_count,
                    p2_touch_p1=p2_touching_count,
                    touch_pairs=touches_found))

    if (write_images) {
      composite = make_touching_image(pheno_name1, pheno_name2,
                                      cell_image1, cell_image2,
                                      touching_ids, masks,
                                      composite_path, colors)

      tag = paste0(replace_invalid_path_characters(pheno_name1, '_'), '_touch_',
                   replace_invalid_path_characters(pheno_name2, '_'), '.tif')
      composite_out = sub('composite_image.(tif|jpg)', tag, composite_path)
      if (!is.null(output_base))
        composite_out = file.path(output_base, basename(composite_out))

      EBImage::writeImage(composite, composite_out, compression='LZW')
    }
  }

  # Return the data
  result
}

# Find the composite image as TIFF or JPEG
find_composite_image = function(cell_seg_path) {
  composite_path =
    sub('cell_seg_data.txt', 'composite_image.tif', cell_seg_path)
  if (!file.exists(composite_path))
    composite_path = sub('tif', 'jpg', composite_path)

  if (!file.exists(composite_path))
    stop('count_touching_cells requires a matching TIFF or JPEG ',
         'composite image when write_images is TRUE.')

  composite_path
}

# Read the membrane and nuclear masks.
# Convert the membrane to single values.
# Use 0.5 to avoid conflict with cell labeling
# Returns a list with `nuclei`, `membrane` and `membrane_width` members.
read_masks = function(cell_seg_path) {
  mask_path = sub('cell_seg_data.txt', 'memb_seg_map.tif', cell_seg_path)

  if (file.exists(mask_path)) {
    # Old-style membrane mask
    membrane = EBImage::readImage(mask_path)
    membrane_width = 1 # Old-style membrane mask is always one pixel thick
    membrane[membrane>0] = 0.5

    nuc_path = sub('cell_seg_data.txt', 'nuc_seg_map.tif', cell_seg_path)
    if (!file.exists(nuc_path))
      stop('nuc_seg_map file not found.')

    # Don't use readImage to read nuclear mask, it converts to 0-1 scale!
    nuclei = tiff::readTIFF(nuc_path, as.is=TRUE)
    nuclei = t(nuclei)
  } else {
    mask_path = sub('cell_seg_data.txt', 'binary_seg_maps.tif', cell_seg_path)
    if (!file.exists(mask_path))
      stop('count_touching_cells requires a segmentation map file.')

    masks = read_maps(mask_path)
    if (!all(c('Membrane', 'Nucleus') %in% names(masks)))
      stop('binary_seg_maps file must contain ',
           'nuclear and membrane segmentation.')

    membrane = masks[['Membrane']]

    # New-style membrane masks may be two pixels thick. If so, it will be
    # a label image with max > 1. Otherwise it will be a simple 0/1 mask image.
    membrane_width = ifelse(max(membrane) > 1, 2, 1)

    nuclei = masks[['Nucleus']]
    rm(masks)
    membrane[membrane>0] = 0.5
    membrane = t(membrane)
    nuclei = t(nuclei)
  }

  # inForm cell seg doesn't always draw the membrane to the edge of the image.
  # That is a disaster for flood fill. Draw a border around the membrane
  # to prevent this.
  membrane[1, ] = membrane[, 1] =
    membrane[dim(membrane)[1], ] =
    membrane[, dim(membrane)[2]] = 0.5

  list(nuclei=nuclei, membrane=membrane, membrane_width=membrane_width)
}

# Given a data frame of cells and membrane and nuclear masks,
# make a label image with a region for each cell. Returns NULL if d is empty.
make_cell_image <- function(d, nuclei, membrane) {
  stopifnot('Cell ID' %in% names(d))

  if (nrow(d)==0) return(NULL) # No cells in this data

  image = membrane
  cell_ids = d$`Cell ID`

  # Find interior points of all cells
  nuc_locations = purrr::map(cell_ids, ~find_interior_point(nuclei, .x))

  # Filter out missing cells
  cell_ids = cell_ids[!is.null(nuc_locations)]
  nuc_locations = nuc_locations[!is.null(nuc_locations)]

  # Fill each cell with its ID
  if (utils::packageVersion('EBImage') >= '4.19.9') {
    # Optimized version only needs one call to floodFill
    image = EBImage::floodFill(image, nuc_locations,
                               as.list(cell_ids))
  } else {
    # Older version - one call per fill
    for (i in seq_len(length(nuc_locations))) {
      image = EBImage::floodFill(image, nuc_locations[i], cell_ids[i])
    }
  }

  # Remove the membrane outlines
  image[membrane==0.5] = 0
  image
}

# This uses a distance map to find the interior-most point in a nucleus
find_interior_point = function(nuclei, cell_id) {
  # Locate the nucleus in the overall map and extract it as a patch.

  # Where is the nucleus?
  nuc_locations = which(nuclei==cell_id, arr.ind=TRUE, useNames=FALSE)
  if (nrow(nuc_locations)==0) return(NULL) # Didn't find this cell

  # Figure out the bounds of the patch. Include a 1-pixel border if possible
  row_min = max(1, range(nuc_locations[, 1])[1]-1)
  row_max = min(dim(nuclei)[1], range(nuc_locations[, 1])[2]+1)
  row_range =  seq.int(row_min, row_max)

  col_min = max(1, range(nuc_locations[, 2])[1]-1)
  col_max = min(dim(nuclei)[2], range(nuc_locations[, 2])[2]+1)
  col_range =  seq.int(col_min, col_max)

  # Extract the actual patch
  n = nuclei[row_range, col_range]

  # Clear out extra stuff and a border and compute a distance map on the patch.
  # Clearing the border prevents finding a max on the edge of the full image.
  n[1, ] = n[, 1] = n[dim(n)[1], ] = n[, dim(n)[2]] = n[n!=cell_id] = 0
  dm = EBImage::distmap(n)

  # Where is the interior-most point? We are happy to take the first one.
  ix_raw = which.max(dm)

  # Arrays are stored column-wise in R
  col_length = dim(dm)[1]
  row = ix_raw %% col_length # modulus
  col = ix_raw %/% col_length + 1 # integer division

  # Offset by the patch origin. Subtract one because we are adding
  # two one-based indices
  c(row+row_min-1, col+col_min-1)
}

# Given two cell images (from make_cell_image), find the IDs of the cells in
# each image that touch each other. Returns a matrix with two columns,
# the cell numbers in i1 and i2
find_touching_cell_pairs <- function(i1, i2, extra_size) {
  # Dilate i1 and look for intersections with i2
  # Order doesn't matter here, we are looking for kissing pairs.
  i1_big = EBImage::dilate(i1, EBImage::makeBrush(3, shape='diamond'))
  i2_big = EBImage::dilate(i2,
                           EBImage::makeBrush(3+extra_size, shape='diamond'))

  # Find p1s touching p2s as pairs.
  overlap = cbind(as.numeric(i1_big), as.numeric(i2_big))
  overlap = overlap[overlap[, 1]>0.1 & overlap[, 2]>0.1, , drop=FALSE]
  overlap = unique(overlap)
  overlap
}

make_touching_image <- function(pheno_name1, pheno_name2,
                                cell_image1, cell_image2,
                                touching_ids, masks,
                                composite_path, colors) {
  # Make a pretty picture showing the touch points.
  # First make a mask containing just the touching cells by searching for
  # the touching IDs in the original cell images. Then dilate to overlap the
  # membrane mask and mask out anything not in the membrane mask.
  both = EBImage::Image(dim=dim(cell_image1))
  both[cell_image1 %in% touching_ids[[1]]] = 1
  both[cell_image2 %in% touching_ids[[2]]] = 1

  kern3 = EBImage::makeBrush(3, shape='diamond')
  both = EBImage::dilate(both, kern3)
  both[masks$membrane==0] = 0

  composite = EBImage::readImage(composite_path, all=FALSE)
  if (!is.null(colors)) {
    # If we have colors, outline all cells of a type
    # and fill the touching cells
    i1_touching = cell_image1
    i1_touching[!cell_image1 %in% touching_ids[[1]]] = 0

    composite = EBImage::paintObjects(cell_image1, composite,
                            col=c(colors[[pheno_name1]], NA))
    composite = EBImage::paintObjects(i1_touching, composite,
                            col=c(colors[[pheno_name1]], colors[[pheno_name1]]))

    i2_touching = cell_image2
    i2_touching[!cell_image2 %in% touching_ids[[2]]] = 0

    composite = EBImage::paintObjects(cell_image2, composite,
                           col=c(colors[[pheno_name2]], NA))
    composite = EBImage::paintObjects(i2_touching, composite,
                           col=c(colors[[pheno_name2]], colors[[pheno_name2]]))
  }

  # Draw the cell outlines onto the composite image
  composite = EBImage::paintObjects(both, composite, col='white')
  composite
}

# Replace invalid path characters
# This replaces Windows invalid characters and space.
# @param s A string to clean
# @param repl The replacement character
replace_invalid_path_characters = function(s, repl) {
  re = '[<>:"/\\|?* ]'
  if (grepl(re, repl)) stop('Replacement character contains invalid characters')
  gsub(re, repl, s)
}