In akoyabio/phenoptrReports: Create reports using Phenoptics data

suppressPackageStartupMessages(library(dplyr))
library(ggplot2)
suppressPackageStartupMessages(library(purrr))
library(readxl)
suppressPackageStartupMessages(library(tidyr))

knitr::opts_chunk$set(echo=FALSE,fig.width=10, fig.height=5.5, 
                      comment=NA, warning=FALSE, message=FALSE)

workbook_path = params$workbook_path
max_slides_per_plot = params$max_slides_per_plot
max_heatmaps_per_plot = params$max_heatmaps_per_plot
.by_str = params$.by
.by = rlang::sym(.by_str)

# Note: Using readxl rather than openxlsx for reading because
# the check.names parameter of openxlsx::readWorkbook is kind of broken.
# https://github.com/awalker89/openxlsx/issues/102
sheet_names = readxl::excel_sheets(workbook_path)

# Boilerplate for ggplot theming
scale_fill_phenoptr = scale_fill_manual(values=phenoptr_colors)
scale_x_expand = scale_x_discrete(expand=c(0, 1, 0, 1)) # expand_scale(add=1)
base_line = geom_hline(yintercept=0, color='grey50')

theme_phenoptr = theme_minimal() +
  theme(strip.text.y=element_text(face='bold', size=12),
        strip.text.x=element_text(face='bold', angle=90),
        #strip.background.y=element_rect(color='grey90', fill='white', linetype=1),
        strip.background.y=element_blank(),
        axis.ticks=element_blank(),
        axis.text.x=element_blank(),
        axis.text.y=element_text(size=8),
        axis.title=element_text(face='bold'),
        panel.grid.major=element_blank(),
        panel.grid.minor=element_blank(),
        panel.spacing.x = unit(0, "null"), # Take out horizontal space 
        legend.key.size = unit(12, 'points'),
        legend.title=element_text(face='bold'),
        legend.justification = "top"
  )

# If d has more than max_slides_per_plot slides, 
# split it into pieces no bigger than max_per_plot.
# If `pack` is FALSE, try to make the pieces the same size.
# If `pack` is TRUE, make all pieces except the last contain `max_per_plot`
# items.
split_slides = function(d, max_per_plot=max_slides_per_plot, pack=FALSE) {
  slides = unique(d[[.by_str]])
  n_slides = length(slides)
  if (n_slides <= max_per_plot)
    return(list(d))

  n_groups = ceiling(n_slides/max_per_plot)
  group_size = ifelse(pack, max_per_plot, ceiling(n_slides/n_groups))
  groups = rep(1:n_groups, each=group_size)
  slide_groups = split(slides, groups[1:n_slides])
  map(slide_groups, ~(d %>% filter(!!.by %in% .x)))
}

# Create a tissue category factor which has Tumor first and Total / All last
order_tissue_categories = function(cats) {
  cats = factor(cats)
  if ('Tumor' %in% cats)
    cats = forcats::fct_relevel(cats, 'Tumor')
  if ('Total' %in% cats)
    cats = forcats::fct_relevel(cats, 'Total', after=Inf)
  if ('All' %in% cats)
    cats = forcats::fct_relevel(cats, 'All', after=Inf)
  cats
}

if ('Cell Counts' %in% sheet_names) {
  counts = read_xlsx(workbook_path, 'Cell Counts', skip=1)
  tall = counts %>%
    select(-starts_with('TMA ')) %>% 
    gather('Phenotype', 'Cell Count', -(!!.by), -`Tissue Category`) %>%
    filter(!Phenotype %in% c('Other')) %>% 
    mutate(`Tissue Category` = order_tissue_categories(`Tissue Category`))

  # If there were phenotypes defined, exclude 'Total/All Cells' from the chart(s)
  if (dplyr::n_distinct(tall$Phenotype) > 1)
    tall = tall %>% filter(!Phenotype %in% c('Total Cells', 'All Cells'))

  # If any count > 2000 we will make two sets of plots
  plot_limits = if (max(tall$`Cell Count`, na.rm=TRUE) > 2000) c(NA, 1000) else NA

  tall = tall %>% split_slides

  for (plot_limit in plot_limits) {
    walk2(seq_along(tall), tall, function(i, d) {
      limit_text = if(is.na(plot_limit)) '' else ' (limited range)'
      if (length(tall) > 1) {
        cat(stringr::str_glue('## Phenotype cell counts per tissue category and {.by_str}{limit_text} ({i} of {length(tall)})\n\n'))
      } else {
        cat(stringr::str_glue('## Phenotype cell counts per tissue category and {.by_str}{limit_text}\n\n'))
      }
      p = ggplot(d, aes(Phenotype, `Cell Count`, fill=Phenotype)) +
        base_line +
        geom_col(na.rm=TRUE, position='identity') +
        facet_grid(vars(`Tissue Category`), vars(!!.by), switch='x') +
        scale_fill_phenoptr + scale_x_expand +
        labs(x=.by_str, y='Cell Count') +
        theme_phenoptr

      if (!is.na(plot_limit))
        p = p + coord_cartesian(ylim=c(0, plot_limit))

      print(p)
      cat('\n\n')
    })
  }
}

if ('Cell Densities' %in% sheet_names) {
  density = read_xlsx(workbook_path, 'Cell Densities', skip=1)

  # Reshape for plotting
  tall = density %>% 
    select(-starts_with('TMA ')) %>% 
    gather('Phenotype', 'Density', -(!!.by), -`Tissue Category`) %>%
    filter(!Phenotype %in% c('Tissue Area (mm2)', 'Other')) %>% 
    mutate(`Tissue Category` = order_tissue_categories(`Tissue Category`)) 

  # If there were phenotypes defined, exclude 'Total/All Cells' from the chart(s)
  if (dplyr::n_distinct(tall$Phenotype) > 1)
    tall = tall %>% filter(!Phenotype %in% c('Total Cells', 'All Cells'))

  # If any Density > 2000 we will make two sets of plots
  plot_limits = if (max(tall$Density, na.rm=TRUE) > 2000) c(NA, 1000) else NA

  tall = tall %>% split_slides

  for (plot_limit in plot_limits) {
    walk2(seq_along(tall), tall, function(i, d) {
        limit_text = if(is.na(plot_limit)) '' else ' (limited range)'
      if (length(tall) > 1) {
        cat(stringr::str_glue('## Phenotype cell density per tissue category and {.by_str}{limit_text} ({i} of {length(tall)})\n\n'))
      } else {
        cat(stringr::str_glue('## Phenotype cell density per tissue category and {.by_str}{limit_text}\n\n'))
      }
      p = ggplot(d,
                 aes(Phenotype, Density, fill=Phenotype)) +
        base_line +
        geom_col(na.rm=TRUE, position='identity') +
        facet_grid(vars(`Tissue Category`), vars(!!.by), switch='x') +
        scale_fill_phenoptr + scale_x_expand +
        labs(x=.by_str, y=expression(bold(paste('Cell Density (', cells/mm^2, ')')))) +
        theme_phenoptr

      if (!is.na(plot_limit))
        p = p + coord_cartesian(ylim=c(0, plot_limit))

            print(p)
      cat('\n\n')
    })
  }
}

if ('Mean Expression' %in% sheet_names) {
  expression = read_xlsx(workbook_path, 'Mean Expression', skip=1)

  # Reshape for plotting
  tall = expression %>% 
    select(-starts_with('TMA ')) %>% 
    gather('Measure', 'Mean', -(!!.by), -`Tissue Category`) %>%
    mutate(`Tissue Category` = order_tissue_categories(`Tissue Category`)) %>%
    extract(Measure, into=c('Phenotype', 'Measure'), 
            regex='(Total Cells|All Cells|[^ ]+) (.*)') %>% 
    mutate(Measure = stringr::str_replace(Measure, '([^(]*) (\\(.*\\) )?Mean', 'Mean \\1 Expression'))

  # We may have multiple measures, first split on Measure
  tall_by_measure = split(tall, tall$Measure)

  walk(tall_by_measure, function(tall_one_measure) {
    # Now split to fit slides per page
    tall_one_measure = split_slides(tall_one_measure)

    walk2(seq_along(tall_one_measure), tall_one_measure, function(i, d) {
      if (length(tall_one_measure) > 1) {
        cat(stringr::str_glue('## {d$Measure[1]} ({i} of {length(tall_one_measure)})\n\n'))
      } else {
        cat(stringr::str_glue('## {d$Measure[1]}\n\n'))
      }
      p = ggplot(d, aes(Phenotype, Mean, fill=Phenotype)) +
        base_line +
        geom_col(na.rm=TRUE, position='identity') +
        facet_grid(vars(`Tissue Category`), vars(!!.by), switch='x') +
        scale_fill_phenoptr + scale_x_expand +
        labs(x=.by_str, y=d$Measure[1]) +
        theme_phenoptr

      print(p)
      cat('\n\n')
    })
  })
}

# There may be multiple H-Score sheets
h_score_names = sheet_names %>% purrr::keep(~startsWith(.x, 'H-Score'))
for (sheet_name in h_score_names) {
  if ('.name_repair' %in% names(formals(read_xlsx))) {
    # readxl version 1.2.0 adds a .name_repair option. We don't want name repair
    # and have to turn it off if present. We will get duplicate column names
    # that we handle by selecting by index.
    h_score = read_xlsx(workbook_path, sheet_name, skip=2, .name_repair='minimal')
  } else {
    # Older readxl appends '__1' to column names
    h_score = read_xlsx(workbook_path, sheet_name, skip=2)%>%
      rename_all(~stringr::str_remove(.x, '__1'))
  }

  # Select needed columns and order tissue category
  # `dplyr::select` requires unique names, use base subsetting
  tma_cols = startsWith(names(h_score), 'TMA ')
  h_score = h_score[, !tma_cols]
  h_score = h_score[, c(1:2, 7:11)] %>% 
    mutate(`Tissue Category` = order_tissue_categories(`Tissue Category`))

  # Find the marker from the first row, if present
  h_score_title = read_xlsx(workbook_path, sheet_name, range='C1', 
                            col_names='title') %>% 
    unlist()
  if (stringr::str_detect(h_score_title, 'H-Score,')) {
    safe_name = stringr::str_remove(h_score_title, "H-Score,") %>% 
      phenoptrReports:::escape_markdown()
    title = stringr::str_glue('Cumulative Percent of Cells in {safe_name} Bins')
  } else title = 'Cumulative Percent of Cells in Expression Bins'

  # Clean up duplicate names and reshape for plotting
  tall = h_score %>%
    select(-`H-Score`) %>% 
    gather('Bin', 'Percent',  -(!!.by), -`Tissue Category`)

  # Cumulative percent plots as one bar per slide so we don't have to split_slides
  cat('##', title, '\n\n')
  p = ggplot(tall,
         aes(!!.by, Percent, fill=Bin)) +
    geom_col(position=position_stack(reverse=TRUE), na.rm=TRUE) +
    facet_grid(vars(`Tissue Category`), switch='x') +
    scale_fill_manual(values=phenoptr_colors[c(1, 2, 4, 3)]) + 
    scale_y_continuous(labels=scales::percent) +
    labs(x='', y='Percent Total Cells per Bin') +
    theme_phenoptr +
    theme(axis.text.x=element_text(face='bold', angle=90))

    print(p)
    cat('\n\n')

  # H-Score is also one plot
  cat('##', h_score_title, '\n\n')
  p = ggplot(h_score, aes(!!.by, `H-Score`)) +
    geom_col(fill=phenoptr_colors[1]) +
    facet_grid(vars(`Tissue Category`), switch='x') +
    labs(x='', y='H-Score') +
    theme_phenoptr +
    theme(axis.text.x=element_text(face='bold', angle=90))

    print(p)
    cat('\n\n')
}

# This makes labels for heatmaps that split the names into two rows
# so they are readable. The regex splits between _ and [ while 
# preserving both.
heatmap_labeller = function(d) {
  stringr::str_split(d[[1]], '(?<=_)(?=\\[)') %>% purrr::transpose()
}

if ('Nearest Neighbors' %in% sheet_names) {
  # Make heat maps of the median distance between phenotypes,
  # one heat map per slide and tissue category.
  neighbors = read_xlsx(workbook_path, 'Nearest Neighbors', skip=1) %>% 
    select(-starts_with('TMA '))

  # Filter out Total Cells, they are not interesting and add clutter
  neighbors = neighbors %>% 
    filter(From != 'Total Cells', To != 'Total Cells')

  # Find overall max so we can use the same fill scale on every plot
  max_median = max(neighbors$Median, na.rm=TRUE)

  # Split to multiple pages
  categories = levels(order_tissue_categories(neighbors$`Tissue Category`))
  neighbors = split_slides(neighbors, max_heatmaps_per_plot, pack=TRUE)
  walk(categories, function(category) {
    iwalk(neighbors, function(d, i) {
      if (length(neighbors) > 1) {
        cat(stringr::str_glue(
          '## Median Distance to Nearest Neighbor in {category} Tissue ',
          '({i} of {length(neighbors)})\n\n'))
      } else {
        cat(stringr::str_glue('## Median Distance to Nearest Neighbor in {category} Tissue\n\n'))
      }
      d = d %>% 
        filter(`Tissue Category`==category) %>% 
        mutate(From=factor(From, levels=rev(sort(unique(From)))))
      p = ggplot(d, aes(To, From, fill=Median)) +
        geom_raster(na.rm=TRUE) +
        facet_wrap(vars(!!.by), nrow=2, labeller=heatmap_labeller) +
        scale_fill_gradientn('Median\nDistance', na.value='grey90', 
                             limits=c(0, max_median),
                             colors=RColorBrewer::brewer.pal(9, 'RdYlBu')) +
        theme_phenoptr +
        # Overrides for theme_phenoptr elements
        theme(strip.text.x=element_text(face='plain', angle=0,
                                        margin=margin(b=1)),
              axis.text.x=element_text(size=8, angle=90, hjust=1)) +
        coord_equal()


      print(p)
      cat('\n\n')
    })
  })
}

if ('Count Within' %in% sheet_names) {
  # Make heat maps of the "From with" count for every combination of
  # slide, radius and tissue category.
  counts = read_xlsx(workbook_path, 'Count Within', skip=1) %>% 
    select(-starts_with('TMA '))

  # Filter out Total Cells, they are not interesting and add clutter,
  # and From == To rows, they can have much higher counts that other
  # combinations which make it hard to interpret the rest of the values.
  counts = counts %>% filter(From != 'Total Cells', To != 'Total Cells')
  counts$`From with`[counts$From==counts$To] = NA

  # We will group the plots by radius and tissue category,
  # then split by pages.
  radii = sort(unique(counts$Radius))
  categories = levels(order_tissue_categories(counts$`Tissue Category`))

  # Find max count per radius and category so we can use the same 
  # fill scale on every page within each group.
  max_counts = counts %>% group_by(Radius, `Tissue Category`) %>% 
    summarize(max=max(`From with`, na.rm=TRUE))

  # Split to multiple pages
  split_counts = split_slides(counts, max_heatmaps_per_plot, pack=TRUE)

  walk(radii, function(radius) {
    walk(categories, function(category) {
      iwalk(split_counts, function(d, i) {
        d = d %>% filter(`Tissue Category`==category, radius==Radius) %>% 
              mutate(From=factor(From, levels=rev(sort(unique(From)))))

        if (length(split_counts) > 1) {
          cat(stringr::str_glue(
            '## Count of "From" cells in {category} Tissue with a "To" cell ',
            'within {radius} microns ({i} of {length(split_counts)})\n\n'))
        } else {
          cat(stringr::str_glue(
            '## Count of "From" cells in {category} Tissue with a "To" cell ',
            'within {radius} microns\n\n'))
        }

        # Find the max count for this group
        max_count = max_counts %>% 
          filter(`Tissue Category`==category, radius==Radius) %>% pluck('max')

        p = ggplot(d, aes(To, From, fill=`From with`)) +
          geom_raster(na.rm=TRUE) +
          facet_wrap(vars(!!.by), nrow=2, labeller=heatmap_labeller) +
          scale_fill_gradientn('Cell\nCount', na.value='grey90', 
                               limits=c(0, max_count),
                               colors=rev(RColorBrewer::brewer.pal(9, 'RdYlBu'))) +
          theme_phenoptr +
          # Overrides for theme_phenoptr elements that don't apply here
          theme(strip.text.x=element_text(face='plain', angle=0,
                                      margin=margin(b=1)),
                axis.text.x=element_text(size=8, angle=90, hjust=1)) +
          coord_equal()

        print(p)
        cat('\n\n')
      })
    })
  })
}