suppressPackageStartupMessages(library(dplyr)) library(ggplot2) suppressPackageStartupMessages(library(purrr)) library(readxl) suppressPackageStartupMessages(library(tidyr)) knitr::opts_chunk$set(echo=FALSE,fig.width=10, fig.height=5.5, comment=NA, warning=FALSE, message=FALSE) workbook_path = params$workbook_path max_slides_per_plot = params$max_slides_per_plot max_heatmaps_per_plot = params$max_heatmaps_per_plot .by_str = params$.by .by = rlang::sym(.by_str) # Note: Using readxl rather than openxlsx for reading because # the check.names parameter of openxlsx::readWorkbook is kind of broken. # https://github.com/awalker89/openxlsx/issues/102 sheet_names = readxl::excel_sheets(workbook_path) # Boilerplate for ggplot theming scale_fill_phenoptr = scale_fill_manual(values=phenoptr_colors) scale_x_expand = scale_x_discrete(expand=c(0, 1, 0, 1)) # expand_scale(add=1) base_line = geom_hline(yintercept=0, color='grey50') theme_phenoptr = theme_minimal() + theme(strip.text.y=element_text(face='bold', size=12), strip.text.x=element_text(face='bold', angle=90), #strip.background.y=element_rect(color='grey90', fill='white', linetype=1), strip.background.y=element_blank(), axis.ticks=element_blank(), axis.text.x=element_blank(), axis.text.y=element_text(size=8), axis.title=element_text(face='bold'), panel.grid.major=element_blank(), panel.grid.minor=element_blank(), panel.spacing.x = unit(0, "null"), # Take out horizontal space legend.key.size = unit(12, 'points'), legend.title=element_text(face='bold'), legend.justification = "top" ) # If d has more than max_slides_per_plot slides, # split it into pieces no bigger than max_per_plot. # If `pack` is FALSE, try to make the pieces the same size. # If `pack` is TRUE, make all pieces except the last contain `max_per_plot` # items. split_slides = function(d, max_per_plot=max_slides_per_plot, pack=FALSE) { slides = unique(d[[.by_str]]) n_slides = length(slides) if (n_slides <= max_per_plot) return(list(d)) n_groups = ceiling(n_slides/max_per_plot) group_size = ifelse(pack, max_per_plot, ceiling(n_slides/n_groups)) groups = rep(1:n_groups, each=group_size) slide_groups = split(slides, groups[1:n_slides]) map(slide_groups, ~(d %>% filter(!!.by %in% .x))) } # Create a tissue category factor which has Tumor first and Total / All last order_tissue_categories = function(cats) { cats = factor(cats) if ('Tumor' %in% cats) cats = forcats::fct_relevel(cats, 'Tumor') if ('Total' %in% cats) cats = forcats::fct_relevel(cats, 'Total', after=Inf) if ('All' %in% cats) cats = forcats::fct_relevel(cats, 'All', after=Inf) cats }
if ('Cell Counts' %in% sheet_names) { counts = read_xlsx(workbook_path, 'Cell Counts', skip=1) tall = counts %>% select(-starts_with('TMA ')) %>% gather('Phenotype', 'Cell Count', -(!!.by), -`Tissue Category`) %>% filter(!Phenotype %in% c('Other')) %>% mutate(`Tissue Category` = order_tissue_categories(`Tissue Category`)) # If there were phenotypes defined, exclude 'Total/All Cells' from the chart(s) if (dplyr::n_distinct(tall$Phenotype) > 1) tall = tall %>% filter(!Phenotype %in% c('Total Cells', 'All Cells')) # If any count > 2000 we will make two sets of plots plot_limits = if (max(tall$`Cell Count`, na.rm=TRUE) > 2000) c(NA, 1000) else NA tall = tall %>% split_slides for (plot_limit in plot_limits) { walk2(seq_along(tall), tall, function(i, d) { limit_text = if(is.na(plot_limit)) '' else ' (limited range)' if (length(tall) > 1) { cat(stringr::str_glue('## Phenotype cell counts per tissue category and {.by_str}{limit_text} ({i} of {length(tall)})\n\n')) } else { cat(stringr::str_glue('## Phenotype cell counts per tissue category and {.by_str}{limit_text}\n\n')) } p = ggplot(d, aes(Phenotype, `Cell Count`, fill=Phenotype)) + base_line + geom_col(na.rm=TRUE, position='identity') + facet_grid(vars(`Tissue Category`), vars(!!.by), switch='x') + scale_fill_phenoptr + scale_x_expand + labs(x=.by_str, y='Cell Count') + theme_phenoptr if (!is.na(plot_limit)) p = p + coord_cartesian(ylim=c(0, plot_limit)) print(p) cat('\n\n') }) } }
if ('Cell Densities' %in% sheet_names) { density = read_xlsx(workbook_path, 'Cell Densities', skip=1) # Reshape for plotting tall = density %>% select(-starts_with('TMA ')) %>% gather('Phenotype', 'Density', -(!!.by), -`Tissue Category`) %>% filter(!Phenotype %in% c('Tissue Area (mm2)', 'Other')) %>% mutate(`Tissue Category` = order_tissue_categories(`Tissue Category`)) # If there were phenotypes defined, exclude 'Total/All Cells' from the chart(s) if (dplyr::n_distinct(tall$Phenotype) > 1) tall = tall %>% filter(!Phenotype %in% c('Total Cells', 'All Cells')) # If any Density > 2000 we will make two sets of plots plot_limits = if (max(tall$Density, na.rm=TRUE) > 2000) c(NA, 1000) else NA tall = tall %>% split_slides for (plot_limit in plot_limits) { walk2(seq_along(tall), tall, function(i, d) { limit_text = if(is.na(plot_limit)) '' else ' (limited range)' if (length(tall) > 1) { cat(stringr::str_glue('## Phenotype cell density per tissue category and {.by_str}{limit_text} ({i} of {length(tall)})\n\n')) } else { cat(stringr::str_glue('## Phenotype cell density per tissue category and {.by_str}{limit_text}\n\n')) } p = ggplot(d, aes(Phenotype, Density, fill=Phenotype)) + base_line + geom_col(na.rm=TRUE, position='identity') + facet_grid(vars(`Tissue Category`), vars(!!.by), switch='x') + scale_fill_phenoptr + scale_x_expand + labs(x=.by_str, y=expression(bold(paste('Cell Density (', cells/mm^2, ')')))) + theme_phenoptr if (!is.na(plot_limit)) p = p + coord_cartesian(ylim=c(0, plot_limit)) print(p) cat('\n\n') }) } }
if ('Mean Expression' %in% sheet_names) { expression = read_xlsx(workbook_path, 'Mean Expression', skip=1) # Reshape for plotting tall = expression %>% select(-starts_with('TMA ')) %>% gather('Measure', 'Mean', -(!!.by), -`Tissue Category`) %>% mutate(`Tissue Category` = order_tissue_categories(`Tissue Category`)) %>% extract(Measure, into=c('Phenotype', 'Measure'), regex='(Total Cells|All Cells|[^ ]+) (.*)') %>% mutate(Measure = stringr::str_replace(Measure, '([^(]*) (\\(.*\\) )?Mean', 'Mean \\1 Expression')) # We may have multiple measures, first split on Measure tall_by_measure = split(tall, tall$Measure) walk(tall_by_measure, function(tall_one_measure) { # Now split to fit slides per page tall_one_measure = split_slides(tall_one_measure) walk2(seq_along(tall_one_measure), tall_one_measure, function(i, d) { if (length(tall_one_measure) > 1) { cat(stringr::str_glue('## {d$Measure[1]} ({i} of {length(tall_one_measure)})\n\n')) } else { cat(stringr::str_glue('## {d$Measure[1]}\n\n')) } p = ggplot(d, aes(Phenotype, Mean, fill=Phenotype)) + base_line + geom_col(na.rm=TRUE, position='identity') + facet_grid(vars(`Tissue Category`), vars(!!.by), switch='x') + scale_fill_phenoptr + scale_x_expand + labs(x=.by_str, y=d$Measure[1]) + theme_phenoptr print(p) cat('\n\n') }) }) }
# There may be multiple H-Score sheets h_score_names = sheet_names %>% purrr::keep(~startsWith(.x, 'H-Score')) for (sheet_name in h_score_names) { if ('.name_repair' %in% names(formals(read_xlsx))) { # readxl version 1.2.0 adds a .name_repair option. We don't want name repair # and have to turn it off if present. We will get duplicate column names # that we handle by selecting by index. h_score = read_xlsx(workbook_path, sheet_name, skip=2, .name_repair='minimal') } else { # Older readxl appends '__1' to column names h_score = read_xlsx(workbook_path, sheet_name, skip=2)%>% rename_all(~stringr::str_remove(.x, '__1')) } # Select needed columns and order tissue category # `dplyr::select` requires unique names, use base subsetting tma_cols = startsWith(names(h_score), 'TMA ') h_score = h_score[, !tma_cols] h_score = h_score[, c(1:2, 7:11)] %>% mutate(`Tissue Category` = order_tissue_categories(`Tissue Category`)) # Find the marker from the first row, if present h_score_title = read_xlsx(workbook_path, sheet_name, range='C1', col_names='title') %>% unlist() if (stringr::str_detect(h_score_title, 'H-Score,')) { safe_name = stringr::str_remove(h_score_title, "H-Score,") %>% phenoptrReports:::escape_markdown() title = stringr::str_glue('Cumulative Percent of Cells in {safe_name} Bins') } else title = 'Cumulative Percent of Cells in Expression Bins' # Clean up duplicate names and reshape for plotting tall = h_score %>% select(-`H-Score`) %>% gather('Bin', 'Percent', -(!!.by), -`Tissue Category`) # Cumulative percent plots as one bar per slide so we don't have to split_slides cat('##', title, '\n\n') p = ggplot(tall, aes(!!.by, Percent, fill=Bin)) + geom_col(position=position_stack(reverse=TRUE), na.rm=TRUE) + facet_grid(vars(`Tissue Category`), switch='x') + scale_fill_manual(values=phenoptr_colors[c(1, 2, 4, 3)]) + scale_y_continuous(labels=scales::percent) + labs(x='', y='Percent Total Cells per Bin') + theme_phenoptr + theme(axis.text.x=element_text(face='bold', angle=90)) print(p) cat('\n\n') # H-Score is also one plot cat('##', h_score_title, '\n\n') p = ggplot(h_score, aes(!!.by, `H-Score`)) + geom_col(fill=phenoptr_colors[1]) + facet_grid(vars(`Tissue Category`), switch='x') + labs(x='', y='H-Score') + theme_phenoptr + theme(axis.text.x=element_text(face='bold', angle=90)) print(p) cat('\n\n') }
# This makes labels for heatmaps that split the names into two rows # so they are readable. The regex splits between _ and [ while # preserving both. heatmap_labeller = function(d) { stringr::str_split(d[[1]], '(?<=_)(?=\\[)') %>% purrr::transpose() } if ('Nearest Neighbors' %in% sheet_names) { # Make heat maps of the median distance between phenotypes, # one heat map per slide and tissue category. neighbors = read_xlsx(workbook_path, 'Nearest Neighbors', skip=1) %>% select(-starts_with('TMA ')) # Filter out Total Cells, they are not interesting and add clutter neighbors = neighbors %>% filter(From != 'Total Cells', To != 'Total Cells') # Find overall max so we can use the same fill scale on every plot max_median = max(neighbors$Median, na.rm=TRUE) # Split to multiple pages categories = levels(order_tissue_categories(neighbors$`Tissue Category`)) neighbors = split_slides(neighbors, max_heatmaps_per_plot, pack=TRUE) walk(categories, function(category) { iwalk(neighbors, function(d, i) { if (length(neighbors) > 1) { cat(stringr::str_glue( '## Median Distance to Nearest Neighbor in {category} Tissue ', '({i} of {length(neighbors)})\n\n')) } else { cat(stringr::str_glue('## Median Distance to Nearest Neighbor in {category} Tissue\n\n')) } d = d %>% filter(`Tissue Category`==category) %>% mutate(From=factor(From, levels=rev(sort(unique(From))))) p = ggplot(d, aes(To, From, fill=Median)) + geom_raster(na.rm=TRUE) + facet_wrap(vars(!!.by), nrow=2, labeller=heatmap_labeller) + scale_fill_gradientn('Median\nDistance', na.value='grey90', limits=c(0, max_median), colors=RColorBrewer::brewer.pal(9, 'RdYlBu')) + theme_phenoptr + # Overrides for theme_phenoptr elements theme(strip.text.x=element_text(face='plain', angle=0, margin=margin(b=1)), axis.text.x=element_text(size=8, angle=90, hjust=1)) + coord_equal() print(p) cat('\n\n') }) }) }
if ('Count Within' %in% sheet_names) { # Make heat maps of the "From with" count for every combination of # slide, radius and tissue category. counts = read_xlsx(workbook_path, 'Count Within', skip=1) %>% select(-starts_with('TMA ')) # Filter out Total Cells, they are not interesting and add clutter, # and From == To rows, they can have much higher counts that other # combinations which make it hard to interpret the rest of the values. counts = counts %>% filter(From != 'Total Cells', To != 'Total Cells') counts$`From with`[counts$From==counts$To] = NA # We will group the plots by radius and tissue category, # then split by pages. radii = sort(unique(counts$Radius)) categories = levels(order_tissue_categories(counts$`Tissue Category`)) # Find max count per radius and category so we can use the same # fill scale on every page within each group. max_counts = counts %>% group_by(Radius, `Tissue Category`) %>% summarize(max=max(`From with`, na.rm=TRUE)) # Split to multiple pages split_counts = split_slides(counts, max_heatmaps_per_plot, pack=TRUE) walk(radii, function(radius) { walk(categories, function(category) { iwalk(split_counts, function(d, i) { d = d %>% filter(`Tissue Category`==category, radius==Radius) %>% mutate(From=factor(From, levels=rev(sort(unique(From))))) if (length(split_counts) > 1) { cat(stringr::str_glue( '## Count of "From" cells in {category} Tissue with a "To" cell ', 'within {radius} microns ({i} of {length(split_counts)})\n\n')) } else { cat(stringr::str_glue( '## Count of "From" cells in {category} Tissue with a "To" cell ', 'within {radius} microns\n\n')) } # Find the max count for this group max_count = max_counts %>% filter(`Tissue Category`==category, radius==Radius) %>% pluck('max') p = ggplot(d, aes(To, From, fill=`From with`)) + geom_raster(na.rm=TRUE) + facet_wrap(vars(!!.by), nrow=2, labeller=heatmap_labeller) + scale_fill_gradientn('Cell\nCount', na.value='grey90', limits=c(0, max_count), colors=rev(RColorBrewer::brewer.pal(9, 'RdYlBu'))) + theme_phenoptr + # Overrides for theme_phenoptr elements that don't apply here theme(strip.text.x=element_text(face='plain', angle=0, margin=margin(b=1)), axis.text.x=element_text(size=8, angle=90, hjust=1)) + coord_equal() print(p) cat('\n\n') }) }) }) }
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.