In broadinstitute/cdsr_biomarker: R toolkit for biomarker analysis

title: r params$title date: "r format(Sys.time(), '%d %B %Y')"

library(taigr)
library(htmltools)
library(DT)
library(plotly)
library(crosstalk)
library(here)
library(cowplot)
library(tidyverse)
library(heatmaply)

knitr::opts_chunk$set(echo = FALSE, warning = FALSE, message = FALSE, fig.width = 7, fig.height = 7, cache = F)
htmltools::img(src = knitr::image_uri(system.file("reports","CDS_logo.png",package = "cdsrbiomarker")), 
               alt = 'logo', 
               width="150px", height="170px",
               style = 'position:absolute; top:10px; left:75%')

bscols(plot_ly(),datatable(tibble(),style="bootstrap"))

if (is.null(params$in_path)) {stop("You must specify the results directory")}
df <- read_csv(str_c(params$in_path,"/data.csv"))
y <- as.matrix(df[,-1])
rownames(y) <- df[[1]]
disc_table <- read_csv(str_c(params$in_path,"/discrete_table.csv"))
lin_table <- read_csv(str_c(params$in_path,"/lin_association_table.csv"))
rf_table <- read_csv(str_c(params$in_path,"/rf_table.csv"))

if(disc_table$pert %>% unique() %>% length() < 2){stop("Biomarker results tables must contain more than one perturbation. Try single_profile_biomarker_report instead.")}

This report presents biomarker analysis results for two or more response profiles.
To navigate through this report click on the blue tabs under each heading.

Response profiles {.tabset}

These tabs provide information about the response profiles included in this report.

Info

if(file.exists(str_c(params$in_path,"/meta_data.csv"))){
  meta_data <- read_csv(str_c(params$in_path,"/meta_data.csv"))
  datatable(meta_data, style="bootstrap", width="100%",options = list(lengthChange = FALSE))
}

Correlation

cor <- cor(y, use = "complete.obs")
diag(cor) <- NA
p <- plot_ly(
    x = colnames(cor), y = rownames(cor),
    z = cor, type = "heatmap"
)
p %>% colorbar(title = "Correlation") %>% layout(xaxis = list(tickangle = 90)) %>% config(displayModeBar = F)

Discrete test {.tabset}

These tabs presents results for discrete features. The effect size is the difference between each feature and all others and the q-value is a t-test corrected for multiple hypotheses testing.

for (cur_feature_type in disc_table$feature_type %>% unique()) {
  cat(sprintf('\n\n## %s {.tabset .tabset-pills}\n\n', cur_feature_type))
  for (cur_pert in disc_table$pert %>% unique()) {
    cat(sprintf('\n\n### %s \n\n', cur_pert))
    cat('\n\n You can scroll through the table to select features to 
      highlight in the volcano plot or select points in the volcano plot to view 
      them in the table.\n\n')

    df <- disc_table %>% 
      dplyr::filter(feature_type == cur_feature_type & pert == cur_pert & q.value < .1) %>%
      arrange(q.value) %>% head(2000) %>% 
      transmute(feature = feature,`effect size` = round(effect_size,4),
                `-log10(q-value)` = round(-log10(q.value),4),
                color = q.value < .01)

    colors <- df %>% arrange(color) %>% 
      mutate(color = ifelse(color,"darkorange","darkgrey")) %>% .[["color"]] %>% unique()

    sd <- highlight_key(select(df,-color),~feature)

    updatemenus <- list(list(showactive = FALSE,type = 'buttons',x = 1,y = 1.15,
                             buttons = list(
      list(label = "Reset",method = "relayout",args = list(list(shapes = c()))))))

    p <- plot_ly(type = "scatter", data = sd, x = ~`effect size`, y = ~`-log10(q-value)`,
               text = ~feature, hoverinfo = "text", color = df$color, 
               colors = colors,
               marker = list(line = list(width = 0))) %>% 
      layout(xaxis = list(title = "effect size"), yaxis = list(title = "-log10(q-value)"),
             showlegend = FALSE,updatemenus = updatemenus) %>% config(displayModeBar = F)

     p2 <-bscols(
      ggplotly(p) %>%
        highlight(color = rgb(0,0.4588,0.6901),on = "plotly_click",
                  off = "plotly_relayout",opacityDim = 1),
      datatable(sd, style="bootstrap", width="100%",
                options = list(lengthChange = FALSE,scrollY = "300px",paging = FALSE)),
      widths = c(6,6))

     c('\n\n')
     cat(htmltools::knit_print.shiny.tag(p2))
     c('\n\n')
  }
  cat('\n\n### shared \n\n')
  cat('\n\n To generate this plot biomarkers for each response profile are ranked by q-value. 
      The 50 biomarkers with the lowest average rank across the response profiles are shown. \n\n')
  ranks <- disc_table %>% filter(feature_type == cur_feature_type) %>% dplyr::group_by(pert) %>% 
    dplyr::mutate(rank = dense_rank(q.value)) %>% select(feature,rank,pert) %>% 
    spread(key = "pert",value = "rank") %>% 
    tibble::column_to_rownames(var = "feature") %>% as.matrix()
  ranks[is.na(ranks)] <- max(ranks, na.rm = T)
  top_shared_biomarkers <- ranks %>% rowMeans() %>% sort() %>% head(50) %>% names()
  fig <- heatmaply(t(ranks[top_shared_biomarkers,]),dendrogram = "none",colors = viridis(n = 256, direction = -1),
                   column_text_angle = 90,na.rm = FALSE,key.title = "rank") %>% config(displayModeBar = F)
  c('\n\n')
  cat(htmltools::knit_print.shiny.tag(fig))
  c('\n\n')
}

Linear association {.tabset}

These tabs presents results for continuous features. The effect size and q-value are calculated using linear modeling approach. Specifically, the effect size is the mean moderated effect size based on adaptive shrinkage.

for (cur_feature_type in lin_table$feature_type %>% unique()) {
  cat(sprintf('\n\n## %s {.tabset .tabset-pills}\n\n', cur_feature_type))
  for (cur_pert in lin_table$pert %>% unique()) {
    cat(sprintf('\n\n### %s \n\n', cur_pert))
    cat('\n\n You can scroll through the table to select features to 
      highlight in the volcano plot or select points in the volcano plot to view 
      them in the table.\n\n')

    df <- lin_table %>% 
      dplyr::filter(feature_type == cur_feature_type & pert == cur_pert & qvalue < .1) %>%
      arrange(qvalue) %>% head(2000) %>% 
      transmute(feature = str_sub(ind.var,0,20),`effect size` = round(PosteriorMean,4),
                `-log10(q-value)` = round(-log10(qvalue),4),
                color = qvalue < .01)

    sd <- highlight_key(select(df,-color),~feature)

    updatemenus <- list(list(showactive = FALSE,type = 'buttons',x = 1,y = 1.15,
                             buttons = list(
      list(label = "Reset",method = "relayout",args = list(list(shapes = c()))))))

    colors <- df %>% arrange(color) %>% 
      mutate(color = ifelse(color,"darkorange","darkgrey")) %>% .[["color"]] %>% unique()

    p <- plot_ly(type = "scatter", data = sd, x = ~`effect size`, y = ~`-log10(q-value)`,
               text = ~feature, hoverinfo = "text", color = df$color, 
               colors = colors,
               marker = list(line = list(width = 0))) %>% 
      layout(xaxis = list(title = "effect size"), yaxis = list(title = "-log10(q-value)"),
             showlegend = FALSE,updatemenus = updatemenus) %>% config(displayModeBar = F)

     p2 <-bscols(
      ggplotly(p) %>%
        highlight(color = rgb(0,0.4588,0.6901),on = "plotly_click",
                  off = "plotly_relayout",opacityDim = 1),
      datatable(sd, style="bootstrap", width="100%",
                options = list(lengthChange = FALSE,scrollY = "300px",paging = FALSE)),
      widths = c(6,6))

     c('\n\n')
     cat(htmltools::knit_print.shiny.tag(p2))
     c('\n\n')
  }
  cat('\n\n### shared \n\n')
  cat('\n\n To generate this plot biomarkers for each response profile are ranked by q-value. 
      The 50 biomarkers with the lowest average rank across the response profiles are shown. \n\n')
  ranks <- lin_table %>% filter(feature_type == cur_feature_type) %>% dplyr::group_by(pert) %>% 
    dplyr::mutate(rank = rank(qvalue)) %>% select(ind.var,rank,pert) %>% 
    spread(key = "pert",value = "rank") %>% 
    tibble::column_to_rownames(var = "ind.var") %>% as.matrix()
  ranks[is.na(ranks)] <- max(ranks, na.rm = T)
  ranks[ranks > 500] <- 500
  top_shared_biomarkers <- ranks %>% rowMeans() %>% sort() %>% head(50) %>% names()
  fig <- heatmaply(t(ranks[top_shared_biomarkers,]),dendrogram = "none",colors = viridis(n = 256, direction = -1),
                   column_text_angle = 90,na.rm = FALSE,key.title = "rank") %>% config(displayModeBar = F)
  c('\n\n')
  cat(htmltools::knit_print.shiny.tag(fig))
  c('\n\n')
}

Random forest {.tabset}

These tabs presents random forest results for different feature sets. Feature importance is the importance of the feature in the random forstest and stability is the stability of that feature are folds in 10-fold cross validation.

cat('\n\n## model accuracy\n\n')
cat('\n\n This plot shows the pearson correlation between the random forest predictions 
    and the measured values.\n\n')

rf_table %>% group_by(pert,feature_set) %>% 
  dplyr::summarize(pearson = first(PearsonScore)) %>% 
  ggplot(aes(pert,pearson,fill = feature_set)) + geom_col(position = "dodge") + 
  theme_cowplot(10) + theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
  labs(x = "perturbation", fill = "feature set") 

for (cur_feature_set in rf_table$feature_set %>% unique()) {
  cat(sprintf('\n\n## %s {.tabset .tabset-pills}\n\n', cur_feature_set))
  for (cur_pert in rf_table$pert %>% unique()) {
    cat(sprintf('\n\n### %s \n\n', cur_pert))
    cat('\n\n You can scroll through the table to select features to 
      highlight in the plot or select points in the plot to view 
      them in the table.\n\n')

    df <- rf_table %>% 
      dplyr::filter(feature_set == cur_feature_set & pert == cur_pert) %>% 
      transmute(feature = feature,`feature importance` = round(RF.imp.mean,4),
                stability = round(RF.imp.stability,4),
                color = str_split_fixed(feature,pattern = "_",n = 2)[,1])

    sd <- highlight_key(select(df,-color),~feature)

    updatemenus <- list(list(showactive = FALSE,type = 'buttons',x = 1.2,y = 1.15,
                             buttons = list(
      list(label = "Reset",method = "relayout",args = list(list(shapes = c()))))))

    p <- plot_ly(type = "scatter", data = sd, x = ~`feature importance`, y = ~stability,
               text = ~feature, hoverinfo = "text", color = df$color, colors = "Set1",
               marker = list(line = list(width = 0))) %>% 
      layout(xaxis = list(title = "feature importance"), yaxis = list(title = "stability"),
             showlegend = TRUE,updatemenus = updatemenus) %>% config(displayModeBar = F)

     p2 <-bscols(
      ggplotly(p) %>%
        highlight(color = rgb(0,0.4588,0.6901),on = "plotly_click",
                  off = "plotly_relayout",opacityDim = 1),
      datatable(sd, style="bootstrap", width="100%",
                options = list(lengthChange = FALSE,scrollY = "300px",paging = FALSE)),
      widths = c(6,6))

     c('\n\n')
     cat(htmltools::knit_print.shiny.tag(p2))
     c('\n\n')
  }
  cat('\n\n### shared \n\n')
  cat('\n\n To generate this plot biomarkers for each response profile are ranked by q-value. 
      The 50 biomarkers with the lowest average ranks across the response profiles are shown. \n\n')
  ranks <- rf_table %>% filter(feature_set == cur_feature_set) %>% dplyr::group_by(pert) %>% 
    dplyr::mutate(rank = rank(-RF.imp.mean)) %>% select(feature,rank,pert) %>% 
    spread(key = "pert",value = "rank") %>% 
    tibble::column_to_rownames(var = "feature") %>% as.matrix()
  ranks[is.na(ranks)] <- max(ranks, na.rm = T)
  top_shared_biomarkers <- ranks %>% rowMeans() %>% sort() %>% head(50) %>% names()
  fig <- heatmaply(t(ranks[top_shared_biomarkers,]),dendrogram = "none",colors = viridis(n = 256, direction = -1),
                   column_text_angle = 90,na.rm = FALSE,key.title = "rank") %>% config(displayModeBar = F)
  c('\n\n')
  cat(htmltools::knit_print.shiny.tag(fig))
  c('\n\n')
}