suppressPackageStartupMessages(library(magrittr)) if (rlang::is_true(getOption("knitr.in.progress"))) { params_ <- scdrake::scdrake_list(params) } drake_cache_dir <- params_$drake_cache_dir drake::loadd( config_main, config_integration, hvg_plots_int_df, single_samples_df, common_features_count, hvg_int_list, gene_annotation, int_diagnostics_df, sce_int_pca_df_for_report, sce_int_dimred_plots_df, selected_markers_int_plots_files, selected_markers_int_plots_files_out, path = drake_cache_dir ) cfg <- config_integration details_template <- scdrake::str_line( "\n<details>", " <summary class='used-functions'>Show method details \u25be</summary>", " <hr />", " Used function: {fn_link}\n\n", " {text}", " <hr />", "</details>\n" )
This table is showing which data were integrated. Before integration, these data were processed by the single-sample pipeline.
single_samples_df
The first step in integration was subsetting the input data to contain only a common set of features ("universe"):
Common features count: r common_features_count
Then each sample was rescaled to adjust for differences in sequencing depth.
The batchelor::multiBatchNorm()
function recomputes log-normalized expression values after
adjusting the size factors for systematic differences in coverage between SingleCellExperiment
objects.
(Size factors only remove biases between cells within a single batch.)
This improves the quality of the correction by removing one aspect of the technical differences between batches.
r scdrake::format_used_functions("batchelor::multiBatchNorm()")
HVGs identified in individual samples are combined before integration.
hvg_int <- hvg_int_list$hvg_int hvg_combination <- hvg_int$hvg_combination hvg_int_with_cc <- hvg_int_list$hvg_int_with_cc if (!rlang::is_null(hvg_int_with_cc)) { hvg_rm_cc_genes_ids <- hvg_int$hvg_rm_cc_genes_ids hvg_rm_cc_genes_ann <- gene_annotation[hvg_rm_cc_genes_ids, ] scdrake::catg0("{length(hvg_rm_cc_genes_ids)} cell cycle related genes were removed before HVG selection.\n\n") cc_genes_table <- hvg_rm_cc_genes_ann[, c("ENSEMBL", "SYMBOL")] %>% scdrake::render_bootstrap_table(row.names = FALSE) %>% as.character() cat("<details>\n <summary class='used-functions'>Show cell cycle-related genes \u25be</summary>\n\n") cat(cc_genes_table) cat("\n\n</details>") } if (hvg_combination == "hvg_metric") { hvg_metric <- hvg_int$hvg_metric hvg_selection <- hvg_int$hvg_selection hvg_selection_value <- hvg_int$hvg_selection_value scdrake::catg0( "**HVG metric (common for all samples)**: '{hvg_metric}' -> combined by {hvg_combination_fn}\n\n", hvg_combination_fn = dplyr::if_else(hvg_int$hvg_metric == "gene_var", "`scran::combineVar()`", "`scran::combineCV2()`") ) scdrake::catg0('Based on "{hvg_metric}", HVGs were selected by: ') if (hvg_selection == "top") { scdrake::catg0("top {hvg_selection_value} HVGs.\n\n") } else if (hvg_selection == "significance") { scdrake::catg0("FDR < {hvg_selection_value}\n\n") } else if (hvg_selection == "threshold") { scdrake::catg0("{hvg_metric} > {hvg_selection_value}\n\n") } } else { scdrake::catg0("HVGs were combined by {hvg_combination} of HVGs within individual samples.\n\n") } res <- scdrake::lapply_rows(hvg_plots_int_df, FUN = function(par) { if (par$hvg_rm_cc_genes) { scdrake::md_header("HVGs with removed CC genes", 2) } else { scdrake::md_header("All HVGs", 2) } scdrake::catg0("**Found {length(par$hvg_int$hvg_ids)} HVGs.**\n\n") print(par$hvg_plot) return(list()) })
if (hvg_int$hvg_metric == "gene_var") { res <- scdrake::format_used_functions("scran::combineVar()", do_cat = TRUE) } else if (hvg_int$hvg_metric == "gene_cv2") { res <- scdrake::format_used_functions("scran::combineCV2()", do_cat = TRUE) }
scran::denoisePCA()
, which takes the estimates returned by scran::modelGeneVar()
.res <- dplyr::group_by(sce_int_pca_df_for_report, name) %>% dplyr::group_map(function(group, key) { int_method_name <- key$name int_method_description <- scdrake::get_int_method_description(int_method_name) scdrake::md_header(int_method_description$header, 2, extra = "{.tabset}") dplyr::group_by(group, hvg_rm_cc_genes) %>% dplyr::group_map(function(group, key) { hvg_rm_cc_genes <- key$hvg_rm_cc_genes if (hvg_rm_cc_genes) { scdrake::md_header("Removed cell cycle genes from HVGs", 3) } else { scdrake::md_header("With all HVGs", 3) } print(group$pca_selected_pcs_plot[[1]]) scdrake::catg0("\n\n**{group$pca_selected_pcs} PCs were selected using the '{group$pca_selection_method}' method.**\n\n") } ) })
r scdrake::format_used_functions(c("scater::runPCA()", "PCAtools::findElbowPoint()", "scran::getDenoisedPCs()", "batchelor::multiBatchPCA()"))
Here you can quickly check how samples overlap after integration.
res <- dplyr::group_by(sce_int_dimred_plots_df, name) %>% dplyr::group_map(function(group, key) { int_method_name <- key$name int_method_description <- scdrake::get_int_method_description(int_method_name) scdrake::md_header(int_method_description$header, 2, extra = "{.tabset}") scdrake::catg0( details_template, fn_link = int_method_description$fn_link, text = int_method_description$description ) by_hvg_rm_cc_genes <- dplyr::group_by(group, hvg_rm_cc_genes) dplyr::group_map(by_hvg_rm_cc_genes, function(group, key) { hvg_rm_cc_genes <- key$hvg_rm_cc_genes if (hvg_rm_cc_genes) { scdrake::md_header("Removed cell cycle genes from HVGs", 3, extra = "{.tabset}") } else { scdrake::md_header("With all HVGs", 3, extra = "{.tabset}") } by_dimred <- dplyr::group_by(group, dimred_name) dplyr::group_map(by_dimred, function(group, key) { dimred_name <- key$dimred_name scdrake::md_header(stringr::str_to_upper(dimred_name), 4) if (!is_null(selected_markers_int_plots_files)) { selected_markers_file <- selected_markers_int_plots_files %>% dplyr::filter(name == int_method_name, hvg_rm_cc_genes == !!hvg_rm_cc_genes, dimred_name == !!dimred_name) %>% dplyr::pull(out_pdf_file) cat("\n\n") scdrake::create_a_link( href = selected_markers_file, text = "Selected markers PDF", href_rel_start = fs::path_dir(cfg$INTEGRATION_REPORT_HTML_FILE), do_cat = TRUE ) cat("\n\n") } by_colour_by <- dplyr::group_by(group, colour_by) dplyr::group_map(by_colour_by, function(group, key) { print(group$plot[[1]]) }) }) }) })
Graph-based clustering was used for diagnostics below.
res <- dplyr::group_by(int_diagnostics_df, name) %>% dplyr::group_map(function(group, key) { int_method_name <- key$name int_method_description <- scdrake::get_int_method_description(int_method_name) scdrake::md_header(int_method_description$header, 2, extra = "{.tabset}") dplyr::group_by(group, hvg_rm_cc_genes) %>% dplyr::group_map(function(group, key) { hvg_rm_cc_genes <- key$hvg_rm_cc_genes if (hvg_rm_cc_genes) { scdrake::md_header("Removed cell cycle genes from HVGs", 3, extra = "{.tabset}") } else { scdrake::md_header("With all HVGs", 3, extra = "{.tabset}") } scdrake::md_header("Cells's assignment", 4) cells_per_batch_cluster <- group$cells_per_batch_cluster[[1]] cluster_has_zeros <- apply(dplyr::select(cells_per_batch_cluster, -cluster), 1, FUN = function(row) any(row == 0)) cluster_has_zeros <- cells_per_batch_cluster$cluster[cluster_has_zeros] if (!is_empty(cluster_has_zeros)) { scdrake::catg0( "**WARNING**: The following clusters have zero number of assigned cells in some samples: ", "{str_comma(cluster_has_zeros)}\n\n\n" ) } print( kableExtra::kable(group$cells_per_batch_cluster_percentages[[1]], row.names = FALSE) %>% kableExtra::kable_styling(bootstrap_options = c("striped", "hover", "condensed")) ) %>% cat() scdrake::md_header("Cells's assignment (plots)", 4) print(group$plot_batch_cluster_absolute[[1]]) print(group$plot_batch_cluster_ratio[[1]]) scdrake::md_header("Variance in cluster assignment", 4) cat( "The variation in the log-abundances to rank the clusters with the greatest variability", "in their proportional abundances across batches. We can then focus on batch-specific clusters", "that may be indicative of incomplete batch correction. Obviously, though, this diagnostic is", "subject to interpretation as the same outcome can be caused by batch-specific populations;", "some prior knowledge about the biological context is necessary to distinguish between these two possibilities.\n\n", sep = " " ) print( kableExtra::kable(group$batch_cluster_var_df[[1]], row.names = FALSE) %>% kableExtra::kable_styling(bootstrap_options = c("striped", "hover", "condensed")) ) %>% cat() scdrake::md_header("Rand indices", 4) cat( "Rand index is used to evaluate biological heterogeneity preservation by summarizing the agreement between clusterings.", "This provides a simple metric that we can use to assess the preservation of variation by different correction methods.", "Larger rand indices (i.e., closer to 1) are more desirable, though this must be balanced against the ability of each", "method to actually remove the batch effect.\n\n", sep = " " ) print( kableExtra::kable(group$rand_indices[[1]], row.names = FALSE) %>% kableExtra::kable_styling(bootstrap_options = c("striped", "hover", "condensed")) ) %>% cat() } ) })
Rand indices are used to evaluate biological heterogeneity preservation by summarizing the agreement between clusterings. This provides a simple metric that we can use to assess the preservation of variation by different correction methods. Larger Rand indices (i.e., closer to 1) are more desirable, though this must be balanced against the ability of each method to actually remove the batch effect.
int_diagnostics_df$rand_indices %>% dplyr::bind_rows() %>% dplyr::bind_cols( dplyr::select(int_diagnostics_df, name, hvg_rm_cc_genes), . ) %>% dplyr::arrange(hvg_rm_cc_genes, name)
r scdrake::format_used_functions("bluster::pairwiseRand()")
Show input parameters
Main config
print(config_main)
print(config_integration)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.