An example how to build multiple super-cell-like objects, including 'exact' (Super-cells obtained with the exact coarse-gaining), 'approx' (Super-cells obtained with the aaproximate coarse-graining), 'metacell(.*)' (Metacell build on the same genes as super-cells -- 'metacell_SC_like'; and Metacell build in a default set of genes -- 'metacell_default') for a set of graining levels and random seeds.
# if (!requireNamespace("BiocManager", quietly = TRUE)) # install.packages("BiocManager") # # BiocManager::install("SingleCellExperiment") # # if (!requireNamespace("remotes")) install.packages("remotes") # remotes::install_github("GfellerLab/SuperCell") # remotes::install_github("mariiabilous/SuperCellBM") library(SingleCellExperiment) library(SuperCell) library(SuperCellBM)
Such as .gamma.seq
for the set of fraining levels, .seed.seq
for the set of random seeds, adata.folder
and fig.folder
for the folders where to write data and plots.
For the full list of the default parameters, see ./examples/config/Tian_config.R
.
source("./examples/config/cell_lines_config.R")
Whether to compute super-cell (ToComputeSC
) or whether to compute super-cell gene expression (ToComputeSC_GE
) or load saved files. Make sure, these file exists :)
Flag ToTestPackage
is used to run stript in 2 modes: package testing (ToTestPackage == TRUE
) or generating super-cell structure and super-cell gene expression for the further analyses (ToTestPackage == FALSE
).
ToComputeSC <- F ToComputeSC_GE <- F ToTestPackage <- F # @Loc, This is just to test whether package works (on reduced set of graining levels and seeds, turn it to FALSE to get real data @ all grainig levels and seeds) filename_suf <- "" # variable to add a suffix to the saved files in case of testing of the package if(ToTestPackage){ testing_gamma_seq <- c(1, 10, 100) testing_seed_seq <- .seed.seq[1:3] warning(paste("The reduced set of graining leveles and seeds will be used, to get real output, turn it ti FALSE")) warning(paste("Original set of graining levels is:", paste(.gamma.seq, collapse = ", "), "but used testing set is:", paste(testing_gamma_seq, collapse = ", "))) warning(paste("Original set of seeds is:", paste(.seed.seq, collapse = ", "), "but used testing set is:", paste(testing_seed_seq, collapse = ", "))) .gamma.seq <- testing_gamma_seq .seed.seq <- testing_seed_seq filename_suf = "_testing_package" } else { .seed.seq = .seed.seq[1:5] }
cell_lines
data from Tian et al., 2019.RData.file.path <- file.path(data.folder, 'cell_lines_git.RData') if(!file.exists(RData.file.path)){ if(!dir.exists(data.folder)) dir.create(data.folder, recursive = T) download.file('https://github.com/LuyiTian/sc_mixology/blob/master/data/sincell_with_class_5cl.RData?raw=true', RData.file.path) } load(RData.file.path) # keep used dataset cell_lines_SCE <- sce_sc_10x_5cl_qc #remove not-used datasets rm(sc_Celseq2_5cl_p1, sc_Celseq2_5cl_p2, sc_Celseq2_5cl_p3, sce_sc_10x_5cl_qc)
Get and set the main variables, such as single-cell gene expression (sc.GE
), single-cell counts (sc.counts
),
number of single cells (N.c
) and total number of genes (N.g
).
Set matrix column names to cellIDs (cell.ids
) and row names to gene names (gene.names
).
cell.ids <- cell_lines_SCE@colData@rownames N.c <- cell_lines_SCE@colData@nrows gene.names <- cell_lines_SCE@int_elementMetadata$external_gene_name N.g <- length(gene.names) sc.GE <- cell_lines_SCE@assays$data$logcounts colnames(sc.GE) <- cell.ids rownames(sc.GE) <- gene.names sc.counts <- cell_lines_SCE@assays$data$counts colnames(sc.counts) <- cell.ids rownames(sc.counts) <- gene.names if(0){ saveRDS(sc.counts, file.path(data.folder, "sc_counts.Rds")) cell.meta.df <- cell_lines_SCE@colData saveRDS(cell.meta.df, file.path(data.folder, "sc_meta_data_df.Rds")) } ## this is not needed at this point, but will be used later GT.cell.type <- cell_lines_SCE@colData$cell_line_demuxlet names(GT.cell.type) <- cell.ids N.clusters <- length(unique(GT.cell.type)) GT.cell.type.names <- names(table(GT.cell.type)) GT.cell.type.2.num <- 1:length(unique(GT.cell.type)) names(GT.cell.type.2.num) <- GT.cell.type.names GT.cell.type.num <- GT.cell.type.2.num[GT.cell.type] names(GT.cell.type.num) <- names(GT.cell.type) ## uncomment this when needed #.pal.GT <- .color.tsne.Tian ## to Global Config #scales::show_col(.pal.GT) #mito.genes <- grep(pattern = "^MT", x = gene.names, value = TRUE) #ribo.genes <- grep(pattern = "^RP[LS]", x = gene.names, value = TRUE) #mito.ribo.genes <- c(mito.genes, ribo.genes) #length(mito.ribo.genes) #gene.meta <- data.frame(name = gene.names, inNcells = rowSums(sc.GE>0), mean.expr = rowMeans(sc.GE), sd = rowSds(sc.GE)) #head(gene.meta)
for the Exact, Aprox (Super-cells obtained with the exact or approximate coarse-graining), Subsampling or Random (random grouping of cells into super-cells).
filename <- paste0('initial', filename_suf) SC.list <- compute_supercells( sc.GE, ToComputeSC = ToComputeSC, data.folder = data.folder, filename = filename, gamma.seq = .gamma.seq, n.var.genes = .N.var.genes, k.knn = .k.knn, n.pc = .N.comp, approx.N = .approx.N, fast.pca = TRUE, genes.use = .genes.use, genes.exclude = .genes.omit, seed.seq = .seed.seq ) # min sizes of supre-cells lapply(SC.list$Exact, function(x){ min(x[[1]][["supercell_size"]]) }) #set min sizes of metacells min_mc_size_seq <- c(1, 5, 10, 15, 20) cat(paste("Super-cell computed for:", paste(names(SC.list), collapse = ", "), "\nat graining levels:", paste(names(SC.list[['Approx']]), collapse = ", "), "\nfor seeds:", paste(names(SC.list[['Approx']][[1]]), collapse = ", "), "\n", "\nand saved to / loaded from", paste0(filename, ".Rds")))
SC.mc <- compute_supercells_metacells_with_min_mc_size( sc.counts = sc.counts, gamma.seq = .gamma.seq, SC.list = SC.list, min_mc_size_seq = min_mc_size_seq, proj.name = proj.name, ToComputeSC = ToComputeSC, mc.k.knn = 100, T_vm_def = 0.08, MC.folder = "MC", MC_gene_settings = c('Metacell_default', 'Metacell_SC_like') # do not change )
additional_gamma_seq <- get_actual_gammas_metacell(SC.mc[1]) cat(paste("Metacells were computed in", length(names(SC.mc)), "settings:", paste(names(SC.mc), collapse = ", "), "\nfor Gammas:", paste(names(SC.mc[[1]]), collapse = ", "), "\nbut actual gammas are:", paste(additional_gamma_seq, collapse = ", ") )) # manually expand MC because later we will have 2 different setting for MC profile: fp - footpring of MC, av - averaged SC.mc.fp <- SC.mc names(SC.mc.fp) <- sapply(names(SC.mc), FUN = function(x){paste0(x, '_fp')}) SC.mc.av <- SC.mc names(SC.mc.av) <- sapply(names(SC.mc), FUN = function(x){paste0(x, '_av')}) SC.mc.expanded <- c(SC.mc.fp, SC.mc.av) names(SC.mc.expanded) rm(SC.mc.fp, SC.mc.av, SC.mc)
So that we can dirrectly compare the results of Super-cells and Metacells at the same graining levels.
filename <- paste0('additional_gammas', filename_suf) SC.list <- compute_supercells_additional_gammas( SC.list, additional_gamma_seq = additional_gamma_seq, ToComputeSC = ToComputeSC, data.folder = data.folder, filename = filename, approx.N = .approx.N, fast.pca = TRUE ) cat(paste("Super-cells of methods:", paste(names(SC.list), collapse = ", "), "\nwere computed at aggitional graining levels:", paste(additional_gamma_seq, collapse = ", "), "\nand added to SC.list" ))
SC.list <- c(SC.list, SC.mc.expanded) rm(SC.mc.expanded) filename <- paste0("all", filename_suf) saveRDS(SC.list, file = file.path(data.folder, "SC", paste0(filename, ".Rds"))) cat(paste( "Metacell data added to SC.list \nand now it contains:", paste(names(SC.list), collapse = ", "), "\nSC.list was saved to", file.path(data.folder, "SC", paste0(filename, ".Rds")) ))
SC.list <- annotate_supercells_to_cluster( SC.list = SC.list, sc.annotation = GT.cell.type, annotation.name = 'cell_line' )
purity.df <- plot_annotation_purity( SC.list, annotation.name = 'cell_line', w = 2.5, h = 2.5, )
GE profile for the super-cell data is computede:
metacell_(.*)_av
2) using the default output of Metacell maned footprint -> metacell_(.*)_fp
filename <- paste0("all", filename_suf) SC.GE.list <- compute_supercells_GE( sc.GE = sc.GE, SC.list = SC.list, ToComputeSC_GE = ToComputeSC_GE, data.folder = data.folder, filename = filename ) cat(paste("Gene expression profile computed for:", paste(names(SC.GE.list), collapse = ", "), "\nat graining levels:", paste(sort(as.numeric(names(SC.GE.list[['Approx']]))), collapse = ", "), "\nfor seeds:", paste(names(SC.GE.list[['Approx']][[1]]), collapse = ", "), "\nand saved to / loaded from", paste0(filename, ".Rds") ))
SC.list <- compute_supercells_PCA( SC.list = SC.list, SC.GE.list = SC.GE.list, N.comp = .N.comp ) SC.list <- remove_mc_fields(SC.list) saveRDS(SC.list, file = file.path(data.folder, 'SC', 'all_PCA.Rds'))
SC.list <- readRDS(file.path(data.folder, 'SC', 'all_PCA.Rds'))
# clustering run at HPC if(0){ SC.list <- compute_supercells_clustering( SC.list, N.comp = 10, N.clusters.seq = c(2:10), pca_name = 'SC_PCA' ) saveRDS(SC.list, file = file.path(data.folder, 'SC', 'all_PCA_clustering.Rds')) } else { SC.list <- readRDS(file.path(data.folder, "SC", "all_PCA_clustering.Rds")) }
# clustering run at HPC genes.use <- SC.list$Exact$`2`$`12345`$genes.use if(length(.N.comp) == 1) {N.comp <- 1:.N.comp} else {N.comp <- .N.comp} if(0){ sc.pca <- prcomp(t(sc.GE[genes.use,]), scale. = T, center = T) sc.dist <- dist(sc.pca$x[,N.comp]) sc.clustering <- hclust(sc.dist, method = "ward.D2") sc.clustering <- cutree(sc.clustering, k = length(unique(GT.cell.type))) saveRDS(sc.clustering, file = file.path(data.folder, 'sc_clustering.Rds')) } else { sc.clustering <- readRDS(file.path(data.folder, 'sc_clustering.Rds')) }
devtools::document() devtools::install() detach('package:SuperCellBM', unload = TRUE) library(SuperCellBM)
# clustering run at HPC if(0){ alt.clusterings <- compute_alternative_clustering( sc.pca = sc.pca$x, N.comp = .N.comp, N.clusters.seq = .N.clusters.seq, seed.seq = .seed.seq ) saveRDS(alt.clusterings, file = file.path(data.folder, 'alt_clustering.Rds')) } else { alt.clusterings <- readRDS(file.path(data.folder, 'alt_clustering.Rds')) }
# run at HPC if(0){ clust.consistency <- compute_consistency_of_supercell_clustering( SC.list = SC.list, sc.annotation = GT.cell.type, sc.clustering = sc.clustering, sc.alternative.clustering = alt.clusterings ) saveRDS(clust.consistency, file = file.path(data.folder, "clustering_consistency.Rds")) } else { clust.consistency <- readRDS(file.path(data.folder, "clustering_consistency.Rds")) }
clust.consistency_summary <- plot_clustering_consistency( clust.consistency, min.value.alt.clustering = 0.5, error_bars = "sd" ) clust.consistency_summary
set.seed(123) supercell_plot(SC.list$Exact$`20`$`12345`$graph.supercells)
N.clusters clust.name <- paste0("hclust_", N.clusters) ## add clustering result field for(meth in names(SC.list)){ for(gamma.ch in names(SC.list[[meth]])){ for(seed.i.ch in names(SC.list[[meth]][[gamma.ch]])){ SC.list[[meth]][[gamma.ch]][[seed.i.ch]][[clust.name]] <- as.character(SC.list[[meth]][[gamma.ch]][[seed.i.ch]][["hclust"]][[as.character(N.clusters)]]) } } } ## map clustering result to GT cell type (cell line) SC.list <- map_clustering_to_cell_type( SC.list = SC.list, GT.cell.type = GT.cell.type, clust.name = clust.name ) clust.name.dea <- paste0(clust.name, "_mapped_to_GT") ### map single-cell clustering to GT annotation sc.clustering.mapped_to_GT <- GT.cell.type.names[diag_consistency_mtrx( m = as.matrix(table(sc.clustering, GT.cell.type)) )$vcol[as.character(sc.clustering)]]
# run DEA at HPC if(0){ DEA.SC <- compute_supercells_DEA( SC.list = SC.list, SC.GE.list = SC.GE.list, cluster.name = clust.name.dea, verbose = TRUE ) saveRDS(DEA.SC, file.path(data.folder, "DEA_SC.Rds")) } else { DEA.SC <- readRDS(file.path(data.folder, "DEA_SC.Rds")) }
# run at HPC if(0){ DEA.sc <- compute_singglecell_DEA( sc.GE, clusters = sc.clustering.mapped_to_GT ) saveRDS(DEA.sc, file.path(data.folder, "DEA_singlecell.Rds")) } else { DEA.sc <- readRDS(file.path(data.folder, "DEA_singlecell.Rds")) } # run at HPC if(0){ DEA.GT <- compute_singglecell_DEA( sc.GE, clusters = GT.cell.type ) saveRDS(DEA.GT, file.path(data.folder, "DEA_GT.Rds")) } else { DEA.GT <- readRDS(file.path(data.folder, "DEA_GT.Rds")) }
GT.DEA.bool <- compute_DEA_GT_bool( DEA.GT, all.genes = gene.names, logFC.thresh = .logFC.thresh, pval.thresh = .pval.thresh )
DEA.consistency <- compute_consistency_of_supercell_DEA( DEA.list = DEA.SC, GT.DEA.bool = GT.DEA.bool, sc.DEA = DEA.sc, verbose = T ) DEA.consistency.df.plot <- plot_DEA_consistency( DEA.consistency, consistency.index.name = 'TPR', error_bars = "quartiles" )
table( SC.list$Exact$`10`$`12345`$hclust_5_mapped_to_GT[SC.list$Exact$`10`$`12345`$membership], GT.cell.type ) table( SC.list$Exact$`10`$`12345`$hclust_5[SC.list$Exact$`10`$`12345`$membership], GT.cell.type )
print("Done! Congrats!") if(ToTestPackage){ warning(paste("(!) Script was run in a test mode, to get real cell_line super-cell data, run this script with ToTestPackage <- FALSE")) }
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.