knitr::opts_chunk$set(echo = TRUE, comment = "") # https://github.com/kgellatl/SignalCell library(SignalCell)
# load("~/Dropbox (UMass Medical School)/UMASS/GitHub/R/SignalCell/data/tst_dat.Rdata") # sparse_dat <- as(tst_dat, "dgCMatrix") load("~/Dropbox (UMass Medical School)/Lab_inDrop/mDC_vignette_data/mDC_dat.Rdata") sparse_dat <- as(mDC_dat, "dgRMatrix") object.size(mDC_dat) object.size(sparse_dat) rm(mDC_dat)
### http://bioconductor.org/packages/release/bioc/html/SingleCellExperiment.html sce <- construct_sce(sparse_dat) rm(sparse_dat) class(sce) class(assay(sce, "counts")) sce
counts <- assay(sce, "counts") libsizes <- colSums(counts) size.factors <- libsizes/mean(libsizes) logcounts(sce) <- log2(t(t(counts)/size.factors) + 1) assayNames(sce) get_def_assay(sce) sce <- set_def_assay(sce, "logcounts") get_def_assay(sce)
colData(sce)$Timpoint <- "0hr" sce$Timpoint[grep("1", colnames(sce))] <- "1hr" sce$Timpoint[grep("4", colnames(sce))] <- "4hr" sce <- calc_libsize(sce, assay = "counts", label = "UMI_sum") colData(sce) sce
counts <- apply(assay(sce, "counts"), 1, function(c)sum(c!=0)) rowData(sce)$gene_counts <- counts rowData(sce) # use for vector annotations sce
sce <- calc_var_genes(sce, method = "Malhanobis") sce <- calc_var_genes(sce, method = "CV") sce <- calc_var_genes(sce, method = "Gini") rowData(sce) sce
gene_subset <- get_var_genes(sce, method = "Malhanobis", cutoff = 0.85) gene_subset[100:110] sce <- dim_reduce(input = sce, genelist = gene_subset, method = "iPCA") sce <- embed_cells(input = sce, lem = "iPCA", method = "tSNE") colData(sce)
###### Example 1 : Linear Embedding Matrix (LEM) Class, Single Embedding ##### # First we run dimension reduction # When NULL, the name of the lem defaults to the method, in this case iPCA sce <- dim_reduce(input = sce, genelist = gene_subset, method = "iPCA", reducedDim_key = NULL) # We can see what LEM results exist with reducedDimNames reducedDimNames(sce) # We can access the LEM with reducedDim and providing a LEM name LEM_iPCA <- reducedDim(sce, "iPCA") # Structure of the LEM # SVD (like) Matrices head(sampleFactors(LEM_iPCA)) # cells = rows head(featureLoadings(LEM_iPCA)) # gene = rows head(factorData(LEM_iPCA)) # diagonal # There is also a metadata field. This stores the parameters of the LEM. # As of now, the embedding field is empty metadata_iPCA <- metadata(LEM_iPCA) str(metadata_iPCA) # Next we embed the cells. Right now tSNE is supported, the default method. We can add UMAP support # This embedding is stored internally to the input LEM, so a lem must be provided as an argument # As above, when no embedding_key is provided, the name of the embedding defaults to the method, in this case tSNE # When TRUE, write_colData will write the x & y coordinates to colData() # When NULL the colData_labs default to x and y pasted with the method sce <- embed_cells(input = sce, lem = "iPCA", embedding_key = NULL, write_colData = T, method = "tSNE", seed = 10) colData(sce) # Now within the metadata of the LEM, we have an embedding stored, along with its parameters # To simplify usage, there is a function, embeddings(), to list available LEMs, and their embedding_key # There is another function, embedding(), to retrieve a specific embedding with its embedding_key # This requires a LEM argument, and the embedding_key embeddings(sce) head(embedding(sce, lem = "iPCA", embedding = "tSNE")) # Parameters for the embedding are nested within the LEM metadata, and then the embedding LEM_iPCA <- reducedDim(sce, "iPCA") metadata_iPCA <- metadata(LEM_iPCA) metadata_iPCA$embeddings$tSNE$parameters ###### Example 2 : Second LEM, Customization, multiple embeddings ###### # You can store as many LEM objects you want. # In this case we use reducedDim_key to name it specifically. # We run PCA for 20 components sce <- dim_reduce(input = sce, reducedDim_key = "PCA_20comp", genelist = gene_subset, method = "PCA", nComp = 20) # Now we have 2 LEMs stored reducedDimNames(sce) # Now when we call embed_cells, we call the new reducedDim() object, "PCA_20comp" # However we only select 15 components # When embedding we can also specify the column names written to colData() # This way as many x + y embeddings can be stored as you desire within colData (as long as they are uniquely named) sce <- embed_cells(input = sce, lem = "PCA_20comp", embedding_key = "PCA_20_15", colData_labs = c("PCA_20_15_x", "PCA_20_15_y"), comp_select = 1:15, iterations = 500, tSNE_perp = 50) colData(sce) # On a single LEM object we can try as many embeddings as desired # To prevent them overwriting, we can provide a new embedding_key # With our second embedding, we tweak parameters sce <- embed_cells(input = sce, lem = "PCA_20comp", embedding_key = "PCA_embed2", colData_labs = c("PCA_embed2_x", "PCA_embed2_y"), comp_select = 1:10, tSNE_perp = 50) colData(sce) # Now our object has 2 sets of LEMS, and within the second LEM are 2 different embeddings embeddings(sce) # If write_colData is FALSE when you run embed_cells(), than the colData will not get flooded with all embeddings you run # You can always write a single embedding result to colData, the one you prefer int_embedding <- embedding(input = sce, lem = "iPCA", embedding = "tSNE") colnames(int_embedding) <- c("x", "y") colData(sce) <- cbind(colData(sce), int_embedding) colData(sce)
reducedDims(sce) sce <- cluster_cells(sce, dims = "Comp", lem = "iPCA", method = "spectral", k = 6) plot_metadata(sce, color_by = "Cluster") # 2D clustering can be done on colData() sce <- cluster_cells(sce, dims = "2d", method = "density", xy = c("tSNE_x", "tSNE_y")) colData(sce) # 2D clustering can also be done on an embedding not stored in colData, but requires more arguments because the embedding is stored with the LEM # We also provide a name argument for this new colData() column to prevent overwriting the default "Cluster" # Notice the end result clustering on the colData() or lem are the same, because the underlying data is the same embeddings(sce) sce <- cluster_cells(sce, dims = "2d", method = "density", lem = "iPCA", embedding = "tSNE", name = "PCA_20_embed2_cluster") colData(sce)[,c("Cluster", "PCA_20_embed2_cluster")] # A New argument allows calculation of clustering statistics (cluster_stats()). These are stored in the internal metadata of the sce ks <- 3:13 reps <- 1:7 for(i in 1:length(ks)) { for (j in reps) { sce <- cluster_cells(sce, dims = "Comp", lem = "iPCA", method = "spectral", k = ks[i], cluster_stats = T) } } c_metrics <- int_metadata(sce)$cluster_metrics c_metrics <- pivot_longer(c_metrics, cols = 1:4, values_to = "value", names_to = "metric") c_metrics$metric <- as.factor(c_metrics$metric) c_metrics$k <- as.factor(as.numeric(c_metrics$k)) g <- ggplot(c_metrics) g <- g + geom_violin(aes(x = k, y = value), scale = "width") g <- g + geom_jitter(aes(x = k, y = value)) g <- g + facet_wrap(~metric, scales = "free") plot(g) # average.between = average distance between clusters. # average.within = average distance within clusters. # within.cluster.ss = a generalisation of the within clusters sum of squares # avg.silwidth = average silhouette width. #pearsongamma = correlation between distances and a 0-1-vector where 0 means same cluster, 1means different clusters # dunn = minimum separation / maximum diameter. # dunn2 = minimum average dissimilarity between two cluster / maximum average withincluster dissimilarity # entropy = entropy of the distribution of cluster memberships # wb.ratio = average.within/average.between # ch = Calinski and Harabasz inde # https://medium.com/@haataa/how-to-measure-clustering-performances-when-there-are-no-ground-truth-db027e9a871c # https://cran.r-project.org/web/packages/fpc/fpc.pdf
###### NEEDS TO BE CHANGED #### EITHER EMBEDDING OR LEM! # sce <- set_xy(sce, "iPCA") # # sce <- set_xy(sce, "iPCA") # head(colData(sce))[,c("x", "y")]
###### NEEDS TO BE CHANGED #### EITHER EMBEDDING OR LEM! plot_metadata(sce, color_by = "Cluster") # # plot_metadata(sce, color_by = "Cluster", facet_by = "Timepoint", coords = c("iPCA", 1, 2))
###### NEEDS TO BE CHANGED #### EITHER EMBEDDING OR LEM! # plot_tsne_gene(sce, gene = "Ccr7") # # plot_tsne_gene(sce, gene = c("Ccr7", "Lcn2")) # # plot_tsne_gene(sce, gene = c("Tnf"), facet_by = "Timepoint")
# plot_violin(sce, gene = "Ccr7", color_by = "Cluster") # # plot_violin(sce, gene = "Ccr7", color_by = "Cluster", assay = "counts") # # plot_violin(sce, gene = c("Ccr7", "Lcn2"), color_by = "Cluster") # NEW # # plot_violin(sce, gene = c("Tnf"), facet_by = "Timepoint", color_by = "Cluster")
# sce@elementMetadata # use for bulk values??
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.