In kgellatl/SignalCell: Network Analysis for single cell RNA sequencing Data (sc-RNASeq)

knitr::opts_chunk$set(echo = TRUE, comment = "")
# https://github.com/kgellatl/SignalCell
library(SignalCell)

Load Data

# load("~/Dropbox (UMass Medical School)/UMASS/GitHub/R/SignalCell/data/tst_dat.Rdata")
# sparse_dat <- as(tst_dat, "dgCMatrix")
load("~/Dropbox (UMass Medical School)/Lab_inDrop/mDC_vignette_data/mDC_dat.Rdata")
sparse_dat <- as(mDC_dat, "dgRMatrix")
object.size(mDC_dat)
object.size(sparse_dat)
rm(mDC_dat)

SCE Class

Construct SingleCellExperiment

### http://bioconductor.org/packages/release/bioc/html/SingleCellExperiment.html
sce <- construct_sce(sparse_dat)
rm(sparse_dat)

class(sce)
class(assay(sce, "counts"))
sce

Enhanced exprs Like functionality

counts <- assay(sce, "counts")
libsizes <- colSums(counts)
size.factors <- libsizes/mean(libsizes)
logcounts(sce) <- log2(t(t(counts)/size.factors) + 1)
assayNames(sce)

get_def_assay(sce)
sce <- set_def_assay(sce, "logcounts")
get_def_assay(sce)

pData Like functionality

colData(sce)$Timpoint <- "0hr"
sce$Timpoint[grep("1", colnames(sce))] <- "1hr"
sce$Timpoint[grep("4", colnames(sce))] <- "4hr"

sce <- calc_libsize(sce, assay = "counts", label = "UMI_sum")
colData(sce)
sce

fData Like functionality

counts <- apply(assay(sce, "counts"), 1, function(c)sum(c!=0))
rowData(sce)$gene_counts <- counts
rowData(sce) # use for vector annotations
sce

Basic Analysis

Finding variable genes

sce <- calc_var_genes(sce, method = "Malhanobis")
sce <- calc_var_genes(sce, method = "CV")
sce <- calc_var_genes(sce, method = "Gini")

rowData(sce)
sce

Bare Minimum Example

gene_subset <- get_var_genes(sce, method = "Malhanobis", cutoff = 0.85)
gene_subset[100:110]
sce <- dim_reduce(input = sce, genelist = gene_subset, method = "iPCA")
sce <- embed_cells(input = sce, lem = "iPCA", method = "tSNE")
colData(sce)

Dimension reduction Detailed

###### Example 1 : Linear Embedding Matrix (LEM) Class, Single Embedding #####

# First we run dimension reduction
# When NULL, the name of the lem defaults to the method, in this case iPCA
sce <- dim_reduce(input = sce, genelist = gene_subset, method = "iPCA", reducedDim_key = NULL)

# We can see what LEM results exist with reducedDimNames
reducedDimNames(sce)

# We can access the LEM with reducedDim and providing a LEM name
LEM_iPCA <- reducedDim(sce, "iPCA")

# Structure of the LEM
# SVD (like) Matrices
head(sampleFactors(LEM_iPCA)) # cells = rows
head(featureLoadings(LEM_iPCA))  # gene = rows
head(factorData(LEM_iPCA)) # diagonal

# There is also a metadata field. This stores the parameters of the LEM.
# As of now, the embedding field is empty
metadata_iPCA <- metadata(LEM_iPCA)
str(metadata_iPCA)

# Next we embed the cells. Right now tSNE is supported, the default method. We can add UMAP support
# This embedding is stored internally to the input LEM, so a lem must be provided as an argument
# As above, when no embedding_key is provided, the name of the embedding defaults to the method, in this case tSNE
# When TRUE, write_colData will write the x & y coordinates to colData()
# When NULL the colData_labs default to x and y pasted with the method

sce <- embed_cells(input = sce, lem = "iPCA", embedding_key = NULL, write_colData = T, method = "tSNE", seed = 10)
colData(sce)

# Now within the metadata of the LEM, we have an embedding stored, along with its parameters
# To simplify usage, there is a function, embeddings(), to list available LEMs, and their embedding_key
# There is another function, embedding(), to retrieve a specific embedding with its embedding_key
# This requires a LEM argument, and the embedding_key

embeddings(sce)
head(embedding(sce, lem = "iPCA", embedding = "tSNE"))

# Parameters for the embedding are nested within the LEM metadata, and then the embedding
LEM_iPCA <- reducedDim(sce, "iPCA")
metadata_iPCA <- metadata(LEM_iPCA)
metadata_iPCA$embeddings$tSNE$parameters

######  Example 2 : Second LEM, Customization, multiple embeddings ######

# You can store as many LEM objects you want.
# In this case we use reducedDim_key to name it specifically.
# We run PCA for 20 components
sce <- dim_reduce(input = sce, reducedDim_key = "PCA_20comp", genelist = gene_subset, method = "PCA", nComp = 20)

# Now we have 2 LEMs stored
reducedDimNames(sce)

# Now when we call embed_cells, we call the new reducedDim() object, "PCA_20comp"
# However we only select 15 components
# When embedding we can also specify the column names written to colData()
# This way as many x + y embeddings can be stored as you desire within colData (as long as they are uniquely named)
sce <- embed_cells(input = sce, lem = "PCA_20comp", embedding_key = "PCA_20_15", colData_labs = c("PCA_20_15_x", "PCA_20_15_y"), comp_select = 1:15, iterations = 500, tSNE_perp = 50)
colData(sce)

# On a single LEM object we can try as many embeddings as desired
# To prevent them overwriting, we can provide a new embedding_key
# With our second embedding, we tweak parameters
sce <- embed_cells(input = sce, lem = "PCA_20comp", embedding_key = "PCA_embed2", colData_labs = c("PCA_embed2_x", "PCA_embed2_y"), comp_select = 1:10, tSNE_perp = 50)
colData(sce)

# Now our object has 2 sets of LEMS, and within the second LEM are 2 different embeddings
embeddings(sce)

# If write_colData is FALSE when you run embed_cells(), than the colData will not get flooded with all embeddings you run
# You can always write a single embedding result to colData, the one you prefer
int_embedding <- embedding(input = sce, lem = "iPCA", embedding = "tSNE")
colnames(int_embedding) <- c("x", "y")
colData(sce) <- cbind(colData(sce), int_embedding)
colData(sce)

Clustering

reducedDims(sce)
sce <- cluster_cells(sce, dims = "Comp", lem = "iPCA",  method = "spectral", k = 6)
plot_metadata(sce, color_by = "Cluster")

# 2D clustering can  be done on colData()
sce <- cluster_cells(sce, dims = "2d", method = "density", xy = c("tSNE_x", "tSNE_y"))
colData(sce)

# 2D clustering can also be done on an embedding not stored in colData, but requires more arguments because the embedding is stored with the LEM
# We also provide a name argument for this new colData() column to prevent overwriting the default "Cluster"
# Notice the end result clustering on the colData() or lem are the same, because the underlying data is the same
embeddings(sce)
sce <- cluster_cells(sce, dims = "2d", method = "density", lem = "iPCA",  embedding = "tSNE", name = "PCA_20_embed2_cluster")

colData(sce)[,c("Cluster", "PCA_20_embed2_cluster")]

# A New argument allows calculation of clustering statistics (cluster_stats()). These are stored in the internal metadata of the sce

ks <- 3:13
reps <- 1:7

for(i in 1:length(ks)) {
  for (j in reps) {
    sce <- cluster_cells(sce, dims = "Comp", lem = "iPCA",  method = "spectral", k = ks[i], cluster_stats = T)
  }
}

c_metrics <- int_metadata(sce)$cluster_metrics
c_metrics <- pivot_longer(c_metrics, cols = 1:4, values_to = "value", names_to = "metric")
c_metrics$metric <- as.factor(c_metrics$metric)
c_metrics$k <- as.factor(as.numeric(c_metrics$k))

g <- ggplot(c_metrics)
g <- g + geom_violin(aes(x = k, y = value), scale = "width")
g <- g + geom_jitter(aes(x = k, y = value))
g <- g + facet_wrap(~metric, scales = "free")
plot(g)

# average.between = average distance between clusters.
# average.within = average distance within clusters.
# within.cluster.ss = a generalisation of the within clusters sum of squares
# avg.silwidth = average silhouette width. 
#pearsongamma = correlation between distances and a 0-1-vector where 0 means same cluster, 1means different clusters
# dunn = minimum separation / maximum diameter.
# dunn2 = minimum average dissimilarity between two cluster / maximum average withincluster dissimilarity
# entropy = entropy of the distribution of cluster memberships
# wb.ratio = average.within/average.between
# ch = Calinski and Harabasz inde

# https://medium.com/@haataa/how-to-measure-clustering-performances-when-there-are-no-ground-truth-db027e9a871c
# https://cran.r-project.org/web/packages/fpc/fpc.pdf

Basic Plots

Setting default coordinates

###### NEEDS TO BE CHANGED
#### EITHER EMBEDDING OR LEM!
# sce <- set_xy(sce, "iPCA")
# 
# sce <- set_xy(sce, "iPCA")
# head(colData(sce))[,c("x", "y")]

tSNE metadata Plot

###### NEEDS TO BE CHANGED
#### EITHER EMBEDDING OR LEM!

plot_metadata(sce, color_by = "Cluster")
# 
# plot_metadata(sce, color_by = "Cluster", facet_by = "Timepoint",  coords = c("iPCA", 1, 2))

tSNE gene Plot

###### NEEDS TO BE CHANGED
#### EITHER EMBEDDING OR LEM!

# plot_tsne_gene(sce, gene = "Ccr7")
# 
# plot_tsne_gene(sce, gene = c("Ccr7", "Lcn2")) 
# 
# plot_tsne_gene(sce, gene = c("Tnf"), facet_by = "Timepoint")

Violin Plot

# plot_violin(sce, gene = "Ccr7", color_by = "Cluster")
# 
# plot_violin(sce, gene = "Ccr7", color_by = "Cluster", assay = "counts")
# 
# plot_violin(sce, gene = c("Ccr7", "Lcn2"), color_by = "Cluster") # NEW
# 
# plot_violin(sce, gene = c("Tnf"), facet_by = "Timepoint", color_by = "Cluster")

Bulk Analysis

# sce@elementMetadata # use for bulk values??

To Do

Change the dim reduce

Quantitative clustering

Test file Sizes on LARGE Data!

dgC vs dgR matrix performance test

kgellatl/SignalCell documentation built on Sept. 3, 2020, 8:45 a.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

kgellatl/SignalCell
Network Analysis for single cell RNA sequencing Data (sc-RNASeq)

In kgellatl/SignalCell: Network Analysis for single cell RNA sequencing Data (sc-RNASeq)

Load Data

SCE Class

Construct SingleCellExperiment

Enhanced exprs Like functionality

pData Like functionality

fData Like functionality

Basic Analysis

Finding variable genes

Bare Minimum Example

Dimension reduction Detailed

Clustering

Basic Plots

Setting default coordinates

tSNE metadata Plot

tSNE gene Plot

Violin Plot

Bulk Analysis

To Do

Change the dim reduce

Quantitative clustering

Test file Sizes on LARGE Data!

dgC vs dgR matrix performance test

R Package Documentation

Browse R Packages

We want your feedback!

kgellatl/SignalCell Network Analysis for single cell RNA sequencing Data (sc-RNASeq)

In kgellatl/SignalCell: Network Analysis for single cell RNA sequencing Data (sc-RNASeq)

Load Data

SCE Class

Construct SingleCellExperiment

Enhanced exprs Like functionality

pData Like functionality

fData Like functionality

Basic Analysis

Finding variable genes

Bare Minimum Example

Dimension reduction Detailed

Clustering

Basic Plots

Setting default coordinates

tSNE metadata Plot

tSNE gene Plot

Violin Plot

Bulk Analysis

To Do

Change the dim reduce

Quantitative clustering

Test file Sizes on LARGE Data!

dgC vs dgR matrix performance test

R Package Documentation

Browse R Packages

We want your feedback!

kgellatl/SignalCell
Network Analysis for single cell RNA sequencing Data (sc-RNASeq)