knitr::opts_chunk$set(echo = TRUE, comment = "")

Package and Data Loading

# https://github.com/UMMS-Biocore/bootcamp/blob/master/session8/session8.md
# https://github.com/kgellatl/SignallingSingleCell
# https://rpubs.com/kgellatl729/774488

# devtools::install_github("kgellatl/SignallingSingleCell")

library("SignallingSingleCell")

load(url("https://galaxyweb.umassmed.edu/pub/class/mDC_UMI_Table.Rdata"))
load(url("https://galaxyweb.umassmed.edu/pub/class/ex_sc_skin.Rdata"))

Preprocessing

Constructing the ExpressionSet Class

The ExpressionSet class (ex_sc) is a convenient data structure that contains 3 dataframes. These dataframes contain expression data, cell information, and gene information respectively.

exprs(ex_sc) is the expression data, where rows are genes and columns are cells. pData(ex_sc) is cell information, where rows are cells and columns are metadata. fData(ex_sc) is gene information, where rows are genes and columns are metadata.

ex_sc <- construct_ex_sc(input = mDC_UMI_Table) # mDC_UMI_Table == Digital Gene Expression Matrix

class(mDC_UMI_Table)
class(ex_sc) 

ex_sc # pData() and fData() are empty

exprs(ex_sc)[1:5,1:5]
pData(ex_sc)[1:5,]
fData(ex_sc)[1:5,]
rm(mDC_UMI_Table)

Data Annotation

Often we have metadata about the experiment that can be valuable in the analysis! Writing that information now may be appropriate. Our experiment consists of a time course with LPS stimulation. The cell names in the DGE Matrix contain a substring encoding this information.

rownames(pData(ex_sc))[c(1,2000,3000)]

pData(ex_sc)$Timepoint <- NA # initialize a new pData column

table(pData(ex_sc)$Timepoint)

pData(ex_sc)[grep("0hr", rownames(pData(ex_sc))),"Timepoint"] <- "0hr"
pData(ex_sc)[grep("1hr", rownames(pData(ex_sc))),"Timepoint"] <- "1hr"
pData(ex_sc)[grep("4hr", rownames(pData(ex_sc))),"Timepoint"] <- "4hr"

head(pData(ex_sc))

table(ex_sc$Timepoint)

Filtering

Often you will want to filter your data to remove low quality cells. There are many ways to do this, often using the number of captured transcripts, unique genes per cell, or the percent of mitochrondrial genes to remove low quality cells.

ex_sc <- calc_libsize(ex_sc, suffix = "raw") # sums counts for each cell
ex_sc <- pre_filter(ex_sc, minUMI = 1000, maxUMI = 10000, threshold = 1, minCells = 10,  print_progress = TRUE) # filters cells and genes

ex_sc <- calc_libsize(ex_sc, suffix = "raw_filtered")

plot_density(ex_sc, title = "UMI Density",  val = "UMI_sum_raw_filtered", statistic = "mean")  

head(pData(ex_sc))

plot_density_ridge(ex_sc, title = "UMI Density",  val = "UMI_sum_raw_filtered", color_by = "Timepoint")  

Basic scRNA-seq analysis

Dimension reduction

Dimensionality reduction is necessary in order to bring the cells from a high dimensional gene expression space (~10k dimensions, one dimension per gene) down to a more reasonable number. Typically this is done first with PCA to bring it down to ~5-15 dimensions, before a final embedding is done using tSNE or UMAP to bring it down to 2 dimensions.

?subset_genes

gene_subset <- subset_genes(ex_sc, method = "PCA", threshold = 1, minCells = 20, nComp = 10, cutoff = 0.85)

dim(ex_sc)
length(gene_subset)

ex_sc <- dim_reduce(ex_sc, genelist = gene_subset, pre_reduce = "vPCA", nVar = .9, nComp = 30, iterations = 500, print_progress=TRUE) 

plot_tsne_metadata(ex_sc, color_by = "Timepoint") 

ex_sc <- dim_reduce(ex_sc, genelist = gene_subset, pre_reduce = "iPCA", nComp = 12, tSNE_perp = 30, iterations = 500, print_progress=FALSE) 

plot_tsne_metadata(ex_sc, color_by = "Timepoint") 

plot_tsne_metadata(ex_sc, color_by = "iPC_Comp1", title = "PC1 cell loadings") 

plot_tsne_metadata(ex_sc, color_by = "iPC_Comp2", title = "PC2 cell loadings") 

plot_tsne_metadata(ex_sc, color_by = "iPC_Comp3", title = "PC3 cell loadings") 

Excersise 1 : Dimension reduction on Skin Data

Now let us try dimension for the skin data!! First we can inspect the skin data to get a sense of what we are working with. Once we get a sense for the data we can then use the same functions as above to perform gene selection and dimension reduction.

dim(ex_sc_skin)

colnames(pData(ex_sc_skin))
colnames(fData(ex_sc_skin))

table(pData(ex_sc_skin))

# Use the subset_genes function to find variable genes in the skin data. Be sure to provide the input argument and method argument. Use ?subset_genes() to view help pages for functions. Try different methods and compare the number of genes you get out for each method.

?subset_genes()

gene_subset <- subset_genes(input = ex_sc_skin, method = "PCA", threshold = 1, minCells = 30)

# Use the dim_reduce function to create a 2D representation of the skin data. Be sure to provide the input argument and a gene list. Try using different pre_reduce method, as well as different numbers of components.

ex_sc_skin <- dim_reduce(ex_sc_skin, genelist = gene_subset, pre_reduce = "iPCA", nComp = 10, iterations = 500)

# Now you can plot metadata from pData() or genes of interest, onto the tSNE mapping.

plot_tsne_metadata(ex_sc_skin, color_by = "Skin")
plot_tsne_metadata(ex_sc_skin, color_by = "Patient")

# You can also try some of your favorite genes. Try the search_gene() function to find some genes that may be of interest to you. Examples are below.

# search_gene(ex_sc_skin, "^CD")

search_gene(ex_sc_skin, "^CD")[17:19] # Some CD Genes use in immunonology
plot_tsne_gene(ex_sc_skin, gene = "CD207")

search_gene(ex_sc_skin, "^CD")[40:41] # Some CD Genes use in immunonology
plot_tsne_gene(ex_sc_skin, gene = "CD3D")

Clustering

Now that we have dimension reduced data we can try clustering it! For dimensions, both Comp and 2d are supported. There will determine if the clustering is done on principal components, or on the 2D representation. There are also 2 clustering algorithms available, density and spectral. Typically we recommend spectral clustering on PCA components, or density clustering on the 2d representation. Try both!

ex_sc <- cluster_sc(ex_sc, dimension = "Comp", method = "spectral", num_clust = 4) 
ex_sc$cluster_spectral <- ex_sc$Cluster

ex_sc <- cluster_sc(ex_sc, dimension = "2d", method = "density", num_clust = 4) 
ex_sc$cluster_density <- ex_sc$Cluster

plot_tsne_metadata(ex_sc, color_by = "cluster_spectral")
plot_tsne_metadata(ex_sc, color_by = "cluster_density")

Excercise 2 : Clustering on the Skin Data

Now let us try clustering for the skin data!! Try both density based and spectral clustering!

# Use the cluster_sc function to cluster the skin data. Try varying the number of clusters. Try varying either Comp or 2d for the dimension. You can also try spectral or density for the method. You can also try adjusting the number of clusters that you get out.

ex_sc_skin <- cluster_sc(ex_sc_skin, dimension = "Comp", method = "spectral", num_clust = 4)

plot_tsne_metadata(ex_sc_skin, color_by = "Cluster")

Cell Type identification

There are many possible ways to identify cell types based on their gene expression. The id_markers function will identify genes that are highly expressed in a high proportion of a given cluster, relative to the other clusters.

ex_sc <- id_markers(ex_sc, print_progress = TRUE) 

ex_sc <- calc_agg_bulk(ex_sc, aggregate_by = "Cluster")

markers <- return_markers(ex_sc, num_markers = 15) 

plot_heatmap(ex_sc, genes = unique(unlist(markers)), type = "bulk")

Excercise 3 : Cell Type identification Skin Data

Now try to identify the cell types in the skin data! Often when you get your gene lists of markers back, focus on the first few, and use google to help you! See if you can find out the rough classification of the 4 clusters in the skin data.

ex_sc_skin <- id_markers(ex_sc_skin, id_by = "Cluster")

markers <- return_markers(ex_sc_skin)

markers

ex_sc_skin <- calc_agg_bulk(ex_sc_skin, aggregate_by = "Cluster")

plot_heatmap(ex_sc_skin, type = "bulk", genes = unique(unlist(markers)))

Normalization

Now that the data has preliminary clusters, we can normalize. SCRAN normalization will first normalize internally in clusters, before normalizing across clusters. Once the data is normalized we can run the same steps as above before visualization. The first step is to select the genes to be used for normalization. One method would be to first only use genes expressed in more than n cells, and then remove the most variable genes. This method can be computationally expensive, and is currently commented out. A simpler approach, counts per million, is also provided below.

table(pData(ex_sc)$Cluster)

# ex_sc_norm <- norm_sc(ex_sc, pool_sizes = c(20,25,30,35,40))

x <- exprs(ex_sc)
cSum <- apply(x,2,sum) # sum counts for each cell
x <- as.matrix(sweep(x,2,cSum,FUN='/'))*1e4 # normalize to UMIs per 10k
ex_sc_norm <- construct_ex_sc(x) # Make a new expression set with normalized counts
pData(ex_sc_norm) <- pData(ex_sc) # Copy over the pData

ex_sc_norm <- calc_libsize(ex_sc_norm, suffix = "CP10k")

plot_tsne_metadata(ex_sc_norm, color_by = "UMI_sum_raw")
plot_tsne_metadata(ex_sc_norm, color_by = "UMI_sum_CP10k")

Excercise 4 : Process the normalized mouse data!

Now that we have normalized, it is time to reprocess the data as before, this time on the normalized counts! Use the ex_sc_norm normalized counts from above to run through the same processing steps as above. The core functions are outlined below. If you get stuck look at the examples above that used the mouse data. You can also use the ? function to find the help pages, for example, ?subset_genes

?subset_genes

# gene_subset <- subset_genes(input = ex_sc_norm, method = "")

# ex_sc_norm <- dim_reduce(input = ex_sc_norm, genelist = gene_subset, pre_reduce = "")

# ex_sc_norm <- cluster_sc(ex_sc_norm, dimension = "", method = "", num_clust = "")

# plot_tsne_metadata(ex_sc_norm color_by = "")

# plot_tsne_gene(ex_sc_norm, gene = "")

# ex_sc_norm <- id_markers(ex_sc_norm, id_by = "")

# markers <- return_markers(ex_sc_norm, return_by = "")

# markers

# ex_sc_norm <- calc_agg_bulk(ex_sc_norm, aggregate_by = "")

# plot_heatmap(ex_sc_norm, type = "", genes = unique(unlist(markers)))

Advanced scRNA-seq analysis

Supervised Analysis

From the above analysis, it is clear that some clusters are formed based on their cell type, while others are based on their experimental condition. In these cases it can be useful to incorporate prior information in order to obtain clusters and annotations that are grounded in biological significance. Below, we can assign "panels" similar to flow cytometry, that will enable cell groupings based on the expression of genes that you believe to signify biologically relevant cell types.

panel1 <- c("S100a9", "Mmp9") # Neutrophil Markers
panel2 <- c("Ccr7", "Fscn1", "Flt3") # DC
panel3 <- c("Mertk", "Csf1r") # Mac

panels <- list(panel1, panel2, panel3)

plot_tsne_gene(ex_sc_norm, gene = panels[[1]], title = "", log_scale = T)
plot_tsne_gene(ex_sc_norm, gene = panels[[2]], title = "", log_scale = T)
plot_tsne_gene(ex_sc_norm, gene = panels[[3]], title = "", log_scale = T)

names(panels) <- c("Neutrophil", "Dendritic", "Macrophage")

panels

ex_sc_norm <- flow_filter(ex_sc_norm, panels = panels, title = "Flow Pass Cells")

ex_sc_norm <- flow_svm(ex_sc_norm, pcnames = "Comp")

plot_tsne_metadata(ex_sc_norm, color_by = "cluster_spectral", facet_by = "Timepoint")
plot_tsne_metadata(ex_sc_norm, color_by = "SVM_Classify")

DE analysis

Now that cells are grouped by their cell type, we can run DE in order to determine which genes are change in association with our experimental conditions.

For simplicity we can subset to 0hr and 4hr, to find the genes that change between these conditions.

It should be noted that DE should always be run on raw counts, not on the normalized counts!

ex_sc_norm_0_4 <- subset_ex_sc(ex_sc_norm, variable = "Timepoint", select = c("0hr", "4hr"))

table(pData(ex_sc_norm_0_4)[,c("SVM_Classify", "Timepoint")])

findDEgenes(input = ex_sc,
            pd = pData(ex_sc_norm_0_4),
            DEgroup = "Timepoint",
            contrastID = "4hr",
            facet_by = "SVM_Classify",
            outdir = "~/Downloads/")

# Macrophages

g <- plot_volcano(de_path = "~/Downloads/", de_file = "Macrophage_4hr_DEresults.tsv", fdr_cut = 1E-150, logfc_cut = 1)
plot(g)
mac_de <- unique(g$data$label)
mac_de <- mac_de[-which(mac_de == "")]

plot_violin(ex_sc_norm_0_4, color_by = "Timepoint", facet_by = "SVM_Classify", gene = "Rsad2")

# DCs

g <- plot_volcano(de_path = "~/Downloads/", de_file = "Dendritic_0hr_DEresults.tsv", fdr_cut = 0.0001, logfc_cut = 2)
plot(g)
dc_de <- unique(g$data$label)
dc_de <- dc_de[-which(dc_de == "")]

plot_violin(ex_sc_norm_0_4, color_by = "Timepoint", facet_by = "SVM_Classify", gene = "Ifit1")

# Neutrophils

g <- plot_volcano(de_path = "~/Downloads/", de_file = "Neutrophil_0hr_DEresults.tsv", fdr_cut = 1E-5, logfc_cut = 3)
plot(g)
neut_de <- unique(g$data$label)
neut_de <- neut_de[-which(neut_de == "")]

plot_violin(ex_sc_norm_0_4, color_by = "Timepoint", facet_by = "SVM_Classify", gene = "Mmp9")

# Heatmap of these DE genes

all_de <- c(mac_de, dc_de, neut_de)
unique(all_de)

ex_sc_norm <- calc_agg_bulk(ex_sc_norm, aggregate_by = c("Timepoint", "SVM_Classify"))

plot_heatmap(ex_sc_norm, genes = c(mac_de, dc_de, neut_de), type = "bulk", facet_by = "SVM_Classify", gene_names = F)

Homework

Now try to run DE between the 0hr and 1hr timepoints on the mouse data. Then make a volcano plot of the genes that are significantly changed (FDR < 0.001, logfc_cut >= 3) within Dendritic cells. The basic steps are outlined below.

# ex_sc_norm_0_1 <- subset_ex_sc()

# findDEgenes() 

# plot_volcano()


kgellatl/SignallingSingleCell documentation built on Dec. 29, 2021, 4:12 p.m.