knitr::opts_chunk[["set"]]( collapse = TRUE, comment = "#>" # fig.width = 5L, # fig.height = 5L )
library(COTAN) library(zeallot) library(data.table) library(Rtsne) library(qpdf) library(GEOquery) options(parallelly.fork.enable = TRUE)
This tutorial contains the same functionalities as the first release of the COTAN tutorial but done using the new and updated functions.
Download the data-set for "mouse cortex E17.5"
.
dataDir <- tempdir() GEO <- "GSM2861514" fName <- "GSM2861514_E175_Only_Cortical_Cells_DGE.txt.gz" dataSetFile <- file.path(dataDir, GEO, fName) if (!file.exists(dataSetFile)) { getGEOSuppFiles(GEO, makeDirectory = TRUE, baseDir = dataDir, fetch_files = TRUE, filter_regex = fName) sample.dataset <- read.csv(dataSetFile, sep = "\t", row.names = 1L) }
Define a directory where the output will be stored.
outDir <- tempdir() # Log-level 2 was chosen to showcase better how the package works # In normal usage a level of 0 or 1 is more appropriate setLoggingLevel(2L) # This file will contain all the logs produced by the package # as if at the highest logging level setLoggingFile(file.path(outDir, "vignette_v2.log")) message("COTAN uses the `torch` library when asked to `optimizeForSpeed`") message("Run the command 'options(COTAN.UseTorch = FALSE)'", " in your session to disable `torch` completely!") # this command does check whether the torch library is properly installed c(useTorch, deviceStr) %<-% COTAN:::canUseTorch(TRUE, "cuda") if (useTorch) { message("The `torch` library is availble and ready to use") if (deviceStr == "cuda") { message("The `torch` library can use the `CUDA` GPU") } else { message("The `torch` library can only use the CPU") message("Please ensure you have the `OpenBLAS` libraries", " installed on the system") } } rm(useTorch, deviceStr)
Initialize the COTAN
object with the row count table and
the metadata for the experiment.
cond <- "mouse_cortex_E17.5" obj <- COTAN(raw = sample.dataset) obj <- initializeMetaDataset(obj, GEO = GEO, sequencingMethod = "Drop_seq", sampleCondition = cond) logThis(paste0("Condition ", getMetadataElement(obj, datasetTags()[["cond"]])), logLevel = 1L)
Before we proceed to the analysis, we need to clean the data. The analysis will use a matrix of raw UMI counts as the input. To obtain this matrix, we have to remove any potential cell doublets or multiplets, as well as any low quality or dying cells.
We can check the library size (UMI number) with an empirical cumulative distribution function
ECDPlot(obj, yCut = 700L)
cellSizePlot(obj)
genesSizePlot(obj)
mit <- mitochondrialPercentagePlot(obj, genePrefix = "^Mt") mit[["plot"]]
During the cleaning, every time we want to remove cells or genes
we can use the dropGenesCells()
function.
To drop cells by cell library size:
cells_to_rem <- getCells(obj)[getCellsSize(obj) > 6000L] obj <- dropGenesCells(obj, cells = cells_to_rem)
To drop cells by gene number: high genes count might indicate doublets...
cells_to_rem <- getCells(obj)[getNumExpressedGenes(obj) > 3000L] obj <- dropGenesCells(obj, cells = cells_to_rem) genesSizePlot(obj)
To drop cells by mitochondrial percentage:
to_rem <- mit[["sizes"]][["mit.percentage"]] > 1.5 cells_to_rem <- rownames(mit[["sizes"]])[to_rem] obj <- dropGenesCells(obj, cells = cells_to_rem) mit <- mitochondrialPercentagePlot(obj, genePrefix = "^Mt") mit[["plot"]]
If we do not want to consider the mitochondrial genes we can remove them before starting the analysis.
genes_to_rem <- getGenes(obj)[grep("^Mt", getGenes(obj))] cells_to_rem <- getCells(obj)[which(getCellsSize(obj) == 0L)] obj <- dropGenesCells(obj, genes_to_rem, cells_to_rem)
We want also to log the current status.
logThis(paste("n cells", getNumCells(obj)), logLevel = 1L)
The clean()
function estimates all the parameters for the data. Therefore, we have to run it again every time we remove any genes or cells from the data.
n_it <- 1L obj <- clean(obj) c(pcaCellsPlot, pcaCellsData, genesPlot, UDEPlot, nuPlot, zoomedNuPlot) %<-% cleanPlots(obj) pcaCellsPlot
genesPlot
We can observe here that the red cells are really enriched in hemoglobin genes so we prefer to remove them. They can be extracted from the pcaCellsData
object and removed.
cells_to_rem <- rownames(pcaCellsData)[pcaCellsData[["groups"]] == "B"] obj <- dropGenesCells(obj, cells = cells_to_rem) n_it <- 2L obj <- clean(obj) c(pcaCellsPlot, pcaCellsData, genesPlot, UDEPlot, nuPlot, zoomedNuPlot) %<-% cleanPlots(obj) pcaCellsPlot
To color the PCA based on nu
(so the cells' efficiency)
UDEPlot
UDE (color) should not correlate with principal components! This is very important.
The next part is used to remove the cells with efficiency too low.
plot(nuPlot)
We can zoom on the smallest values and, if COTAN detects a clear elbow, we can decide to remove the cells.
plot(zoomedNuPlot)
We also save the defined threshold in the metadata and re-run the estimation
yset <- 0.28 obj <- addElementToMetaDataset(obj, "Threshold low UDE cells:", yset) cells_to_rem <- getCells(obj)[getNu(obj) < yset] obj <- dropGenesCells(obj, cells = cells_to_rem)
Repeat the estimation after the cells are removed
n_it <- 3L obj <- clean(obj) c(pcaCellsPlot, pcaCellsData, genesPlot, UDEPlot, nuPlot, zoomedNuPlot) %<-% cleanPlots(obj) pcaCellsPlot
logThis(paste("n cells", getNumCells(obj)), logLevel = 1L)
In this part, all the contingency tables are computed
and used to get the statistics necessary to COEX
evaluation and storing
obj <- proceedToCoex(obj, calcCoex = TRUE, optimizeForSpeed = TRUE, cores = 6L, saveObj = TRUE, outDir = outDir)
Since saveObj = TRUE
, this step can be skipped as the COTAN
object has
already been saved in the outDir
.
# saving the structure saveRDS(obj, file = file.path(outDir, paste0(cond, ".cotan.RDS")))
It is also possible to run directly a single function if we don't want to clean anything.
obj2 <- automaticCOTANObjectCreation( raw = sample.dataset, GEO = GEO, sequencingMethod = "Drop_seq", sampleCondition = cond, calcCoex = TRUE, cores = 6L, optimizeForSpeed = TRUE, saveObj = TRUE, outDir = outDir)
To calculate the GDI
we can run:
quantP <- calculateGDI(obj) head(quantP)
The next function can either plot the GDI
for the dataset directly or
use the pre-computed dataframe.
It marks a 1.43
threshold (in red) and
the two highest quantiles (in blue) by default.
We can also specify some gene sets (three in this case) that
we want to label explicitly in the GDI
plot.
genesList <- list( "NPGs" = c("Nes", "Vim", "Sox2", "Sox1", "Notch1", "Hes1", "Hes5", "Pax6"), "PNGs" = c("Map2", "Tubb3", "Neurod1", "Nefm", "Nefl", "Dcx", "Tbr1"), "hk" = c("Calm1", "Cox6b1", "Ppia", "Rpl18", "Cox7c", "Erh", "H3f3a", "Taf1", "Taf2", "Gapdh", "Actb", "Golph3", "Zfr", "Sub1", "Tars", "Amacr") ) # needs to be recalculated after the changes in nu/dispersion above #obj <- calculateCoex(obj, actOnCells = FALSE) GDIPlot(obj, GDIIn = quantP, cond = cond, genes = genesList, GDIThreshold = 1.43)
The percentage of cells expressing the gene in the third column of this data-frame is reported.
To perform the Gene Pair Analysis, we can plot a heatmap of the COEX
values
between two gene sets.
We have to define the different gene sets (list.genes
) in a list.
Then we can choose which sets to use in the function parameter sets
(for example, from 1 to 3).
We also have to provide an array of the file name prefixes for each condition
(for example, "mouse_cortex_E17.5").
In fact, this function can plot genes relationships across many different
conditions to get a complete overview.
print(cond)
heatmapPlot(genesLists = genesList, sets = (1L:3L), conditions = c(cond), dir = outDir)
We can also plot a general heatmap of COEX
values based on some markers like
the following one.
genesHeatmapPlot(obj, primaryMarkers = c("Satb2", "Bcl11b", "Vim", "Hes1"), pValueThreshold = 0.001, symmetric = TRUE)
genesHeatmapPlot(obj, primaryMarkers = c("Satb2", "Bcl11b", "Fezf2"), secondaryMarkers = c("Gabra3", "Meg3", "Cux1", "Neurod6"), pValueThreshold = 0.001, symmetric = FALSE)
Sometimes we can also be interested in the numbers present directly in the contingency tables for two specific genes. To get them we can use two functions:
contingencyTables()
to produce the observed and expected data
c(observedCT, expectedCT) %<-% contingencyTables(obj, g1 = "Satb2", g2 = "Bcl11b") print("Observed CT") observedCT print("Expected CT") expectedCT
Another useful function is getGenesCoex()
. This can be used to extract
the whole or a partial COEX
matrix from a COTAN
object.
# For the whole matrix coex <- getGenesCoex(obj, zeroDiagonal = FALSE) coex[1L:5L, 1L:5L]
# For a partial matrix coex <- getGenesCoex(obj, genes = c("Satb2", "Bcl11b", "Fezf2")) head(coex)
COTAN
provides a way to establish genes' clusters given some lists of markers
layersGenes <- list( "L1" = c("Reln", "Lhx5"), "L2/3" = c("Satb2", "Cux1"), "L4" = c("Rorb", "Sox5"), "L5/6" = c("Bcl11b", "Fezf2"), "Prog" = c("Vim", "Hes1") ) c(gSpace, eigPlot, pcaClustersDF, treePlot) %<-% establishGenesClusters(obj, groupMarkers = layersGenes, numGenesPerMarker = 25L, kCuts = 5L) plot(eigPlot)
plot(treePlot)
UMAPPlot(pcaClustersDF[, 1L:10L], clusters = pcaClustersDF[["hclust"]], elements = layersGenes, title = "Genes' clusters UMAP Plot")
It is possible to obtain a cell clusterization based on the concept of
uniformity of expression of the genes across the cells.
That is the cluster satisfies the null hypothesis of the COTAN
model:
the genes expression is not dependent on the cell in consideration.
There are two functions involved into obtaining a proper clusterization:
the first is cellsUniformClustering
that uses standard tools clusterization
methods, but then discards and re-clusters any non-uniform cluster.
Please not that the most important parameter for the users is the
GDIThreshold
: it defines how strict is the uniformity check,
good values are between 1.42 (more strict) and 1.43 (less strict).
c(splitClusters, splitCoexDF) %<-% cellsUniformClustering(obj, GDIThreshold = 1.43, initialResolution = 0.8, optimizeForSpeed = TRUE, cores = 6L, saveObj = TRUE, outDir = outDir) obj <- addClusterization(obj, clName = "split", clusters = splitClusters, coexDF = splitCoexDF)
In the case one already has an existing clusterization, it is possible to
calculate the DEA data.frame
and add it to the COTAN
object.
splitCoexDF <- DEAOnClusters(obj, clusters = splitClusters) obj <- addClusterization(obj, clName = "split", clusters = splitClusters, coexDF = splitCoexDF)
However since the algorithm that creates the clusters is not geared directly
to achieve cluster uniformity, there might be clusters that can be merged back
together and still be uniform. This is the purpose of the function
mergeUniformCellsClusters
that takes a clusterization and tries to merge
cluster pairs after checking that together the pair forms a uniform cluster.
In order to avoid the massive possible checks the function relies on a related
distance the find the cluster pairs that will be most likely merged.
Again the most important parameter for the users is the GDIThreshold
, it is
supposed to be the same as the one used to generate the clusterization in the
first place, but if needed it can be slightly relaxed in order to reduce the
number of clusters.
c(mergeClusters, mergeCoexDF) %<-% mergeUniformCellsClusters(obj, clusters = splitClusters, GDIThreshold = 1.43, optimizeForSpeed = TRUE, cores = 6L, saveObj = TRUE, outDir = outDir) # merges are: # 1 <- '-1' + 1 + 4 # 2 <- 2 + 3 + 8 # 3 <- 5 # 4 <- 6 # 5 <- 7 obj <- addClusterization(obj, clName = "merge", clusters = mergeClusters, coexDF = mergeCoexDF)
In the following graph, it is possible to check that the found clusters align very well to the expression of the layers' genes defined above...
clustersMarkersHeatmapPlot(obj, groupMarkers = layersGenes, clName = "split", kCuts = 5L)
clustersMarkersHeatmapPlot(obj, groupMarkers = layersGenes, clName = "merge", kCuts = 5L)
The next few lines are just to clean.
if (file.exists(file.path(outDir, paste0(cond, ".cotan.RDS")))) { #Delete file if it exists file.remove(file.path(outDir, paste0(cond, ".cotan.RDS"))) } unlink(file.path(outDir, cond), recursive = TRUE) file.remove(dataSetFile) # stop logging to file setLoggingFile("") file.remove(file.path(outDir, "vignette_v1.log")) options(parallelly.fork.enable = FALSE) sessionInfo()
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.