HandlingClusterizations: Handling cells' _clusterization_ and related functions
In seriph78/COTAN: COexpression Tables ANalysis

HandlingClusterizations

R Documentation

Handling cells' clusterization and related functions

Description

These functions manage the clusterizations and their associated cluster COEX data.frames.

A clusterization is any partition of the cells where to each cell it is assigned a label; a group of cells with the same label is called cluster.

For each cluster is also possible to define a COEX value for each gene, indicating its increased or decreased expression in the cluster compared to the whole background. A data.frame with these values listed in a column for each cluster is stored separately for each clusterization in the clustersCoex member.

The formulae for this In/Out COEX are similar to those used in the calculateCoex() method, with the role of the second gene taken by the In/Out status of the cells with respect to each cluster.

Usage

## S4 method for signature 'COTAN'
estimateNuLinearByCluster(objCOTAN, clName = "", clusters = NULL)

## S4 method for signature 'COTAN'
getClusterizations(objCOTAN, dropNoCoex = FALSE, keepPrefix = FALSE)

## S4 method for signature 'COTAN'
getClusterizationName(objCOTAN, clName = "", keepPrefix = FALSE)

## S4 method for signature 'COTAN'
getClusterizationData(objCOTAN, clName = "")

getClusters(objCOTAN, clName = "")

## S4 method for signature 'COTAN'
getClustersCoex(objCOTAN)

## S4 method for signature 'COTAN'
addClusterization(
  objCOTAN,
  clName,
  clusters,
  coexDF = data.frame(),
  override = FALSE
)

## S4 method for signature 'COTAN'
addClusterizationCoex(objCOTAN, clName, coexDF)

## S4 method for signature 'COTAN'
dropClusterization(objCOTAN, clName)

DEAOnClusters(objCOTAN, clName = "", clusters = NULL)

clusterGeneContingencyTables(objCOTAN, gene, cells)

pValueFromDEA(coexDF, numCells, adjustmentMethod = "none")

logFoldChangeOnClusters(
  objCOTAN,
  clName = "",
  clusters = NULL,
  floorLambdaFraction = 0.05
)

distancesBetweenClusters(
  objCOTAN,
  clName = "",
  clusters = NULL,
  coexDF = NULL,
  useDEA = TRUE,
  distance = NULL
)

UMAPPlot(
  dataIn,
  clusters = NULL,
  elements = NULL,
  title = "",
  colors = NULL,
  numNeighbors = 0L,
  minPointsDist = NaN
)

cellsUMAPPlot(
  objCOTAN,
  clName = "",
  clusters = NULL,
  dataMethod = "",
  genesSel = "HVG_Seurat",
  numGenes = 2000L,
  colors = NULL,
  numNeighbors = 0L,
  minPointsDist = NA
)

clustersMarkersHeatmapPlot(
  objCOTAN,
  groupMarkers = list(),
  clName = "",
  clusters = NULL,
  coexDF = NULL,
  kCuts = 3L,
  adjustmentMethod = "bonferroni",
  condNameList = NULL,
  conditionsList = NULL
)

clustersSummaryData(
  objCOTAN,
  clName = "",
  clusters = NULL,
  condName = "",
  conditions = NULL
)

clustersSummaryPlot(
  objCOTAN,
  clName = "",
  clusters = NULL,
  condName = "",
  conditions = NULL,
  plotTitle = ""
)

clustersTreePlot(
  objCOTAN,
  kCuts,
  clName = "",
  clusters = NULL,
  useDEA = TRUE,
  distance = NULL,
  hclustMethod = "ward.D2"
)

findClustersMarkers(
  objCOTAN,
  n = 10L,
  markers = NULL,
  clName = "",
  clusters = NULL,
  coexDF = NULL,
  adjustmentMethod = "bonferroni"
)

geneSetEnrichment(clustersCoex, groupMarkers = list())

reorderClusterization(
  objCOTAN,
  clName = "",
  clusters = NULL,
  coexDF = NULL,
  reverse = FALSE,
  keepMinusOne = TRUE,
  useDEA = TRUE,
  distance = NULL,
  hclustMethod = "ward.D2"
)

Arguments

`objCOTAN`	a `COTAN` object
`clName`	The name of the clusterization. If not given the last available clusterization will be used, as it is probably the most significant!
`clusters`	A clusterization to use. If given it will take precedence on the one indicated by `clName`
`dropNoCoex`	When `TRUE` drops the names from the clusterizations with empty associated coex `data.frame`
`keepPrefix`	When `TRUE` returns the internal name of the clusterization: the one with the `CL_` prefix.
`coexDF`	a `data.frame` where each column indicates the `COEX` for each of the clusters of the clusterization
`override`	When `TRUE` silently allows overriding data for an existing clusterization name. Otherwise the default behavior will avoid potential data losses
`gene`	a gene
`cells`	a sub-set of the cells
`numCells`	the number of overall cells in all clusters
`adjustmentMethod`	p-value multi-test adjustment method. Defaults to `"bonferroni"`; use `"none"` for no adjustment
`floorLambdaFraction`	Indicates the lower bound to the average count sums inside or outside the cluster for each gene as fraction of the relevant `lambda` parameter. Default is `5\%`
`useDEA`	Boolean indicating whether to use the DEA to define the distance; alternatively it will use the average Zero-One counts, that is faster but less precise.
`distance`	type of distance to use. Default is `"cosine"` for DEA and `"euclidean"` for Zero-One. Can be chosen among those supported by `parallelDist::parDist()`
`dataIn`	The `matrix` to plot. It must have a row names containing the given elements (the columns are features)
`elements`	a named `list` of elements to label. Each array in the list will be shown with a different color
`title`	a string giving the plot title. Will default to UMAP Plot if not specified
`colors`	an `array` of colors to use in the plot. If not sufficient colors are given it will complete the list using colors from `getColorsVector()`
`numNeighbors`	Overrides the default `n_neighbors` value
`minPointsDist`	Overrides the default `min_dist` value
`dataMethod`	selects the method to use to create the `data.frame` to pass to the `UMAPPlot()`. To calculate, for each cell, a statistic for each gene based on available data/model, the following methods are supported: `⁠"RW", "Raw", "RawData"⁠` uses the raw counts `⁠"NN", "NuNorm", "Normalized"⁠` uses the `\nu`-normalized counts `⁠"LN", "LogNorm", "LogNormalized"⁠` uses the log-normalized counts (default) `⁠"BI", "Bin", "Binarized"⁠` uses the binarized data matrix `⁠"BD", "BinDiscr", "BinarizedDiscrepancy"⁠` uses the difference between the binarized data matrix and the estimated probability of one `⁠"AB", "AdjBin", "AdjBinarized"⁠` uses the absolute value of the binarized discrepancy above `⁠"LH", "Like", "Likelihood"⁠` uses the likelihood of binarized data matrix `⁠"LL", "LogLike", "LogLikelihood"⁠` uses the log-likelihood of binarized data matrix `⁠"DL", "DerLogL", "DerivativeLogLikelihood"⁠` uses the derivative of the log-likelihood of binarized data matrix `⁠"SL", "SignLogL", "SignedLogLikelihood"⁠` uses the signed log-likelihood of binarized data matrix For the last four options see `calculateLikelihoodOfObserved()` for more details
`genesSel`	Decides whether and how to perform gene-selection. It can be a straight list of genes or a string indicating one of the following selection methods: `"HGDI"` Will pick-up the genes with highest GDI. Since it requires an available `COEX` matrix it will fall-back to `"HVG_Seurat"` when the matrix is not available `"HVG_Seurat"` Will pick-up the genes with the highest variability via the Seurat package (the default method) `"HVG_Scanpy"` Will pick-up the genes with the highest variability according to the `Scanpy` package (using the Seurat implementation)
`numGenes`	the number of genes to select using the above method. Will be ignored when no selection have been asked or when an explicit list of genes was passed in
`groupMarkers`	an optional named `list` with an element for each group comprised of one or more marker genes
`kCuts`	the number of estimated cluster (this defines the height for the tree cut)
`condNameList`	a `list` of conditions' names to be used for additional columns in the final plot. When none are given no new columns will be added using data extracted via the function `clustersSummaryData()`
`conditionsList`	a `list` of conditions to use. If given they will take precedence on the ones indicated by `condNameList`
`condName`	The name of a condition in the `COTAN` object to further separate the cells in more sub-groups. When no condition is given it is assumed to be the same for all cells (no further sub-divisions)
`conditions`	The conditions to use. If given it will take precedence on the one indicated by `condName` that will only indicate the relevant column name in the returned `data.frame`
`plotTitle`	The title to use for the returned plot
`hclustMethod`	It defaults is `"ward.D2"` but can be any of the methods defined by the `stats::hclust()` function.
`n`	the number of extreme `COEX` values to return
`markers`	a `list` of marker genes
`clustersCoex`	the `COEX` `data.frame`
`reverse`	a flag to the output order
`keepMinusOne`	a flag to decide whether to keep the cluster `"-1"` (representing the non-clustered cells) untouched

Details

estimateNuLinearByCluster() does a linear estimation of nu: cells' counts averages normalized cluster by cluster

getClusterizations() extracts the list of the clusterizations defined in the COTAN object.

getClusterizationName() normalizes the given clusterization name or, if none were given, returns the name of last available clusterization in the COTAN object. It can return the clusterization internal name if needed

getClusterizationData() extracts the asked clusterization and its associated COEX data.frame from the COTAN object

getClusters() extracts the asked clusterization from the COTAN object

getClustersCoex() extracts the full clusterCoex member list

addClusterization() adds a clusterization to the current COTAN object, by adding a new column in the metaCells data.frame and adding a new element in the clustersCoex list using the passed in COEX data.frame or an empty data.frame if none were passed in.

addClusterizationCoex() adds a clusterization COEX data.frame to the current COTAN object. It requires the named clusterization to be already present.

dropClusterization() drops a clusterization from the current COTAN object, by removing the corresponding column in the metaCells data.frame and the corresponding COEX data.frame from the clustersCoex list.

DEAOnClusters() is used to run the Differential Expression analysis using the COTAN contingency tables on each cluster in the given clusterization

clusterGeneContingencyTables() returns the observed and expected contingency tables for a given gene and a given set of cells (a cluster). The implementation runs the same algorithms used to calculate the full observed/expected contingency tables used for DEA, but restricted to only the relevant gene and cluster, thus much faster and less memory intensive

pValueFromDEA() is used to convert to p-value the Differential Expression analysis using the COTAN contingency tables on each cluster in the given clusterization

logFoldChangeOnClusters() is used to get the log difference of the expression levels for each cluster in the given clusterization against the rest of the data-set

distancesBetweenClusters() is used to obtain a distance between the clusters. Depending on the value of the useDEA flag will base the distance on the DEA columns or the averages of the Zero-One matrix.

UMAPPlot() plots the given data.frame containing genes information related to clusters after applying the umap transformation via Seurat::RunUMAP()

cellsUMAPPlot() returns a ggplot2 plot where the given clusters are placed on the base of their relative distance. Also if needed calculates and stores the DEA of the relevant clusterization.

clustersMarkersHeatmapPlot() returns the heatmap plot of a summary score for each cluster and each gene marker in the given clusterization. It also returns the numerosity and percentage of each cluster on the right and a clusterization dendogram on the left, as returned by the function clustersTreePlot(). The heatmap cells' colors express the DEA, that is whether a gene is enriched or depleted in the cluster, while the stars are aligned to the corresponding adjusted p-value: ⁠***⁠ for p < 0.1\%, ⁠**⁠ for p < 1\%, * for p < 5\%, . for p < 10\%

clustersSummaryData() calculates various statistics about each cluster (with an optional further condition to separate the cells).

clustersSummaryPlot() calculates various statistics about each cluster via clustersSummaryData() and puts them together into a plot.

clustersTreePlot() returns the dendogram plot where the given clusters are placed on the base of their relative distance. Also if needed calculates and stores the DEA of the relevant clusterization.

findClustersMarkers() takes in a COTAN object and a clusterization and produces a data.frame with the n most positively enriched and the n most negatively enriched genes for each cluster. The function also provides whether and the found genes are in the given markers list or not. It also returns the adjusted p-value for multi-tests using the stats::p.adjust()

geneSetEnrichment() returns a cumulative score of enrichment in a cluster over a gene set. In formulae it calculates \frac{1}{n}\sum_i(1-e^{-\theta X_i}), where the X_i are the positive values from DEAOnClusters() and \theta = -\frac{1}{0.1} \ln(0.25)

reorderClusterization() takes in a clusterizations and reorder its labels so that in the new order near labels indicate near clusters according to a DEA (or Zero-One) based distance

Value

estimateNuLinearByCluster() returns the updated COTAN object

getClusterizations() returns a vector of clusterization names, usually without the CL_ prefix

getClusterizationName() returns the normalized clusterization name or NULL if no clusterizations are present

getClusterizationData() returns a list with 2 elements:

"clusters" the named cluster labels array
"coex" the associated COEX data.frame. This will be an empty data.frame when not specified for the relevant clusterization

getClusters() returns the named cluster labels array

getClustersCoex() returns the list with a COEX data.frame for each clusterization. When not empty, each data.frame contains a COEX column for each cluster.

addClusterization() returns the updated COTAN object

addClusterizationCoex() returns the updated COTAN object

dropClusterization() returns the updated COTAN object

DEAOnClusters() returns the co-expression data.frame for the genes in each cluster

clusterGeneContingencyTables() returns a list containing the observed and expected contingency tables

pValueFromDEA() returns a data.frame containing the p-values corresponding to the given COEX adjusted for multi-test

logFoldChangeOnClusters() returns the log-expression-change data.frame for the genes in each cluster

distancesBetweenClusters() returns a dist object

UMAPPlot() returns a ggplot2 object

cellsUMAPPlot() returns a list with 2 objects:

"plot" a ggplot2 object representing the umap plot
"cellsPCA" the data.frame PCA used to create the plot

clustersMarkersHeatmapPlot() returns a list with:

"heatmapPlot" the complete heatmap plot
"dataScore" the data.frame with the score values
"pValueDF" the data.frame with the corresponding adjusted p-values

clustersSummaryData() returns a data.frame with the following statistics: The calculated statistics are:

"clName" the cluster labels
"condName" the relevant condition (that sub-divides the clusters)
"CellNumber" the number of cells in the group
"MeanUDE" the average "UDE" in the group of cells
"MedianUDE" the median "UDE" in the group of cells
"ExpGenes25" the number of genes expressed in at the least 25% of the cells in the group
"ExpGenes" the number of genes expressed at the least once in any of the cells in the group
"CellPercentage" fraction of the cells with respect to the total cells

clustersSummaryPlot() returns a list with a data.frame and a ggplot objects

"data" contains the data,
"plot" is the returned plot

clustersTreePlot() returns a list with 2 objects:

"dend" a ggplot2 object representing the dendrogram plot
"objCOTAN" the updated COTAN object

findClustersMarkers() returns a data.frame containing n genes for each cluster scoring top/bottom COEX scores. The data.frame also contains:

"CL" the cluster
"Gene" the gene
"Score" the COEX score of the gene
"adjPVal" the p-values associated to the COEX adjusted for multi-testing
"DEA" the differential expression of the gene
"IsMarker" whether the gene is among the given markers
"logFoldCh" the log-fold-change of the gene expression inside versus outside the cluster from logFoldChangeOnClusters()

geneSetEnrichment() returns a data.frame with the cumulative score

reorderClusterization() returns a list with 2 elements:

"clusters" the newly reordered cluster labels array
"coex" the associated COEX data.frame
"permMap" the reordering mapping

Examples

data("test.dataset")
objCOTAN <- COTAN(raw = test.dataset)
objCOTAN <- proceedToCoex(objCOTAN, cores = 6L, calcCoex = FALSE,
                          optimizeForSpeed = TRUE, saveObj = FALSE)

data("test.dataset.clusters1")
clusters <- test.dataset.clusters1

coexDF <- DEAOnClusters(objCOTAN, clusters = clusters)

groupMarkers <- list(G1 = c("g-000010", "g-000020", "g-000030",
                            "g-000150", "g-000160", "g-000170"),
                     G2 = c("g-000300", "g-000330", "g-000450",
                            "g-000460", "g-000470"),
                     G3 = c("g-000510", "g-000530", "g-000550",
                            "g-000570", "g-000590"))

geneClusters <- rep(1:3, each = 240)[1:600]
names(geneClusters) <- getGenes(objCOTAN)

umapPlot <- UMAPPlot(coexDF, clusters = NULL, elements = groupMarkers)
plot(umapPlot)

objCOTAN <- addClusterization(objCOTAN, clName = "first_clusterization",
                              clusters = clusters, coexDF = coexDF)

lfcDF <- logFoldChangeOnClusters(objCOTAN, clusters = clusters)
umapPlot2 <- UMAPPlot(lfcDF, clusters = geneClusters)
plot(umapPlot2)

objCOTAN <- estimateNuLinearByCluster(objCOTAN, clusters = clusters)

clSummaryPlotAndData <-
  clustersSummaryPlot(objCOTAN, clName = "first_clusterization",
                      plotTitle = "first clusterization")
plot(clSummaryPlotAndData[["plot"]])

if (FALSE) {
  objCOTAN <- dropClusterization(objCOTAN, "first_clusterization")
}

clusterizations <- getClusterizations(objCOTAN, dropNoCoex = TRUE)
stopifnot(length(clusterizations) == 1)

cellsUmapPlotAndDF <- cellsUMAPPlot(objCOTAN, dataMethod = "LogNormalized",
                                    clName = "first_clusterization",
                                    genesSel = "HVG_Seurat")
plot(cellsUmapPlotAndDF[["plot"]])

enrichment <- geneSetEnrichment(clustersCoex = coexDF,
                                groupMarkers = groupMarkers)

clHeatmapPlotAndData <- clustersMarkersHeatmapPlot(objCOTAN, groupMarkers)

conditions <- as.integer(substring(getCells(objCOTAN), 3L))
conditions <- factor(ifelse(conditions <= 600, "L", "H"))
names(conditions) <- getCells(objCOTAN)

clHeatmapPlotAndData2 <-
  clustersMarkersHeatmapPlot(objCOTAN, groupMarkers, kCuts = 2,
                             condNameList = list("High/Low"),
                             conditionsList = list(conditions))

clName <- getClusterizationName(objCOTAN)

clusterDataList <- getClusterizationData(objCOTAN, clName = clName)

clusters <- getClusters(objCOTAN, clName = clName)

allClustersCoexDF <- getClustersCoex(objCOTAN)

summaryData <- clustersSummaryData(objCOTAN)

treePlotAndObj <- clustersTreePlot(objCOTAN, 2)
objCOTAN <- treePlotAndObj[["objCOTAN"]]
plot(treePlotAndObj[["dend"]])

clMarkers <- findClustersMarkers(objCOTAN, markers = list(),
                                 clusters = clusters)

seriph78/COTAN documentation built on June 1, 2025, 4:57 p.m.