options(htmltools.dir.version = FALSE)

Intuition behind the approach


Formulation

$$P = {p_1, p_2, \dots, p_N}$$

$$P = \begin{bmatrix} p_{1, 1} & p_{1, 2} & \cdots & p_{1, N} \ p_{2, 1} & p_{2, 2} & \cdots & p_{2, N} \ \vdots & \vdots & \ddots & \vdots \ p_{n, 1} & p_{n, 2} & \cdots & p_{n, N} \ \end{bmatrix}$$


Formulation


Formulation

Lets for a moment describe density as a function of given simplex V and given set of points P.

--


Barycentric coordinates

For any set of $n + 1$ affinely independent points in $n$-dimensional space $V$ we can describe any point $s$ in $n$-dimensional space using barycentric coordinates $\alpha_i$, i.e.

$$\forall\ V = {v_1, \dots, v_{n + 1}\ \in !R^n \ : \ rank(v_{2} - v_1, \dots,v_{n} - v_1, v_{n + 1} - v_1) = n}$$ $$\forall s \in !R^n\ \exists!(\alpha_1, \alpha_2, \dots, \alpha_n, \alpha_{n+1}) : s = \alpha_1 v_1 + \dots +\alpha_{n + 1} v_{n + 1} \ |\ \sum_{i = 1}^{n + 1}{\alpha_i} = 1$$

--

Easy to see, that for any point $p$ in $P$ and a given simplex $\Delta^V$, $p$ is in $\Delta^V$ if and only if all the barycentric coordinates (with basis of $V$) of p are non-negative.

Let $$p = \alpha_1 v_1 + \dots +\alpha_{n + 1} v_{n + 1} \ |\ \sum_{i = 1}^{n + 1}{\alpha_i}=1$$ then $$p \in \Delta^V \Longleftrightarrow \forall{a_i} \geq 0$$


Barycentric coordinates transform

Luckily we can tranform any point $p$ from $n$-dimensional space to $(n + 1)$-dimensional barycentric coordinates using linear transformation (aka left matrix multiplication). This allows us to perform this matrix operator to all the points at once which reduces computation time pretty significantly.

This method was performed (and tested) first at dummy data of 12k points and some triangle as a given simplex.


Show case: generated data

library(ggplot2)
source("~/backup/ClusDec/clusdec/R/countInside.R")

set.seed(1)
exampleEndpoints <- matrix(c(
 c(10, 10), c(11, 12), c(15, 14)
), nrow=2)

x_min <- min(exampleEndpoints[1, ])
x_max <- max(exampleEndpoints[1, ])
y_min <- min(exampleEndpoints[2, ])
y_max <- max(exampleEndpoints[2, ])

pointsToCheck <- matrix(c(
    runif(12000, min = x_min, max = x_max),
    runif(12000, min = y_min, max = y_max)
), ncol=2)
pointsToCheck <- t(pointsToCheck)


pointsToPlot <- as.data.frame(t(pointsToCheck))
colnames(pointsToPlot)  <- c("x", "y")

toPlot <- as.data.frame(t(exampleEndpoints))
colnames(toPlot) <- c("x", "y")


ggplot(toPlot, aes(x=x, y=y)) + geom_point() + theme_bw() +
    geom_point(data=pointsToPlot) +
    geom_polygon(color="grey", fill="grey", alpha=0.5)

Show case: results

start <- Sys.time()
isInside <- countInside(exampleEndpoints, pointsToCheck)
Sys.time() - start

Show case: results

pointsToPlot$inside <- isInside

toPlot <- as.data.frame(t(exampleEndpoints))
colnames(toPlot) <- c("x", "y")

ggplot(toPlot, aes(x=x, y=y)) + geom_point() + theme_bw() +
    geom_point(data=pointsToPlot, aes(color=inside))
    # geom_polygon(color="grey", fill="grey", alpha=0.5)

Now back to density

Once implemented quick counting of points inside of a simplex and since hypervolume of simplex can be easily calcualted as determinant of matrix devided by factorial of dimensionality we can easily write a function to calculate a density of a given simplex in context of set of points.

pointsToPlot$inside <- isInside

toPlot <- as.data.frame(t(exampleEndpoints))
colnames(toPlot) <- c("x", "y")

ggplot(toPlot, aes(x=x, y=y)) + geom_point() + theme_bw() +
    geom_point(data=pointsToPlot, aes(color=inside), size=0.3)
    # geom_polygon(color="grey", fill="grey", alpha=0.5)
simplexDensity(exampleEndpoints, pointsToCheck)

Now back to density

Once implemented quick counting of points inside of a simplex and since hypervolume of simplex can be easily calcualted as determinant of matrix devided by factorial of dimensionality we can easily write a function to calculate a density of a given simplex in context of set of points.

exampleEndpoints <- matrix(c(
    c(10, 10), c(10, 14), c(15, 14)
), nrow=2)
isInside <- countInside(exampleEndpoints, pointsToCheck)
pointsToPlot$inside <- isInside

toPlot <- as.data.frame(t(exampleEndpoints))
colnames(toPlot) <- c("x", "y")

ggplot(toPlot, aes(x=x, y=y)) + geom_point() + theme_bw() +
    geom_point(data=pointsToPlot, aes(color=inside), size=0.3)
    # geom_polygon(color="grey", fill="grey", alpha=0.5)
simplexDensity(exampleEndpoints, pointsToCheck)

Densities are pretty similar -- no surpise

Density function seems to be working


Simulation data

library(data.table)
set.seed(1)

sampleFromSimplexUniformly <- function(k=3, M=10000) {
    x <- rep(0, k + 1)
    x[k + 1] <- M

    x[2:k] <- sample(1:(M-1), k - 1)
    x <- sort(x)
    y <- (x - shift(x))[2:(k + 1)]
    return(y / M)
}

samples <- 40
cellTypes <- 3
proportions <- matrix(0, ncol=samples, nrow=cellTypes)
colnames(proportions) <- paste0("Sample ", 1:samples)
rownames(proportions) <- paste0("Cell type ", 1:cellTypes)

for (j in 1:samples) proportions[, j] <- sampleFromSimplexUniformly(cellTypes)
half <- proportions[3, ] / 2
proportions[1, ] <- proportions[1, ] + half
proportions[3, ] <- proportions[3, ] - half

genes <- 12000
basis <- matrix(0, nrow=genes, ncol=cellTypes)
rownames(basis) <- paste0("Gene ", 1:genes)
colnames(basis) <- paste0("Cell type ", 1:cellTypes)

for (i in 1:genes) {
    basis[i, ] <- sampleFromSimplexUniformly() * runif(1, min=200, max=10000)
}

data <- basis %*% proportions

Lets get back to simulation data. At first we gonna check the density approach without noise. But anyway how are we going to find the endpoints of simplex? We can use simulated annealing to perform the search of perfect points in our space.


Simulation data, how does it look like?

library(scatterplot3d)
source("~/backup/ClusDec/clusdec/R/simplex_utils.R")

dataReduced <- dimensionality_reduction(data, 3, projection = T)
dataMean <- colMeans(dataReduced)
dataShifted <- t(t(dataReduced) - dataMean)
proj <- dimensionality_reduction(dataShifted, 2)
proj <- as.data.frame(proj)
colnames(proj) <- c("x", "y")
ggplot(proj, aes(x=x, y=y)) + geom_point(size=0.3) + theme_bw()

Simulation data, how does it look like?

library(scatterplot3d)
library(unmixR)
library(amap)
source("~/backup/ClusDec/clusdec/R/simplex_utils.R")

dataReduced <- dimensionality_reduction(data, 3, projection = T)
pp <- point_selection(dataReduced, 3, 1)
dataMean <- colMeans(dataReduced)
dataShifted <- t(t(dataReduced) - dataMean)
proj <- dimensionality_reduction(dataShifted, 2)
proj <- as.data.frame(proj)
colnames(proj) <- c("x", "y")
ggplot(proj, aes(x=x, y=y)) + geom_point(size=0.3) + theme_bw()

Simulated annealing

We can use simulated annealing to find regions of high density

In simulation dataset we gotta find 3 points in 2d space with highest possible density.

library(GenSA)

P <- as.matrix(t(proj))
xMin <- min(P[1, ])
yMin <- min(P[2, ])
xMax <- max(P[1, ])
yMax <- max(P[2, ])

lower <- rep(c(xMin, yMin), 3)
upper <- rep(c(xMax, yMax), 3)

# First approx is VCA points
V <- P[, rownames(pp)]
colnames(V) <- NULL

Simulated annealing



ctlab/ClusDec documentation built on May 14, 2019, 12:29 p.m.