#' Clusters trajectory data
#'
#' @param trajectory_data data to cluster
#' @param P degree of bezier curve
#' @param K number of clusters
#' @param niter number of iteration to run it for
#' @importFrom magrittr %>%
#' @return results of the clustering
#' @export
cluster_trajectory_data <- function(trajectory_data, P = 3, K = 5, niter = 20){
# create data for em algorithm
prepared_trajectory_data <-
trajectory_data %>%
tidyr::nest(data = c(x, y)) %>%
dplyr::mutate(curve_i = row_number()) %>%
dplyr::mutate(n_i = purrr::map_dbl(data, nrow),
t_i = purrr::map(n_i, ~ tibble::tibble(t = (1:. - 1)/(.-1))),
T_i = purrr::map(t_i, ~ dplyr::mutate(., p = list(0:P)) %>%
tidyr::unnest(cols = c(p)) %>%
dplyr::mutate(D_p_of_t = choose(P, p) * t^p * (1 - t)^(P - p)) %>%
tidyr::spread(p, D_p_of_t, sep = "_") %>%
dplyr::select(-t))) %>%
dplyr::select(data, curve_i, n_i, t_i, T_i) %>%
dplyr::arrange(curve_i) %>%
ungroup()
# create X matrix
X <-
prepared_trajectory_data %>%
dplyr::select(T_i) %>%
tidyr::unnest(cols = c(T_i)) %>%
Matrix::as.matrix()
# create Y matrix
Y <-
prepared_trajectory_data %>%
dplyr::select(data) %>%
tidyr::unnest(cols = data) %>%
Matrix::as.matrix()
SEQ <-
prepared_trajectory_data %>%
dplyr::select(n_i) %>%
dplyr::mutate(n_i = cumsum(n_i)) %>%
Matrix::as.matrix()
INDEX <-
prepared_trajectory_data %>%
dplyr::select(curve_i, t_i) %>%
tidyr::unnest(cols = c(t_i)) %>%
dplyr::select(curve_i)
##################### INIT
kmean_data <-
prepared_trajectory_data %>%
dplyr::transmute(ends = purrr::map(data, ~ filter(., row_number() == max(row_number())) %>% dplyr::select(x, y)),
ends = purrr::map(data, ~ summarise(., x = mean(x), y = mean(y)))) %>%
tidyr::unnest(cols = c(ends))
kmeans_results <- kmeans(kmean_data, centers = K, iter.max = 100)
# kmean_data %>%
# dplyr::mutate(cluster = kmeans_results$cluster) %>%
# ggplot2::ggplot(aes(x = x, y = y, colour = factor(cluster))) +
# ggplot2::geom_point() +
# ggplot2::theme_bw()
#
# K means
init_clusters <-
prepared_trajectory_data %>%
dplyr::mutate(cluster = kmeans_results$cluster)
# random
# init_clusters <-
# prepared_trajectory_data %>%
# dplyr::mutate(cluster = sample(rep(1:K, ceiling(nrow(.) / K)), size = nrow(.), replace = F))
Alpha <-
init_clusters %>%
dplyr::group_by(cluster) %>%
dplyr::summarise(n = n()) %>%
dplyr::mutate(prop = n/sum(n)) %>%
dplyr::pull(prop)
c <-
init_clusters %>%
tidyr::unnest(data) %>%
dplyr::select(cluster) %>%
dplyr::pull(cluster)
calc_Piik <- function(data, Sigma){
data %>%
transmute(Piik = purrr::pmap_dbl(list(x, y, x1, y1), ~ emdbook::dmvnorm(x = c(..3, ..4),
mu = c(..1, ..2),
Sigma))) %>%
dplyr::bind_cols(as_tibble(INDEX))
}
Beta <- list()
Sigma <- list()
tic()
for(k in 1:K){
Beta[[k]] <- solve(t(X[k == c, ]) %*% X[k == c, ]) %*% t(X[k == c, ]) %*% Y[k == c, ]
Sigma[[k]] <- diag(diag(t(Y[k == c, ] - X[k == c, ] %*% Beta[[k]]) %*% (Y[k == c, ] - X[k == c, ] %*% Beta[[k]])/ nrow(X[k == c, ])))
}
toc()
n <- length(SEQ)
N <- SEQ[n]
curve_lengths <- prepared_trajectory_data %>% dplyr::select(n_i)
#################################################################################
l_hood <- -Inf
tic()
for(i in 1:niter){
print(i)
#############################################################################
# Expectation Step
print("e_step time")
tic()
data_Piik <-
tibble::tibble(Beta, Sigma) %>%
dplyr::mutate(k = row_number()) %>%
#partition() %>%
dplyr::mutate(X_Beta = purrr::map(Beta,
~ Matrix::as.matrix(X) %*% .x %>%
tibble::as_tibble() %>%
dplyr::bind_cols(rename(tibble::as_tibble(Matrix::as.matrix(Y)), x1 = x, y1 = y)))) %>%
dplyr::mutate(Piik = purrr::map2(X_Beta, Sigma, calc_Piik)) %>%
dplyr::select(k, Piik) %>%
#collect() %>%
tidyr::unnest(cols = c(Piik))
scale_m <-
data_Piik %>%
dplyr::ungroup() %>%
dplyr::summarise(mean = mean(Piik)) %>%
dplyr::pull(mean)
Pik <-
data_Piik %>%
group_by(curve_i) %>%
dplyr::mutate(Piik = Piik/mean(Piik)) %>%
dplyr::group_by(curve_i, k) %>%
dplyr::summarise(Pik = prod(Piik)) %>%
tidyr::spread(k, Pik) %>%
dplyr::ungroup() %>%
dplyr::select(-curve_i) %>%
Matrix::as.matrix()
Pik[is.infinite(Pik) & Pik > 0] <-.Machine$double.xmax
Pik <- Pik * Alpha
toc()
#############################################################################
# Calculate Log Likelihood
# Calculate Probability of data over all clusters
s <- rowSums(Pik)
# Since we're not on the log scale we might get 0
if(any(s == 0)){
# replace 0 with the smallest number possible
# then weight by the alphas
Pik[s == 0, ] <- .Machine$double.xmin * Alpha
# recalculate the probability of observing this data over all clusters
s <- rowSums(Pik)
}
# Now calculate the new log likelihood
l_hood_new <- sum(log(s)) + N * log(scale_m)
# If we've reached our tolerance stop the loop
if(i> 1 & abs(l_hood_new - l_hood)/l_hood < 1e-6){
break
}
# For monitoring
#print(l_hood)
# overwrite the old log likelihood
l_hood <- l_hood_new
# Calculate the Pi_ik
Pik <- Pik/s
# Perform Maximization Step
print("m_step time")
tic()
#############################################################################
Alpha <- colSums(Pik) / n
weights <-
Pik %>%
tibble::as_tibble() %>%
dplyr::mutate(curve_i = row_number()) %>%
dplyr::right_join(INDEX, by = "curve_i") %>%
dplyr::select(matches("\\d")) %>%
as.list()
param_updates <-
tibble::tibble(k = 1:K, weights = weights) %>%
dplyr::mutate(weights = purrr::map(weights, ~ Matrix::Diagonal(x = .))) %>%
#partition() %>%
dplyr::mutate(Beta_new = purrr::map(weights, ~ calc_new_beta(., X, Y)),
Sigma_new = purrr::map2(weights, Beta_new, ~ calc_new_sigma(.x, X, Y, .y)),
Beta_new = purrr::map(Beta_new, as.matrix)) %>%
#collect() %>%
dplyr::arrange(k)
Beta <- param_updates %>% dplyr::pull(Beta_new)
Sigma <- param_updates %>% dplyr::pull(Sigma_new)
#############################################################################
toc()
}
toc()
em_results <-
list("l_hood" = l_hood_new,
"Pik" = Pik,
"Beta" = Beta,
"Sigma" = Sigma,
"Alpha" = Alpha)
em_results <-
em_results %>%
reorder_clusters()
return(em_results)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.