Description Usage Arguments Value References Examples
View source: R/maskmeans-package.R
This is the primary function to run either the agglomeration or splitting version of the multi-view K-means clustering algorithm
1 2 3 4 5 6 7 8 9 |
mv_data |
Multi-view data, either in the form of a concatenated matrix (where the columns of the views have
been combined) or in the form of a list, where each element contains one of the views. In the former case, the argument
|
clustering_init |
Initial hard or soft clustering to be used for aggregating or splitting clusters. |
type |
Either |
parallel |
If |
BPPARAM |
Optional parameter object passed internally to |
verbose |
If |
... |
Additional optional parameters. See |
Output from either mv_aggregation
or mv_splitting
, according to the type
of algorithm
specified above.
For the aggregation algorithm, the outputs are as follows:
merged_clusters |
Matrix providing each pair of merged clusters at each iteration of the algorithm |
hclust |
Object of class "hclust" to be used for plotting cluster aggregations |
weights |
Matrix of dimension |
criterion |
Value taken on by the agglomerative criterion at each iteration |
final_classification |
Final classification of observations using |
final_K |
Number of clusters chosen via model selection by the data-drive slope estimation (DDSE) slope heuristics |
For the splitting algorithm, the outputs are as follows:
split_clusters |
Matrix providing the history of each cluster splitting at each iteration of the algorithm |
weights |
If |
criterion |
Value taken on by the splitting criterion at each iteration |
withnss |
The within sum-of-squares for each cluster at the last iteration |
ksplit |
Vector identifying which cluster was split at each iteration of the algorithm |
final_classification |
Final classification of observations using |
final_K |
Number of clusters chosen via model selection by the data-drive slope estimation (DDSE) slope heuristics |
all_probapost |
List of conditional probabilities for each split for the soft splitting algorithm |
final_probapost |
Matrix of conditional probabilities of cluster membership for
each observation for the model with |
Godichon-Baggioni, A., Maugis-Rabusseau, C. and Rau, A. (2020) Multi-view cluster aggregation and splitting, with an application to multi-omic breast cancer data. Annals of Applied Statistics, 14:2, 752-767.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 | ## Simulate data
set.seed(12345)
sim_1 <- mv_simulate(type = "D1")
sim_2 <- mv_simulate(type = "D2")
sim_3 <- mv_simulate(type = "D3")
sim_4 <- mv_simulate(type = "D4")
sim_5 <- mv_simulate(type = "D5")
sim_6a <- mv_simulate(type = "D6")
sim_6b <- mv_simulate(type = "D6", beta=7, n=200, K=5, sigma=0.5)
X <- sim_6a$data
mv <- c(2,2,2,1,1,2)
gamma <- 2
Xlist <- list(X[,1:2], X[,3:4], X[,5:6], matrix(X[,7], ncol=1),
matrix(X[,8], ncol=1), X[,9:10])
X_scale <- maskmeans:::scaleview(X, mv)
#-------------------------------------------------------------------
## Double-check that all functions provide the same result as before
#-------------------------------------------------------------------
#**************************************
## Test 1: hard clustering aggregation
#**************************************
cluster_init <- kmeans(Xlist[[1]], 20)$cluster
set.seed(12345)
hard_agglom <- maskmeans(mv_data=X, mv=mv, clustering_init=cluster_init,
type = "aggregation", gamma=gamma)
set.seed(12345)
hard_agglom_old <- maskmeans:::hmv1(X_scale, mv=mv, gamma=gamma,
cluster.init=cluster_init,
weightsopt = TRUE)
all.equal(hard_agglom$weights, hard_agglom_old$weights,
check.attributes = FALSE)
all.equal(hard_agglom$criterion, hard_agglom_old$CRIT)
all.equal(hard_agglom$merged_clusters, hard_agglom_old$merge)
#**************************************
## Test 2: soft clustering aggregation
#**************************************
set.seed(12345)
proba_init <- matrix(runif(nrow(X)*20), ncol=20)
proba_init <- proba_init / rowSums(proba_init)
# library(fclust)
# proba_init <- FKM(X, k=5)$U
soft_agglom <- maskmeans(mv_data=X, mv=mv, clustering_init=proba_init,
type = "aggregation", gamma=gamma)
set.seed(12345)
soft_agglom_old <- maskmeans:::hmvprobapost(X_scale, mv=mv, gamma=gamma,
probapost.init=proba_init)
all.equal(soft_agglom$weights, soft_agglom_old$weights,
check.attributes=FALSE)
all.equal(soft_agglom$criterion, soft_agglom_old$CRIT)
all.equal(soft_agglom$merged_clusters, soft_agglom_old$merge)
#**************************************
## Test 3: hard clustering splitting
#**************************************
set.seed(12345)
cluster_init <- kmeans(Xlist[[1]], 5)$cluster
hard_split <- maskmeans(mv_data=X, mv=mv, clustering_init=cluster_init,
type = "splitting", Kmax=20,
perCluster_mv_weights = FALSE)
set.seed(12345)
hard_split_old <- maskmeans:::splittingClusters(X=X_scale, mv=mv, gamma=gamma,
Kmax=20, cluster.init=cluster_init,
weightsopt = TRUE, testkmeans = TRUE)
all.equal(hard_split$weights, hard_split_old$weights,
check.attributes = FALSE)
all.equal(hard_split$criterion, hard_split_old$CRIT[-1])
## Differences just due to label switching
all.equal(hard_split$split_clusters, hard_split_old$clustersplithist)
## Differences just due to label switching
all.equal(hard_split$ksplit, hard_split_old$ksplit,
check.attributes = FALSE)
## Differences just due to label switching
all.equal(hard_split$withinss, hard_split_old$withinss,
check.attributes = FALSE)
#**************************************
## Test 4: hard clustering splitting with per-weights
#**************************************
set.seed(12345)
cluster_init <- kmeans(Xlist[[1]], 10)$cluster
hard_split_perCluster <- maskmeans(mv_data=X, mv=mv,
clustering_init=cluster_init, type = "splitting",
Kmax=20, perCluster_mv_weights=TRUE, gamma=1)
set.seed(12345)
hard_split_old_perCluster <- maskmeans:::splittingClustersbis(X=X_scale, mv=mv, gamma=1,
Kmax=20, cluster.init=cluster_init)
## Note: these are not identical as there was an error in the original code
mapply(all.equal, hard_split_perCluster$weights,
hard_split_old_perCluster$weights,
check.attributes = FALSE)
all.equal(hard_split_perCluster$criterion,
hard_split_old_perCluster$CRIT[-1])
all.equal(hard_split_perCluster$split_clusters,
hard_split_old_perCluster$clustersplithist)
all.equal(hard_split_perCluster$ksplit, hard_split_old_perCluster$ksplit,
check.attributes = FALSE)
all.equal(hard_split_perCluster$withinss, hard_split_old_perCluster$withinss,
check.attributes = FALSE)
#**************************************
## Test 5: soft clustering splitting
#**************************************
## Not run:
set.seed(12345)
proba_init <- matrix(runif(nrow(X)*5), ncol=5)
proba_init <- proba_init / rowSums(proba_init)
soft_split <- maskmeans(mv_data=X, mv=mv, clustering_init=proba_init,
type = "splitting", gamma=gamma, delta = 2,
perCluster_mv_weights = FALSE, Kmax = 16,
verbose=TRUE, parallel=TRUE)
set.seed(12345)
soft_split_old <- maskmeans:::splittingProbapost(X=X_scale, mv=mv,
gamma=gamma, delta=2, Kmax=7,
probapost.init=proba_init)
all.equal(soft_split$weights, soft_split_old$weights,
check.attributes=FALSE)
all.equal(soft_split$criterion, soft_split_old$CRIT)
## Columns are not in the same order here but otherwise equal
all.equal(soft_split$probapost, soft_split_old$probapost)
## End(Not run)
#**************************************
## Test 6: soft clustering splitting with per-weights
#**************************************
## Not run:
set.seed(12345)
soft_split_perCluster <- maskmeans(mv_data=X, mv=mv, clustering_init=proba_init,
type = "splitting", gamma=gamma, delta = 2,
perCluster_mv_weights = TRUE, Kmax = 16,
parallel=FALSE)
set.seed(12345)
soft_split_old_perCluster <- maskmeans:::splittingProbapostbis(X=X_scale, mv=mv,
gamma=gamma, delta=2, Kmax=7, probapost.init=proba_init)
all.equal(soft_split_perCluster$weights, soft_split_old_perCluster$weights,
check.attributes=FALSE)
all.equal(soft_split_perCluster$criterion, soft_split_old_perCluster$CRIT)
all.equal(soft_split_perCluster$probapost, soft_split_old_perCluster$probapost)
## End(Not run)
#**************************************
## Other testing idea: using Xlist instead
#**************************************
cluster_init <- kmeans(Xlist[[1]], 10)$cluster
hard_agglom_list <- maskmeans(mv_data=Xlist, clustering_init=cluster_init,
type = "aggregation", gamma=gamma)
table(maskmeans_cutree(hard_agglom_list, K=6, clustering_init=cluster_init)$classif)
#**************************************
## Plot functions
#**************************************
mv_plot(mv_data=sim_6a$data, mv=mv, labels=sim_6a$labels[,1])
mv_plot(mv_data=sim_1$data, mv=c(2,2,2,2), labels=sim_1$labels[,1])
mv_plot(mv_data=sim_2$data, mv=c(2,2,2,2), labels=sim_2$labels[,1])
mv_plot(mv_data=Xlist, labels=sim_6a$labels[,1])
p <- maskmeans_plot(hard_agglom)
p <- maskmeans_plot(soft_agglom)
p <- maskmeans_plot(hard_split)
p <- maskmeans_plot(hard_split, type="tree")
p <- maskmeans_plot(hard_split, type="tree", edge_arrow=FALSE)
p <- maskmeans_plot(hard_split_perCluster, type="tree")
p <- maskmeans_plot(hard_split_perCluster, type = "tree_perClusterWeights")
p <- maskmeans_plot(hard_split_perCluster)
## Tree plots look weird here
## Not run:
p <- maskmeans_plot(soft_split, type = "tree")
p <- maskmeans_plot(soft_split_perCluster)
## End(Not run)
## Plot weights in the final splits originating from intial cluster 8 (using final_K)
s <- split_zoom(hard_split_perCluster, initial_cluster = 8, mv_names = letters[1:6])
|
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.