In this vignette, we introduce a cross-validation approach using mask tensor [@mask1; @mask2] to estimate the optimal rank parameter for matrix/tensor decomposition. Mask matrix/tensor has the same size of data matrix/tensor and contains only 0 or 1 elements, 0 means not observed value, and 1 means observed value. In this approach, only non-zero and non-NA elements are randomly specified as 0 and estimated by other elements reconstructed by the result of matrix/tensor decomposition. This can be considered a cross-validation approach because the elements specified as 1 are the training dataset and the elements specified as 0 are the test dataset.
Here, we use these packages.
library("nnTensor") library("ggplot2") library("dplyr")
Here, we use three types of demo data as follows:
data_matrix <- toyModel("NMF") data_matrices <- toyModel("siNMF_Easy") data_tensor <- toyModel("CP")
To set mask matrix/tensor, here we use kFoldMaskTensor, which divides only the elements other than NA and 0 into k mask matrices/tensors. In NMF, the mask matrix can be specified as M and the rank parameter (J) with the smallest test error is considered the optimal rank. Here, three mask matrices are prepared for each rank, and the average is used to estimate the optimal rank.
out_NMF <- expand.grid(replicate=1:3, rank=factor(1:10), value=0) count <- 1 for(i in 1:10){ masks_NMF <- kFoldMaskTensor(data_matrix, k=3) for(j in 1:3){ out_NMF[count, 3] <- rev( NMF(data_matrix, M = masks_NMF[[j]], J = i)$TestRecError)[1] count <- count + 1 } }
Looking at the average test error for each rank, the optimal rank appears to be around 8 to 10 (with some variation depending on random seeds).
ggplot(out_NMF, aes(x=rank, y=value)) + geom_point() + stat_summary(fun = mean, geom = "point", shape=21, size=3, fill="blue") + stat_summary(fun = mean, geom = "line", colour = "blue", aes(group=1)) + xlab("Rank") + ylab("Test Reconstruction Error")
(group_by(out_NMF, rank) |> summarize(Avg = mean(value)) -> avg_test_error_NMF) avg_test_error_NMF[which(avg_test_error_NMF$Avg == min(avg_test_error_NMF$Avg))[1], ]
Same as NMF, mask matrix M can be specified in NMTF, and the rank parameter (rank) with the smallest test error is considered the optimal rank. The following code is time-consuming and should be executed in your own environment.
out_NMTF <- expand.grid(replicate=1:3, rank2=1:10, rank1=1:10, value=0) rank_NMTF <- paste0(out_NMTF$rank1, "-", out_NMTF$rank2) out_NMTF <- cbind(out_NMTF, rank=factor(rank_NMTF, levels=unique(rank_NMTF))) count <- 1 for(i in 1:10){ for(j in 1:10){ masks_NMTF <- kFoldMaskTensor(data_matrix, k=3) for(k in 1:3){ out_NMTF[count, 4] <- rev( NMTF(data_matrix, M = masks_NMTF[[k]], rank = c(i, j))$TestRecError)[1] count <- count + 1 } } }
ggplot(out_NMTF, aes(x=rank, y=value)) + geom_point() + stat_summary(fun = mean, geom = "point", shape=21, size=3, fill="blue") + stat_summary(fun = mean, geom = "line", colour = "blue", aes(group=1)) + xlab("Rank") + ylab("Test Reconstruction Error") + theme(axis.text.x = element_text(angle = 90, hjust = 1))
(out_NMTF |> group_by(rank1, rank2) |> summarize(Avg = mean(value)) -> avg_test_error_NMTF) avg_test_error_NMTF[which(avg_test_error_NMTF$Avg == min(avg_test_error_NMTF$Avg))[1], ]
Same as NMF, mask matrices M can be specified in siNMF, and the rank parameter (J) with the smallest test error is considered the optimal rank. The following code is time-consuming and should be executed in your own environment.
out_siNMF <- expand.grid(replicate=1:3, rank=factor(1:10), value=0) count <- 1 for(i in 1:10){ masks_siNMF <- lapply(1:3, function(x){ list( kFoldMaskTensor(data_matrices[[1]], k=3)[[x]], kFoldMaskTensor(data_matrices[[2]], k=3)[[x]], kFoldMaskTensor(data_matrices[[3]], k=3)[[x]]) }) for(j in 1:3){ out_siNMF[count, 3] <- rev( siNMF(data_matrices, M = masks_siNMF[[j]], J = i)$TestRecError)[1] count <- count + 1 } }
ggplot(out_siNMF, aes(x=rank, y=value)) + geom_point() + stat_summary(fun = mean, geom = "point", shape=21, size=3, fill="blue") + stat_summary(fun = mean, geom = "line", colour = "blue", aes(group=1)) + xlab("Rank") + ylab("Test Reconstruction Error")
(out_siNMF |> group_by(rank) |> summarize(Avg = mean(value)) -> avg_test_error_siNMF) avg_test_error_siNMF[which(avg_test_error_siNMF$Avg == min(avg_test_error_siNMF$Avg))[1], ]
Same as NMF, mask matrices M can be specified in jNMF, and the rank parameter (J) with the smallest test error is considered the optimal rank. The following code is time-consuming and should be executed in your own environment.
out_jNMF <- expand.grid(replicate=1:3, rank=factor(1:10), value=0) count <- 1 for(i in 1:10){ masks_jNMF <- lapply(1:3, function(x){ list( kFoldMaskTensor(data_matrices[[1]], k=3)[[x]], kFoldMaskTensor(data_matrices[[2]], k=3)[[x]], kFoldMaskTensor(data_matrices[[3]], k=3)[[x]]) }) for(j in 1:3){ out_jNMF[count, 3] <- rev( jNMF(data_matrices, M = masks_jNMF[[j]], J = i)$TestRecError)[1] count <- count + 1 } }
ggplot(out_jNMF, aes(x=rank, y=value)) + geom_point() + stat_summary(fun = mean, geom = "point", shape=21, size=3, fill="blue") + stat_summary(fun = mean, geom = "line", colour = "blue", aes(group=1)) + xlab("Rank") + ylab("Test Reconstruction Error")
(out_jNMF |> group_by(rank) |> summarize(Avg = mean(value)) -> avg_test_error_jNMF) avg_test_error_jNMF[which(avg_test_error_jNMF$Avg == min(avg_test_error_jNMF$Avg))[1], ]
Same as NMF, mask tensor M can be specified in NTF, and the rank parameter (rank) with the smallest test error is considered the optimal rank. The following code is time-consuming and should be executed in your own environment.
out_NTF <- expand.grid(replicate=1:3, rank=factor(1:10), value=0) count <- 1 for(i in 1:10){ masks_NTF <- kFoldMaskTensor(data_tensor, k=3) for(j in 1:3){ out_NTF[count, 3] <- rev( NTF(data_tensor, M = masks_NTF[[j]], rank = i)$TestRecError)[1] count <- count + 1 } }
ggplot(out_NTF, aes(x=rank, y=value)) + geom_point() + stat_summary(fun = mean, geom = "point", shape=21, size=3, fill="blue") + stat_summary(fun = mean, geom = "line", colour = "blue", aes(group=1)) + xlab("Rank") + ylab("Test Reconstruction Error")
(group_by(out_NTF, rank) |> summarize(Avg = mean(value)) -> avg_test_error_NTF) avg_test_error_NTF[which(avg_test_error_NTF$Avg == min(avg_test_error_NTF$Avg))[1], ]
Same as NMF, mask tensor M can be specified in NTD, and the rank parameter (rank) with the smallest test error is considered the optimal rank. The following code is time-consuming and should be executed in your own environment.
out_NTD <- expand.grid(replicate=1:3, rank3=1:5, rank2=1:5, rank1=1:5, value=0) rank_NTD <- paste0(out_NTD$rank1, "-", out_NTD$rank2, "-", out_NTD$rank3) out_NTD <- cbind(out_NTD, rank=factor(rank_NTD, levels=unique(rank_NTD))) count <- 1 for(i in 1:5){ for(j in 1:5){ for(k in 1:5){ masks_NTD <- kFoldMaskTensor(data_tensor, k=3) for(k in 1:3){ out_NTD[count, 5] <- rev( NTD(data_tensor, M = masks_NTD[[k]], rank = c(i, j, k))$TestRecError)[1] count <- count + 1 } } } }
ggplot(out_NTD, aes(x=rank, y=value)) + geom_point() + stat_summary(fun = mean, geom = "point", shape=21, size=3, fill="blue") + stat_summary(fun = mean, geom = "line", colour = "blue", aes(group=1)) + xlab("Rank") + ylab("Test Reconstruction Error") + theme(axis.text.x = element_text(angle = 90, hjust = 1))
(out_NTD |> group_by(rank1, rank2, rank3) |> summarize(Avg = mean(value)) -> avg_test_error_NTD) avg_test_error_NTD[which(avg_test_error_NTD$Avg == min(avg_test_error_NTD$Avg))[1], ]
sessionInfo()
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.