sumots: Helper functions for Sumo Analytics

Documented in get_gmm_clusters

#' Function to create clusters using time series features and model based clustering
#'
#' @param data Data to be used for clustering. Has to include columns: id, date and outcome
#' @param max_clust Maximum number of clusters
#'


get_gmm_clusters <- function(data, max_clust) {

    require(mclust)
    require(feasts)

    data_features <- data %>%
        as.data.frame() %>%
        drop_na() %>%
        as_tsibble(key = id, index = date) %>%
        features(
            outcome,
            feature_set(
                tags = c("tile", "acf", "stl", "spectral", "lumpiness", "roll", "count")
            )) %>%
        select(id, trend_strength, spikiness, linearity, curvature, acf1, n_crossing_points, longest_flat_spot, var_tiled_var,
               shift_level_index, shift_kl_index, spectral_entropy, var_tiled_var, var_tiled_mean) %>%
        column_to_rownames(var = "id") %>%
        drop_na() %>%
        scale()


    mc <- Mclust(data_features, G = max_clust)

    message("Optimal number of clusters found: ", mc$G)

    gmm_cluster_tbl <- mc$classification %>%
        as.data.frame() %>%
        rownames_to_column("id") %>%
        set_names("id", "cluster") %>%
        as_tibble()

    gmm_cluster_prob_tbl <- mc$z %>%
        as.data.frame() %>%
        rownames_to_column("id") %>%
        as_tibble() %>%
        pivot_longer(-id) %>%
        group_by(id) %>%
        filter(value == max(value)) %>%
        ungroup() %>%
        select(-name)



    return_list <- list()
    return_list$cluster <- gmm_cluster_tbl
    return_list$prob    <- gmm_cluster_prob_tbl

    return(return_list)

}