R/global_similarity.R

Defines functions global_similarity

Documented in global_similarity

#' Heuristic method of comparing if two datasets are similar
#'
#' @param egalitarian egalitarian object
#' @param n number of runs of kmeans algorithm
#'
#' @return data frame
#'
#' @importFrom stats as.formula kmeans model.matrix
#'
#' @export
#'

global_similarity <- function(egalitarian, n) {
  target_name <- dataset_ <- NULL
  kmeans_results <- lapply(1:n,
                           function(x)
                             kmeans(model.matrix(as.formula(paste0(target_name, "~.")),
                                                 data = select(egalitarian$data, -dataset_)),
                                    centers = 2))
  kmeans_compare <- lapply(kmeans_results,
                           function(x)
                             data.frame(predicted = x$cluster,
                                        true = egalitarian$data$dataset_))
  results <- lapply(kmeans_compare, function(x) {
    kmeans_acc <- prop.table(table(x), margin = 2)
    if(kmeans_acc[1, 1] >= kmeans_acc[2, 1]) {
      training_accuracy <- kmeans_acc[1, 1]
      validation_accuracy <- kmeans_acc[2, 2]
    } else {
      training_accuracy <- kmeans_acc[2, 1]
      validation_accuracy <- kmeans_acc[1, 2]
    }
    data.frame(training_detected = training_accuracy,
               vaidation_detected = validation_accuracy)
  })
  results <- dplyr::bind_rows(results)
  dplyr::summarise_all(results, mean)
}
mstaniak/egalitaRian documentation built on Aug. 26, 2019, 11:11 p.m.