R/mdr_dist.R

#' Main function collecting the functions of MDRDist
#'
#' @param data A data.frame with integer categories, that contains the
#'    observations which should be clustered. Missing data are to be encoded
#'    as integer -9, all other categories have to be positive.
#'    If the data contain a column "status", this column will be discarded.
#' @param working_dir An existing, empty directory in which to drop intermediate
#'    stages
#' @param n_rounds The number of repeated MBMDR-calls
#' @param use_existing_models check, whether the working directory is empty and
#'    throw an error, if not -- unless former models are allowed by this flag.
#' @param evaluate_models an option that stops computation after calling MB-MDR.
#'    This function is intended to be used to calculate models once and run
#'    several with several similarity weights afterwards.
#' @param similarity_weights how should different interaction-class-labels be
#'    combined? This is a list of lists as generated by function similarity_weights
#' @param comparison_distribution which distribution should be assumed for
#'    generated data? This argument should be passed as function out of the set
#'    ( "compdist_estimated_uncorellated", "compdist_uniform")
#' @param model_assumption how should different interaction classes be
#'    interpreted with respect to similarity? This argument should be passed as
#'    function out of the set ( "all_same", "all_different")
#' @param further_options a structure to manipulate further options.
#'    It is passed as a list of two-element-lists, where the first element describes
#'    one of the parameters in function "set_options()" and the second element
#'    the new value.
#'
#' @return The function returns
#'    dist: a distance matrix (in dist-structure) and
#'    importance: a table containing the names and frequency of most appearing
#'       interactions
#' @export
#'
#' @import parallelMap
#' @import reshape2
#' @import checkmate
#'
mdr_dist <- function(data,
                     working_dir,
                     n_rounds,
                     use_existing_models = FALSE,
                     evaluate_models = TRUE,
                     similarity_weights = MDRDist_weights("first_test"),
                     comparison_distribution = compdist_estimated_uncorellated,
                     model_assumption = all_different,
                     further_options = NULL){

  checkmate::assertDataFrame(x = data, types = "numeric", any.missing = FALSE)
  checkmate::assertDirectory(x = working_dir)
  checkmate::assertNumber(x = n_rounds, lower = 0, upper = 10000)
  checkmate::assertLogical(x = use_existing_models)
  checkmate::assertLogical(x = evaluate_models)
  checkmate::assertClass(x = similarity_weights, classes = "similarity_weights")
  checkmate::assertFunction(x = model_assumption)
  checkmate::assertFunction(x = comparison_distribution)
  checkmate::assertList(x = further_options, null.ok = TRUE)

  working_dir <- path.expand(working_dir)

  if(length(x = dir(working_dir)) > 0){
    checkmate::assert(use_existing_models,
                      .var.name = "empty working directory")
    warning("The working directory you choose was not empty. The programm
            cannot guarantee, that the results are meaningful in any way!")
  }

  ### end of checks
  if(!is.null(further_options)){
    names(further_options) <- sprintf("mdrdist_%s", names(further_options))
  }

  set_options()
  options(further_options)

  if(n_rounds > 0){
    nothing_to_save <- parallelMap::parallelLapply(x = 1:n_rounds,
                                                   fun = repeat_mbmdr,
                                                   data = data,
                                                   working_dir = working_dir,
                                                   comparison_distribution = comparison_distribution,
                                                   further_options = further_options)
  }

  if(evaluate_models){
    # read in models
    calculated_models <- read_mbmdr(list.dirs(path = working_dir,
                                              full.names = TRUE,
                                              recursive = FALSE))
    if(nrow(calculated_models) < 1) {
      # catch the case, that no model is significant
      dist <- matrix(1, nrow = nrow(data), ncol = nrow(data),
                     dimnames = list(rownames(data), rownames(data)))
      calculated_models_reduced <- list()
    } else {
      # else continue with evaluation

      calculated_models_reduced <- reduce_calculated_models(calculated_models = calculated_models)

      calculated_models_reduced$models_int <- lapply(X = calculated_models_reduced$models,
                                                     FUN = model_assumption)

      classified_data <- classify_data(mbmdr_return = calculated_models_reduced,
                                       data = data)

      similarity <- calculate_similarity(classified_data = classified_data,
                                         similarity_weights = similarity_weights,
                                         interaction_importance_weights = calculated_models_reduced$count)
      if(max(as.vector(x=similarity)) > 1){
        warning(sprintf(fmt = "Similarity was not scaled properly to 1! The maximum was actually %f.",
                        max(as.vector(x=similarity))))
      }

      dist <- (abs(1 - similarity) ^ getOption("mdrdist_dissimilarity_exponent"))
    }

    res <- list(dist = dist,
                importance = calculated_models_reduced,
                interaction = calculated_models)
    class(res) <- "MDRDist-distance"
    return(res)
  } else{
    print(sprintf("The models are saved in %s.",
                  working_dir))
    return(working_dir)
  }
}

repeat_mbmdr <- function(x, data, working_dir, comparison_distribution, further_options){

  set_options()
  options(further_options)

  Data2 <- build_supervised_sample(data = data,
                                   fraction_of_real_data = getOption("mdrdist_fraction_of_real_data"),
                                   fraction_of_artificial_data = getOption("mdrdist_fraction_of_artificial_data"),
                                   do_bootstrapping = getOption("mdrdist_do_bootstrapping"),
                                   method = comparison_distribution)

  nothing_to_save <- call_mbmdr(formula = status ~ .,
                                data = Data2,
                                working_dir = working_dir)
  return(TRUE)
}
imbs-hl/MDRDist documentation built on May 18, 2019, 4:45 a.m.