R/framework.R

Defines functions opticskxi_pipeline get_best_kxi fetch_dimred .fetch_dimred

Documented in get_best_kxi opticskxi_pipeline

#' OPTICS k-Xi models comparison pipeline 
#'
#' Computes OPTICS k-Xi models based on a parameter grid, binds results in a
#' data frame, and computes distance based metrics for each model.
#'
#' @param m_data    Data matrix
#' @param df_params Parameter grid for the OPTICS k-Xi function call and
#'                  optional dimension reduction.
#'                  Required columns: n_xi, pts, dist.
#'                  Optonal columns: dim_red, n_dim_red.
#' @param n_cores   Number of cores
#' @return Input parameter data frame with with results binded in columns
#'         optics, clusters and metrics.
#' @seealso \link{get_best_kxi}, \link{ggplot_kxi_metrics},
#' \link{gtable_kxi_profiles}
#' @examples
#' data('hla')
#' m_hla <- hla[-c(1:2)] %>% scale
#' df_params_hla <- expand.grid(n_xi = 3:5, pts = c(20, 30),
#'   dist = c('manhattan', 'euclidean'))
#' df_kxi_hla <- opticskxi_pipeline(m_hla, df_params_hla)
#' ggplot_kxi_metrics(df_kxi_hla, n = 8)
#' gtable_kxi_profiles(df_kxi_hla) %>% plot
#'
#' best_kxi_hla <- get_best_kxi(df_kxi_hla, rank = 2)
#' clusters_hla <- best_kxi_hla$clusters
#' fortify_pca(m_hla, sup_vars = data.frame(Clusters = clusters_hla)) %>%
#'   ggpairs('Clusters', ellipses = TRUE, variables = TRUE)
#' @export
opticskxi_pipeline <- function(m_data,
  df_params = expand.grid(n_xi = 1:10, pts = c(20, 30, 40),
    dist = c('euclidean', 'abscorrelation'),
    dim_red = c('identity', 'PCA', 'ICA'), n_dimred_comp = c(5, 10, 20)),
  n_cores = 1) {

  if (!all(c('n_xi', 'pts', 'dist') %in% names(df_params))) { 
    stop('Missing required columns in parameter grid.')
  }

  # add identity if no dim_red column
  if (!'dim_red' %in% names(df_params)) {
    df_params %<>% cbind(dim_red = 'identity')
  }
  # get dimred and distance
  uniq_names <- c('dim_red', 'n_dimred_comp')
  df_params <- fetch_dimred(m_data, df_params, uniq_names, n_cores)
  uniq_names %<>% c('dist')
  df_params %<>% derive_column(uniq_names, c('m_dimred', 'dist'), amap::Dist,
    'm_dist', c('x', 'method'), mc.cores = n_cores)

  # optics
  uniq_names %<>% c('pts')
  df_params %<>% derive_column(uniq_names, c('m_dist', 'pts'), dbscan::optics,
    'optics', c('x', 'minPts'), mc.cores = n_cores)
  df_params <- fetch_opticskxi(df_params, n_cores)

  # get metrics
  m_dist <- dist(m_data) %>% as.matrix
  df_params$metrics <- parallel::mclapply(df_params$clusters,
      get_kxi_metrics, m_dist, mc.cores = n_cores)

  df_params
}

#' Get best k-Xi model
#'
#' Select k-Xi clustering model based on a metric and a rank
#'
#' @param df_kxi Data frame returned by opticsxi_pipeline
#' @param metric Metric to choose best model
#' @param rank   Rank(s) of model to choose, ordered by decreasing metric
#' @return df_kxi row with specified metric and rank,
#'         simplified to a list if only one rank selected
#' @seealso \link{opticskxi_pipeline}
#' @export
get_best_kxi <- function(df_kxi, metric = 'avg.silwidth', rank = 1) {
  if (all(unlist(df_kxi$metrics) == 0)) stop('All metrics equal 0')
  metric_value <- as.matrix(as.data.frame(df_kxi$metrics))[metric, ]
  if (all(metric_value == 0)) stop(paste('All', metric, 'values equal 0'))

  df_kxi %<>% `[`(order(metric_value, decreasing = TRUE)[rank], )
  if (length(rank) == 1) {
    df_kxi %<>% as.list
    df_kxi[c('optics', 'clusters', 'metrics')] %<>% lapply(`[[`, 1)
  }

  df_kxi 
}

fetch_dimred <- function(m_data, df_params, uniq_names, n_cores) {
  # remove duplicates, number of components if no dim red, and rownames
  df_params[df_params$dim_red == 'identity', 'n_dimred_comp'] <- NA
  df_params %<>% unique %>% `rownames<-`(NULL)

  # fetch dimension reductions
  df_params$m_data <- list(m_data)
  df_params <- derive_column(df_params, uniq_names,
    c('m_data', 'dim_red', 'n_dimred_comp'), .fetch_dimred, 'm_dimred',
    mc.cores = n_cores)
}

.fetch_dimred <- function(m_data, dim_red, n_dimred_comp) {
  n_dimred_comp <- as.numeric(n_dimred_comp)
  switch(dim_red, identity = m_data,
    PCA = stats::prcomp(m_data)$x[, seq_len(n_dimred_comp)],
    ICA = {
      set.seed(0)
      fastICA::fastICA(m_data, n_dimred_comp)$S %>%
        `colnames<-`(paste0('IC', seq_len(ncol(.))))
    })
}

fetch_opticskxi <- function(df_params, n_cores) {
  # call opticsxi on each row
  df_opt <- data.frame(t(df_params[c('optics', 'n_xi', 'pts')]),
    stringsAsFactors = FALSE)
  df_params$clusters <- parallel::mclapply(df_opt,
    function(params) do.call(opticskxi, params), mc.cores = n_cores)

  df_params
}

# apply fun function to each unique row of dataframe df's columns df_names,
# store in column col_name, and merge to full dataframe df
# ... is passed to parallel::mclapply which calls function fun on unique rows
derive_column <- function(df_prm, uniq_names, df_names, fun, col_name,
  param_names = df_names, ...) {

  uniq_idxs <- which(!duplicated(df_prm[uniq_names]))
  uniq_params <- df_prm[uniq_idxs, df_names]
  uniq_df <- df_prm[uniq_idxs, uniq_names]
  uniq_df[[col_name]] <- parallel::mclapply(
    data.frame(t(uniq_params), stringsAsFactors = FALSE), .derive_column, fun,
    param_names, ...)

  # drop the data object, the first parameter, from the dataframe and merge
  df_prm %<>% `[`(-match(df_names[1], names(.)))
  merge(uniq_df, df_prm, by = uniq_names)
}

.derive_column <- function(params, fun, param_names) {
  do.call(fun, as.list(stats::setNames(params, param_names)))
}


get_kxi_metrics <- function(clusters, m_dist,
  metrics = c('avg.silwidth', 'bw.ratio', 'ch', 'pearsongamma', 'dunn',
    'dunn2', 'entropy', 'widestgap', 'sindex')) {
  if (length(table(clusters)) < 2) return(rep(0, length(metrics)))
  ids <- which(!is.na(clusters))

  m_dist[ids, ids] %>% fpc::cluster.stats(clusters[ids]) %>%
    `[[<-`('bw.ratio', 1 / .$wb.ratio) %>% `[`(metrics) %>% unlist
}

Try the opticskxi package in your browser

Any scripts or data that you put into this service are public.

opticskxi documentation built on July 19, 2019, 1:02 a.m.