R/dist.R

Defines functions get_dist fviz_dist

Documented in fviz_dist get_dist

#' @include utilities.R
NULL
#' Enhanced Distance Matrix Computation and Visualization
#' @description Clustering methods classify data samples into groups of similar 
#'   objects. This process requires some methods for measuring the distance or 
#'   the (dis)similarity between the observations. Read more: 
#'   \href{http://www.sthda.com/english/wiki/clarifying-distance-measures-unsupervised-machine-learning}{STHDA
#'    website - clarifying distance measures.}. \itemize{ \item get_dist(): 
#'   Computes a distance matrix between the rows of a data matrix. Compared to
#'   the standard \code{\link[stats]{dist}}() function, it supports 
#'   correlation-based distance measures including "pearson", "kendall" and 
#'   "spearman" methods. \item fviz_dist(): Visualizes a distance matrix }
#' @param x a numeric matrix or a data frame.
#' @param method the distance measure to be used. This must be one of 
#'   "euclidean", "maximum", "manhattan", "canberra", "binary", "minkowski", 
#'   "pearson", "spearman" or "kendall".
#' @param stand logical value; default is FALSE. If TRUE, then the data will be 
#'   standardized using the function scale(). Measurements are standardized for 
#'   each variable (column), by subtracting the variable's mean value and 
#'   dividing by the variable's standard deviation.
#' @param ... other arguments to be passed to the function dist() when using get_dist().
#' @return \itemize{ \item get_dist(): returns an object of class "dist". \item 
#'   fviz_dist(): returns a ggplot2 }
#' @seealso \code{\link[stats]{dist}}
#' @author Alboukadel Kassambara \email{alboukadel.kassambara@@gmail.com}
#' @examples
#' data(USArrests)
#' res.dist <- get_dist(USArrests, stand = TRUE, method = "pearson")
#' 
#' fviz_dist(res.dist, 
#'    gradient = list(low = "#00AFBB", mid = "white", high = "#FC4E07"))
#' @name dist
#' @rdname dist
#' @export
get_dist <- function(x, method = "euclidean",  stand = FALSE, ...){
  
  if(stand) x <- scale(x)
  if(method %in% c("pearson", "spearman", "kendall")){
    res.cor <- stats::cor(t(x),  method = method)
    res.dist <- stats::as.dist(1 - res.cor, ...)
  }
  else res.dist <- stats::dist(x, method = method, ...)
  
  res.dist
}


#' @param dist.obj an object of class "dist" as generated by the function dist() or get_dist().
#' @param order logical value. if TRUE the ordered dissimilarity image (ODI) is shown.
#' @param show_labels logical value. If TRUE, the labels are displayed.
#' @param lab_size the size of labels.
#' @param gradient a list containing three elements specifying the colors for low, mid and high values in 
#'  the ordered dissimilarity image. The element "mid" can take the value of NULL.
#' @rdname dist
#' @export
fviz_dist<- function(dist.obj, order = TRUE, show_labels = TRUE, lab_size = NULL,        
                     gradient = list(low = "red", mid = "white", high = "blue")
                     )
{
  
  if(!inherits(dist.obj, "dist"))
    stop("An object of class dist is required.")
  
  if(order){
  res.hc <- stats::hclust(dist.obj, method = "ward.D2")
  dist.obj <- as.matrix(dist.obj)[res.hc$order, res.hc$order]
  }
  else dist.obj <- as.matrix(dist.obj)
  
  rownames(dist.obj ) <- colnames(dist.obj ) <- paste0(rownames(dist.obj), "-")
  
  d <- reshape2::melt(dist.obj)
  p <- ggplot(d, aes_string(x = "Var1", y = "Var2"))+ 
    ggplot2::geom_tile(aes_string(fill="value")) 
  if(is.null(gradient$mid)) p <- p + ggplot2::scale_fill_gradient(low=gradient$low, high=gradient$high)
  else p <- p + ggplot2::scale_fill_gradient2(midpoint=mean(dist.obj), low=gradient$low, 
                                                    mid=gradient$mid, high=gradient$high, space = "Lab")
  if(show_labels) p <- p + theme(axis.title.x = element_blank(), 
                                       axis.title.y = element_blank(),
                                       axis.text.x = element_text(angle = 45, hjust = 1, size = lab_size),
                                       axis.text.y = element_text(size = lab_size))
  else p <- p + 
    theme(axis.text = element_blank(), axis.ticks = element_blank(),
          axis.title.x = element_blank(),
          axis.title.y = element_blank())
  
  return(p)
}
kassambara/factoextra documentation built on April 3, 2020, 8:28 a.m.