R/k_means.R

Defines functions scale_continuous k_means assess_clusterability variable_cluster_plots

library(tidyverse)
library(deldir)
library(cluster)
library(Rtsne)
library(magrittr)
library(diptest)
library(factoextra)

k_means_description <- "\"k-Means\" is a clustering algorithm which forms groups based upon a data point's distance to the average of the group, placing a data point into the group where the difference between its value and the group average is smallest; the aim being to find such clusters that group the data in such a way where the data points are similar to each other. The algorithm repeats and updates the group averages, and therefore the group membership, until a stopping condition is met. As part of the first step in the algorithm random data points are chosen as the initial group averages, as such we can expect different results each time the algorithm is run unless we provide a starting \"seed\" which will generate the same starting point. The quality of the final clusters will be dependent upon the properties of the data matching the assumptions of the algorithm and if these aren't valid then another clustering algorithm may be a better choice, or we should look at ways in which we can transform our data to make the assumptions valid."

k_means_detailed <- "\"k-Means\" is an iterative clustering algorithm which assigns membership of a data point to a cluster based upon the minimum Euclidean distance to the centres of each cluster; iterations continue, updating the central point, until the assignments do not change from iteration n-1 to iteration n. Whilst non-parametric and as such does not have distributional assumptions, due to using Euclidean distance there are a use cases where the algorithm will provide better results these are: continuous data, spherical variable distribution, heteroscedasticity, and equal membership probabilities. N.B. Due to the initialisation of the algorithm using a randomly selected data point the results will vary unless a predetermined seed is set."

optimal_k_description <- "When clustering we often do not know the number of groups that we should create prior to starting the analysis, and we want to ensure we don't create more groups than is necessary to describe the data. We can sometimes clearly identify the number of groups from plotting the data however when this is unclear or impractical we look for ways in which we can quantify the suitability of the number of groups that we have created. These ways look at how well our solution explains the patterns within the data and how much other grouping solutions improve the description of the data whilst looking for the lowest possible number of groups."

optimal_k_detailed <- "As clustering is an unsupervised machine learning problem, determining the optimal number of clusters to use does not necessarily have a singular solution. As we add more clusters the percentage of variability described by this solution will increase, however we begin to overfit our solution to the data. The number of clusters to use should be chosen so that adding another cluster does not substantially improve the modelling of the data, and as such we can use metrics that quantify the model fit to aid in determining a potential solution to the number of clusters we should use."

elbow_method_description <- "One way in which we can do this is using the \"elbow\" method which plots how well our solution describes the data and looks for the value where increasing the number of clusters does not provide a noticeable improvement. This is because with a smaller number of clusters each increase will provide a greater improvement and this improvement will reduce as the number of clusters increases. Within the plot we look for the value where there is an abrupt change or flattening of this curve thereby forming an \"elbow\" in the plot. However with some data sets this abrupt change does not occur and the plot appears as a smooth line, in these cases a different approach to determining the optimum number of clusters should be used such as the \"silhouette\" method."

elbow_method_detailed <- "The \"elbow\" method is an approach that can aid in determining the optimum number of clusters that we should use by plotting a metric that measures the percentage of the variability within the data that is described by our solution and looking for the point where this variability only incrementally increases for an increase in \"k\". As we increase through the lower initial values of k we are more likely to see large increases in the percentage of variability explained, and as we use larger values of k this increase in explained variability will reduce typically forming a sharp change in the plot, or elbow, and the point where this \"elbow\" is represents the value of k that we should use. However if the data is not particularly clustered to begin with this \"elbow\" may not be clear and the plot will appear as a smooth curve. In this case alternative methods to identify the optimal value of k should be investigated such as the \"silhouette\" method."

silhouette_method_description <- "One way in which we can do this is using the average silhouette value for a range of clustering solutions. The silhouette value is a measure for each data point that looks at the similarity between the cluster it is a member of and between the next nearest cluster resulting in a value between -1 and 1. Data points with a silhouette value close to 1 indicate that the selected data point is similar to other data points within the cluster it is a member of indicating a suitable assignment to this cluster; conversely data points with a silhouette value close to -1 indicate that the data point is more similar to the data points in another cluster indicating a poor assignment to its current cluster. When we take the average silhouette value for all data points we get an indication of the overall suitability of our clustering solution with higher values indicating a better description of the data. We can therefore use this method to identify the optimum number of clusters by calculating the average silhouette value for a range of possible solutions and choosing the solution with the highest average silhouette value."

silhouette_method_detailed <- "The silhouette coefficient is a calculation of the similarity of a data point within the cluster it is a member of versus the next nearest cluster (cohesion and separation respectively). The cohesion is a calculation of the average distance of a data point to the data points in the same cluster, conversely the separation is calculation of the average distance of a data point to data points in the nearest cluster. The silhouette coefficient sits in the range [-1,1] with values closer to 1 indicating a data point is in a suitable cluster (high cohesion and low separation) and values closer to -1 indicating that the data point is more similar to the closest neighbouring cluster and therefore misassigned to the current cluster (low cohesion and high separation). When we take the average silhouette coefficient for a cluster we get a measure of how tightly grouped the points within a cluster are, and taking the average coefficient across all data points provides a metric of the suitability of the overall clustering solution. We can therefore use the average silhouette coefficient across all data points to determine the optimal value of k by choosing the value that provides the highest average silhouette coefficient across all data points."


# vat_description
#
# vat_detailed
#
# hartigans_dip_description
#
# hartigans_dip_detailed
#
# hopkins_decription
#
# hopkins_detailed

scale_continuous <- function(data){
  column_types <- classify_columns(data)
  continuous <- column_types %>%
    filter(grepl('continuous',classification))%>%
    .[['column']]
  data <- data %>%
    mutate_at(continuous,scale)%>%
    rename_at(continuous,function(x)paste0(x,'.Scaled'))
  return(data)
}

k_means <- function(x,k=NULL,seed=sample(1:1e6,1),scale=TRUE,optimal_k_method='silhouette',alpha=0.05){

  # Check for continuous data
  non_numeric_check <- any(!unlist(lapply(x,is.numeric)))
  if(non_numeric_check){
    stop('Data must be numeric')
  }

  potential_discrete <- names(which(
    unlist(lapply(x,function(y)length(unique(y))))/nrow(x)<0.1|
    unlist(lapply(x,function(y)(all(y==round(y)))))
    ))

  if(length(potential_discrete)!=0){
    warning('k means should not be used for discrete/categorical data.')
    if(scale){
      warning('Discrete/categorical variables will not be scaled.')
    }
  }

  clusterability <- assess_clusterability(x,seed=seed,scale=scale,tests_caption = TRUE,alpha = alpha)

  naCheck <- x %>%
    filter_all(function(x)is.na(x)|is.nan(x)|is.infinite(x))
  if(nrow(naCheck)>0){
    stop('Remove NA/NaN/Inf and missing values prior to clustering')
  }

  init <- x
  # Scale the data if set as an option to standardise variances.
  if(scale){
    x <- as.data.frame(scale_continuous(x))
    }

  # If k is null then find k using elbow method
  if(is.null(k)){
    if(optimal_k_method=='elbow'){
      elbow <- tibble('WSS'=numeric(),'k'=numeric())

      for(i in 2:max(6,ceiling(sqrt(nrow(x))))){
        set.seed(seed);temp<-kmeans(x,i)$tot.withinss
        elbow <- bind_rows(elbow,tibble('WSS'=temp,'k'=i))
      }

      elbow <- elbow %>%
        mutate('SDeriv'=c(WSS[-1],NA)+c(NA,WSS[-max(row_number())])-(2*WSS))

      optimal_k <- elbow %>%
        filter(!is.na(SDeriv))%>%
        filter(SDeriv==max(SDeriv))

      optimal_k_plot <- ggplot(elbow)+
        geom_line(aes(x=k,y=WSS,group=1))+
        geom_point(aes(x=k,y=WSS))+
        geom_vline(aes(xintercept=k),data=optimal_k,lty='dashed')+
        theme_bw()+
        ggtitle('Elbow Method')

      k <- optimal_k$k

    }else{
      sil <- tibble('silhouette'=numeric(),'k'=numeric())

      for(i in 2:max(6,ceiling(sqrt(nrow(x))))){
        set.seed(seed);temp<-kmeans(x,i)
        silVal <- mean(silhouette(temp$cluster,dist(x))[,3])
        sil <- bind_rows(sil,tibble('average silhouette'=silVal,'k'=i))
      }

      optimal_k <- sil %>%
        filter(`average silhouette`==max(`average silhouette`))

      optimal_k_plot <- ggplot(sil)+
        geom_line(aes(x=k,y=`average silhouette`,group=1))+
        geom_point(aes(x=k,y=`average silhouette`))+
        geom_vline(aes(xintercept=k),data=optimal_k,lty='dashed')+
        theme_bw()+
        labs(y='average silhouette')+
        ggtitle('Silhouette Method')

      k <- optimal_k$k
    }
  }else{
    optimal_k_method <- NULL
    optimal_k_plot <- NULL
  }

  # Set seed, and perform clustering
  set.seed(seed);clustering <- kmeans(x,k)
  out <- clustering
  attr(out,'k_selection') <- optimal_k_plot

  clusters <- clustering$cluster

  # Renaming because letters are potentially more readily interpretable.
  if(k<26){
    x <- x %>%
      mutate('Cluster'=LETTERS[clusters])
  }else{
    # If k>26 create AA...ZA prefixes - There should never be more than 26*27 clusters
    clusterMatch <- data.frame('ClusterInit'=sort(unique(clusters)))%>%
      mutate('Prefix'=ifelse(ClusterInit>26,LETTERS[floor((row_number()-1)/26)],''),
             'Suffix'=ifelse(ClusterInit>26,LETTERS[(row_number()-1)%%26+1],LETTERS[ClusterInit]))%>%
      mutate('Cluster'=sprintf('%s%s',Prefix,Suffix)) %>%
      select(ClusterInit,Cluster)

    clusterLevs <- clusterMatch$Cluster

    x <- x %>%
      mutate('ClusterInit'=clusters)%>%
      left_join(clusterMatch)%>%
      mutate('Cluster'=factor(Cluster,levels=clusterLevs))%>%
      select(-ClusterInit)
  }


  # Create outputs
  if(scale){
    init$Cluster <- x$Cluster
    # attr(out,'analysed_data') <- x
    attr(out,'data') <- init
  }else{
    attr(out,'data') <- x
  }

  if(ncol(clustering$centers)==2){
    # If there are only two centers create a plot with Voronoi boundaries.
    centers <- x %>% group_by(Cluster)%>%summarise_all(mean)%>%select(colnames(x))%>%as.data.frame
    voronoi <- deldir::deldir(centers[,1],centers[,2],
                      rw=c(
                        range(x[,1])+(abs(range(x[,1])*0.3)*c(-1,1)),
                        range(x[,2])+(abs(range(x[,2])*0.3)*c(-1,1))
                      )
    )

    voronoi_x <-c(
      min(c(voronoi$dirsgs$x1,voronoi$dirsgs$x2)),
      max(c(voronoi$dirsgs$x1,voronoi$dirsgs$x2))
    )

    voronoi_y <-c(
      min(c(voronoi$dirsgs$y1,voronoi$dirsgs$y2)),
      max(c(voronoi$dirsgs$y1,voronoi$dirsgs$y2))
    )

    data_x <- c(min(x[[colnames(x)[1]]]),max(x[[colnames(x)[1]]]))%>%
      (function(x){
        (1+((sign(x)*c(-1,1))*0.05))*x
      })
    data_y <- c(min(x[[colnames(x)[2]]]),max(x[[colnames(x)[2]]]))%>%
      (function(x){
        (1+((sign(x)*c(-1,1))*0.05))*x
      })

    x_limits=c(min(voronoi_x[1],data_x[1]),max(voronoi_x[2],data_x[2]))
    y_limits=c(min(voronoi_y[1],data_y[1]),max(voronoi_y[2],data_y[2]))

    raw_plot <- ggplot(x)+
      geom_point(aes_string(x=colnames(x)[1],y=colnames(x)[2]))+
      theme_bw()+
      ggtitle('Cluster Visualisation')+
      geom_segment(
        aes(x = x1, y = y1, xend = x2, yend = y2),
        data = voronoi$dirsgs,colour=NA
      )+
      scale_x_continuous(expand=c(0,0.0),limits=x_limits)+
      scale_y_continuous(expand=c(0,0.0),limits=y_limits)

    plot <- ggplot(x)+
      geom_text(data=centers,
                aes_string(x=colnames(x)[1],y=colnames(x)[2],colour='Cluster'),label='x',
                alpha=0.5,size=10,show.legend=FALSE)+
      geom_point(aes_string(x=colnames(x)[1],y=colnames(x)[2],colour='Cluster'))+
      geom_segment(
        aes(x = x1, y = y1, xend = x2, yend = y2),
        data = voronoi$dirsgs
      )+
      theme_bw()+
      ggtitle('Data')+
      scale_x_continuous(expand=c(0,0.0),limits=x_limits)+
      scale_y_continuous(expand=c(0,0.0),limits=y_limits)

    attr(out,'raw_plot') <- raw_plot
    attr(out,'plot') <- plot
  }else if(ncol(clustering$centers)>2){

    # Otherwise use t-SNE to reduce dimensionality to create the visualisation
    perplexity_use <- ifelse(nrow(x)>ncol(x),
      floor(nrow(x)/(ncol(x))),floor(nrow(x)/4))

    set.seed(seed);tsne <- Rtsne(x[,-ncol(x)],perplexity=perplexity_use,dims=2,max_iter=500,check_duplicates=FALSE)
    plotting <- tsne$Y %>%
      as_tibble %>%
      set_colnames(c('X','Y'))%>%
      mutate('Cluster'=x$Cluster)

    raw_plot <- ggplot(plotting)+
      geom_point(aes_string(x='X',y='Y'))+
      scale_x_continuous(expand=c(0.1,0))+
      scale_y_continuous(expand=c(0.1,0))+
      theme_bw()+
      ggtitle('Data',subtitle='Dimensionally Reduced')

    plot <- ggplot(plotting)+
      geom_point(aes_string(x='X',y='Y',colour='Cluster'))+
      scale_x_continuous(expand=c(0.1,0))+
      scale_y_continuous(expand=c(0.1,0))+
      theme_bw()+
      ggtitle('Cluster Visualisation',subtitle='Dimensionally Reduced')

    attr(out,'raw_plot') <- raw_plot
    attr(out,'plot') <- plot
  }else{

    raw_plot <- ggplot(x%>%mutate('AllData'=' '))+
      geom_violin(aes_string(x='AllData',y=colnames(x)[1]))+
      theme_bw()+
      ggtitle('Cluster Visualisation')+
      xlab('All Data')

      plot <- ggplot(x)+
        geom_violin(aes_string(x='Cluster',y=colnames(x)[1],fill='Cluster'))+
        theme_bw()+
        ggtitle('Cluster Visualisation')
    #
    # }else{
    #   plot <- ggplot(x)+
    #     geom_boxplot(aes_string(y=colnames(x)[1],fill='Cluster'))+
    #     theme_bw()+
    #     ggtitle('Cluster Visualisation')
    #
    # }

      attr(out,'raw_plot') <- raw_plot
      attr(out,'plot') <- plot
  }

  attr(out,'clusterability') <- clusterability
  attr(out,'args') <- list('function'='k_means',
                              'x'=init,
                              'k'=k,
                              'seed'=seed,
                              'scale'=scale,
                              'optimal_k_method'=optimal_k_method)
  attr(out,'variable_plots') <- variable_cluster_plots(x,'Cluster')
  return(out)
}


assess_clusterability <- function(data,seed=sample(1:1000,1),scale=TRUE,n=NULL,tests_caption=FALSE,alpha=0.05){
  if(is.null(n)){
    n <- nrow(data)/4
  }
  if(scale){
    data <- scale_continuous(data)
  }
  # http://www.mayaackerman.info/pub/clusterability2017.pdf
  # 12The Beta quantile defined as the value such that, assuming the data was generatedwithoutclusters,thechanceofconcludingthatthedataisclustered, i.e. P(H < qα(n, n))is100α%. Weuseaone-sidedtestbecauseifthedatais morespatiallyrandomthanexpectedbychance,itwouldstillbeunclusterable. 13 NotethattheHopkinsstatisticapproachesaGaussiandistributionforlarge samples(e.g. n > 50)Inthiscase,onecouldinsteadusethethreshold0.5− z1−α/(2√2n +1)wheren isthenumberofpointssampled.
  # 0.5-qnorm(p = 0.95,0,1)/(2*sqrt(50+1))
  # qbeta(0.05,50,50) - Do we want to do this?
  tendency <- get_clust_tendency(data,n,seed=seed,gradient = list(low='blue',mid='white',high='red'))
  out <- tendency$plot+
    theme_bw()+
    labs(fill='Dissimilarity',title='Ordered Dissimilarity Image',subtitle='Image showing the dissimilarity between data points')+
    theme(axis.text=element_blank(),axis.title = element_blank())+
    scale_x_discrete(expand=c(0,0))+
    scale_y_discrete(expand=c(0,0))

  if(tests_caption){
    out <- out + labs(caption=sprintf("Hopkins' Statistic: %0.3f",
                                     1-tendency$hopkins_stat))
  }

  attr(out,'metrics') <- list('hopkins'=1-tendency$hopkins_stat)
  return(out)
}

variable_cluster_plots <- function(data,cluster,continuous_plot='box',non_continuous_plot='heatmap'){
  if(length(cluster)==1&is.character(cluster)){
    if(cluster!='Cluster'){
      data <- data %>%
        rename('Cluster'=cluster)
    }

  }else{
    data$Cluster <- cluster
  }
  column_types <- classify_columns(data%>%select(-Cluster))
  noncontinuous <- column_types %>% filter(!grepl('continuous',classification)) %>% .[['column']]
  continuous <- column_types %>% filter(grepl('continuous',classification)) %>% .[['column']]

  cont_plots <- lapply(continuous,function(x,data,continuous_plot='density'){
    plotting <- data %>% select(x,'Cluster')
    if(continuous_plot=='violin'){
      plot <- ggplot(plotting)+
        geom_violin(aes_string(x='Cluster',y=x,fill='Cluster'))+
        ggtitle(sprintf('Breakdown of "%s" by Cluster',x))
    }
    if(continuous_plot=='density'){
      plot0 <- ggplot(plotting)+
        geom_density(aes_string(fill='Cluster',x=x),alpha=1/(length(unique(plotting$Cluster))),trim=TRUE)

      y_limits <- c(0,1.1*max(ggplot_build(plot0)$data[[1]]$y))
      plot <- plot0+
        scale_x_continuous(expand=c(0,0))+
        scale_y_continuous(limits=y_limits,expand=c(0,0))+
        ggtitle(sprintf('Breakdown of "%s" by Cluster',x))
    }
    if(continuous_plot=='box'){
      plot <- ggplot(plotting)+
        geom_boxplot(aes_string(x='Cluster',y=x,fill='Cluster'))+
        ggtitle(sprintf('Breakdown of "%s" by Cluster',x))
    }
    return(plot+theme_bw())
  },data=data,continuous_plot=continuous_plot)
  names(cont_plots) <- continuous
  non_cont_plots <- lapply(noncontinuous,function(x,data,non_continuous_plot='heatmap'){
    plotting <- data %>% select(x,'Cluster') %>%
      group_by(.dots=c(x,'Cluster'))%>%
      count()%>%
      spread(x,n,fill=0)%>%
      gather(key=x,value='n',-Cluster)%>%
      set_colnames(c('Cluster',x,'n'))

    if(is.numeric(data[[x]])){
      plotting <- plotting %>%
        mutate_at(.vars=x,function(y){
          levelsUse <- as.character(sort(unique(as.numeric(y)),decreasing = TRUE))
          return(factor(y,levels=levelsUse))
        })
    }
    if(non_continuous_plot=='bar'){
      plot <- ggplot(plotting)+
        geom_col(aes_string(x='Cluster',y='n',fill=x),position='dodge')+
        ggtitle(sprintf('Breakdown of "%s" by Cluster',x))
    }
    if(non_continuous_plot=='heatmap'){
      plot <- ggplot(plotting)+
        geom_tile(aes_string(x='Cluster',y=x,fill='n'),colour='black')+
        ggtitle(sprintf('Breakdown of "%s" by Cluster',x))+
        scale_fill_gradient(low='white',high='blue')+
        scale_x_discrete(expand=c(0,0))+
        scale_y_discrete(expand=c(0,0))
    }

    return(plot+theme_bw())
  },data=data,non_continuous_plot=non_continuous_plot)
  names(non_cont_plots) <- noncontinuous

  allPlots <- c(cont_plots,non_cont_plots)[colnames(data %>% select(-Cluster))]
}

#
# assess <- assess_clusterability(mtcars[,1:4])
#
# attr(assess)
# visualise <- function(x){
#   attr(x,'plot')
# }
# res <- k_means(iris[,-5],seed=1)
# visualise(res)
# attr(res,'k_selection')
# attr(res,'data')
#
# res <- k_means(mtcars[,c(1:4)])
# visualise(res)
# attr(res,'k_selection')
# attr(res,'data')
statisticiansix/demonstrandum documentation built on Dec. 2, 2019, 1:29 a.m.