R/unsupervised_segment.R
In citrus: Customer Intelligence Tool for Rapid Understandable Segmentation

Documented in k_clusters

#' k-clusters model
#'
#' k-clusters method for segmentation. It can handle segmentation for both numerical data types only, by using k-means algorithm, and mixed data types (numerical and categorical) by using k-prototypes algorithm 
#' @param data data.frame, the data to segment
#' @param hyperparameters list of hyperparameters to pass. They include
#' centers: number of clusters or a set of initial (distinct) cluster centers, or 'auto'. When 'auto' is chosen, the number of clusters is optimised; \cr
#' iter_max: the maximum number of iterations allowed; 
#' n_start: how many random sets of cluster centers should be tried;
#' max_centers: maximum number of clusters when 'auto' option is selected for the centers; 
#' segmentation_variables: the columns to use to segment on.
#' standardize: whether to standardize numeric columns.
#' @importFrom stats kmeans ave sd dist var
#' @importFrom clustMixType lambdaest kproto
#' @importFrom dplyr n_distinct %>% mutate_if
#' @importFrom utils capture.output
#' @importFrom rlang .data
#' @importFrom ggplot2 ggplot geom_line geom_point geom_text scale_x_continuous xlab ylab
#' @param verbose logical whether information about the clustering procedure should be given.
#' @return A class called "k-clusters" containing a list of the model definition, the hyper-parameters,
#' a table of outliers, the elbow plot (ggplot object) used to determine the optimal no. of clusters, and
#' a lookup table containing segment predictions for customers.
#' @export
k_clusters <- function(data, hyperparameters, verbose = TRUE){
  
  
  if(is.null(hyperparameters$segmentation_variables)){
    segmentation_variables <- colnames(data)
    segmentation_variables <- segmentation_variables[segmentation_variables != "id"]
    
  }else{
    variables <- c("id", hyperparameters$segmentation_variables)
    data <- data[, variables]

  }
  input_params <- list(centers = hyperparameters$centers,
                       iter_max = hyperparameters$iter_max,
                       nstart = hyperparameters$nstart,
                       standardize = hyperparameters$standardize,
                       segmentation_variables = hyperparameters$segmentation_variables)
  
  # treatment of missings
  if(verbose == TRUE) {message("Checking for NAs in variables")}
  NAcount <- apply(data, 2, function(z) sum(is.na(z)))
  
  if(any(NAcount == nrow(data))) stop(paste("Variables have only NAs please remove them:",names(NAcount)[NAcount == nrow(data)],"!"))
  miss <- apply(data, 1, function(z) any(is.na(z)))
  
  if(sum(miss) > 0) {warning("# NAs in variables:\n", paste(capture.output(print(NAcount)), collapse = "\n"), "\n", 
          paste0(sum(miss), " observation(s) with NAs.\n", 
          "Observations with NAs are removed.\n")) }
  else{ 
    if(verbose == TRUE) message("Variables have 0 NAs")}
  
  # remove missings
  data <- data[!miss,]
  
  ids <- data[,'id']
  data <- data[,(names(data) != 'id')]
  
  #Standardize
  zscore <- function(x){
    return((x-mean(x))/sd(x))
  }
  if(hyperparameters$standardize == TRUE){
    data <- data %>% mutate_if(is.numeric, ~ zscore(.x))
  }
  
  #Kmeans ++ algorithm
  kmeanspp <- function(data, centers = 2,
                       start = "random", iter.max = 100,
                       nstart = 10, ...) {
    
    kk <- centers
    
    if (length(dim(data)) == 0) {
      data <- matrix(data, ncol = 1)
    } else {
      data <- cbind(data)
    }
    
    
    if(length(centers) != 1L){
      out <- kmeans(data, centers = centers, iter.max = iter.max)
    }else{
      num.samples <- nrow(data)
      ndim <- ncol(data)
      out <- list()
      unique <- which(!duplicated(data))
      out$tot.withinss <- Inf
      for (restart in seq_len(nstart)) {
        center_ids <- rep(0, length = kk)
        if (start == "random"){
          center_ids[1:2] = sample(unique, 1)
          for (ii in 2:kk) { # the plus-plus step in kmeans
            if (ndim == 1){
              dists <- apply(cbind(data[center_ids, ]), 1,
                             function(center) {rowSums((data - center)^2)})
            } else {
              dists <- apply(data[center_ids, ], 1,
                             function(center) {rowSums((data - center)^2)})
            }
            probs <- apply(dists, 1, min)
            probs[center_ids] <- 0
            
            center_ids[ii] <- sample(unique, 1, prob = probs[unique],replace = FALSE)
          }
        } else{
          center_ids = start
        }
        
        tmp.out <- kmeans(data, centers = data[center_ids, ], iter.max = iter.max, ...)
        tmp.out$inicial.centers <- data[center_ids, ]
        if (tmp.out$tot.withinss < out$tot.withinss){
          out <- tmp.out
        }
      }
    }
    return(out)
  }
  
  outliers_kproto <- function(data, km) {
    
    withinss <- km$withinss
    size <- km$size
    cmd <- data.frame(withinss/size)
    dists <- data.frame(d = km$dists, cluster = km$cluster)
    cl_d <- c()
    cl_md <- c()
    for(i in 1:length(km$cluster)) {
      cl_d <- append(cl_d, dists[i, dists$cluster[i]])
      cl_md <-  append(cl_md, cmd[dists$cluster[i],2])
    }
    cluster_dist <- data.frame(distance = cl_d, cluster = km$cluster)
    cl_sd <- ave(cluster_dist$distance, cluster_dist$cluster,FUN = function(x) sd(x, na.rm=TRUE))
    cluster_z_score <- (cl_d - cl_md)/cl_sd
    data_outliers <- data.frame(data, cluster =  km$cluster, c_dist = cl_d, cluster_z_score,
                                cluster_outlier = ifelse(cluster_z_score > 2, TRUE , FALSE))
    #return table only for outliers
    return(data_outliers[data_outliers$cluster_z_score > 2,])
  }
  
  outliers_kmeans <- function(data, km) {
    
    centers <- km$centers[km$cluster, ]
    distances <- sqrt(rowSums((data - centers)^2))
    cm <- ave(distances, km$cluster, FUN = function(x) mean(x, na.rm=TRUE))
    csd <- ave(distances, km$cluster,FUN = function(x) sd(x, na.rm=TRUE))
    cluster_z_score <- (distances - cm)/csd
    data_outliers <- data.frame(data, cluster =  km$cluster, c_dist = distances, cluster_z_score,
                                cluster_outlier = ifelse(cluster_z_score > 2, TRUE , FALSE))
    #return table only for outliers
    return(data_outliers[data_outliers$cluster_z_score > 2,])
  }
  
  #Get optimal k using automated elbow method
  koptimize <- function(data,max_centers,type){
    scores <- data.frame()
    for (i in 1:max_centers){
      scores[i,"k"] <- i
      if (type == "kmeans"){
        
        
        kk <- kmeanspp(data,centers = i,iter.max=hyperparameters$iter_max, nstart = hyperparameters$nstart)
        scores[i,"withinss"] <-kk$tot.withinss
        
      }else{
        kk <- kproto(data, k=i, iter.max=hyperparameters$iter_max, nstart= hyperparameters$nstart, lambda = lambda, verbose = verbose)
        scores[i,"withinss"] <-kk$tot.withinss
        
      }
    }
    for(i in 1:(max_centers-1)){
      scores[i+1,"delta"] <- scores[i,"withinss"] - scores[i+1,"withinss"]
      scores[i+1,"delta2"] <- scores[i,"delta"] - scores[i+1,"delta"]
    }
    
    for(i in 1:max_centers){
      scores[i,"strength"] <- (scores[i+1,"delta2"] - scores[i+1,"delta"]) / scores[i,"k"]
      
    }
    return(scores)
    
  }
  

  
  if(all(grepl('factor|character', sapply(data,class)))) {
    stop("Data contain only categorical variables: cannot run the k-clusters model")
  }
  
  if(any(grepl('factor|character', sapply(data,class)))) {
    # data contain at least one categorical column
    if(verbose == TRUE) {message("Data contain both categorical and numerical feautures: using k-prototype algorithm")}
    if(any(grepl('character', sapply(data,class)))) {
      # if character tranform into factor
      cols.to.factor <-sapply( data, function(col) class(col) == "character")
      data[cols.to.factor] <- lapply(data[cols.to.factor], factor)
      data <- data.frame(data)
    }
    
    if(hyperparameters$centers == 'auto') {
      if(verbose == TRUE) {message("Optimising number of clusters")}
      if(verbose == TRUE) {message("Estimating lambda for categorical variables")}
      lambda <- lambdaestimation(data, verbose = verbose)
      k_opt_scores <- koptimize(data,max_centers = hyperparameters$max_centers,type = "kproto")
      k_opt <- k_opt_scores[which.max(k_opt_scores$strength),"k"]
      if(verbose == TRUE) {message(paste0("Number of optimal clusters: ", k_opt))}
      km <- kproto(data, k=k_opt, iter.max=hyperparameters$iter_max, nstart= hyperparameters$nstart, lambda = lambda, verbose = FALSE)
      
      #outliers detection
      data_outliers <- outliers_kproto(data, km)
      
    } else {
      if(verbose == TRUE) {message("Estimating lambda for categorical variables")}
      lambda <- lambdaestimation(data, verbose = verbose)
      km <- kproto(data, k=hyperparameters$centers, iter.max=hyperparameters$iter_max, nstart= hyperparameters$nstart, lambda = lambda, verbose = FALSE)
      if(verbose == TRUE) {
        message(paste0("Number of clusters: ", max(km$cluster)))
      }
      #outliers detection
      data_outliers <- outliers_kproto(data, km)
    }
    
  } else {
    #data contain only numeric values
    if(verbose == TRUE) {message("Data contain only numeric feautures: using kmeans algorithm")}
    
    if(hyperparameters$centers == 'auto'){
      if(verbose == TRUE) {message("Optimising number of clusters")}
      
      k_opt_scores <- koptimize(data,max_centers = hyperparameters$max_centers,type = "kmeans")
      k_opt <- k_opt_scores[which.max(k_opt_scores$strength),"k"]
      if(verbose == TRUE) {message(paste0("Number of optimal clusters: ", k_opt))}
      
      km <- kmeanspp(data, centers = k_opt,iter.max=hyperparameters$iter_max, nstart = hyperparameters$nstart)
      #outliers detection
      data_outliers <- outliers_kmeans(data, km)
      
    } else {
      
      km <- kmeans(x = data, centers = hyperparameters$centers, iter.max=hyperparameters$iter_max,nstart=hyperparameters$nstart)
      #outliers detection
      data_outliers <- outliers_kmeans(data, km)
    }
  }
  if(hyperparameters$centers == 'auto'){
    elbow_plot <- ggplot(k_opt_scores, aes(x = .data$k, y = .data$withinss)) +
      geom_line() + geom_point()+
      scale_x_continuous(breaks = 1:10) +
      xlab("Clusters") + 
      ylab("Total Within Cluster Sum of Squares") +
      geom_text(data = k_opt_scores[which.max(k_opt_scores$strength),], aes(x = .data$k+0.75, y = .data$withinss*1.01, label = "Optimal clusters"))+
      geom_point(data=k_opt_scores[which.max(k_opt_scores$strength),], 
                 aes(x=.data$k,y=.data$withinss), 
                 color='magenta3',
                 size=4)
  }else{
    elbow_plot <-  NULL
  }
  if(verbose == TRUE) { message(paste0("Number of rows: ", nrow(data)))}
  out <- list(segment_model = km,
              model_hyperparameters = input_params,
              outliers_table = data_outliers,
              elbow_plot = elbow_plot,
              predicted_values = data.frame("id" = ids,"segment" = km$cluster)
  )
  
  class(out) <- 'k-clusters'
  
  
  
  return(out)
  
}

#' @importFrom stats predict
#' @importFrom methods is
predict.k_clusters <- function(object,newdata,...){
  object <-  object$segment_model
  if (is(object, "kmeans")) {
    list(cluster = apply(newdata, 1, function(r) which.min(colSums((t(object$centers) - r)^2))),
         dists = t(apply(newdata, 1, function(r) colSums((t(object$centers) - r)^2))))
  }else if (is(object, "kproto")) {
    if(any(grepl('factor|character', sapply(newdata,class)))) {
      # data contain at least one categorical column
      if(any(grepl('character', sapply(newdata,class)))) {
        # if character tranform into factor
        cols.to.factor <-sapply( newdata, function(col) class(col) == "character")
        newdata[cols.to.factor] <- lapply(newdata[cols.to.factor], factor)
      }
      
    }
    predict(object, data.frame(newdata))
  }
}

#' @importFrom rlang .data
lambdaestimation <- function(x, num.method = 1, fac.method = 1, outtype = "numeric", verbose = FALSE){
  # initial error checks
  if(!is.data.frame(x)) stop("x should be a data frame!")
  if(!num.method %in% 1:2) stop("Argument 'num.method' must be either 1 or 2!")
  if(!fac.method %in% 1:2) stop("Argument 'fac.method' must be either 1 or 2!")
  if(!outtype %in% c("numeric","vector","variation")) stop("Wrong specificytion of argument 'outtype'!")
  
  # check for numeric and factor variables
  numvars <- sapply(x, is.numeric)
  anynum <- any(numvars)
  catvars <- sapply(x, is.factor)
  anyfact <- any(catvars)
  if(!anynum) cat("\n No numeric variables in x! \n\n")
  if(!anyfact) cat("\n No factor variables in x! \n\n")
  
  if(anynum & num.method == 1) vnum <- sapply(x[,numvars, drop = FALSE], var, na.rm =TRUE)
  if(anynum & num.method == 2) vnum <- sapply(x[,numvars, drop = FALSE], sd, na.rm = TRUE)
  
  if(anyfact & fac.method == 1) vcat <- sapply(x[,catvars, drop = FALSE], function(z) return(1-sum((table(z)/sum(!is.na(z)))^2)))
  if(anyfact & fac.method == 2) vcat <- sapply(x[,catvars, drop = FALSE], function(z) return(1-max(table(z)/sum(!is.na(z)))))
  if (mean(vnum, na.rm = TRUE) == 0){
    warning("All numerical variables have zero variance.\n
            No meaninful estimation for lambda.\n
            Rather use kmodes{klaR} instead of kprotos().")
    anynum <- FALSE
  } 
  if (mean(vcat, na.rm = TRUE) == 0){
    warning("All categorical variables have zero variance.\n
            No meaninful estimation for lambda!\n
            Rather use kmeans() instead of kprotos().")
    anyfact <- FALSE
  } 
  # if(num.method == 1 & verbose == TRUE) cat("Numeric variances:\n")
  # if(num.method == 2 & verbose == TRUE) cat("Numeric standard deviations:\n")
  if(num.method == 1 & verbose == TRUE) cat("Average numeric variance:", mean(vnum), "\n\n")
  if(num.method == 2& verbose == TRUE) cat("Average numeric standard deviation:", mean(vnum), "\n\n")
  
  if(verbose == TRUE) {
    message(paste("Heuristic for categorical variables: (method = ",fac.method,") \n", sep = ""))
    message(vcat)
    message("Average categorical variation:", mean(vcat), "\n\n")
  }
  
  if(anynum & anyfact) {
    if(outtype == "numeric") {lambda <- mean(vnum)/mean(vcat); if(verbose == TRUE) cat("Estimated lambda:", lambda, "\n\n")}
    if(outtype != "numeric") {
      lambda <- rep(0,ncol(x))
      names(lambda) <- names(x)
      lambda[numvars] <- vnum
      lambda[catvars] <- vcat
    }
    if(outtype == "vector") lambda <- 1/lambda
    return(lambda)
  }
  if(!(anynum & anyfact)) invisible()
}