R/clustSimFunc.R

Defines functions clustSimFunc

Documented in clustSimFunc

#' clustSimFunc
#' @param dfData a data.frame object or a character string indicating the name of the data.frame object.
#' @param nClust a number indicating the number of clusters upto which the clustering is to be tested starting from number of clusters = 2.
#' @param envir a variable indicating the environment where the output data.frame object should be saved.
#' @param ylimPlot a numeric vector containing two values indicating the lower and upper limits of the y-axis.
#' @param subDir a character string indicating the name of the subdirectory within "output" and "plot" directories to save the output data.frame object (as a .txt file) and plot (as a .png file) respectively. If a subdirectory with the given name does not exist within output and/or plot, then it is created. If not specified, the outputs are saved in output/ and plot/. 
#' @param main a character string (default: NULL) indicating an overall title for the plot.
#' @param width a number (default: 1200) indicating the width of the output plot.
#' @param height a number (default: 600) indicating the height of the output plot.
#' @param res a number (default: 125) indicating the resolution of the output plot.
#' @return clustSimFunc calculates the silhouette values ​​for the number of "clusters" n in range 2 to \code{nClust} (maximum 10), both inclusive, which are obtained for the data in the data.frame object \code{dfData} using the Gower clustering algorithm from the function daisy from package cluster (cluster::daisy). It saves the silhouette values ​​of 2 to 10 clusters in a .txt file saved in the "output" directory in the current working directory. It also creates a plot showing the average silhouette width against the number of clusters (2 to \code{nClust})considered for clustering and saves it as a .png file in the \code{subDir} subdirectory within the directory "plot" inside the current working directory. It creates "output" and/or "plot" directories in the current working directory if not present already. Similarly, if \code{subDir} is specified, it creates a subdirectory with the name \code{subDir} within both output/ and plot/ if not already present, and saves the outputs in the respective subdirectories. If a subdirectory is not specified (i.e. missing \code{subDir}), then it saves the output .txt file in output/ and the plot in plot/. It also saves the output data.frame object in the ". GlobalEnv" environment.
#' @description clustSimFunc takes as input a data.frame object or a data.frame object name (\code{dfData}), a number (\code{nClust}), a numeric vector (\code{ylimPlot}) of two numbers indicating te lower and upper limits of the y-axis of the plot, a character string indicating the name of the output subdirectory (\code{subDir}), a character string (\code{main}) indicating the title of the plot, three positive numbers indicating the width (\code{weight}), height (\code{height}) and resolution (\code{res}) of the output plot, and  calculates the silhouette values ​​for the number of "clusters" n in range 2 to \code{nClust} (maximum 10), for the data in the data.frame object \code{dfData} using the Gower clustering algorithm from the function daisy from package cluster (cluster::daisy). The silhouette values are for the number of clusters 2 to 10 are saved in a .txt file in the subdirectory \code{subDir} inside the "output" directory within the current working directory. A plot showing the average silhouette width against the number of clusters (2 to \code{nClust}) is saved as a .png file in the subdirectory \code{subDir} inside the "plot" directory within the current working directory. "output" and/or "plot" directories are created in the current working directory if not present already. Similarly, if \code{subDir} is specified, a subdirectory with the name \code{subDir} is created within both output/ and plot/ if not already present, and the outputs are saved in that subdirectory. If a subdirectory is not specified (i.e. missing \code{subDir}), then the output .txt file is saved in output/ and the plot is saved in plot/.
#' @examples
#' tab1 = xlsx::read.xlsx("./sample-data.xlsx",sheetName = "data")
#' tab1Vars <- c("i..id" , "age" ,"area" , "paddArea" , "paddyFld" , "date")
#' tab1Var <- selectExclude(tab1,tab1Vars)
#' clustSimFunc(tab1Var,4)
#' clustSimFunc(tab1Var,4,,c(0,0.5))
#' @export
clustSimFunc<-function(dfData,nClust,envir=.GlobalEnv,ylimPlot=NULL,
                       subDir,main=NULL,width=1200,height=600, res=125,...)
{
  envName<<-new.env()
  if(is.matrix(dfData)|is.data.frame(dfData))
  {
    name<-deparse(substitute(dfData))  
  }
  
  if(is.character(dfData))
  {
    name<-dfData
    dfData<-get(dfData,envir=envir)
  }
  
  if(missing(nClust))
  {
    nClust<-nrow(dfData)-1
  }
  #simulation and print file with 10 best clusters
  silVector<-numeric(nClust) #Vektor zur Berechung der "average silhouette width"
  ###############
  gowVars<-cluster::daisy(dfData,metric=c("gower")) #Unaehnlichkeitsmatrix mit datVar
  ###############
  
  for (k in 2:(nClust)) 
  {
    silVector[k] <- cluster::pam(gowVars, k, diss=TRUE)$silinfo$avg.width
  }
  silIndexMax<- which.max(silVector) 
  
  # create output directory
  if (!any(dir(getwd())=="output"))
  {
    print("directory output has been created in the present working directory")
    dir.create("output/",recursive=T)
  }
  
  # create plot directory
  if (!any(dir(getwd())=="plot"))
  {
    print("directory plot has been created in the present working directory")
    dir.create("plot/",recursive=T)
  }
  
  # test or add subDir als directory
  if(missing(subDir))
  {
    dirOutput<-file.path(getwd(),"output")
    dirPlot<-file.path(getwd(),"plot")
  }
  else
  {
    dirOutput<-file.path(getwd(),"output",subDir)
    if(!dir.exists(dirOutput))
    {
      dir.create(dirOutput)
    }
    dirPlot<-file.path(getwd(),"plot",subDir)
    if(!dir.exists(dirPlot))
    {
      dir.create(dirPlot)
    }
  }
  
  
  cat("Silhouette-optimal number of clusters k =", silIndexMax,"\n", 
      "with an average silhouette width of", max(silVector), "\n",
      "silouhette values for n=2 until n=10:","\n",
      paste("(",2:10,"); ",silVector[2:10],sep=""), file=paste(dirOutput,"/ind_",name,".txt",sep=""))  
  
  #plot Hs-Clusteranzahl
  par(...)
  if(missing(main))
  {main<-paste("Choice of the number of clusters",name)}
  plot(1:nClust, silVector, type="h", main=main,
       xlab="Number of clusters", ylab="Average silhouette width",ylim=ylimPlot)
  axis(1, silIndexMax, " ", line=0,col="red", font=0.5,col.axis="red")
  axis(1, silIndexMax, silIndexMax, line=1,col="red", font=0.5,col.axis="red")
  # axis(1, silIndexMax, paste("numerical","\n", "optimum"), line=-5,col="red", font=0.5,
  #      col.axis="red",tick=F)
  # axis(1, 3, paste(3,"\n","practicable","\n","optimum"), line=3, col="blue", font=0.1,
  #      col.axis="blue",tick=F)
  # axis(1,3," " , line=0, col="blue", font=0.5,col.axis="blue",tick=T)
  points(silIndexMax, max(silVector), pch=15, col="red", cex=1)
  # points(3,asw.hs[3], pch=15, col="blue", cex=1)
  
  
  dev.copy(png, filename=paste(dirPlot,"/clustSim_",name,".png",sep=""),width=width,height=height, res=res)
  dev.off()
  rm(list=ls())
}
lwTools/agriTrf documentation built on March 26, 2020, 12:09 a.m.