#' clustSimFunc
#' @param dfData a data.frame object or a character string indicating the name of the data.frame object.
#' @param nClust a number indicating the number of clusters upto which the clustering is to be tested starting from number of clusters = 2.
#' @param envir a variable indicating the environment where the output data.frame object should be saved.
#' @param ylimPlot a numeric vector containing two values indicating the lower and upper limits of the y-axis.
#' @param subDir a character string indicating the name of the subdirectory within "output" and "plot" directories to save the output data.frame object (as a .txt file) and plot (as a .png file) respectively. If a subdirectory with the given name does not exist within output and/or plot, then it is created. If not specified, the outputs are saved in output/ and plot/.
#' @param main a character string (default: NULL) indicating an overall title for the plot.
#' @param width a number (default: 1200) indicating the width of the output plot.
#' @param height a number (default: 600) indicating the height of the output plot.
#' @param res a number (default: 125) indicating the resolution of the output plot.
#' @return clustSimFunc calculates the silhouette values for the number of "clusters" n in range 2 to \code{nClust} (maximum 10), both inclusive, which are obtained for the data in the data.frame object \code{dfData} using the Gower clustering algorithm from the function daisy from package cluster (cluster::daisy). It saves the silhouette values of 2 to 10 clusters in a .txt file saved in the "output" directory in the current working directory. It also creates a plot showing the average silhouette width against the number of clusters (2 to \code{nClust})considered for clustering and saves it as a .png file in the \code{subDir} subdirectory within the directory "plot" inside the current working directory. It creates "output" and/or "plot" directories in the current working directory if not present already. Similarly, if \code{subDir} is specified, it creates a subdirectory with the name \code{subDir} within both output/ and plot/ if not already present, and saves the outputs in the respective subdirectories. If a subdirectory is not specified (i.e. missing \code{subDir}), then it saves the output .txt file in output/ and the plot in plot/. It also saves the output data.frame object in the ". GlobalEnv" environment.
#' @description clustSimFunc takes as input a data.frame object or a data.frame object name (\code{dfData}), a number (\code{nClust}), a numeric vector (\code{ylimPlot}) of two numbers indicating te lower and upper limits of the y-axis of the plot, a character string indicating the name of the output subdirectory (\code{subDir}), a character string (\code{main}) indicating the title of the plot, three positive numbers indicating the width (\code{weight}), height (\code{height}) and resolution (\code{res}) of the output plot, and calculates the silhouette values for the number of "clusters" n in range 2 to \code{nClust} (maximum 10), for the data in the data.frame object \code{dfData} using the Gower clustering algorithm from the function daisy from package cluster (cluster::daisy). The silhouette values are for the number of clusters 2 to 10 are saved in a .txt file in the subdirectory \code{subDir} inside the "output" directory within the current working directory. A plot showing the average silhouette width against the number of clusters (2 to \code{nClust}) is saved as a .png file in the subdirectory \code{subDir} inside the "plot" directory within the current working directory. "output" and/or "plot" directories are created in the current working directory if not present already. Similarly, if \code{subDir} is specified, a subdirectory with the name \code{subDir} is created within both output/ and plot/ if not already present, and the outputs are saved in that subdirectory. If a subdirectory is not specified (i.e. missing \code{subDir}), then the output .txt file is saved in output/ and the plot is saved in plot/.
#' @examples
#' tab1 = xlsx::read.xlsx("./sample-data.xlsx",sheetName = "data")
#' tab1Vars <- c("i..id" , "age" ,"area" , "paddArea" , "paddyFld" , "date")
#' tab1Var <- selectExclude(tab1,tab1Vars)
#' clustSimFunc(tab1Var,4)
#' clustSimFunc(tab1Var,4,,c(0,0.5))
#' @export
clustSimFunc<-function(dfData,nClust,envir=.GlobalEnv,ylimPlot=NULL,
subDir,main=NULL,width=1200,height=600, res=125,...)
{
envName<<-new.env()
if(is.matrix(dfData)|is.data.frame(dfData))
{
name<-deparse(substitute(dfData))
}
if(is.character(dfData))
{
name<-dfData
dfData<-get(dfData,envir=envir)
}
if(missing(nClust))
{
nClust<-nrow(dfData)-1
}
#simulation and print file with 10 best clusters
silVector<-numeric(nClust) #Vektor zur Berechung der "average silhouette width"
###############
gowVars<-cluster::daisy(dfData,metric=c("gower")) #Unaehnlichkeitsmatrix mit datVar
###############
for (k in 2:(nClust))
{
silVector[k] <- cluster::pam(gowVars, k, diss=TRUE)$silinfo$avg.width
}
silIndexMax<- which.max(silVector)
# create output directory
if (!any(dir(getwd())=="output"))
{
print("directory output has been created in the present working directory")
dir.create("output/",recursive=T)
}
# create plot directory
if (!any(dir(getwd())=="plot"))
{
print("directory plot has been created in the present working directory")
dir.create("plot/",recursive=T)
}
# test or add subDir als directory
if(missing(subDir))
{
dirOutput<-file.path(getwd(),"output")
dirPlot<-file.path(getwd(),"plot")
}
else
{
dirOutput<-file.path(getwd(),"output",subDir)
if(!dir.exists(dirOutput))
{
dir.create(dirOutput)
}
dirPlot<-file.path(getwd(),"plot",subDir)
if(!dir.exists(dirPlot))
{
dir.create(dirPlot)
}
}
cat("Silhouette-optimal number of clusters k =", silIndexMax,"\n",
"with an average silhouette width of", max(silVector), "\n",
"silouhette values for n=2 until n=10:","\n",
paste("(",2:10,"); ",silVector[2:10],sep=""), file=paste(dirOutput,"/ind_",name,".txt",sep=""))
#plot Hs-Clusteranzahl
par(...)
if(missing(main))
{main<-paste("Choice of the number of clusters",name)}
plot(1:nClust, silVector, type="h", main=main,
xlab="Number of clusters", ylab="Average silhouette width",ylim=ylimPlot)
axis(1, silIndexMax, " ", line=0,col="red", font=0.5,col.axis="red")
axis(1, silIndexMax, silIndexMax, line=1,col="red", font=0.5,col.axis="red")
# axis(1, silIndexMax, paste("numerical","\n", "optimum"), line=-5,col="red", font=0.5,
# col.axis="red",tick=F)
# axis(1, 3, paste(3,"\n","practicable","\n","optimum"), line=3, col="blue", font=0.1,
# col.axis="blue",tick=F)
# axis(1,3," " , line=0, col="blue", font=0.5,col.axis="blue",tick=T)
points(silIndexMax, max(silVector), pch=15, col="red", cex=1)
# points(3,asw.hs[3], pch=15, col="blue", cex=1)
dev.copy(png, filename=paste(dirPlot,"/clustSim_",name,".png",sep=""),width=width,height=height, res=res)
dev.off()
rm(list=ls())
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.