R/clustFunc.R

Defines functions clustFunc

Documented in clustFunc

#' clustFunc
#' @param dfSel a data.frame object or a character string indicating the name of the data.frame object. This is a subset of the data.frame object \code{dfOriginal} containing only the non-numeric columns of \code{dfOriginal}
#' @param dfOriginal a data.frame object or a character string indicating the name of the data.frame object.
#' @param addName a string (default: "Clust") to be added to the name of the output data.frame object and the output .rda file
#' @param subDir a character string indicating the name of the subdirectory within "output" and "plot" directories to save the output data.frame object (as a .rda file) and plot (as a .png file) respectively. If a subdirectory with the given name does not exist within output and/or plot, then it is created. If not specified, the outputs are saved in output/ and plot/. 
#' @param n a number indicating the number of clusters to be formed on clustering.
#' @return clustFunc updates the data.frame object \code{dfOriginal} by adding two new columns corresponding to the cluster number to which each row of the data.frame is assigned after clustering using the Gower clustering algorithm from the function daisy from package cluster (cluster::daisy). One column (called "clustN") is of class numeric, the other (called "clustF") is of class factor. It then saves this updated data.frame object as a .rda file in the subdirectory called \code{subDir} within the directory "output" inside the current working directory. It also creates a silhouette plot for \code{n} clusters and saves it as a .png file in the \code{subDir} subdirectory within the directory "plot" inside the current working directory. It creates "output" and/or "plot" directories in the current working directory if not present already. Similarly, if \code{subDir} is specified, it creates a subdirectory with the name \code{subDir} within both output/ and plot/ if not already present, and saves the outputs in the respective subdirectories. If a subdirectory is not specified (i.e. missing \code{subDir}), then it saves the output .rda file in output/ and the plot in plot/. It also saves the output data.frame object in the ". GlobalEnv" environment. 
#' @description clustFunc takes as input two data.frame objects (\code{dfSel} and \code{dfOriginal}), a character string (optional) (\code{addName}) to be added to the output file name, a character string indicating the name of the output subdirectory (\code{subDir}), the number of clusters to be formed (\code{n}), three positive numbers indicating the width (\code{width}), height (\code{height}) and resolution (\code{res}) of the output plot and inserts two new columns in the data.frame object containing the cluster number assigned to each row. One column (called "clustN") is of class numeric, the other (called "clustF") is of class factor. The output data.frame object is saved as a .rda file in the \code{subDir} subdirectory within the directory "output" inside the current working directory. A silhouette plot of clusters is saved as a .png file in the \code{subDir} subdirectory within the directory "plot" inside the current working directory. "output" and/or "plot" directories are created in the current working directory if not present already. Similarly, if \code{subDir} is specified, a subdirectory with the name \code{subDir} is created within both output/ and plot/ if not already present, and the outputs are saved in that subdirectory. If a subdirectory is not specified (i.e. missing \code{subDir}), then the output .rda file is saved in output/ and the plot is saved in plot/. The output data.frame object is also saved in the ". GlobalEnv" environment.
#' @examples 
#' \dontrun{
#' tab1 = xlsx::read.xlsx("./inst/extdata/sample-data.xlsx",sheetName = "data")
#' tab1Vars <- c("i..id" , "age" ,"area" , "paddArea" , "paddyFld" , "date")
#' tab1Var <- selectExclude(tab1,tab1Vars)
#' clustFunc(tab1Var,tab1,,,3)}
#' @export
clustFunc<-function(dfSel,dfOriginal,addName="Clust",subDir,n,...)
{
  envClustFunc<-new.env()

  if(is.character(dfSel))
  {
    nameDfSel<-dfSel
    dfSel<-get(dfSel)
  }
  else
    {
      nameDfSel<-deparse(substitute(dfSel))
    }

  if(is.character(dfOriginal))
  {
    nameDfOriginal<-dfOriginal
    dfOriginal<-get(dfOriginal)
  }
  else
  {
    nameDfOriginal<-deparse(substitute(dfOriginal))
  }

  dfSel<-droplevels(dfSel)

  gowvarTable<-cluster::daisy(dfSel,metric=c("gower")) #Unaehnlichkeitsmatrix fuer ausgewaehlte vars.  similarity matrix for selected vars
  clustTab<-cluster::pam(gowvarTable, n, diss=TRUE) #Cluster mit pam fuer n Grupen. Cluster with pam for n groups
  clustN<-data.frame(clustN=clustTab[3]) #Clusterspalte als class::numeric. Cluster column as class::numeric
  clustF<-as.factor(as.character(clustN[,1])) #Clusterspalte als class::factor. Cluster column as class::factor

  tabClust<-cbind(dfOriginal,clustN=clustN[,1],clustF)
  nameTabClust<-paste(nameDfSel,addName,n,sep="")

  assign(nameTabClust,tabClust,envir=envClustFunc)
  assign(nameTabClust,tabClust,envir=.GlobalEnv)

  #test or add plot directory
  if (!any(dir(getwd())=="plot"))
  {
    print("directory plot has been created in the present working directory")
    dir.create("plot/",recursive=T)
  }

  #test or add output directory
  if (!any(dir(getwd())=="output"))
  {
    print("directory output has been created in the present working directory")
    dir.create("output/",recursive=T)
  }

  # test or add subDir als directory
  if(missing(subDir))
  {
    dirOutput<-file.path(getwd(),"output")
    dirPlot<-file.path(getwd(),"plot")
  }else
  {
    dirOutput<-file.path(getwd(),"output",subDir)
    if(!dir.exists(dirOutput))
    {
      dir.create(dirOutput)
    }
    dirPlot<-file.path(getwd(),"plot",subDir)
    if(!dir.exists(dirPlot))
    {
      dir.create(dirPlot)
    }
  }

  par(...)
  plot(clustTab,main=paste("Silhouette plot for table",nameDfSel,"with",n,"clusters",sep=" "))


  dev.copy(png, filename=paste(dirPlot,"/",nameDfSel,"_",n,"_clusters.png",sep=""),
             width=2000,height=2000, res=150)
  dev.off()


  save(list=nameTabClust, envir=envClustFunc,file=(paste(dirOutput,"/",nameTabClust,".rda",sep="")))
  ###PLEASE ADD TWO BOOLEAN PARAMETERS FOR THIS TWO LINES
  # write.xlsx(data.frame(clustTab$silinfo[1]),file=paste(dirOutput,"/sil_",nameTabClust,".xlsx",sep=""))
  # write.xlsx(tabClust,file=paste(dirOutput,"/",nameTabClust,".xlsx",sep=""),row.names=F)
  rm(list=ls())
}
lwTools/agriTrf documentation built on March 26, 2020, 12:09 a.m.