#' Class providing object with methods for drawing IPPs and FIN
#'
#' The class provides objects with methods for drawing impact pattern plots (IPPs) and feature interaction network (FIN).
#' @aliases IPP FIN
#' @docType class
#'
#' @export
#' @keywords data
#'
#' @return Object of \code{R6Class} with methods for drawing IPPs and FIN.
#'
#' @format \code{R6Class} object.
#'
#' @field \link{X.Data} a data.frame, the dataset of input features.
#' @field \link{Pred.Fun} an object, the prediction function. It can be any
#' model created by "nnet", "randomforest" and "kernlab" etc.
#' @field \link{Model.Package} a string, the package name of the interpreted
#' machine learning model, such as "nnet" and "randomforest".
#' @field \link{Pred.Type} a string, the type of prediction.
#' @field \link{Pred.Dimension} an integer, indicating which class is predicted.
#' This field is used only for classification model.
#' @field \link{XB.Size} an integer, the size of \link{XB.Sample}.
#' @field \link{XB.SamplingMethod} a string, the sampling method of
#' \link{XB.Sample}, "joint" or "independent". "joint" means that all features
#' are sampled from \link{X.Data} jointly. "independent" means that each
#' feature is sampled independently; then all features are combined randomly.
#' @field \link{ParaTable} a data.frame, the parameter table. It is generated by
#' method \link{GenerateParaTable}.
#' @field \link{XA.Sample} a list, the sample of X_A extracted from
#' \link{X.Data}. It is generated by method \link{SamplingXA}.
#' @field \link{XB.Sample} a list, the sample of X_B extracted from
#' \link{X.Data}. It is generated by method \link{SamplingXB}.
#' @field \link{Pred.Res} a list, the prediction results of f(X_A,X_B), which is
#' generated by method \link{PredictData}.
#' @field \link{Clustering.Res} a list, the clustering results, which is
#' generated by method \link{ClusterImpactPlots}.
#' @field \link{TreeRules} a list, the decision tree rules, which is generated
#' by method \link{BuildTree}.
#' @field \link{FIN.Data} a data.frame, the feature interaction network, which
#' is generated by method \link{BuildTree}.
#' @field \link{ColorList} a list, the curve colors used for drawing IPPs.
#' @field \link{TaskFinishTime} a list, the finishing time of tasks.
#' @section Methods: \describe{ \item{\link{initialize}}{initialize some fields
#' of object and excute the method \link{CheckInitialization} and
#' \link{GenerateParaTable}.} \item{\link{CheckInitialization}}{validate the
#' initialization information.} \item{\link{GenerateParaTable}}{Generate the
#' parameter table \link{ParaTable}. } \item{\link{CheckParaTable}}{validate
#' the information in \link{ParaTable}.} \item{\link{SamplingXA}}{sampling
#' \link{XA.Sample} from \link{X.Data}.} \item{\link{SamplingXB}}{sampling
#' \link{XB.Sample} from \link{X.Data}.} \item{\link{PredictData}}{predict
#' data using \link{Pred.Fun} based on \link{XA.Sample} and \link{XB.Sample}.}
#' \item{\link{ClusterImpactPlots}}{cluster the impact curves of each feature
#' based on the predicting results \link{Pred.Res}.}
#' \item{\link{BuildTree}}{build decision tree based on the clustering results
#' \link{Clustering.Res}.} \item{\link{DrawIPP}}{draw the impact pattern
#' plots.} \item{\link{DrawFIN}}{draw the feature interaction network.}
#' \item{\link{WriteToExcel}}{write the results to an excel file.}
#' \item{\link{ExecuteAll}}{execute the methods \link{SamplingXA},
#' \link{SamplingXB}, \link{PredictData}, \link{ClusterImpactPlots} and
#' \link{BuildTree} in sequence.}
#'
#' }
#' @examples
#' library(IPPModel)
#' library(igraph)
#'
#' #------ FIRST EXAMPLE ------
#' library(nnet)
#' data("bank")
#' # build model
#' bank.NN <- nnet(y ~ ., data = bank, size = 5, maxit = 1000)
#' # remove the output variable
#' bank.ds = bank[-17]
#' # create IPPModel object
#' IPP.bank = IPPModel$new(XDS=bank.ds, PredFun=bank.NN,
#' ModelPackage="nnet", PredType="raw", PredDim=1,
#' XB.Size=1000, XB.SamplingMethod="joint")
#' # modify the clustering method to "kmedoids"
#' IPP.bank$ParaTable$clusteringMethod = "kmedoids"
#' # execute all tasks
#' IPP.bank$ExecuteAll()
#' # draw impact pattern plots (IPP)
#' IPP.bank$DrawIPP(centralized = TRUE, nc = 4)
#' # draw feature interaction network (FIN)
#' IPP.bank$DrawFIN(threshold = 0.2, lay.out = igraph::layout.auto)
#' # write the results into an excel file
#' IPP.bank$WriteToExcel("output.xlsx")
#'
#' #------ SECOND EXAMPLE ------
#' library(randomForest)
#' data("whitewine")
#' # build model
#' WW.RF <- randomForest(quality ~ ., data = whitewine, mtry = 4,importance=TRUE, na.action=na.omit)
#' # remove the output variable
#' WW.ds = whitewine[-12]
#' # create IPPModel object
#' IPP.WW = IPPModel$new(XDS=WW.ds, PredFun=WW.RF,
#' ModelPackage="randomForest", PredType="response", PredDim=1,
#' XB.Size=1000, XB.SamplingMethod="joint")
#' # set the maximum depth of trees to be 5
#' IPP.WW$ParaTable$treeDepth = 5
#' # execute all tasks
#' IPP.WW$ExecuteAll()
#' # draw impact pattern plots (IPP)
#' IPP.WW$DrawIPP(centralized = TRUE, nc = 4)
#' # draw feature interaction network (FIN)
#' IPP.WW$DrawFIN(threshold = 0.1, lay.out = igraph::layout.circle)
#'
#' #------ THIRD EXAMPLE ------
#' library(kernlab)
#' data("iris")
#' iris.SVM <- ksvm(Species ~ ., data = iris,kernel="rbfdot", kpar="automatic",C=0.1, prob.model = TRUE)
#' # remove the output variable
#' iris.ds = iris[-5]
#' # create IPPModel object
#' IPP.iris = IPPModel$new(XDS=iris.ds, PredFun=iris.SVM,
#' ModelPackage="kernlab", PredType="prob", PredDim=1,
#' XB.Size=200, XB.SamplingMethod="independent")
#' # execute the tasks step by step
#' IPP.iris$SamplingXA() # sampling XA
#' IPP.iris$SamplingXB() # sampling XB
#' IPP.iris$PredictData() # predict
#' IPP.iris$ClusterImpactPlots() # clustering impact plots
#' IPP.iris$BuildTree() # build tree
#' # draw impact pattern plots (IPP)
#' IPP.iris$DrawIPP(centralized = TRUE, nc = 4)
#' # draw feature interaction network (FIN)
#' IPP.iris$DrawFIN(threshold = 0.3, lay.out = igraph::layout.auto)
#' # write the results into an excel file
#' IPP.iris$WriteToExcel("output.xlsx")
#'
#' @references Xiaohang Zhang, Ji Zhu, SuBang Choe, Yi Lu and Jing Liu.
#' Exploring black box of supervised learning models: Visualizing the impact
#' of features on prediction. Working paper.
#'
IPPModel <- R6::R6Class(
"IPPModel",
public = list(
################# ATTRIBUTES ################
X.Data = NA, # the dataset of input features
Pred.Fun = NA, # the prediction function
Pred.Type = NA, # the type of prediction
Pred.Dimension = NA, # which class is predicted
Model.Package = NA, # the package name of the interpreted machine learnign model
XB.Size = NA, # the size of XB.Sample
XB.SamplingMethod = NA, # sampling method of XB.Sample:
# - "joint": all features are sampled jointly from X.Data
# - "indepedent": all features are sampled independely from X.Data
ParaTable = NA, # the information of X.Data, which is generated by GenerateParaInfo()
XA.Sample = NA, # the sample of X_A extracted from X.Data, which is generate by SamplingXA()
XB.Sample = NA, # the sample of X_B extracted from X.Data, which is generate by SamplingXB()
Pred.Res = NA, # prediction results of f(X_A,X_B), which is generate by DataPrediction()
Clustering.Res = NA, # clustering results, which is generate by Clustering()
TreeRules = NA, # decision tree rules, which is generated by BuildingTree()
FIN.Data = NA, # feature interaction networks, which is generated by BuildingTree()
TaskFinishTime = NA, # the finish time of tasks
ColorList = c("red", "darkgreen", "blue", "orange", "black", "purple", "pink",
"cyan", "cadeblue"), # the line colors used for drawing IPPs
################ METHODS ################
#------------------------------------
# initialization
#------------------------------------
initialize = function(XDS, PredFun, PredType, PredDim=1, ModelPackage, XB.Size=200,
XB.SamplingMethod="joint"){
cat("...Initializing\n")
if(!self$CheckInitialization(XDS, PredType, PredDim, ModelPackage, XB.Size, XB.SamplingMethod)){
stop("Parameter initialization error")
}
require(ModelPackage, character.only = TRUE)
x = predict(PredFun, XDS[1,], type = PredType)
if(!is.numeric(x)){
stop("PredFun, XDS and PredType do not match.")
}
self$X.Data = XDS
self$XB.Size = XB.Size
self$XB.SamplingMethod = XB.SamplingMethod
self$Pred.Fun = PredFun
self$Pred.Type = PredType
self$Pred.Dimension = PredDim
self$Model.Package = ModelPackage
self$GenerateParaTable()
self$TaskFinishTime[['Initializ']] = Sys.time()
cat(" ...Finished\n")
cat("...Please check the parameters by accessing the attribute 'ParaTable' before continuing the other tasks.\n")
},
#------------------------------------
# check initialization information
#------------------------------------
CheckInitialization = function(XDS, PredType, PredDim, ModelPackage, XB.Size, XB.SamplingMethod){
if(!is.data.frame(XDS)){
message("...XDS must be a data.frame.")
return (FALSE)
}
check.integer = function(x) x == round(x)
if(!check.integer(PredDim)){
message("...PreDim must be an integer.")
return (FALSE)
}else if(PredDim < 1){
message("...PredDim must be an integer greater than 0.")
return (FALSE)
}
if(!check.integer(XB.Size)){
message("...XB.Size must be an integer.")
return (FALSE)
}else if(XB.Size < 1){
message("...XB.Size must be an integer greater than 0.")
return (FALSE)
}
if(! XB.SamplingMethod %in% c("joint","independent")){
message("...XB.SamplingMethod must be 'joint' or 'independent'.")
return (FALSE)
}
if(!is.character(PredType)){
message("...PredType must a string.")
return (FALSE)
}
if(!is.character(ModelPackage)){
message("...ModelPackage must a string.")
return (FALSE)
}
return (TRUE)
},
#------------------------------------
# generate parameter information
#------------------------------------
GenerateParaTable = function(){
varNum = ncol(self$X.Data)
varNames = names(self$X.Data)
uniqValue = rep(NA,varNum) # the number of unique values
dataType = rep(NA,varNum) # the data type
X_A = rep(TRUE,varNum); # a target feature or not
samplingMethod = rep("all",varNum); # sampling method
L_A = rep(NA,varNum) # the number of levels
centralized = rep(TRUE,varNum); # centralized or not
distMeasure = rep(NA,varNum) # the distance measure
autoK = rep(TRUE,varNum); # if the number of cluster is determined automatically
numK = rep(5,varNum); # the number of clusters if autoK is FALSE or the max number of cluster if autoK is TRUE
clusteringMethod = rep("kmedoids", varNum) # clustering method
treeDepth = rep(3,varNum) # the maximum depth of decision tree
minSplit = rep(60,varNum) # the minimum number of observations for splitting
for(i in 1:varNum){
uniqValue[i] = length(unique(self$X.Data[,i]))
if(is.factor(self$X.Data[,i])){
if(is.ordered(self$X.Data[,i])){
dataType[i] = "ordinal"
L_A[i] = 0
distMeasure[i] = "euclidean"
}else if(uniqValue[i] == 2){
dataType[i] = "binary"
L_A[i] = 0
distMeasure[i] = "euclidean"
}else{
dataType[i] = "nominal"
L_A[i] = 0
distMeasure[i] = "euclidean"
}
}else if(uniqValue[i] > 10){
dataType[i] = "interval"
L_A[i] = 50
samplingMethod[i] = "equal"
distMeasure[i] = "cosine"
}else{
dataType[i] = "ordinal"
L_A[i] = 0
distMeasure[i] = "euclidean"
}
}
XInfo = data.frame(dataType,uniqValue,
X_A,L_A,samplingMethod,
clusteringMethod, centralized,distMeasure,autoK,numK,
treeDepth,minSplit)
rownames(XInfo) = varNames
# set levels
levels(XInfo$dataType) = c(levels(XInfo$dataType), setdiff(c('interval', 'binary', 'nominal', 'ordinal'), levels(XInfo$dataType)))
levels(XInfo$X_A) = c(levels(XInfo$X_A), setdiff(c(TRUE, FALSE), levels(XInfo$X_A)))
levels(XInfo$samplingMethod) = c(levels(XInfo$samplingMethod), setdiff(c('all', 'equal', 'percentile', 'random'), levels(XInfo$samplingMethod)))
levels(XInfo$distMeasure) = c(levels(XInfo$distMeasure), setdiff(c('cosine','euclidean'), levels(XInfo$distMeasure)))
levels(XInfo$centralized) = c(levels(XInfo$centralized), setdiff(c(TRUE, FALSE), levels(XInfo$centralized)))
levels(XInfo$autoK) = c(levels(XInfo$autoK), setdiff(c(TRUE, FALSE), levels(XInfo$autoK)))
levels(XInfo$clusteringMethod) = c(levels(XInfo$clusteringMethod), setdiff(c("kmeans", "kmedoids"), levels(XInfo$clusteringMethod)))
self$ParaTable = XInfo
self$TaskFinishTime[['ParaTable']] = Sys.time()
},
#------------------------------------
# check parameter information
#------------------------------------
CheckParaTable = function(){
if(!all(row.names(self$ParaTable) %in% names(self$X.Data))){
message("The row names of ParaTable are not consistent with the column names of X.Data.")
return (FALSE)
}
if(!all(unique(self$ParaTable[,"dataType"]) %in%
c('interval', 'binary', 'nominal', 'ordinal'))){
message("The 'dataType' in ParaTable must be 'interval', 'binary', 'nominal' or 'ordinal'.")
return (FALSE)
}
if(!all(unique(self$ParaTable[,"X_A"]) %in%
c(TRUE, FALSE))){
message("The 'X_A' in ParaTable must be TRUE or FALSE.")
return (FALSE)
}
if(!all(unique(self$ParaTable[,"centralized"]) %in% c(TRUE, FALSE))){
message("The 'centralized' in ParaTable must be TRUE or FALSE.")
return (FALSE)
}
if(!all(unique(self$ParaTable[,"autoK"]) %in% c(TRUE, FALSE))){
message("The 'autoK' in ParaTable must be TRUE or FALSE.")
return (FALSE)
}
if(!all(unique(self$ParaTable[,"samplingMethod"]) %in% c('all', 'equal', 'percentile', 'random'))){
message("The 'samplingMethod' in ParaTable must be 'all', 'equal', 'percentile' or 'random'.")
return (FALSE)
}
if(!all(unique(self$ParaTable[,"clusteringMethod"]) %in% c("kmeans", "kmedoids"))){
message("The 'clusteringMethod' in ParaTable must be 'kmeans' or 'kmedoids'.")
return (FALSE)
}
if(!all(unique(self$ParaTable[,"distMeasure"]) %in% c("cosine", "euclidean"))){
message("The 'distMeasure' in ParaTable must be 'cosine' or 'euclidean'.")
return (FALSE)
}
check.integer = function(x) x == round(x)
if(!all(check.integer(self$ParaTable[,"L_A"]))){
message("The 'L_A' in ParaTable must be integers.")
return (FALSE)
}else if(!all(self$ParaTable[,"L_A"] >= 0)){
message("The 'L_A' in ParaTable must be non-negative integers.")
return (FALSE)
}
if(!all(check.integer(self$ParaTable[,"numK"]))){
message("The 'numK' in ParaTable must be integers.")
return (FALSE)
}else if(!all(self$ParaTable[,"numK"] >= 1)){
message("The 'numK' in ParaTable must be integers greater than 0.")
return (FALSE)
}
if(!all(check.integer(self$ParaTable[,"treeDepth"]))){
message("The 'treeDepth' in ParaTable must be integers.")
return (FALSE)
}else if(!all(self$ParaTable[,"treeDepth"] >= 1)){
message("The 'treeDepth' in ParaTable must be integers greater than 0.")
return (FALSE)
}
if(!all(check.integer(self$ParaTable[,"minSplit"]))){
message("The 'minSplit' in ParaTable must be integers.")
return (FALSE)
}else if(!all(self$ParaTable[,"minSplit"] >= 1)){
message("The 'minSplit' in ParaTable must be integers greater than 0.")
return (FALSE)
}
return (TRUE)
},
#------------------------------------
# sampling X_A
#------------------------------------
SamplingXA = function(){
cat("...Sampling X_A\n")
if(!self$CheckInitialization(self$X.Data, self$Pred.Type, self$Pred.Dimension, self$Model.Package, self$XB.Size, self$XB.SamplingMethod)){
stop("Parameter initialization error")
}
if(!self$CheckParaTable()){
stop("The parameters in ParaTable are wrong.")
}
X_A = list()
AList = rownames(self$ParaTable)[self$ParaTable["X_A"] == TRUE]
xnames = names(self$X.Data)
for(i in 1:length(AList)){
var = AList[i]
LA = self$ParaTable[var, "L_A"]
if(self$ParaTable[var, "dataType"] == "interval"){
sM = self$ParaTable[var, "samplingMethod"]
# X_A is sampled uniformly
if(LA == 0){
X_A[[var]] = sort(unique(self$X.Data[,var]))
}else if(sM == 'equal'){
minV = min(self$X.Data[,var],na.rm = TRUE)
maxV = max(self$X.Data[,var],na.rm = TRUE)
X_A[[var]] = seq(minV,maxV,(maxV - minV) / (LA - 1))
}else if(sM == 'percentile'){
X_A[[var]] = quantile(self$X.Data[,var], probs = seq(0, 1, 1 / (LA - 1)))
}else{ # random
X_A[[var]] = sort(sample(unique(self$X.Data[,var]), LA))
}
}else{
# X_A is sampled from dataset
temp = unique(self$X.Data[,var])
if(LA == 0 | length(temp) <= LA)
X_A[[var]] = sort(temp)
else
X_A[[var]] = sort(sample(temp, LA))
}
}
self$XA.Sample = X_A
self$TaskFinishTime[['XA.Sample']] = Sys.time()
cat(" ...Finished\n")
},
#------------------------------------
# sampling X_B
#------------------------------------
SamplingXB = function(){
cat("...Sampling X_B\n")
if(!self$CheckInitialization(self$X.Data, self$Pred.Type, self$Pred.Dimension, self$Model.Package, self$XB.Size, self$XB.SamplingMethod)){
stop("Parameter initialization error")
}
if(!self$CheckParaTable()){
stop("The parameters in ParaTable are wrong.")
}
if(self$XB.SamplingMethod == "joint"){
# All features in X_B are sampled directly as a whole from dataset
# while the joint distribution X_B keeps unchanged.
if(self$XB.Size >= nrow(self$X.Data)){
self$XB.Sample = self$X.Data
}else{
self$XB.Sample = self$X.Data[sample(1:nrow(self$X.Data), self$XB.Size),]
}
}else{
# Each feature in X_B is sampled independently.
X_B = data.frame(matrix(NA, nrow = self$XB.Size, ncol = ncol(self$X.Data)))
names(X_B) = names(self$X.Data)
for(fea in names(self$X.Data)){
X_B[[fea]] = sample(self$X.Data[[fea]], self$XB.Size, replace = TRUE)
}
self$XB.Sample = X_B
}
self$TaskFinishTime[['XB.Sample']] = Sys.time()
cat(" ...Finished\n")
},
#------------------------------------
# prediction of X_A and X_B
#------------------------------------
PredictData = function(){
cat("...Predicting\n")
if(!self$CheckInitialization(self$X.Data, self$Pred.Type, self$Pred.Dimension, self$Model.Package, self$XB.Size, self$XB.SamplingMethod)){
stop("Parameter initialization error")
}
if(!self$CheckParaTable()){
stop("The parameters in ParaTable are wrong.")
}
if(!is.list(self$XA.Sample)){
stop("XA.Sample data is not correct.")
}else if(!all(names(self$XA.Sample) %in% row.names(self$ParaTable))){
stop("XA.Sample data is not correct.")
}
if(!is.data.frame(self$XB.Sample)){
stop("XB.Sample data is not correct.")
}else if(!all(names(self$XB.Sample) %in% row.names(self$ParaTable))){
stop("XB.Sample data is not correct.")
}
AList = rownames(self$ParaTable)[self$ParaTable["X_A"] == TRUE]
cluster <- parallel::makeCluster(length(AList))
parallel::clusterExport(cl=cluster, varlist=c("predict"), envir = environment())
res = parallel::parLapply(cl = cluster,X = 1:length(AList), fun = private$DataPredictionSingle)
parallel::stopCluster(cluster)
names(res) = AList
self$Pred.Res = res
self$TaskFinishTime[['Pred.Res']] = Sys.time()
cat(" ...Finished\n")
},
#------------------------------------
# clustering based on K-medoid
#------------------------------------
ClusterImpactPlots = function(){
cat("...Clustering\n")
if(!self$CheckInitialization(self$X.Data, self$Pred.Type, self$Pred.Dimension, self$Model.Package, self$XB.Size, self$XB.SamplingMethod)){
stop("Parameter initialization error")
}
if(!self$CheckParaTable()){
stop("The parameters in ParaTable are wrong.")
}
AList = rownames(self$ParaTable)[self$ParaTable["X_A"] == TRUE]
if(!is.list(self$Pred.Res)){
stop("Pred.Res data is not correct. You need to predict first before clustering.")
}else if(!all(names(self$Pred.Res) %in% AList)){
stop("Pred.Res data is not correct. You need to predict first before clsutering.")
}
# parallel clustering
cluster <- parallel::makeCluster(length(AList))
parallel::clusterExport(cl=cluster, varlist=c(), envir = environment())
res = parallel::parLapply(cl = cluster,X = 1:length(AList), fun = private$ClusteringSingle)
parallel::stopCluster(cluster)
names(res) = AList
self$Clustering.Res = res
self$TaskFinishTime[['Clustering.Res']] = Sys.time()
cat(" ...Finished\n")
},
#------------------------------------
# build decision tree
#------------------------------------
BuildTree = function(){
cat("...Building Decision Trees\n")
if(!self$CheckInitialization(self$X.Data, self$Pred.Type, self$Pred.Dimension, self$Model.Package, self$XB.Size, self$XB.SamplingMethod)){
stop("Parameter initialization error")
}
if(!self$CheckParaTable()){
stop("The parameters in ParaTable are wrong.")
}
AList = rownames(self$ParaTable)[self$ParaTable["X_A"] == TRUE]
if(!is.list(self$Clustering.Res)){
stop("Clustering.Res data is not correct. You need to cluster first before building trees.")
}else if(!all(names(self$Clustering.Res) %in% AList)){
stop("Clustering.Res data is not correct. You need to cluster first before building trees.")
}
cluster <- parallel::makeCluster(length(AList))
parallel::clusterExport(cl=cluster, varlist=c(), envir = environment())
res = parallel::parLapply(cl = cluster,X = 1:length(AList), fun = private$BuildingTreeSingle)
parallel::stopCluster(cluster)
names(res) = AList
self$TreeRules = res
# Build feature interaction network (FIN)
self$FIN.Data = data.frame(matrix(NA, nrow = ncol(self$X.Data), ncol = length(AList)),
row.names = names(self$X.Data))
colnames(self$FIN.Data) = AList
for(fea in AList){
varImp = res[[fea]][['varImp']]
self$FIN.Data[names(varImp), fea] = varImp
}
self$TaskFinishTime[['TreeRules']] = Sys.time()
self$TaskFinishTime[['FIN.Data']] = Sys.time()
cat(" ...Finished\n")
},
#------------------------------------
# draw impact pattern plots (IPP)
#------------------------------------
DrawIPP = function(centralized=TRUE, nc=4){
AList = rownames(self$ParaTable)[self$ParaTable["X_A"] == TRUE]
if(!is.list(self$Clustering.Res)){
stop("Clustering.Res data is not correct. You need to cluster first before drawing IPPs.")
}else if(!all(names(self$Clustering.Res) %in% AList)){
stop("Clustering.Res data is not correct. You need to cluster first before drawing IPPs.")
}
nr = ceiling(length(AList) / nc)
par(mfcol = c(nr, nc), mai = c(.7,.5,.3,.5))
for(fea in AList){
IPPs = self$Clustering.Res[[fea]][["medoids"]]
if(centralized){
if(is.vector(IPPs))
IPPs = IPPs - mean(IPPs)
else
IPPs = IPPs - rowMeans(IPPs)
}
bestK = self$Clustering.Res[[fea]][["bestK"]]
XA = self$XA.Sample[[fea]]
ymax = max(IPPs)
ymin = min(IPPs)
y = IPPs[1,]
if(self$ParaTable[fea,"dataType"] == "interval"){
plot(x = XA, y = y, type = "l", xlab = fea,
ylab = "prediction", ylim = c(ymin, ymax), col = self$ColorList[1])
}else{
plot(x = XA, y = y, type = "l", xlab = fea, xaxt = "n",
ylab = "prediction", ylim = c(ymin, ymax), col = self$ColorList[1])
axis(1, labels = as.character(XA), at = as.numeric(XA))
}
if(bestK > 1){
for(i in 2:bestK){
if(i > length(self$ColorList))
coll = "black"
else
coll = self$ColorList[i]
lines(x = XA, y = IPPs[i,], type = "l", col = coll)
}
}
}
#dev.off()
par(mfcol = c(1, 1))
},
#------------------------------------
# write results to excel file
#------------------------------------
WriteToExcel = function(excelName){
cat("...Write Information\n")
require("xlsx",character.only = TRUE)
wb <- xlsx::createWorkbook()
cs1 <- xlsx::CellStyle(wb) + xlsx::Font(wb, isBold = TRUE, color = "red")
cs2 <- xlsx::CellStyle(wb) + xlsx::Font(wb, isItalic = TRUE, color = "blue")
# write the parameter information
sheet <- xlsx::createSheet(wb, sheetName="Parameters")
xlsx::addDataFrame(x = self$ParaTable, sheet = sheet,
row.names = TRUE, col.names = TRUE,
startColumn = 1, startRow = 1,
rownamesStyle = cs2, colnamesStyle = cs1)
paraDs = data.frame(matrix(NA, nrow = 3, ncol = 1))
colnames(paraDs) = "name"
row.names(paraDs) = c("Model.Package", "XB.Size", "XB.SamplingMethod")
paraDs["Model.Package","name"] = self$Model.Package
paraDs["XB.Size","name"] = self$XB.Size
paraDs["XB.SamplingMethod","name"] = self$XB.SamplingMethod
nr = nrow(self$ParaTable) + 3
xlsx::addDataFrame(x = paraDs, sheet = sheet,
row.names = TRUE, col.names = FALSE,
startColumn = 1, startRow = nr,
rownamesStyle = cs2)
AList = rownames(self$ParaTable)[self$ParaTable["X_A"] == TRUE]
for(fea in AList){
clusRes = self$Clustering.Res[[fea]]
treeRules = self$TreeRules[[fea]]
# import parameters
bestK = clusRes$bestK
BVar = as.character(treeRules$depenVar)
AData = self$XA.Sample[[fea]]
clusterData = self$Pred.Res[[fea]]
BData = self$XB.Sample
groups = clusRes$groups
IPPs = clusRes$medoids
if(!is.factor(AData)){
xlabel = AData
}else{
xlabel = as.character(AData)
}
#-----create worksheet-----
sheet <- xlsx::createSheet(wb, sheetName=fea)
row.n = 1
#-----write impact pattern plots-----
xlsx::addDataFrame(x = data.frame("IMPACT PATTERNS"), sheet = sheet,
row.names = FALSE, col.names = FALSE,
startColumn = 1, startRow = row.n, colStyle = list('1'=cs1))
row.n = row.n + 1
data = data.frame(t(xlabel))
rownames(data) = fea
xlsx::addDataFrame(x = data, sheet = sheet,
row.names = TRUE, col.names = FALSE,
startColumn = 1, startRow = row.n)
row.n = row.n + 1
row.names(IPPs) = paste("cluster", 1:bestK, sep = "-")
xlsx::addDataFrame(x = IPPs, sheet = sheet,
row.names = TRUE, col.names = FALSE,
startColumn = 1, startRow = row.n)
row.n = row.n + nrow(IPPs) + 2
#-----write cluster information-----
xlsx::addDataFrame(x = data.frame("CLUSTER INFORMATION"), sheet = sheet,
row.names = FALSE, col.names = FALSE,
startColumn = 1, startRow = row.n, colStyle = list('1'=cs1))
row.n = row.n + 1
for(j in 1:bestK){
xlsx::addDataFrame(x = data.frame(paste("cluster",j,sep = '-')), sheet = sheet,
row.names = FALSE, col.names = FALSE,
startColumn = 1, startRow = row.n,colStyle = list('1'=cs2))
row.n = row.n + 1
data = data.frame(t(xlabel))
rownames(data) = fea
xlsx::addDataFrame(x = data, sheet = sheet,
row.names = TRUE, col.names = FALSE,
startColumn = 1, startRow = row.n)
if(length(which(groups == j)) > 1){
minL = apply(clusterData[groups == j,],2,min)
maxL = apply(clusterData[groups == j,],2,max)
avgL = apply(clusterData[groups == j,],2,mean)
}else{
minL = clusterData[groups == j,]
maxL = minL
avgL = minL
}
data = rbind(minL,avgL,maxL)
rownames(data) = c("min","mean","max")
row.n = row.n + 1
xlsx::addDataFrame(x = data, sheet = sheet,
row.names = TRUE, col.names = FALSE,
startColumn = 1, startRow = row.n)
row.n = row.n + 4
}
#-----write tree rules-----
if(bestK > 1){
row.n = row.n + 1
xlsx::addDataFrame(x = data.frame("TREE RULES"), sheet = sheet,
row.names = FALSE, col.names = FALSE,
startColumn = 1, startRow = row.n, colStyle = list('1'=cs1))
row.n = row.n + 1
xlsx::addDataFrame(x = treeRules$rules, sheet = sheet,
row.names = FALSE, col.names = TRUE,
startColumn = 1, startRow = row.n)
if(length(BVar) > 0){ # tree rules exist
row.n = row.n + nrow(treeRules$rules) + 1
xlsx::addDataFrame(x = paste("maxDepth = ", treeRules$maxDepth),
sheet = sheet,
row.names = FALSE, col.names = FALSE,
startColumn = 1, startRow = row.n)
}
}
else{ # only one cluster
row.n = row.n + 1
xlsx::addDataFrame(x = data.frame("NO RULES"), sheet = sheet,
row.names = FALSE, col.names = FALSE,
startColumn = 1, startRow = row.n, colStyle = list('1'=cs1))
}
}
# write feature interaction network (FIN)
sheet <- xlsx::createSheet(wb, sheetName="FIN")
xlsx::addDataFrame(x = data.frame("FROM"), sheet = sheet,
row.names = FALSE, col.names = FALSE,
startColumn = 1, startRow = 3, colStyle = list('1'=cs1))
xlsx::addDataFrame(x = data.frame("TO"), sheet = sheet,
row.names = FALSE, col.names = FALSE,
startColumn = 3, startRow = 1, colStyle = list('1'=cs1))
xlsx::addDataFrame(x = self$FIN.Data, sheet = sheet,
row.names = TRUE, col.names = TRUE,
startColumn = 2, startRow = 2)
# save data
xlsx::saveWorkbook(wb, excelName)
cat(" ...Finished\n")
},
#------------------------------------
# draw feature interaction network (FIN)
#------------------------------------
DrawFIN = function(threshold = 0, lay.out = igraph::layout.auto){
if(!is.data.frame(self$FIN.Data)){
stop("FIN data is not correct. You need to build decision tree before drawing FIN.")
}
# create grpah matrix
netMatrix = as.matrix(self$FIN.Data)
netMatrix[is.na(netMatrix)] <- 0
netMatrix[netMatrix < threshold] <- 0
# draw network
g = igraph::graph_from_adjacency_matrix(netMatrix, mode = "directed", weighted = TRUE)
igraph::plot.igraph(g, layout=lay.out)
},
#------------------------------------
# Sampling, Prediting, Clustering and Building Tree
#------------------------------------
ExecuteAll = function(){
self$SamplingXA()
self$SamplingXB()
self$PredictData()
self$ClusterImpactPlots()
self$BuildTree()
}
),
private = list(
#------------------------------------
# prediction of X_A and X_B for single feature
#------------------------------------
DataPredictionSingle = function(X){
require(self$Model.Package,character.only = TRUE)
AList = rownames(self$ParaTable)[self$ParaTable["X_A"] == TRUE]
fea = AList[X]
res = list()
X.ds = self$XB.Sample
for(i in 1:length(self$XA.Sample[[fea]])){ # Cardian product of XB.Sample and XA.Sample
x.a = self$XA.Sample[[fea]][i]
X.ds[fea] = x.a
predRes = predict(self$Pred.Fun, X.ds, type=self$Pred.Type) # predict
if(is.vector(predRes)){ # regression problem
res[[as.character(x.a)]] = predRes
}else{ # classification problem
if(ncol(predRes) < self$Pred.Dimension)
self$Pred.Dimension = 1
res[[as.character(x.a)]] = predRes[,self$Pred.Dimension]
}
}
return (as.data.frame(res))
},
#------------------------------------
# calculate Cosine similarity
#------------------------------------
CosineSim = function(X){
n = ncol(X)
sim = matrix(NA,n,n)
for(i in 1:(n-1)){
sim[i,i] = 1
for(j in (i+1):n){
sim[i,j] = sum(X[,i] * X[,j]) /
(sqrt(sum(X[,i] * X[,i])) * sqrt(sum(X[,j] * X[,j])))
sim[j,i] = sim[i,j]
}
}
sim[n,n] = 1
return (sim)
},
#------------------------------------
# clustering for single feature
#------------------------------------
ClusteringSingle = function(X){
AList = rownames(self$ParaTable)[self$ParaTable["X_A"] == TRUE]
fea = AList[X]
clusData = self$Pred.Res[[fea]]
original.clusData = clusData
centralized = self$ParaTable[fea, "centralized"]
distMeasure = self$ParaTable[fea, "distMeasure"]
autoK = self$ParaTable[fea, "autoK"]
numK = self$ParaTable[fea, "numK"]
clusterMethod = self$ParaTable[fea, "clusteringMethod"]
# -----Centralize the dataset-----
if(centralized){
clusData = clusData - rowMeans(clusData)
}
if(distMeasure == "cosine"){
for(i in 1:nrow(clusData)){
if(sum(clusData[i,]* clusData[i,]) == 0)
clusData[i,] = 0.001
}
}
# -----Calculate the distance matrix-----
if(autoK | clusterMethod == 'kmedoids'){
if(distMeasure == "euclidean")
distMatrix = dist(clusData)
if(distMeasure == "cosine")
distMatrix = 1 - private$CosineSim(t(clusData))
if(all(distMatrix < 0.00001)){
if(clusterMethod == 'kmeans')
return (list(groups = rep(1, self$XB.Size), medoids = t(as.data.frame(colMeans(original.clusData))), bestK = 1))
else
return (list(groups = rep(1, self$XB.Size), medoids = original.clusData[1,], bestK = 1))
}
}
# -----only one cluster-----
if (numK == 1){
if(clusterMethod == 'kmeans'){
return (list(groups = rep(1, self$XB.Size), medoids = t(as.data.frame(colMeans(original.clusData))), bestK = 1))
}else{
distSum = rowSums(as.matrix(distMatrix))
ind = (1:length(distSum))[distSum == min(distSum)]
if(length(ind) > 1) ind = ind[1]
return (list(groups = rep(1, self$XB.Size), medoids = original.clusData[ind,], bestK = 1))
}
}
# -----Clustering by k-medoids or k-means method-----
if(autoK){ # the number of clusters are determined through Dunn index
clusRes = list()
bestDunn = 0
bestK = 1
for(k in 2:numK){
# clustering
if(clusterMethod == 'kmeans'){ #k-means
if(distMeasure == "euclidean"){
temp = akmeans::akmeans(x = clusData, min.k = k, max.k = k, d.metric = 1, ths1 = 1e10)
clusRes[[k]] = list(clustering = temp$cluster,medoids = temp$centers)
}else{ # cosin
temp = akmeans::akmeans(x = clusData, min.k = k, max.k = k, d.metric = 2, ths1 = 1e10)
clusRes[[k]] = list(clustering = temp$cluster,medoids = temp$centers)
}
}else{ # k-medoid
temp = cluster::pam(x = distMatrix,k = k, diss = TRUE)
clusRes[[k]] = list(clustering = temp$clustering,medoids = temp$medoids)
}
# calculate dunn to determine the number of clusters
clusterIndex = fpc::cluster.stats(d = distMatrix, clustering = clusRes[[k]]$clustering,
silhouette = FALSE)$dunn
if(is.infinite(clusterIndex)){
if(clusterMethod == 'kmedoids')
return(list(groups = rep(1,self$XB.Size), medoids = original.clusData[1,], bestK = 1))
else
return(list(groups = rep(1,self$XB.Size), medoids = t(as.data.frame(colMeans(original.clusData))), bestK = 1))
}
if(clusterIndex > bestDunn){
bestDunn = clusterIndex
bestK = k
}
groups = clusRes[[bestK]]$clustering
if(clusterMethod == 'kmeans')
medoids = clusRes[[bestK]]$medoids
else
medoids = original.clusData[clusRes[[bestK]]$medoids,]
}
}else{ # the numbe of clusters are specified
bestK = numK
if(clusterMethod == 'kmeans'){
if(distMeasure == "euclidean"){
temp = akmeans::akmeans(x = clusData, min.k = bestK, max.k = bestK, d.metric = 1, ths1 = 1e10)
}else{ # cosin
temp = akmeans::akmeans(x = clusData, min.k = bestK, max.k = bestK, d.metric = 2, ths1 = 1e10)
}
groups = temp$cluster
medoids = temp$centers
}else{
temp = cluster::pam(x = distMatrix,k = bestK, diss = TRUE)
groups = temp$clustering
medoids = original.clusData[temp$medoids,]
}
}
if(bestK == 1){ # only one cluster
groups = rep(1,self$XB.Size)
if(clusterMethod == 'kmeans')
medoids = t(as.data.frame(colMeans(original.clusData)))
else
medoids = original.clusData[1,]
}
return (list(groups = groups, medoids = as.data.frame(medoids), bestK = bestK))
},
#------------------------------------
# build decision tree for single feature
#------------------------------------
BuildingTreeSingle = function(X){
AList = rownames(self$ParaTable)[self$ParaTable["X_A"] == TRUE]
feaList = names(self$X.Data)
fea = AList[X]
maxDepth = self$ParaTable[fea, "treeDepth"]
minSplit = self$ParaTable[fea, "minSplit"]
groups = self$Clustering.Res[[fea]][["groups"]]
bestK = self$Clustering.Res[[fea]][["bestK"]]
BData = self$XB.Sample[feaList[! fea == feaList]]
# there is only one cluster
if(bestK == 1){
return (list(rules = NA,depenVar = NA,maxDepth = NA,varImp = NA))
}
#-----build tree model-----
md = maxDepth
continue = TRUE
while(continue){
treeModel = rpart::rpart(groups ~ .,data = BData, method = "class",
control = list(maxdepth = md, minsplit = minSplit))
temp = treeModel$frame
leafNum = rownames(temp)[temp["var"] == "<leaf>"]
if(length(leafNum) == 1 & md < 2 * maxDepth){
md = md + 1
}else{
continue = FALSE
}
}
# if(length(leafNum) == 1){
# return (list(rules = NA,depenVar = NA,maxDepth = NA,varImp = NA))
# }
#-----add links to FIN-----
varImp = treeModel$variable.importance
varImp = varImp / sum(varImp)
#-----extract rules-----
temp = treeModel$frame
leafNum = rownames(temp)[temp["var"] == "<leaf>"]
depenVar = unique(temp[temp["var"] != "<leaf>","var"])
rules = rpart::path.rpart(treeModel,leafNum,print.it = FALSE)
rules.d = rep(NA,length(rules))
for(i in 1:length(rules))
rules.d[i] = paste(rules[[i]][-1],collapse = " & ")
#-----extract the distribution of class in each leaf node (rule)-----
unNum = sort(unique(treeModel$where))
seqNum = data.frame('num' = 1:length(unNum))
row.names(seqNum) = unNum
xyz = data.frame("class" = seqNum[as.character(treeModel$where),"num"],groups)
xx = as.data.frame.matrix(table(xyz))
colnames(xx) = paste("clu", names(xx),sep = '-')
return (list(rules = data.frame("rules" = rules.d,xx),
depenVar = depenVar,maxDepth = md,
varImp = varImp))
}
)
)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.