R/IPPModel.R

#' Class providing object with methods for drawing IPPs and FIN
#'
#' The class provides objects with methods for drawing impact pattern plots (IPPs) and feature interaction network (FIN).
#' @aliases IPP FIN
#' @docType class
#'
#' @export
#' @keywords data
#'
#' @return Object of \code{R6Class} with methods for drawing IPPs and FIN.
#'
#' @format \code{R6Class} object.
#'
#' @field \link{X.Data} a data.frame, the dataset of input features.
#' @field \link{Pred.Fun} an object, the prediction function. It can be any
#'   model created by "nnet", "randomforest" and "kernlab" etc.
#' @field \link{Model.Package} a string, the package name of the interpreted
#'   machine learning model, such as "nnet" and "randomforest".
#' @field \link{Pred.Type} a string, the type of prediction.
#' @field \link{Pred.Dimension} an integer, indicating which class is predicted.
#'   This field is used only for classification model.
#' @field \link{XB.Size} an integer, the size of \link{XB.Sample}.
#' @field \link{XB.SamplingMethod} a string, the sampling method of
#'   \link{XB.Sample}, "joint" or "independent". "joint" means that all features
#'   are sampled from \link{X.Data} jointly. "independent" means that each
#'   feature is sampled independently; then all features are combined randomly.
#' @field \link{ParaTable} a data.frame, the parameter table. It is generated by
#'   method \link{GenerateParaTable}.
#' @field \link{XA.Sample} a list, the sample of X_A extracted from
#'   \link{X.Data}. It is generated by method \link{SamplingXA}.
#' @field \link{XB.Sample} a list, the sample of X_B extracted from
#'   \link{X.Data}. It is generated by method \link{SamplingXB}.
#' @field \link{Pred.Res} a list, the prediction results of f(X_A,X_B), which is
#'   generated by method \link{PredictData}.
#' @field \link{Clustering.Res} a list, the clustering results, which is
#'   generated by method \link{ClusterImpactPlots}.
#' @field \link{TreeRules} a list, the decision tree rules, which is generated
#'   by method \link{BuildTree}.
#' @field \link{FIN.Data} a data.frame, the feature interaction network, which
#'   is generated by method \link{BuildTree}.
#' @field \link{ColorList} a list, the curve colors used for drawing IPPs.
#' @field \link{TaskFinishTime} a list, the finishing time of tasks.
#' @section Methods: \describe{ \item{\link{initialize}}{initialize some fields
#'   of object and excute the method \link{CheckInitialization} and
#'   \link{GenerateParaTable}.} \item{\link{CheckInitialization}}{validate the
#'   initialization information.} \item{\link{GenerateParaTable}}{Generate the
#'   parameter table \link{ParaTable}. } \item{\link{CheckParaTable}}{validate
#'   the information in \link{ParaTable}.} \item{\link{SamplingXA}}{sampling
#'   \link{XA.Sample} from \link{X.Data}.} \item{\link{SamplingXB}}{sampling
#'   \link{XB.Sample} from \link{X.Data}.} \item{\link{PredictData}}{predict
#'   data using \link{Pred.Fun} based on \link{XA.Sample} and \link{XB.Sample}.}
#'   \item{\link{ClusterImpactPlots}}{cluster the impact curves of each feature
#'   based on the predicting results \link{Pred.Res}.}
#'   \item{\link{BuildTree}}{build decision tree based on the clustering results
#'   \link{Clustering.Res}.} \item{\link{DrawIPP}}{draw the impact pattern
#'   plots.} \item{\link{DrawFIN}}{draw the feature interaction network.}
#'   \item{\link{WriteToExcel}}{write the results to an excel file.}
#'   \item{\link{ExecuteAll}}{execute the methods \link{SamplingXA},
#'   \link{SamplingXB}, \link{PredictData}, \link{ClusterImpactPlots} and
#'   \link{BuildTree} in sequence.}
#'
#'   }
#' @examples
#' library(IPPModel)
#' library(igraph)
#'
#' #------ FIRST EXAMPLE ------
#' library(nnet)
#' data("bank")
#' # build model
#' bank.NN <- nnet(y ~ ., data = bank, size = 5, maxit = 1000)
#' # remove the output variable
#' bank.ds = bank[-17]
#' # create IPPModel object
#' IPP.bank = IPPModel$new(XDS=bank.ds, PredFun=bank.NN,
#'                         ModelPackage="nnet", PredType="raw", PredDim=1,
#'                         XB.Size=1000, XB.SamplingMethod="joint")
#' # modify the clustering method to "kmedoids"
#' IPP.bank$ParaTable$clusteringMethod = "kmedoids"
#' # execute all tasks
#' IPP.bank$ExecuteAll()
#' # draw impact pattern plots (IPP)
#' IPP.bank$DrawIPP(centralized = TRUE, nc = 4)
#' # draw feature interaction network (FIN)
#' IPP.bank$DrawFIN(threshold = 0.2, lay.out = igraph::layout.auto)
#' # write the results into an excel file
#' IPP.bank$WriteToExcel("output.xlsx")
#'
#' #------ SECOND EXAMPLE ------
#' library(randomForest)
#' data("whitewine")
#' # build model
#' WW.RF <- randomForest(quality ~ ., data = whitewine, mtry = 4,importance=TRUE, na.action=na.omit)
#' # remove the output variable
#' WW.ds = whitewine[-12]
#' # create IPPModel object
#' IPP.WW = IPPModel$new(XDS=WW.ds, PredFun=WW.RF,
#'                       ModelPackage="randomForest", PredType="response", PredDim=1,
#'                       XB.Size=1000, XB.SamplingMethod="joint")
#' # set the maximum depth of trees to be 5
#' IPP.WW$ParaTable$treeDepth = 5
#' # execute all tasks
#' IPP.WW$ExecuteAll()
#' # draw impact pattern plots (IPP)
#' IPP.WW$DrawIPP(centralized = TRUE, nc = 4)
#' # draw feature interaction network (FIN)
#' IPP.WW$DrawFIN(threshold = 0.1, lay.out = igraph::layout.circle)
#'
#' #------ THIRD EXAMPLE ------
#' library(kernlab)
#' data("iris")
#' iris.SVM <- ksvm(Species ~ ., data = iris,kernel="rbfdot", kpar="automatic",C=0.1, prob.model = TRUE)
#' # remove the output variable
#' iris.ds = iris[-5]
#' # create IPPModel object
#' IPP.iris = IPPModel$new(XDS=iris.ds, PredFun=iris.SVM,
#'                         ModelPackage="kernlab", PredType="prob", PredDim=1,
#'                         XB.Size=200, XB.SamplingMethod="independent")
#' # execute the tasks step by step
#' IPP.iris$SamplingXA()  # sampling XA
#' IPP.iris$SamplingXB()  # sampling XB
#' IPP.iris$PredictData()  # predict
#' IPP.iris$ClusterImpactPlots() # clustering impact plots
#' IPP.iris$BuildTree()  # build tree
#' # draw impact pattern plots (IPP)
#' IPP.iris$DrawIPP(centralized = TRUE, nc = 4)
#' # draw feature interaction network (FIN)
#' IPP.iris$DrawFIN(threshold = 0.3, lay.out = igraph::layout.auto)
#' # write the results into an excel file
#' IPP.iris$WriteToExcel("output.xlsx")
#'
#' @references Xiaohang Zhang, Ji Zhu, SuBang Choe, Yi Lu and Jing Liu.
#'   Exploring black box of supervised learning models: Visualizing the impact
#'   of features on prediction. Working paper.
#'

IPPModel <- R6::R6Class(
    "IPPModel",
    public = list(
        ################# ATTRIBUTES ################

        X.Data = NA, # the dataset of input features
        Pred.Fun = NA, # the prediction function
        Pred.Type = NA, # the type of prediction
        Pred.Dimension = NA, # which class is predicted
        Model.Package = NA, # the package name of the interpreted machine learnign model
        XB.Size = NA, # the size of XB.Sample
        XB.SamplingMethod = NA, # sampling method of XB.Sample:
            #  - "joint": all features are sampled jointly from X.Data
            #  - "indepedent": all features are sampled independely from X.Data

        ParaTable = NA, # the information of X.Data, which is generated by GenerateParaInfo()
        XA.Sample = NA, # the sample of X_A extracted from X.Data, which is generate by SamplingXA()
        XB.Sample = NA, # the sample of X_B extracted from X.Data, which is generate by SamplingXB()

        Pred.Res = NA, # prediction results of f(X_A,X_B), which is generate by DataPrediction()
        Clustering.Res = NA, # clustering results, which is generate by Clustering()
        TreeRules = NA, # decision tree rules, which is generated by BuildingTree()
        FIN.Data = NA, # feature interaction networks, which is generated by BuildingTree()

        TaskFinishTime = NA, # the finish time of tasks
        ColorList = c("red", "darkgreen", "blue", "orange", "black", "purple", "pink",
                      "cyan", "cadeblue"), # the line colors used for drawing IPPs

        ################ METHODS ################

        #------------------------------------
        # initialization
        #------------------------------------

        initialize = function(XDS, PredFun, PredType, PredDim=1, ModelPackage, XB.Size=200,
                              XB.SamplingMethod="joint"){
            cat("...Initializing\n")

            if(!self$CheckInitialization(XDS, PredType, PredDim, ModelPackage, XB.Size, XB.SamplingMethod)){
                stop("Parameter initialization error")
            }

            require(ModelPackage, character.only = TRUE)
            x = predict(PredFun, XDS[1,], type = PredType)
            if(!is.numeric(x)){
                stop("PredFun, XDS and PredType do not match.")
            }

            self$X.Data = XDS
            self$XB.Size = XB.Size
            self$XB.SamplingMethod = XB.SamplingMethod
            self$Pred.Fun = PredFun
            self$Pred.Type = PredType
            self$Pred.Dimension = PredDim
            self$Model.Package = ModelPackage



            self$GenerateParaTable()
            self$TaskFinishTime[['Initializ']] = Sys.time()
            cat("   ...Finished\n")
            cat("...Please check the parameters by accessing the attribute 'ParaTable' before continuing the other tasks.\n")
        },

        #------------------------------------
        # check initialization information
        #------------------------------------
        CheckInitialization = function(XDS, PredType, PredDim, ModelPackage, XB.Size, XB.SamplingMethod){
            if(!is.data.frame(XDS)){
                message("...XDS must be a data.frame.")
                return (FALSE)
            }

            check.integer = function(x) x == round(x)

            if(!check.integer(PredDim)){
                message("...PreDim must be an integer.")
                return (FALSE)
            }else if(PredDim < 1){
                message("...PredDim must be an integer greater than 0.")
                return (FALSE)
            }

            if(!check.integer(XB.Size)){
                message("...XB.Size must be an integer.")
                return (FALSE)
            }else if(XB.Size < 1){
                message("...XB.Size must be an integer greater than 0.")
                return (FALSE)
            }

            if(! XB.SamplingMethod %in% c("joint","independent")){
                message("...XB.SamplingMethod must be 'joint' or 'independent'.")
                return (FALSE)
            }

            if(!is.character(PredType)){
                message("...PredType must a string.")
                return (FALSE)
            }

            if(!is.character(ModelPackage)){
                message("...ModelPackage must a string.")
                return (FALSE)
            }

            return (TRUE)
        },

        #------------------------------------
        # generate parameter information
        #------------------------------------
        GenerateParaTable = function(){
            varNum = ncol(self$X.Data)

            varNames = names(self$X.Data)
            uniqValue = rep(NA,varNum) # the number of unique values
            dataType = rep(NA,varNum) # the data type

            X_A = rep(TRUE,varNum); # a target feature or not
            samplingMethod = rep("all",varNum); # sampling method
            L_A = rep(NA,varNum) # the number of levels

            centralized = rep(TRUE,varNum); # centralized or not
            distMeasure = rep(NA,varNum) # the distance measure
            autoK = rep(TRUE,varNum); # if the number of cluster is determined automatically
            numK = rep(5,varNum); # the number of clusters if autoK is FALSE or the max number of cluster if autoK is TRUE
            clusteringMethod = rep("kmedoids", varNum) # clustering method

            treeDepth = rep(3,varNum) # the maximum depth of decision tree
            minSplit = rep(60,varNum) # the minimum number of observations for splitting

            for(i in 1:varNum){
                uniqValue[i] = length(unique(self$X.Data[,i]))
                if(is.factor(self$X.Data[,i])){
                    if(is.ordered(self$X.Data[,i])){
                        dataType[i] = "ordinal"
                        L_A[i] = 0
                        distMeasure[i] = "euclidean"
                    }else if(uniqValue[i] == 2){
                        dataType[i] = "binary"
                        L_A[i] = 0
                        distMeasure[i] = "euclidean"
                    }else{
                        dataType[i] = "nominal"
                        L_A[i] = 0
                        distMeasure[i] = "euclidean"
                    }
                }else if(uniqValue[i] > 10){
                    dataType[i] = "interval"
                    L_A[i] = 50
                    samplingMethod[i] = "equal"
                    distMeasure[i] = "cosine"
                }else{
                    dataType[i] = "ordinal"
                    L_A[i] = 0
                    distMeasure[i] = "euclidean"
                }
            }
            XInfo = data.frame(dataType,uniqValue,
                               X_A,L_A,samplingMethod,
                               clusteringMethod, centralized,distMeasure,autoK,numK,
                               treeDepth,minSplit)
            rownames(XInfo) = varNames
            # set levels
            levels(XInfo$dataType) = c(levels(XInfo$dataType), setdiff(c('interval', 'binary', 'nominal', 'ordinal'), levels(XInfo$dataType)))
            levels(XInfo$X_A) = c(levels(XInfo$X_A), setdiff(c(TRUE, FALSE), levels(XInfo$X_A)))
            levels(XInfo$samplingMethod) = c(levels(XInfo$samplingMethod), setdiff(c('all', 'equal', 'percentile', 'random'), levels(XInfo$samplingMethod)))
            levels(XInfo$distMeasure) = c(levels(XInfo$distMeasure), setdiff(c('cosine','euclidean'), levels(XInfo$distMeasure)))
            levels(XInfo$centralized) = c(levels(XInfo$centralized), setdiff(c(TRUE, FALSE), levels(XInfo$centralized)))
            levels(XInfo$autoK) = c(levels(XInfo$autoK), setdiff(c(TRUE, FALSE), levels(XInfo$autoK)))
            levels(XInfo$clusteringMethod) = c(levels(XInfo$clusteringMethod), setdiff(c("kmeans", "kmedoids"), levels(XInfo$clusteringMethod)))

            self$ParaTable = XInfo
            self$TaskFinishTime[['ParaTable']] = Sys.time()
        },

        #------------------------------------
        # check parameter information
        #------------------------------------
        CheckParaTable = function(){
            if(!all(row.names(self$ParaTable) %in% names(self$X.Data))){
                message("The row names of ParaTable are not consistent with the column names of X.Data.")
                return (FALSE)
            }
            if(!all(unique(self$ParaTable[,"dataType"]) %in%
                    c('interval', 'binary', 'nominal', 'ordinal'))){
                message("The 'dataType' in ParaTable must be 'interval', 'binary', 'nominal' or 'ordinal'.")
                return (FALSE)
            }

            if(!all(unique(self$ParaTable[,"X_A"]) %in%
                    c(TRUE, FALSE))){
                message("The 'X_A' in ParaTable must be TRUE or FALSE.")
                return (FALSE)
            }

            if(!all(unique(self$ParaTable[,"centralized"]) %in% c(TRUE, FALSE))){
                message("The 'centralized' in ParaTable must be TRUE or FALSE.")
                return (FALSE)
            }

            if(!all(unique(self$ParaTable[,"autoK"]) %in% c(TRUE, FALSE))){
                message("The 'autoK' in ParaTable must be TRUE or FALSE.")
                return (FALSE)
            }

            if(!all(unique(self$ParaTable[,"samplingMethod"]) %in% c('all', 'equal', 'percentile', 'random'))){
                message("The 'samplingMethod' in ParaTable must be 'all', 'equal', 'percentile' or 'random'.")
                return (FALSE)
            }

            if(!all(unique(self$ParaTable[,"clusteringMethod"]) %in% c("kmeans", "kmedoids"))){
                message("The 'clusteringMethod' in ParaTable must be 'kmeans' or 'kmedoids'.")
                return (FALSE)
            }

            if(!all(unique(self$ParaTable[,"distMeasure"]) %in% c("cosine", "euclidean"))){
                message("The 'distMeasure' in ParaTable must be 'cosine' or 'euclidean'.")
                return (FALSE)
            }

            check.integer = function(x) x == round(x)

            if(!all(check.integer(self$ParaTable[,"L_A"]))){
                message("The 'L_A' in ParaTable must be integers.")
                return (FALSE)
            }else if(!all(self$ParaTable[,"L_A"] >= 0)){
                message("The 'L_A' in ParaTable must be non-negative integers.")
                return (FALSE)
            }

            if(!all(check.integer(self$ParaTable[,"numK"]))){
                message("The 'numK' in ParaTable must be integers.")
                return (FALSE)
            }else if(!all(self$ParaTable[,"numK"] >= 1)){
                message("The 'numK' in ParaTable must be integers greater than 0.")
                return (FALSE)
            }

            if(!all(check.integer(self$ParaTable[,"treeDepth"]))){
                message("The 'treeDepth' in ParaTable must be integers.")
                return (FALSE)
            }else if(!all(self$ParaTable[,"treeDepth"] >= 1)){
                message("The 'treeDepth' in ParaTable must be integers greater than 0.")
                return (FALSE)
            }

            if(!all(check.integer(self$ParaTable[,"minSplit"]))){
                message("The 'minSplit' in ParaTable must be integers.")
                return (FALSE)
            }else if(!all(self$ParaTable[,"minSplit"] >= 1)){
                message("The 'minSplit' in ParaTable must be integers greater than 0.")
                return (FALSE)
            }

            return (TRUE)
        },

        #------------------------------------
        # sampling X_A
        #------------------------------------
        SamplingXA = function(){
            cat("...Sampling X_A\n")
            if(!self$CheckInitialization(self$X.Data, self$Pred.Type, self$Pred.Dimension, self$Model.Package, self$XB.Size, self$XB.SamplingMethod)){
                stop("Parameter initialization error")
            }
            if(!self$CheckParaTable()){
                stop("The parameters in ParaTable are wrong.")
            }
            X_A = list()
            AList = rownames(self$ParaTable)[self$ParaTable["X_A"] == TRUE]
            xnames = names(self$X.Data)
            for(i in 1:length(AList)){
                var = AList[i]
                LA = self$ParaTable[var, "L_A"]
                if(self$ParaTable[var, "dataType"] == "interval"){
                    sM = self$ParaTable[var, "samplingMethod"]
                    # X_A is sampled uniformly
                    if(LA == 0){
                        X_A[[var]] = sort(unique(self$X.Data[,var]))
                    }else if(sM == 'equal'){
                        minV = min(self$X.Data[,var],na.rm = TRUE)
                        maxV = max(self$X.Data[,var],na.rm = TRUE)
                        X_A[[var]] = seq(minV,maxV,(maxV - minV) / (LA - 1))
                    }else if(sM == 'percentile'){
                        X_A[[var]] = quantile(self$X.Data[,var], probs = seq(0, 1, 1 / (LA - 1)))
                    }else{ # random
                        X_A[[var]] = sort(sample(unique(self$X.Data[,var]), LA))
                    }
                }else{
                    # X_A is sampled from dataset
                    temp = unique(self$X.Data[,var])
                    if(LA == 0 | length(temp) <= LA)
                        X_A[[var]] = sort(temp)
                    else
                        X_A[[var]] = sort(sample(temp, LA))
                }
            }
            self$XA.Sample = X_A
            self$TaskFinishTime[['XA.Sample']] = Sys.time()
            cat("   ...Finished\n")
        },

        #------------------------------------
        # sampling X_B
        #------------------------------------
        SamplingXB = function(){
            cat("...Sampling X_B\n")

            if(!self$CheckInitialization(self$X.Data, self$Pred.Type, self$Pred.Dimension, self$Model.Package, self$XB.Size, self$XB.SamplingMethod)){
                stop("Parameter initialization error")
            }
            if(!self$CheckParaTable()){
                stop("The parameters in ParaTable are wrong.")
            }

            if(self$XB.SamplingMethod == "joint"){
                # All features in X_B are sampled directly as a whole from dataset
                # while the joint distribution X_B keeps unchanged.
                if(self$XB.Size >= nrow(self$X.Data)){
                    self$XB.Sample = self$X.Data
                }else{
                    self$XB.Sample = self$X.Data[sample(1:nrow(self$X.Data), self$XB.Size),]
                }
            }else{
                # Each feature in X_B is sampled independently.
                X_B = data.frame(matrix(NA, nrow = self$XB.Size, ncol = ncol(self$X.Data)))
                names(X_B) = names(self$X.Data)
                for(fea in names(self$X.Data)){
                    X_B[[fea]] = sample(self$X.Data[[fea]], self$XB.Size, replace = TRUE)
                }
                self$XB.Sample = X_B
            }
            self$TaskFinishTime[['XB.Sample']] = Sys.time()
            cat("   ...Finished\n")
        },

        #------------------------------------
        # prediction of X_A and X_B
        #------------------------------------
        PredictData = function(){
            cat("...Predicting\n")

            if(!self$CheckInitialization(self$X.Data, self$Pred.Type, self$Pred.Dimension, self$Model.Package, self$XB.Size, self$XB.SamplingMethod)){
                stop("Parameter initialization error")
            }
            if(!self$CheckParaTable()){
                stop("The parameters in ParaTable are wrong.")
            }
            if(!is.list(self$XA.Sample)){
                stop("XA.Sample data is not correct.")
            }else if(!all(names(self$XA.Sample) %in% row.names(self$ParaTable))){
                stop("XA.Sample data is not correct.")
            }
            if(!is.data.frame(self$XB.Sample)){
                stop("XB.Sample data is not correct.")
            }else if(!all(names(self$XB.Sample) %in% row.names(self$ParaTable))){
                stop("XB.Sample data is not correct.")
            }

            AList = rownames(self$ParaTable)[self$ParaTable["X_A"] == TRUE]

            cluster <- parallel::makeCluster(length(AList))
            parallel::clusterExport(cl=cluster, varlist=c("predict"), envir = environment())
            res = parallel::parLapply(cl = cluster,X = 1:length(AList), fun = private$DataPredictionSingle)
            parallel::stopCluster(cluster)
            names(res) = AList
            self$Pred.Res = res
            self$TaskFinishTime[['Pred.Res']] = Sys.time()
            cat("   ...Finished\n")
        },

        #------------------------------------
        # clustering based on K-medoid
        #------------------------------------
        ClusterImpactPlots = function(){
            cat("...Clustering\n")

            if(!self$CheckInitialization(self$X.Data, self$Pred.Type, self$Pred.Dimension, self$Model.Package, self$XB.Size, self$XB.SamplingMethod)){
                stop("Parameter initialization error")
            }
            if(!self$CheckParaTable()){
                stop("The parameters in ParaTable are wrong.")
            }

            AList = rownames(self$ParaTable)[self$ParaTable["X_A"] == TRUE]

            if(!is.list(self$Pred.Res)){
                stop("Pred.Res data is not correct. You need to predict first before clustering.")
            }else if(!all(names(self$Pred.Res) %in% AList)){
                stop("Pred.Res data is not correct. You need to predict first before clsutering.")
            }

            # parallel clustering
            cluster <- parallel::makeCluster(length(AList))
            parallel::clusterExport(cl=cluster, varlist=c(), envir = environment())
            res = parallel::parLapply(cl = cluster,X = 1:length(AList), fun = private$ClusteringSingle)
            parallel::stopCluster(cluster)

            names(res) = AList
            self$Clustering.Res = res
            self$TaskFinishTime[['Clustering.Res']] = Sys.time()
            cat("   ...Finished\n")
        },

        #------------------------------------
        # build decision tree
        #------------------------------------
        BuildTree = function(){
            cat("...Building Decision Trees\n")

            if(!self$CheckInitialization(self$X.Data, self$Pred.Type, self$Pred.Dimension, self$Model.Package, self$XB.Size, self$XB.SamplingMethod)){
                stop("Parameter initialization error")
            }
            if(!self$CheckParaTable()){
                stop("The parameters in ParaTable are wrong.")
            }

            AList = rownames(self$ParaTable)[self$ParaTable["X_A"] == TRUE]

            if(!is.list(self$Clustering.Res)){
                stop("Clustering.Res data is not correct. You need to cluster first before building trees.")
            }else if(!all(names(self$Clustering.Res) %in% AList)){
                stop("Clustering.Res data is not correct. You need to cluster first before building trees.")
            }


            cluster <- parallel::makeCluster(length(AList))
            parallel::clusterExport(cl=cluster, varlist=c(), envir = environment())
            res = parallel::parLapply(cl = cluster,X = 1:length(AList), fun = private$BuildingTreeSingle)
            parallel::stopCluster(cluster)
            names(res) = AList
            self$TreeRules = res

            # Build feature interaction network (FIN)
            self$FIN.Data = data.frame(matrix(NA, nrow = ncol(self$X.Data), ncol = length(AList)),
                                  row.names = names(self$X.Data))
            colnames(self$FIN.Data) = AList
            for(fea in AList){
                varImp = res[[fea]][['varImp']]
                self$FIN.Data[names(varImp), fea] = varImp
            }

            self$TaskFinishTime[['TreeRules']] = Sys.time()
            self$TaskFinishTime[['FIN.Data']] = Sys.time()
            cat("   ...Finished\n")
        },

        #------------------------------------
        # draw impact pattern plots (IPP)
        #------------------------------------
        DrawIPP = function(centralized=TRUE, nc=4){
            AList = rownames(self$ParaTable)[self$ParaTable["X_A"] == TRUE]

            if(!is.list(self$Clustering.Res)){
                stop("Clustering.Res data is not correct. You need to cluster first before drawing IPPs.")
            }else if(!all(names(self$Clustering.Res) %in% AList)){
                stop("Clustering.Res data is not correct. You need to cluster first before drawing IPPs.")
            }

            nr = ceiling(length(AList) / nc)
            par(mfcol = c(nr, nc), mai = c(.7,.5,.3,.5))
            for(fea in AList){
                IPPs = self$Clustering.Res[[fea]][["medoids"]]
                if(centralized){
                    if(is.vector(IPPs))
                        IPPs = IPPs - mean(IPPs)
                    else
                        IPPs = IPPs - rowMeans(IPPs)
                }
                bestK = self$Clustering.Res[[fea]][["bestK"]]
                XA = self$XA.Sample[[fea]]

                ymax = max(IPPs)
                ymin = min(IPPs)

                y = IPPs[1,]
                if(self$ParaTable[fea,"dataType"] == "interval"){

                    plot(x = XA, y = y, type = "l", xlab = fea,
                         ylab = "prediction", ylim = c(ymin, ymax), col = self$ColorList[1])
                }else{
                    plot(x = XA, y = y, type = "l", xlab = fea, xaxt = "n",
                         ylab = "prediction", ylim = c(ymin, ymax), col = self$ColorList[1])
                    axis(1, labels = as.character(XA), at = as.numeric(XA))
                }

                if(bestK > 1){
                    for(i in 2:bestK){
                        if(i > length(self$ColorList))
                            coll = "black"
                        else
                            coll = self$ColorList[i]
                        lines(x = XA, y = IPPs[i,], type = "l", col = coll)
                    }
                }
            }
            #dev.off()
            par(mfcol = c(1, 1))
        },

        #------------------------------------
        # write results to excel file
        #------------------------------------
        WriteToExcel = function(excelName){
            cat("...Write Information\n")
            require("xlsx",character.only = TRUE)
            wb <- xlsx::createWorkbook()

            cs1 <- xlsx::CellStyle(wb) + xlsx::Font(wb, isBold = TRUE, color = "red")
            cs2 <- xlsx::CellStyle(wb) + xlsx::Font(wb, isItalic = TRUE, color = "blue")

            # write the parameter information
            sheet  <- xlsx::createSheet(wb, sheetName="Parameters")
            xlsx::addDataFrame(x = self$ParaTable, sheet = sheet,
                               row.names = TRUE, col.names = TRUE,
                               startColumn = 1, startRow = 1,
                               rownamesStyle = cs2, colnamesStyle = cs1)

            paraDs = data.frame(matrix(NA, nrow = 3, ncol = 1))
            colnames(paraDs) = "name"
            row.names(paraDs) = c("Model.Package", "XB.Size", "XB.SamplingMethod")
            paraDs["Model.Package","name"] = self$Model.Package
            paraDs["XB.Size","name"] = self$XB.Size
            paraDs["XB.SamplingMethod","name"] = self$XB.SamplingMethod
            nr = nrow(self$ParaTable) + 3
            xlsx::addDataFrame(x = paraDs, sheet = sheet,
                               row.names = TRUE, col.names = FALSE,
                               startColumn = 1, startRow = nr,
                               rownamesStyle = cs2)

            AList = rownames(self$ParaTable)[self$ParaTable["X_A"] == TRUE]
            for(fea in AList){
                clusRes = self$Clustering.Res[[fea]]
                treeRules = self$TreeRules[[fea]]
                # import parameters
                bestK = clusRes$bestK
                BVar = as.character(treeRules$depenVar)

                AData = self$XA.Sample[[fea]]
                clusterData = self$Pred.Res[[fea]]
                BData = self$XB.Sample
                groups = clusRes$groups
                IPPs = clusRes$medoids
                if(!is.factor(AData)){
                    xlabel = AData
                }else{
                    xlabel = as.character(AData)
                }

                #-----create worksheet-----
                sheet  <- xlsx::createSheet(wb, sheetName=fea)

                row.n = 1
                #-----write impact pattern plots-----
                xlsx::addDataFrame(x = data.frame("IMPACT PATTERNS"), sheet = sheet,
                                   row.names = FALSE, col.names = FALSE,
                                   startColumn = 1, startRow = row.n, colStyle = list('1'=cs1))
                row.n = row.n + 1

                data = data.frame(t(xlabel))
                rownames(data) = fea
                xlsx::addDataFrame(x = data, sheet = sheet,
                                   row.names = TRUE, col.names = FALSE,
                                   startColumn = 1, startRow = row.n)
                row.n = row.n + 1

                row.names(IPPs) = paste("cluster", 1:bestK, sep = "-")
                xlsx::addDataFrame(x = IPPs, sheet = sheet,
                                   row.names = TRUE, col.names = FALSE,
                                   startColumn = 1, startRow = row.n)
                row.n = row.n + nrow(IPPs) + 2

                #-----write cluster information-----
                xlsx::addDataFrame(x = data.frame("CLUSTER INFORMATION"), sheet = sheet,
                                   row.names = FALSE, col.names = FALSE,
                                   startColumn = 1, startRow = row.n, colStyle = list('1'=cs1))
                row.n = row.n + 1
                for(j in 1:bestK){
                    xlsx::addDataFrame(x = data.frame(paste("cluster",j,sep = '-')), sheet = sheet,
                                       row.names = FALSE, col.names = FALSE,
                                       startColumn = 1, startRow = row.n,colStyle = list('1'=cs2))

                    row.n = row.n + 1
                    data = data.frame(t(xlabel))
                    rownames(data) = fea
                    xlsx::addDataFrame(x = data, sheet = sheet,
                                       row.names = TRUE, col.names = FALSE,
                                       startColumn = 1, startRow = row.n)

                    if(length(which(groups == j)) > 1){
                        minL = apply(clusterData[groups == j,],2,min)
                        maxL = apply(clusterData[groups == j,],2,max)
                        avgL = apply(clusterData[groups == j,],2,mean)
                    }else{
                        minL = clusterData[groups == j,]
                        maxL = minL
                        avgL = minL
                    }

                    data = rbind(minL,avgL,maxL)
                    rownames(data) = c("min","mean","max")

                    row.n = row.n + 1
                    xlsx::addDataFrame(x = data, sheet = sheet,
                                       row.names = TRUE, col.names = FALSE,
                                       startColumn = 1, startRow = row.n)
                    row.n = row.n + 4
                }

                #-----write tree rules-----
                if(bestK > 1){
                    row.n = row.n + 1
                    xlsx::addDataFrame(x = data.frame("TREE RULES"), sheet = sheet,
                                       row.names = FALSE, col.names = FALSE,
                                       startColumn = 1, startRow = row.n, colStyle = list('1'=cs1))
                    row.n = row.n + 1
                    xlsx::addDataFrame(x = treeRules$rules, sheet = sheet,
                                       row.names = FALSE, col.names = TRUE,
                                       startColumn = 1, startRow = row.n)

                    if(length(BVar) > 0){ # tree rules exist
                        row.n = row.n + nrow(treeRules$rules) + 1
                        xlsx::addDataFrame(x = paste("maxDepth = ", treeRules$maxDepth),
                                           sheet = sheet,
                                           row.names = FALSE, col.names = FALSE,
                                           startColumn = 1, startRow = row.n)
                    }

                }
                else{ # only one cluster
                    row.n = row.n + 1
                    xlsx::addDataFrame(x = data.frame("NO RULES"), sheet = sheet,
                                       row.names = FALSE, col.names = FALSE,
                                       startColumn = 1, startRow = row.n, colStyle = list('1'=cs1))
                }
            }

            # write feature interaction network (FIN)
            sheet  <- xlsx::createSheet(wb, sheetName="FIN")
            xlsx::addDataFrame(x = data.frame("FROM"), sheet = sheet,
                               row.names = FALSE, col.names = FALSE,
                               startColumn = 1, startRow = 3, colStyle = list('1'=cs1))
            xlsx::addDataFrame(x = data.frame("TO"), sheet = sheet,
                               row.names = FALSE, col.names = FALSE,
                               startColumn = 3, startRow = 1, colStyle = list('1'=cs1))
            xlsx::addDataFrame(x = self$FIN.Data, sheet = sheet,
                               row.names = TRUE, col.names = TRUE,
                               startColumn = 2, startRow = 2)

            # save data
            xlsx::saveWorkbook(wb, excelName)

            cat("   ...Finished\n")
        },

        #------------------------------------
        # draw feature interaction network (FIN)
        #------------------------------------
        DrawFIN = function(threshold = 0, lay.out = igraph::layout.auto){
            if(!is.data.frame(self$FIN.Data)){
                stop("FIN data is not correct. You need to build decision tree before drawing FIN.")
            }
            # create grpah matrix
            netMatrix = as.matrix(self$FIN.Data)
            netMatrix[is.na(netMatrix)] <- 0
            netMatrix[netMatrix < threshold] <- 0
            # draw network
            g = igraph::graph_from_adjacency_matrix(netMatrix, mode = "directed", weighted = TRUE)
            igraph::plot.igraph(g, layout=lay.out)
        },

        #------------------------------------
        # Sampling, Prediting, Clustering and Building Tree
        #------------------------------------
        ExecuteAll = function(){
            self$SamplingXA()

            self$SamplingXB()

            self$PredictData()

            self$ClusterImpactPlots()

            self$BuildTree()
        }
    ),

    private = list(
        #------------------------------------
        # prediction of X_A and X_B for single feature
        #------------------------------------
        DataPredictionSingle = function(X){
            require(self$Model.Package,character.only = TRUE)
            AList = rownames(self$ParaTable)[self$ParaTable["X_A"] == TRUE]
            fea = AList[X]
            res = list()
            X.ds = self$XB.Sample
            for(i in 1:length(self$XA.Sample[[fea]])){ # Cardian product of XB.Sample and XA.Sample
                x.a = self$XA.Sample[[fea]][i]
                X.ds[fea] = x.a

                predRes = predict(self$Pred.Fun, X.ds, type=self$Pred.Type) # predict
                if(is.vector(predRes)){ # regression problem
                    res[[as.character(x.a)]] =  predRes
                }else{ # classification problem
                    if(ncol(predRes) < self$Pred.Dimension)
                        self$Pred.Dimension = 1
                    res[[as.character(x.a)]] =  predRes[,self$Pred.Dimension]
                }
            }
            return (as.data.frame(res))
        },

        #------------------------------------
        # calculate Cosine similarity
        #------------------------------------
        CosineSim = function(X){
            n = ncol(X)
            sim = matrix(NA,n,n)
            for(i in 1:(n-1)){
                sim[i,i] = 1
                for(j in (i+1):n){
                    sim[i,j] = sum(X[,i] * X[,j]) /
                        (sqrt(sum(X[,i] * X[,i])) * sqrt(sum(X[,j] * X[,j])))
                    sim[j,i] = sim[i,j]
                }
            }
            sim[n,n] = 1
            return (sim)
        },

        #------------------------------------
        # clustering for single feature
        #------------------------------------
        ClusteringSingle = function(X){
            AList = rownames(self$ParaTable)[self$ParaTable["X_A"] == TRUE]
            fea = AList[X]

            clusData = self$Pred.Res[[fea]]
            original.clusData = clusData
            centralized = self$ParaTable[fea, "centralized"]
            distMeasure = self$ParaTable[fea, "distMeasure"]
            autoK = self$ParaTable[fea, "autoK"]
            numK = self$ParaTable[fea, "numK"]
            clusterMethod = self$ParaTable[fea, "clusteringMethod"]

            # -----Centralize the dataset-----
            if(centralized){
                clusData = clusData - rowMeans(clusData)
            }
            if(distMeasure == "cosine"){
                for(i in 1:nrow(clusData)){
                    if(sum(clusData[i,]* clusData[i,]) == 0)
                        clusData[i,] = 0.001
                }
            }

            # -----Calculate the distance matrix-----
            if(autoK | clusterMethod == 'kmedoids'){
                if(distMeasure == "euclidean")
                    distMatrix = dist(clusData)
                if(distMeasure == "cosine")
                    distMatrix = 1 - private$CosineSim(t(clusData))

                if(all(distMatrix < 0.00001)){
                    if(clusterMethod == 'kmeans')
                        return (list(groups = rep(1, self$XB.Size), medoids = t(as.data.frame(colMeans(original.clusData))), bestK = 1))
                    else
                        return (list(groups = rep(1, self$XB.Size), medoids = original.clusData[1,], bestK = 1))
                }
            }

            # -----only one cluster-----
            if (numK == 1){
                if(clusterMethod == 'kmeans'){
                    return (list(groups = rep(1, self$XB.Size), medoids = t(as.data.frame(colMeans(original.clusData))), bestK = 1))
                }else{
                    distSum = rowSums(as.matrix(distMatrix))
                    ind = (1:length(distSum))[distSum == min(distSum)]
                    if(length(ind) > 1) ind = ind[1]
                    return (list(groups = rep(1, self$XB.Size), medoids = original.clusData[ind,], bestK = 1))
                }
            }

            # -----Clustering by k-medoids or k-means method-----
            if(autoK){ # the number of clusters are determined through Dunn index
                clusRes = list()
                bestDunn = 0
                bestK = 1
                for(k in 2:numK){
                    # clustering
                    if(clusterMethod == 'kmeans'){ #k-means
                        if(distMeasure == "euclidean"){
                            temp = akmeans::akmeans(x = clusData, min.k = k, max.k = k, d.metric = 1, ths1 = 1e10)
                            clusRes[[k]] = list(clustering = temp$cluster,medoids = temp$centers)
                        }else{ # cosin
                            temp = akmeans::akmeans(x = clusData, min.k = k, max.k = k, d.metric = 2, ths1 = 1e10)
                            clusRes[[k]] = list(clustering = temp$cluster,medoids = temp$centers)
                        }
                    }else{ # k-medoid
                        temp = cluster::pam(x = distMatrix,k = k, diss = TRUE)
                        clusRes[[k]] = list(clustering = temp$clustering,medoids = temp$medoids)
                    }

                    # calculate dunn to determine the number of clusters
                    clusterIndex = fpc::cluster.stats(d = distMatrix, clustering = clusRes[[k]]$clustering,
                                                      silhouette = FALSE)$dunn

                    if(is.infinite(clusterIndex)){
                        if(clusterMethod == 'kmedoids')
                            return(list(groups = rep(1,self$XB.Size), medoids = original.clusData[1,], bestK = 1))
                        else
                            return(list(groups = rep(1,self$XB.Size), medoids = t(as.data.frame(colMeans(original.clusData))), bestK = 1))
                    }

                    if(clusterIndex > bestDunn){
                        bestDunn = clusterIndex
                        bestK = k
                    }
                    groups = clusRes[[bestK]]$clustering
                    if(clusterMethod == 'kmeans')
                        medoids = clusRes[[bestK]]$medoids
                    else
                        medoids = original.clusData[clusRes[[bestK]]$medoids,]
                }
            }else{ # the numbe of clusters are specified
                bestK = numK
                if(clusterMethod == 'kmeans'){
                    if(distMeasure == "euclidean"){
                        temp = akmeans::akmeans(x = clusData, min.k = bestK, max.k = bestK, d.metric = 1, ths1 = 1e10)
                    }else{ # cosin
                        temp = akmeans::akmeans(x = clusData, min.k = bestK, max.k = bestK, d.metric = 2, ths1 = 1e10)
                    }
                    groups = temp$cluster
                    medoids = temp$centers
                }else{
                    temp = cluster::pam(x = distMatrix,k = bestK, diss = TRUE)
                    groups = temp$clustering
                    medoids = original.clusData[temp$medoids,]
                }
            }
            if(bestK == 1){ # only one cluster
                groups = rep(1,self$XB.Size)
                if(clusterMethod == 'kmeans')
                    medoids = t(as.data.frame(colMeans(original.clusData)))
                else
                    medoids = original.clusData[1,]
            }
            return (list(groups = groups, medoids = as.data.frame(medoids), bestK = bestK))

        },

        #------------------------------------
        # build decision tree for single feature
        #------------------------------------
        BuildingTreeSingle = function(X){
            AList = rownames(self$ParaTable)[self$ParaTable["X_A"] == TRUE]
            feaList = names(self$X.Data)

            fea = AList[X]
            maxDepth = self$ParaTable[fea, "treeDepth"]
            minSplit = self$ParaTable[fea, "minSplit"]
            groups = self$Clustering.Res[[fea]][["groups"]]
            bestK = self$Clustering.Res[[fea]][["bestK"]]
            BData = self$XB.Sample[feaList[! fea == feaList]]

            # there is only one cluster
            if(bestK == 1){
                return (list(rules = NA,depenVar = NA,maxDepth = NA,varImp = NA))
            }

            #-----build tree model-----
            md = maxDepth
            continue = TRUE
            while(continue){
                treeModel = rpart::rpart(groups ~ .,data = BData, method = "class",
                                         control = list(maxdepth = md, minsplit = minSplit))
                temp = treeModel$frame
                leafNum = rownames(temp)[temp["var"] == "<leaf>"]

                if(length(leafNum) == 1 & md < 2 * maxDepth){
                    md = md + 1
                }else{
                    continue = FALSE
                }
            }

            # if(length(leafNum) == 1){
            #     return (list(rules = NA,depenVar = NA,maxDepth = NA,varImp = NA))
            # }

            #-----add links to FIN-----
            varImp = treeModel$variable.importance
            varImp = varImp / sum(varImp)

            #-----extract rules-----
            temp = treeModel$frame
            leafNum = rownames(temp)[temp["var"] == "<leaf>"]
            depenVar = unique(temp[temp["var"] != "<leaf>","var"])
            rules = rpart::path.rpart(treeModel,leafNum,print.it = FALSE)
            rules.d = rep(NA,length(rules))
            for(i in 1:length(rules))
                rules.d[i] = paste(rules[[i]][-1],collapse = " & ")

            #-----extract the distribution of class in each leaf node (rule)-----
            unNum = sort(unique(treeModel$where))
            seqNum = data.frame('num' = 1:length(unNum))
            row.names(seqNum) = unNum
            xyz = data.frame("class" = seqNum[as.character(treeModel$where),"num"],groups)
            xx = as.data.frame.matrix(table(xyz))
            colnames(xx) = paste("clu", names(xx),sep = '-')

            return (list(rules = data.frame("rules" = rules.d,xx),
                         depenVar = depenVar,maxDepth = md,
                         varImp = varImp))
        }
    )
)
XZPackage/IPPModel documentation built on May 17, 2019, 6:36 p.m.