R/adaptive_refinement_help.R

Defines functions groupbn.build.blacklist print.groupbn is.groupbn groupbn.vis.html.plot predict.groupbn validation split.cluster rem.id.col plot.groupbn PCAmix.groupbn groupbn.output.table network comb.output.scores group.data.preproc graphviz.plot.neighbourhood get.number get.merge.step get.maxima get.cluster discretize.dens disc.scores cross.en

Documented in cross.en discretize.dens groupbn.output.table groupbn.vis.html.plot is.groupbn plot.groupbn predict.groupbn print.groupbn

#Assisting functions in alphabetical order

#calculate weighted average cross entropy
#pred: predictions, obs: observations (0,1)
#weights are the group proportions
#man kann beide if abfragen zusammenfassen indem man nur den zweiten teil nimmt?
cross.en<-function(pred, obs, sdpred=NULL,weighted=T){
  if(nlevels(factor(obs))<2|nlevels(factor(obs))>4){
    stop('implemented only for a categorical target variable with 2-4 levels')
  }else if(nlevels(factor(obs))>2){
    k<-nlevels(factor(obs))
    pred[pred==0]<-0.000001
    pred[pred==1]<-0.999999
    weight<-pred[!is.na(pred[,1]),1]
    if(weighted==T){
      for (i in 1:length(pred[!is.na(pred[,1]),1])){
        weight[i]<-length(which(obs[!is.na(pred[,1])]!=obs[!is.na(pred[,1])][i]))/length(obs[!is.na(pred[,1])])
      }
    }else{
      weight<-rep(1, length(weight))
    }
    loss<-0
    for (i in 1:length(pred[!is.na(pred[,1]),1])){
      for (j in 1:k){
        if(obs[i]==levels(obs)[j]&!is.na(pred[i,1])){
          loss<-loss-weight[i]*log2(pred[i,j])
        }
      }
    }
  }else{
    pred[pred==0]<-0.000001
    pred[pred==1]<-0.999999
    if(weighted==T){
      weight<-pred[!is.na(pred)]
      weight[as.numeric(obs[!is.na(pred)])==2]<-length(which(as.numeric(obs[!is.na(pred)])==1))/length(obs[!is.na(pred)])
      weight[as.numeric(obs[!is.na(pred)])==1]<-length(which(as.numeric(obs[!is.na(pred)])==2))/length(obs[!is.na(pred)])
    } else {
      weight<-rep(1, length(pred[!is.na(pred)]))#/length(pred[!is.na(pred)])
    }
    if(!is.null(sdpred)){
      loss<-(-sum(weight*(as.numeric(obs[!is.na(pred)])-1)*log2(1-pred[!is.na(pred)]+sdpred[!is.na(pred)]))-sum(weight*(2-as.numeric(obs[!is.na(pred)]))*log2(pred[!is.na(pred)]+sdpred[!is.na(pred)])))
    } else {
      loss<-(-sum(weight*(as.numeric(obs[!is.na(pred)])-1)*log2(1-pred[!is.na(pred)]))-sum(weight*(2-as.numeric(obs[!is.na(pred)]))*log2(pred[!is.na(pred)])))
    }
  }
  return(loss)
}

#discretize and factorize scores of a groupbn object
disc.scores<-function(res, discretize, seed=NULL){
  k<-max(res$grouping)
  param<-res$disc.param
  data.scores<-res$group.data
  colnames(data.scores)<-paste("cl", 1:k, sep="")
  data.scores<-as.data.frame(data.scores)
  if(discretize){
  data.scores.dist<-data.scores
  for (i in 1:dim(data.scores.dist)[2]){
    title<-colnames(data.scores)[i]
    #print(title)
    disc<-discretize.dens(data.scores.dist[,i], cluster=T, title=title, seed=seed)
    data.scores.dist[,i]<-as.factor(disc$discretized)
    param[[i]]<-disc$levels
  }
  res$group.data<-data.scores.dist
  res$disc.param<-param
  } else {
    res$group.data<-data.scores
  }
  return(res)
}

#density approximative discretization
#graph: Boolean, if TRUE plots are produced
#rename.level: Boolean, if TRUE levels are renamed to integers (1,2,3,...)
#cluster: Boolean, if TRUE it is checked if a cluster is already a discrete variable

discretize.dens<-function(data, graph=F, title="Density-approxmative Discretization", rename.level=F, return.all=T, cluster=F, seed=NULL){
  if (!is.null(seed)) {set.seed(seed)}
  if (is.factor(data)){
    if(graph){
      plot<-plot(data,
            col="lightblue4", # column color
            border="black",
            xlab="Value",
            ylab="Frequency",
            main = title)
    }
    message("Data are already discrete.")
    result<-list(plot, discretized=data, levels=NULL)
    if(return.all){
      invisible(result)
    } else {
      invisible(data)
    }
  }
  #check if a cluster already has a limited number of states (<10)
  if (cluster==T&length(table(as.factor(data))[table(as.factor(data))>1])<5&length(table(as.factor(data))[table(as.factor(data))>1])>1){
    m<-nlevels(as.factor(data))
    #all levels exist more than once in the data (then keep all levels)
    if(all(table(as.factor(data))>1)){
      d<-as.factor(data)
      #set cutpoints between levels
      lev<-rep(0, length(levels(d))-1)
      for(l in 1:length(levels(d))){
        lev[l]<-as.numeric(levels(d))[l]+(as.numeric(levels(d))[l+1]-as.numeric(levels(d))[l])/2
      }
      #set outer boundaries to Inf
      lev[1]<--Inf
      lev[length(lev)]<-Inf
      d<-arules::discretize(data, method="fixed", breaks=lev)
      if(rename.level){
        levels(d)<-1:m
      }
      if(graph){
        plot<-plot(stats::density(data, na.rm=T), col="black", lwd=3,main = paste("Density Approxmative Discretization: ", title, sep=""))
        plot<-graphics::hist(data, # histogram
                   col="lightblue4", # column color
                   border="black",
                   breaks=lev,
                   xlab="Value",
                   prob = TRUE, # show densities instead of frequencies
                   main = title,
                   add=TRUE)
        plot<-graphics::lines(stats::density(data, na.rm=T), col="black", lwd=3, main = paste("Density Approxmative Discretization: ", title, sep=""))}
      #plot<-abline(v=lev, col="lightblue4", lty="dashed", lwd=3)
      #plot<-hist(data, col="red", breaks=lev)

      #set outer boundaries to Inf
      lev[1]<--Inf
      lev[length(lev)]<-Inf
      result<-list(plot, discretized=d, levels=lev)

      if(return.all){
        invisible(result)
      } else {
        invisible(d)
      }
    } else{
      cent<-as.numeric(names(table(as.factor(data))[table(as.factor(data))>1]))
      d<-arules::discretize(data, method="cluster", centers=cent)
      lev<-attr(d, "discretized:breaks")
      #set outer boundaries to Inf
      lev[1]<--Inf
      lev[length(lev)]<-Inf
      d<-arules::discretize(data, method="fixed", breaks=lev)
      if(rename.level){
        levels(d)<-1:length(cent)
      }
      result<-list(discretized=d)
      xy=0
      max<-length(cent)
      while(min(table(d))<20&xy<max-2){
        xy=xy+1
        cent<-as.numeric(names(sort(table(as.factor(data))[table(as.factor(data))>1], decreasing=T)[1:(max-xy)]))
        d<-arules::discretize(data, method="cluster", centers=cent)
        lev<-attr(d, "discretized:breaks")
        lev[1]<--Inf
        lev[length(lev)]<-Inf
        d<-arules::discretize(data, method="fixed", breaks=lev)
        if(rename.level){
          levels(d)<-1:length(cent)
        }
      }
      if(graph){
        plot<-plot(stats::density(data, na.rm=T), col="black", lwd=3,main = paste("Density Approxmative Discretization: ", title, sep=""))
        plot<-graphics::hist(data, # histogram
                   col="lightblue4", # column color
                   border="black",
                   breaks=lev,
                   xlab="Value",
                   prob = TRUE, # show densities instead of frequencies
                   main = title,
                   add=TRUE)
        plot<-graphics::lines(stats::density(data, na.rm=T), col="black", lwd=3, main = paste("Density Approxmative Discretization: ", title, sep=""))}
      #plot<-abline(v=lev, col="lightblue4", lty="dashed", lwd=3)
      #plot<-hist(data, col="red", breaks=lev)

      #set outer boundaries to Inf
      lev[1]<--Inf
      lev[length(lev)]<-Inf

      result<-list(plot, discretized=d, levels=lev)

      if(return.all){
        invisible(result)
      } else {
        invisible(d)
      }
    }
  }
  else{
    #get local maxima of the density
    max<-get.maxima(data)
    max.nr<-dim(max)[2]
    #if more than one local maximum: discretize around cluster means
    if (max.nr>1){
      int<-max.nr
      dist.max<-max(max[1,])-min(max[1,])
      range<-abs(stats::quantile(data, prob=0.05, na.rm=T)-stats::quantile(data, prob=0.95, na.rm=T))
      if (dist.max<range/5){
        int<-int+1
        d<-try(arules::discretize(data, method="cluster", centers=c(max[1,],mean(data, na.rm=T)+0.3)), silent = T)
        if(inherits(d,"try-error")){
          d<-arules::discretize(data, method="cluster", centers=length(max[1,])+1)
        }
      }
      else{
        d<-arules::discretize(data, method="cluster", centers=max[1,])
      }

      lev<-attr(d, "discretized:breaks")
      #set outer boundaries to Inf
      lev[1]<--Inf
      lev[length(lev)]<-Inf
      d<-arules::discretize(data, method="fixed", breaks=lev)
      if(rename.level){
        levels(d)<-1:int
      }
    }
    #if only one local maximum and quartiles are unique: discretize to quartiles
    else if (max.nr==1){
      #unique quartiles?
      if (length(unique(stats::quantile(data, probs=c(0,0.25,0.5,0.75,1), na.rm=TRUE))) == 5){
        #rem<-which.min(abs(quantile(data, na.rm = T)[2:4]-max[1,]))
        #if (rem==2){
        int=4
        br<-stats::quantile(data, na.rm=T)
        d<-arules::discretize(data, method="fixed", breaks=br)
        lev<-attr(d, "discretized:breaks")
        #set outer boundaries to Inf
        lev[1]<--Inf
        lev[length(lev)]<-Inf
        d<-arules::discretize(data, method="fixed", breaks=lev)
        if(rename.level){
          levels(d)<-1:int
        }
        #}
        #else if (rem==1){
        #  int=3
        #  br<-quantile(data, na.rm=T)[c(1,3,4,5)]
        #  d<-arules::discretize(data, method="fixed", breaks=br)
        #  lev<-attr(d, "discretized:breaks")
        #   levels(d)<-1:int
        #}
        # else {
        #  int=3
        #  br<-quantile(data, na.rm=T)[c(1,2,3,5)]
        #  d<-arules::discretize(data, method="fixed", breaks=br)
        # lev<-attr(d, "discretized:breaks")
        #  levels(d)<-1:int
        #}
      } else {
        x=0.05
        lev<-unique(stats::quantile(data, prob=c(0,x,1-x,1), na.rm = T))
        while(length(lev)<=2&x>=0){
          x=x-0.01
          lev<-unique(stats::quantile(data, prob=c(0,x,1-x,1), na.rm = T))
        }
        #set outer boundaries to Inf
        lev[1]<--Inf
        lev[length(lev)]<-Inf
        d<-arules::discretize(data, method="fixed", breaks=lev)
        if(rename.level){
          levels(d)<-1:length(unique(stats::quantile(data, na.rm=TRUE, prob=c(0,0.05,0.95,1))))
        }
        #int<-3
        #d<-arules::discretize(data, method="cluster", breaks=int)
        #lev<-attr(d, "discretized:breaks")
        #levels(d)<-1:int
      }
    }

    #plot histograms with intervals to check
    if(graph){
      plot<-plot(stats::density(data, na.rm=T), col="black", lwd=3,main = paste("Density Approxmative Discretization: ", title, sep=""))
      plot<-graphics::hist(data, # histogram
                 col="lightblue4", # column color
                 border="black",
                 breaks=lev,
                 xlab="Value",
                 prob = TRUE, # show densities instead of frequencies
                 main = title,
                 add=TRUE)
      plot<-graphics::lines(stats::density(data, na.rm=T), col="black", lwd=3, main = paste("Density Approxmative Discretization: ", title, sep=""))
      plot<-graphics::lines(max[1,], max[2,], type="p",pch=8, col="lightblue")}
    #plot<-abline(v=lev, col="lightblue4", lty="dashed", lwd=3)
    #plot<-hist(data, col="red", breaks=lev)
    #set outer boundaries to Inf
    lev[1]<--Inf
    lev[length(lev)]<-Inf
    result<-list(plot, discretized=d, levels=lev, optima=max)
    if(return.all){
      invisible(result)
    } else {
      invisible(d)
    }
  }
}
###############

#get all variables belonging to specific cluster from merge matrix
get.cluster<-function(cl, ME){
  idx<-cl
  while(any(idx>0)){
    new<-c()
    for (i in 1:length(idx)){
      if (idx[i]>0){
        new<-c(new, ME[idx[i],])
      }
    }
    idx<-c(idx[which(idx<0)], new)
  }
  return(-1*idx)
}

#find local optima in data (used for discretization function)
get.maxima<-function(data1){
  dens<-stats::density(data1, na.rm = T)
  vec<-dens$y
  #ordered observations
  vec.zoo<- zoo::as.zoo(vec)
  #wrap zeros around to take start and end also into account
  vec.zoo<-c(rep(0,10), vec.zoo, rep(0,10))
  #find maximum within a range of 21 data points
  top <- zoo::rollapply(vec.zoo, 21, function(x) which.max(x)==11)
  #get indices
  top.ind<-zoo::index(top)[zoo::coredata(top)]

  #only keep if peak higher than 0.01
  #if (length(which(dens$y[top.ind]>mean(vec)))>1){
  #  top.ind<-top.ind[which(dens$y[top.ind]>mean(vec))]
  # }
  #else{
  top.y<-dens$y[top.ind]
  sub<-sort(top.y, decreasing = TRUE)[which(sort(top.y, decreasing = T)>0.05*sort(top.y, decreasing = T)[1])]
  if(length(sub)>4){
    sub<-sort(top.y, decreasing = T)[1:4]
  }
  top.ind<-top.ind[which(top.ind%in%which(dens$y%in%sub))]
  #}
  top.x<-dens$x[top.ind]
  top.y<-dens$y[top.ind]

  return(rbind(top.x, top.y))
}

#get row/step in which a variable or cluster is merged
#returns vector of merging step (row of merge matrix) and column of the merged cluster/variable
get.merge.step<-function(j.var=0, j.cl=0, ME){
  if(j.var>0){
    idx<-which(ME[,]==-j.var)
    if (idx>=dim(ME)[1]+1){
      idx<-idx-dim(ME)[1]
      idx.2<-1
    }
    else{
      idx.2<-2
    }
    return(c(idx,idx.2))
  }
  if (j.cl>0){
    if (j.cl==dim(ME)[1] | j.cl==2*dim(ME)[1]){
      idx=402
    }
    else{
      idx<-which(ME==j.cl)
    }
    if (idx>=dim(ME)[1]+1){
      idx<-idx-dim(ME)[1]
      idx.2<-1
    }
    else{
      idx.2<-2
    }
    return(c(idx,idx.2))
  }
}

#get cluster number from merge matrix of a set of variables
get.number<-function(vars, ME){
  list<-sapply(vars, function(x){get.merge.step(j.var=x, ME=ME)})[1,]
  vars.n<-unique(list)
  while(length(vars.n)>1){
    list<-c(list[-which(list == min(list, na.rm = TRUE))], get.merge.step(j.cl=min(vars.n), ME=ME)[1])
    #list<-sapply(vars.n, function(x){get.merge.step(j.cl=x, ME=ME)})
    vars.n<-unique(list)
    length(vars.n)
  }
  return(vars.n)
}

#plot *layer*th-order neighbourhood of a variable in a large network
#return the neighbourhood subgraph and the neighbour nodes as a vector
graphviz.plot.neighbourhood<-function(net, target=NULL, layer=1){
  if(is.groupbn(net)){
    target<-net$target
    net<-net$bn
  }
  if(layer=="all"){
    x<-bnlearn::graphviz.plot(cpdag(net),shape="ellipse", highlight = list(nodes=target, fill="gray", col="black"))
    invisible(list(plot=x, subnetwork=net, neighbours=nodes(net)))
  }else{
    nodes1=target
    for (i in 1:layer){
      nodes1=c(nodes1, unique(unlist(lapply(nodes1,FUN=function(x) mb(net, x)))))
    }
    if (!target%in%nodes1){
      nodes1<-c(nodes1, target)
    }
    nodes1<-unique(nodes1)
    sub=bnlearn::subgraph(net, nodes1)
    x<-bnlearn::graphviz.plot(bnlearn::cpdag(sub),shape="ellipse", highlight = list(nodes=target, fill="gray", col="black"))
    invisible(list(plot=x, subnetwork=sub, neighbours=nodes1))
  }
}


#group data, discretize and factorize and separate target etc.
#res: groupbn object with data, target, separate, grouping
group.data.preproc<-function(res, discretize, seed=NULL){
  if (!is.null(seed)) {set.seed(seed)}
  #combine data
  X.quanti=res$X.quanti
  X.quali=res$X.quali
  target=res$target
  separate=res$separate

  data<-cbind(X.quanti, X.quali)
  if(dim(X.quanti)[1]!=dim(X.quali)[1]){
    stop("wrong data dimensions")
  }

  #discretize aggregation variables
  res<-disc.scores(res, discretize=discretize, seed=seed)
  cluster<-res$grouping
  k<-max(cluster)
  n<-max(cluster)
  data.scores<-res$group.data

  #separate target and other variables that need to be separated
  if(!is.null(target)){
    if(!(target%in%colnames(data))){
      stop("Could not find target variable")
    } else {
      if (target%in%colnames(X.quanti)){
        target.data<-X.quanti[target]
      } else {
        target.data<-X.quali[target]
      }
        target.number<-which(colnames(data)==target)
        cl.target<-cluster[target]
        cl.target.names<-names(which(cluster==cl.target))
        cl.target.names<-cl.target.names[-which(cl.target.names==target)]
        if(length(cl.target.names)==0){
        colnames(data.scores)[cl.target]<-target
        data.scores[,cl.target]<-target.data
        } else {
        #do pca
        pc<-PCAmix.groupbn(X.quanti, X.quali, cl.target.names, seed=seed)
        res$pca.param[[cl.target]]<-pc
        #discretize score
        if(discretize){
          disc<-discretize.dens(pc$scores[,1], cluster=T, seed=seed)
          data.scores[,cl.target]<-as.factor(disc$discretized)
          res$disc.param[[cl.target]]<-disc$levels
          data.scores<-cbind(data.scores, target.data)
          colnames(data.scores)[length(colnames(data.scores))]<-target
          cluster[target]<-n+1
          res$disc.param<-lappend(res$disc.param, NULL)
          res$pca.param<-lappend(res$pca.param, NULL)
          n<-n+1
        } else {
          data.scores[,cl.target]<-pc$scores[,1]
          res$disc.param[[cl.target]]<-NULL
          data.scores<-cbind(data.scores, target.data)
          colnames(data.scores)[length(colnames(data.scores))]<-target
          cluster[target]<-n+1
          res$disc.param<-lappend(res$disc.param, NULL)
          res$pca.param<-lappend(res$pca.param, NULL)
          n<-n+1
        }
        }
    }
  }
  if(!is.null(separate)){
    for (sep in 1:length(separate)){
      variable<-separate[sep]
      if(!(variable%in%colnames(data))){
        stop("Could not find variable")
      } else {
        variable.number<-which(colnames(data)==variable)
        cl.variable<-cluster[variable.number]
        cl.variable.names<-names(which(cluster==cl.variable))
        cl.variable.names<-cl.variable.names[-which(cl.variable.names==variable)]
        if(length(cl.variable.names)==0){
          colnames(data.scores)[cl.variable]<-variable
          if (variable%in%colnames(X.quanti)){
            #discretize data
            if(discretize){
            disc<-discretize.dens(X.quanti[,variable], seed=seed)
            variable.data<-as.factor(disc$discretized)
            #res$pca.param[[cl.variable]]<-NULL
            res$disc.param[[cl.variable]]<-disc$levels
            } else {
              variable.data<-X.quanti[,variable]
            }
          } else if (variable%in%colnames(X.quali)){
            variable.data<-X.quali[variable]
          }
          data.scores[,cl.variable]<-variable.data
        } else {
        res$disc.param<-lappend(res$disc.param, NULL)
        res$pca.param<-lappend(res$pca.param, NULL)
        if (variable%in%colnames(X.quanti)){
          #discretize data
          if(discretize){
            disc<-discretize.dens(X.quanti[,variable], seed=seed)
            variable.data<-as.factor(disc$discretized)
            #res$pca.param[[n+1]]<-NULL
            res$disc.param[[n+1]]<-disc$levels
          }else {
            variable.data<-X.quanti[,variable]
          }
        } else if (variable%in%colnames(X.quali)){
          variable.data<-X.quali[variable]
        }
        #do pca
        pc<-PCAmix.groupbn(X.quanti, X.quali, cl.variable.names, seed=seed)
        res$pca.param[[cl.variable]]<-pc
        #discretize score
        if(discretize){
          disc<-discretize.dens(pc$scores[,1], cluster=T, seed=seed)
          data.scores[,cl.variable]<-as.factor(disc$discretized)
          res$disc.param[[cl.variable]]<-disc$levels
        } else {
          data.scores[,cl.variable]<-pc$scores[,1]
        }
        data.scores<-cbind(data.scores, variable.data)
        colnames(data.scores)[length(colnames(data.scores))]<-variable
        cluster[variable]<-n+1
        n<-n+1
        }
      }
    }
  }

  for (i in 1:k){
    if(is.null(res$pca.param[[i]])){
      cl.variable.names<-names(which(cluster==i))
      pc<-PCAmix.groupbn(X.quanti, X.quali, cl.variable.names, seed=seed)
      res$pca.param[[i]]<-pc
      #discretize score
      if(discretize){
      disc<-discretize.dens(pc$scores[,1], cluster=T, seed=seed)
      #data.scores[,cl.variable]<-as.factor(disc$discretized)
      res$disc.param[[i]]<-disc$levels
      }
    }
  }
  res$group.data<-data.scores
  res$grouping<-cluster
  res$k<-k
  return(res)
}

#add lists to each other
lappend <- function (lst, x){
  lst <- c(lst, list(x))
  return(lst)
}

#add elements of list of lists into separate lists
#rearrange.listoflists <- function (lst){
#  outcome<-vector(mode = "list", length = length(lst[[1]]))
#for (i in 1:length(lst[[1]])){
#    outcome[[i]]<-vector(mode = "list", length = length(lst))
#    for(j in 1:length(lst)){
#      outcome[[i]][[j]]<-lst[[j]][[i]]
#    }
#  }
#  return(outcome)
#}

#get nth element of several lists as a list
comb.output.scores<-function(x,...,n=13){
  sapply(seq_along(x), FUN=function(i) c(rlist::list.last(x[[i]][[n]][[1]])))
}

#Learn Network
#data: Dataframe without missing values
network<-function(data, start=NULL, R=200, restart=5, perturb=max(1,ifelse(is.null(start), round(0.1*dim(data)[2]), round(0.1*narcs(start)))), blacklist=NULL, arc.thresh=NULL, struct.alg=NULL, boot=TRUE, debug=F, seed=NULL){
  if(debug){message("learning arcs")}
  if(!is.null(seed)){set.seed(seed)}
  if(is.null(struct.alg)){stop("Please specify a structure learning algorithm.")}
  data<-as.data.frame(data)
  #print(head(data)) #debugging
  #arcs<-suppressWarnings(bnlearn::boot.strength(data, R=R, algorithm = "hc", algorithm.args = list(restart=restart, perturb=perturb, start=start, blacklist=blacklist)))
  if(boot){
    if(struct.alg=="tabu"){
      arcs<-bnlearn::boot.strength(data, R=R, algorithm = struct.alg, algorithm.args = list(start=start, blacklist=blacklist))
      net<-bnlearn::averaged.network(arcs)
      if(!is.null(arc.thresh)){
        net<-bnlearn::averaged.network(arcs, threshold = arc.thresh)
      }
      res <- try(bnlearn::cextend(net), silent = T)
      #try if extension is possible
      count<-1
      while(inherits(res, "try-error")&count<=10){
        message("try-error")
        count<-count+1
        arcs<-bnlearn::boot.strength(data, R=R, algorithm = struct.alg, algorithm.args = list(blacklist=blacklist))
        net<-bnlearn::averaged.network(arcs)
        if(!is.null(arc.thresh)){
          net<-bnlearn::averaged.network(arcs, threshold = arc.thresh)
        }
        res <- try(bnlearn::cextend(net))
      }
      if(count>10){stop("No extension of the net is possible.")}
    } else if(struct.alg=="hc"){
      arcs<-bnlearn::boot.strength(data, R=R, algorithm = struct.alg, algorithm.args = list(restart=restart, perturb=perturb, start=start, blacklist=blacklist))
      net<-bnlearn::averaged.network(arcs)
      if(!is.null(arc.thresh)){
        net<-bnlearn::averaged.network(arcs, threshold = arc.thresh)
      }
      res <- try(bnlearn::cextend(net), silent = T)
      #try if extension is possible
      count<-1
      while(inherits(res, "try-error")&count<=10){
        message("try-error")
        count<-count+1
        arcs<-bnlearn::boot.strength(data, R=R, algorithm = struct.alg, algorithm.args = list(restart=restart, perturb=perturb,  blacklist=blacklist))
        net<-bnlearn::averaged.network(arcs)
        if(!is.null(arc.thresh)){
          net<-bnlearn::averaged.network(arcs, threshold = arc.thresh)
        }
        res <- try(bnlearn::cextend(net))
      }
      if(count>10){stop("No extension of the net is possible.")}
    } else {
      stop("bootstrapping is only implemented in combination with score-based algorithms.")
    }
  } else {
    bnlearn.struct.alg<-match.fun(struct.alg)
    net<-bnlearn.struct.alg(data, blacklist=blacklist)
    res <- try(bnlearn::cextend(net), silent = T)
    count<-1
    while(inherits(res, "try-error")&count<=10){
      message("try-error")
      count<-count+1
      net<-bnlearn.struct.alg(data, blacklist=blacklist)
      res <- try(bnlearn::cextend(net))
    }
    if(count>10){stop("No extension of the net is possible.")}
    arcs<-NULL
  }
  return(list(net=net, arc.confid=arcs))
}

#res: groupbn object
#save.name: filename for saving
#pdf: Boolean, should the file be saved as pdf
groupbn.output.table<-function(res, with.scores=TRUE){
  net<-res$bn
  X.quali<-res$X.quali
  X.quanti<-res$X.quanti
  data.scores<-res$group.data
  cluster<-res$grouping

  data<-cbind(X.quanti, X.quali)
  #Calculate similarity scores of synthetic variables and original variables
  #calculate similarity score to cluster representant
  scores<-cluster
  names(scores)<-colnames(data)
  for (i in 1:length(scores)){
    if(nlevels(factor( data[stats::complete.cases(data.scores)&stats::complete.cases(data),i]))==1|nlevels(factor(data.scores[stats::complete.cases(data.scores)&stats::complete.cases(data),cluster[i]]))==1){
      scores[i]<-NA
    } else if(is.factor(data[stats::complete.cases(data.scores)&stats::complete.cases(data),i])){
      scores[i]<-ClustOfVar::mixedVarSim(factor(data.scores[stats::complete.cases(data.scores)&stats::complete.cases(data),cluster[i]]), factor(data[stats::complete.cases(data.scores)&stats::complete.cases(data),i]))
    } else {
      scores[i]<-ClustOfVar::mixedVarSim(factor(data.scores[stats::complete.cases(data.scores)&stats::complete.cases(data),cluster[i]]), data[stats::complete.cases(data.scores)&stats::complete.cases(data),i])
    }
  }

  #create data frame with information about cluster and similarity
  names<-paste("cluster",1:max(cluster), sep="")
  max.len<-max(summary(factor(cluster)))
  #first column
  df<-data.frame(c(names(sort(scores[names(which(cluster==1))], decreasing=T, na.last=TRUE)), rep("", max.len - length(names(which(cluster==1))))),stringsAsFactors =FALSE)
  colnames(df)<-names[1]
  #other columns
  for (i in 2:max(cluster)){
    x<-names(sort(scores[names(which(cluster==i))], decreasing=T, na.last=TRUE))
    df[,i]<-c(x, rep("", max.len - length(x)))
    colnames(df)[i]<-names[i]
  }

  #add scores
  if(with.scores){
  for (i in 1:max.len){
    for (j in 1:max(cluster)){
      if (df[i,j]!=""){
        if(is.na(scores[df[i,j]])){
          df[i,j]<-paste(df[i,j], ": NA", sep="")
        }
        else if (round(scores[df[i,j]],2)==0){
          df[i,j]<-paste(df[i,j], ": <0.01", sep="")
        } else{
          df[i,j]<-paste(df[i,j], ": ",round(scores[df[i,j]],2), sep="")
        }
      }
    }
  }
  }
  invisible(df)
}

#do PCA for a given vector of variables (splitting, PCAmix call with right configuration)
#X.quanti, X.quali: data
#names: variable names for which a pca should be done
#graph: if PCA plots should be produced
PCAmix.groupbn<-function(X.quanti, X.quali, names, graph=FALSE, seed=NULL){
  #split to quali and quantitative variables
  if (!is.null(seed)) {set.seed(seed)}
  quali.names<-names[which(names%in%colnames(X.quali))]
  quanti.names<-names[which(names%in%colnames(X.quanti))]
  if (length(quanti.names)==1&length(quali.names)>0){
    #1 Quanti, many Quali: Error: discretize
    tempd<-discretize.dens(X.quanti[,quanti.names], seed=seed)
    quan2qual<-tempd$discretized
    quan2qual<-as.data.frame(quan2qual)
    colnames(quan2qual)<-quanti.names
    pc<-PCAmixdata::PCAmix(X.quali=cbind(X.quali[,quali.names, drop=F], quan2qual), rename.level = T, graph=graph, ndim=2)
    attr(pc, "breaks")<-tempd$levels
  } else if (length(quanti.names)>0&length(quali.names)==0){
    #Only Quanti
    pc<-PCAmixdata::PCAmix(X.quanti=X.quanti[,quanti.names, drop=F], graph=graph, rename.level=TRUE)
  } else if (length(quanti.names)==0&length(quali.names)>0){
    #Only Quali
    pc<-PCAmixdata::PCAmix(X.quali=X.quali[,quali.names, drop=F], graph=graph, rename.level=TRUE)
  } else if (length(quanti.names)>0&length(quali.names)>0){
    #both
    df1<-X.quanti[,quanti.names, drop=F]
    df2<-X.quali[,quali.names, drop=F]

    pc<-PCAmixdata::PCAmix(X.quanti=df1, X.quali=df2, graph=graph, rename.level=TRUE)
  }
  return(pc)
}

plot.groupbn<-function(x,...){
  plot(bnlearn::cpdag(x$bn), highlight=c(x$target), color="coral1",...)
  print(x)
}

#remove variables where all the categories are identical
rem.id.col<-function(X.quali){
  t<-sapply(X.quali, table)
  vars.rem<-c()
  for (i in 1:length(t)){
    if(sum(t[[i]]==0)>=length(t[[i]])-1){
      vars.rem<-c(vars.rem, names(t)[i])
    }
  }
}

#get division of one cluster to the next two clusters, vars=indices of variables
split.cluster<-function(vars, ME){
  nr<-get.number(vars, ME)
  split.nr<-ME[nr,]
  cl1<-get.cluster(split.nr[1], ME)
  cl2<-get.cluster(split.nr[2], ME)
  return(list(cl1, cl2))
}

#Parameter fitting and several validation scores
#if do.fit=T, fitting is done within the function
#if do.fit=F, net must be an already fitted object of class bn.fit , method %in% c("parents", "bayes-lw)
#n bootstapping for probability estimation
validation<-function(net, data, target, thresh=0.5, do.fit=TRUE, n=2000 , from=NULL, method="bayes-lw", debug=F, seed=NULL){
  if (!is.null(seed)) {set.seed(seed)}
  if(do.fit){
    if(debug){message("fitting")}
      if (is.factor(data[[target]])&all(sapply(data, is.factor))){
        fit<-bnlearn::bn.fit(cextend(net), data, method="bayes")
      } else {
        fit<-bnlearn::bn.fit(cextend(net), data)
     }
 } else {
    fit=net
    for (clus in 1:bnlearn::nnodes(fit)){
      if (any(levels(data[,clus])!=rownames(fit[[clus]][[4]]))){
      message("Attention: Level names changed of cluster", clus)
      levels(data[,clus])<-rownames(fit[[clus]][[4]])
      }
    }
 }
  if(debug){message("validation")}
  #save actual values
  obs<-data[[target]]
  if(!is.factor(obs)){
    temp<-sapply(1:dim(data)[1], function(j){
      if(sum(is.na(data[j, -which(colnames(data)==target)]))==0){
        prob<-replicate(10, predict(object=fit, node=target, data=data[j, -which(colnames(data)==target), drop=F], method = method, n=n))
        return(c(mean(prob), stats::sd(prob)))
      }
      else{
        return(c(NA, NA))
      }
    })
    p<-temp[1,]
    sdprob<-temp[2,]
    rm(temp)

    #transform probabilities to predictions
    result<-MLmetrics::MSE(p, obs)
    attr(result, "pscores")<-p
    psd<-p
    psd[p>obs]<-psd[p>obs]-sdprob[p>obs]
    psd[p<=obs]<-psd[p<=obs]+sdprob[p<=obs]
    result.error<-MLmetrics::MSE(psd, obs)
    attr(result, "error.th")<-result.error+(abs(result-result.error))/2
    x<-paste("cor: ",round(cor(p, obs),2),"; R-sq: ", round(cor(p, obs)^2,2))
    if(debug){message(x)}
    attr(result, "scores") <- x
    return(result)
    #stop('implemented only for a categorical target variable with 2-4 levels. Haha')
  } else if(nlevels(factor(obs))<2|nlevels(factor(obs))>4){
    stop('implemented only for a categorical target variable with 2-4 levels')
  } else if(nlevels(factor(obs))>2&all(sapply(data, is.factor))){
    k<-nlevels(obs)
    levels(obs)<-0:(k-1)
    p<-matrix(0, dim(data)[1], k)
    for (j in 1:dim(data)[1]){
      if(sum(is.na(data[j, -which(colnames(data)==target)]))==0){
        p[j,]<-attr(predict(object=fit, node=target, data=data[j, -which(colnames(data)==target), drop=F], method = method, n=n, prob = TRUE), "prob")
      }
      else{
        p[j,]<-rep(NA,k)
      }
    }
    #transform probabilities to predictions
    predictions<-apply(p,1, which.max)-1
    predictions<-factor(predictions, levels=c(0:(k-1)))

    #calculate results
    result<-cross.en(p, obs, weighted=T)
    x<-paste("cross-entr.: ", round(result/sum(!is.na(p[,1])),2))
    attr(result, "scores") <- x
    attr(result, "auc")<- NA
    attr(result, "error.th")<- result/sum(!is.na(p[,1]))
    attr(result, "confusion") <- table(obs, predictions)
    attr(result, "pscores")<-p
    result<-result/sum(!is.na(p[,1]))
    if(debug){print(table(predictions,obs))}
    return(result)
  } else if (nlevels(factor(obs))==2&all(sapply(data, is.factor))) {
  levels(obs)<-c(0,1)
  temp<-sapply(1:dim(data)[1], function(j){
    if(sum(is.na(data[j, -which(colnames(data)==target)]))==0){
      if(is.null(from)){
        prob<-replicate(10, attr(predict(object=fit, node=target, data=data[j, -which(colnames(data)==target), drop=F], method = method, n=n, prob = TRUE), "prob")[1])
      } else {
        prob<-replicate(10, attr(predict(object=fit, node=target, data=data[j, -which(colnames(data)==target), drop=F], method = method, n=n, prob = TRUE, from=from), "prob")[1])
      }
      return(c(mean(prob), stats::sd(prob)))
    }
    else{
      return(c(NA, NA))
    }
  })
  p<-temp[1,]
  sdprob<-temp[2,]
  rm(temp)

  #transform probabilities to predictions
  predictions<-p
  predictions[p>=thresh]<-0
  predictions[p<thresh]<-1
  predictions<-factor(predictions, levels=c(0,1))

  #calculate different scores
  f11<-MLmetrics::F1_Score(y_true=obs[!is.na(predictions)], y_pred = predictions[!is.na(predictions)], positive=1)
  pre1<-MLmetrics::Precision(y_true=obs[!is.na(predictions)], y_pred = predictions[!is.na(predictions)], positive=1)
  re1<-MLmetrics::Recall(y_true=obs[!is.na(predictions)], y_pred = predictions[!is.na(predictions)], positive=1)
  pr.cu<-PRROC::pr.curve(scores.class0 = 1-p[!is.na(p)], weights.class0 = as.numeric(levels(obs[!is.na(p)]))[obs[!is.na(p)]], curve=T, rand.compute = T, min.compute = T, max.compute = T)
  #plot(pr.cu,max.plot = TRUE, min.plot = TRUE, rand.plot = TRUE, fill.area = TRUE, auc.main=TRUE)
  pr1<-pr.cu$auc.integral
  roc.cu<-PRROC::roc.curve(scores.class0 = 1-p[!is.na(p)], weights.class0 = as.numeric(levels(obs[!is.na(p)]))[obs[!is.na(p)]], curve=T, rand.compute = T, min.compute = T, max.compute = T)
  #plot(roc.cu,max.plot = TRUE, min.plot = TRUE, rand.plot = TRUE, fill.area = TRUE, auc.main=TRUE)
  pr2<-roc.cu$auc
  result<-cross.en(p, obs)
  result.error<-cross.en(p, obs, weighted=T,sdpred=sdprob)
  x<-paste("F1: ",round(f11,2),"; Precision: ", round(pre1,2), "; Recall: ", round(re1,2), "; AUC-PR: ", round(pr1,3), "; AUC-ROC: ", round(pr2,3), "; cross-entr.: ", round(result/sum(!is.na(p)),2))
  if(debug){message(x)}
  attr(result, "scores") <- x
  attr(result, "auc")<-roc.cu$auc
  attr(result, "error.th")<-(result.error+(result-result.error)*3/4)/sum(!is.na(p))
  attr(result, "confusion")<-table(obs, predictions)
  attr(result, "pscores")<-1-p
  if(debug){print(table(pred=predictions,obs=obs))}
  result<-result/sum(!is.na(p))
  return(result)
  } else if (nlevels(factor(obs))==2&!all(sapply(data, is.factor))) {
    levels(obs)<-c(0,1)
    p<-c()
    for(j in 1:dim(data)[1]){
      if(sum(is.na(data[j, -which(colnames(data)==target)]))==0){
        if(is.null(from)){
          p<-c(p,predict(object=fit, node=target, data=data[j, -which(colnames(data)==target), drop=F], method = method, n=n))
        } else {
          p<-c(p,predict(object=fit, node=target, data=data[j, -which(colnames(data)==target), drop=F],from=from, method = method, n=n))
        }
      } else{
        p<-c(p,NA)
      }
    }
    #transform probabilities to predictions
    predictions<-p
    #calculate different scores
    f11<-MLmetrics::F1_Score(y_true=obs[!is.na(predictions)], y_pred = predictions[!is.na(predictions)], positive=1)
    pre1<-MLmetrics::Precision(y_true=obs[!is.na(predictions)], y_pred = predictions[!is.na(predictions)], positive=1)
    re1<-MLmetrics::Recall(y_true=obs[!is.na(predictions)], y_pred = predictions[!is.na(predictions)], positive=1)
    x<-paste("F1: ",round(f11,2),"; Precision: ", round(pre1,2), "; Recall: ", round(re1,2))
    result<-f11
    if(debug){message(x)}
    attr(result, "scores") <- x
    attr(result, "error.th")<-result
    attr(result, "confusion")<-table(obs, predictions)
    attr(result, "pscores")<-p
    if(debug){print(table(pred=predictions,obs=obs))}
    result<-result
    return(result)
  }
}


#NEU:
predict.groupbn<-function(object, X.quanti, X.quali, rename.level=FALSE, return.data=FALSE, new.fit=FALSE, debug=FALSE,...){
  if(length(setdiff(colnames(X.quali),colnames(object$X.quali)))>0 | length(setdiff(colnames(object$X.quali),colnames(X.quali)))){
    stop("The variables in X.quali must be the same in the model and in the testdata.")
  }
  if(length(setdiff(colnames(X.quanti),colnames(object$X.quanti)))>0 | length(setdiff(colnames(object$X.quanti),colnames(X.quanti)))){
    stop("The variables in X.quanti must be the same in the model and in the testdata.")
  }
  if(debug){message("Data checked.")}
  #initialize a data.frame
  df <- data.frame(matrix(ncol = max(object$grouping), nrow = nrow(X.quanti)))
  colnames(df)<-paste("cl", levels(as.factor(object$grouping)), sep="")
  #initialize a score vector
  scores<-object$grouping

  #Predict Principal Components
  if(debug){message("Predict PCA:")}
  for (clus in 1:max(object$grouping)){
    if(debug){message(clus, " of ", max(object$grouping))}
    #split quali and quanti variables of a cluster
    quanti.names<-names(which(object$grouping==clus))[which(names(which(object$grouping==clus))%in%colnames(X.quanti))]
    quali.names<-names(which(object$grouping==clus))[which(names(which(object$grouping==clus))%in%colnames(X.quali))]
    if(!is.null(object$pca.param[[clus]])){
      if(!is.null(object$pca.param[[clus]]$quanti)){
        quanti.names<-rownames(object$pca.param[[clus]]$quanti$coord)[which(rownames(object$pca.param[[clus]]$quanti$coord)%in%quanti.names)]
      }
      if(!is.null(object$pca.param[[clus]]$quali)){
        quali.names<-rownames(object$pca.param[[clus]]$quali$contrib)[which(rownames(object$pca.param[[clus]]$quali$contrib)%in%quali.names)]
      }
      nr.vars<-length(quanti.names)+length(quali.names)

      #test for all identical factor levels
      uni<-apply(X.quali[,quali.names, drop=F],2, FUN=function(x){length(stats::na.omit(unique(x)))})
      if (any(uni==1)){
        #add hypothetical sample with missing levels
        nr.added<-length(which(uni==1))
        lvl.miss<-rep(0,length(which(uni==1)))
        for (unix in 1:nr.added){
          lvl.miss[unix]<-length(names(which(table(X.quali[,names(which(uni==1))[unix]])==0)))
        }
        X.quanti1<-X.quanti
        X.quali1<-X.quali
        for (unix in 1:nr.added){
          lvl.nr<-names(which(table(X.quali[,names(which(uni==1))[unix]])==0))
          for(lvl in 1:length(lvl.nr)){
            X.quanti1<-rbind(X.quanti1, X.quanti[1,])
            X.quali1<-rbind(X.quali1, X.quali[1,])
            X.quali1[dim(X.quali1)[1],names(which(uni==1))[unix]]<-lvl.nr[lvl]
            if(lvl>1){
              nr.added<-nr.added+1
            }
          }
        }
        #predict and save without hypothetical sample
        if (length(quanti.names)==1&length(quali.names)>1){
          quan2qual<-arules::discretize(X.quanti1[,quanti.names], method="fixed", breaks=attr(object$pca.param[[clus]], "breaks"))
          quan2qual<-as.data.frame(quan2qual)
          colnames(quan2qual)<-quanti.names
          df[,clus]<-predict(object$pca.param[[clus]], X.quali=cbind(X.quali1[,quali.names, drop=F], quan2qual), rename.level = T, graph=FALSE, ndim=2)[-c((nrow(X.quali1)-nr.added+1):nrow(X.quali1)),1]
          message(quanti.names, "removed")
        } else if (length(quanti.names)==0&length(quali.names)==1){
          df[,clus]<-factor(X.quali1[,quali.names])[1:(length(X.quali1[,quali.names])-1)]
          colnames(df)[clus]<-quali.names
        } else if (length(quanti.names)>0&length(quali.names)==1){
          df[,clus]<-predict(object$pca.param[[clus]], X.quanti=X.quanti[,quanti.names], X.quali=X.quali1[,quali.names, drop=F])[-c((nrow(X.quali1)-nr.added+1):nrow(X.quali1)),1]
        } else if(length(quanti.names)==1&length(quali.names)==0){
          df[,clus]<-X.quanti[,quanti.names, drop=F][1:(length(X.quali1[,quali.names])-1)]
        } else if (length(quanti.names)>0&length(quali.names)>0){
          df[,clus]<-predict(object$pca.param[[clus]], X.quanti=X.quanti1[,quanti.names], X.quali=X.quali1[,quali.names])[-c((nrow(X.quali1)-nr.added+1):nrow(X.quali1)),1]
        } else if(length(quanti.names)>0){
          df[,clus]<-predict(object$pca.param[[clus]], X.quanti=X.quanti1[,quanti.names])[-c((nrow(X.quali1)-nr.added+1):nrow(X.quali1)),1]
        } else {
          df[,clus]<-predict(object$pca.param[[clus]], X.quali=X.quali1[,quali.names])[-c((nrow(X.quali1)-nr.added+1):nrow(X.quali1)),1]
        }
      }else{
        if (length(quanti.names)==1&length(quali.names)>1){
          quan2qual<-arules::discretize(X.quanti[,quanti.names], method="fixed", breaks=attr(object$pca.param[[clus]], "breaks"))
          quan2qual<-as.data.frame(quan2qual)
          colnames(quan2qual)<-quanti.names
          df[,clus]<-predict(object$pca.param[[clus]], X.quali=cbind(X.quali[,quali.names, drop=F], quan2qual), rename.level = T, graph=FALSE, ndim=2)[,1]
        } else if (length(quanti.names)==0&length(quali.names)==1){
          df[,clus]<-factor(X.quali[,quali.names])
          colnames(df)[clus]<-quali.names
        } else if (length(quanti.names)==1&length(quali.names)==0){
          df[,clus]<-X.quanti[,quanti.names]
          colnames(df)[clus]<-quanti.names
        } else if (length(quanti.names)>0&length(quali.names)==1){
          df[,clus]<-predict(object$pca.param[[clus]], X.quanti=X.quanti[,quanti.names], X.quali=X.quali[,quali.names, drop=F])[,1]
        } else if(length(quanti.names)==1&length(quali.names)==0){
          df[,clus]<-X.quanti[,quanti.names, drop=F]
        } else if (length(quanti.names)>0&length(quali.names)>0){
          df[,clus]<-predict(object$pca.param[[clus]], X.quanti=X.quanti[,quanti.names], X.quali=X.quali[,quali.names])[,1]
        } else if(length(quanti.names)>0){
          df[,clus]<-predict(object$pca.param[[clus]], X.quanti=X.quanti[,quanti.names])[,1]
        } else {
          df[,clus]<-predict(object$pca.param[[clus]], X.quali=X.quali[,quali.names])[,1]
        }
      }
    } else if (length(quanti.names)==1&length(quali.names)==0){
      df[,clus]<-X.quanti[,quanti.names, drop=F]
    } else if (length(quanti.names)==0&length(quali.names)==1){
      df[,clus]<-X.quali[,quali.names, drop=F]
    }
  }
  #Discretize with given intervals
  if(debug){message("Discretization:")}
  for (clus in 1:max(object$grouping)){
    if(debug){message(clus, " of ", max(object$grouping))}
    if(all(!is.null(object$disc.param[[clus]]))){
      d<-arules::discretize(df[,clus], method="fixed", breaks=object$disc.param[[clus]])
      if(rename.level){
        levels(d)<-1:(length(object$disc.param[[clus]][[2]])-1)
      }
      df[,clus]<-d
    }
  }

  if(new.fit){
    if(debug){message("New fit.")}
    object$fit<-bn.fit(object$bn, df, method="bayes")
  }

  if(debug){message("Prediction.")}
  data<-cbind(X.quanti, X.quali)
  obs<-data[[object$target]]
  if(!is.factor(obs)){
    #continuous target
    temp<-sapply(1:dim(data)[1], function(j){
      if(sum(is.na(df[j, -which(colnames(df)==object$target)]))==0){
        prob<-replicate(10, predict(object=object$fit, node=object$target, data=df[j,], method = "bayes-lw"))
        c(mean(prob))
      }
      else{
        c(NA)
      }
    })
    temp<-as.data.frame(temp)
    temp$"target"<-obs
    colnames(temp)<-c("pred","target")
    if(return.data){
      return(list(pred=temp, data=df))
    } else {
      return(temp)
    }
    #stop('implemented only for a categorical target variable with 2-4 levels. Haha')
  } else {
    obs<-factor(obs, levels=levels(object$group.data[[object$target]]))
    if(nlevels(obs)<2|nlevels(obs)>4){
      stop('implemented only for a categorical target variable with 2-4 levels')
    } else if(nlevels(obs)>2){
      k<-nlevels(obs)
      levels(obs)<-0:(k-1)
      p<-matrix(0, dim(data)[1], k)
      for (j in 1:dim(data)[1]){
        if(sum(is.na(df[j, -which(colnames(df)==object$target)]))==0){
          p[j,]<-attr( predict(object=object$fit, node=object$target, data=df, method = "bayes-lw", prob = TRUE), "prob")
        }
        else{
          p[j,]<-rep(NA,k)
        }
      }
      return(p)
    } else {
      #discrete target
      levels(obs)<-c(0,1)
      data<-cbind(X.quanti, X.quali)
      temp<-sapply(1:dim(data)[1], function(j){
        if(sum(is.na(df[j, -which(colnames(df)==object$target)]))==0){
          prob<-replicate(10, attr(predict(object=object$fit, node=object$target, data=df[j, ,drop=FALSE], method = "bayes-lw", prob = TRUE), "prob")[2,])
          c(mean(prob))
        }
        else{
          c(NA)
        }
      })
      temp<-as.data.frame(temp)
      temp$"target"<-obs
      colnames(temp)<-c("pred", "target")
      if(return.data){
        return(list(pred=temp, data=df))
      } else {
        return(temp)
      }
    }
  }
}




#Create an interactive html network object with visNet (displaying similarity scores and number of variables in a score)
#res: groupbn object
#df: data frame from groupbn.output.table
#save.name: Name for file
groupbn.vis.html.plot<-function(res, df=NULL, save.file=TRUE, save.name=NULL, hierarchical=FALSE, nodecolor.all="#E0F3F8", nodecolor.special="cornflowerblue", main=NULL){
  if (is.null(df)){
    df<-groupbn.output.table(res)
  }
  if(is.null(main)){
    main=paste("Network model of ",res$target)
  }
  net<-res$bn
  cluster<-res$grouping
  arrows<-rep("to", dim(bnlearn::arcs(net))[1])
  undir<-bnlearn::undirected.arcs(bnlearn::cpdag(net))
  for (i in 1:dim(bnlearn::arcs(net))[1]){
    x<-bnlearn::arcs(net)[i,]
    set<-intersect(which(x[1]==undir[,1]),which(x[2]==undir[,2]))
    if(length(set)>0){
      arrows[i]<-"enabled"
    }
  }
  #node titles
  title<-paste("<b>Cluster", 1:max(cluster), "</b>")
  for (i in 1:dim(df)[2]){
    for (j in 1:dim(df)[1])
      if(df[j,i]!=""){
        title[i]<-paste(title[i], "<br>", df[j,i])
      }
  }
  #use most representative variables as names
  names_P2<-df[1,]
  for (i in 1:length(names_P2)){
    names_P2[i]<-paste("[",i, "] ", stringr::str_split(names_P2[i], stringr::fixed(":"))[[1]][1], " (",length(which(cluster==i)),")", sep="")
  }
  #shorter cluster names with number of variables
  names_P3<-names_P2
  for (i in 1:length(names_P2)){
    names_P3[i]<-paste("Cl", i, " (", length(which(cluster==i)), ")", sep="")
  }
  names_P3[which(bnlearn::nodes(net)==res$target)]<-toupper(res$target)
  #nodecolors
  #all blue
  nodecolors<-rep(nodecolor.all, length(bnlearn::nodes(net)))
  #special nodes: other blue
  nodecolors[which(bnlearn::nodes(net)%in%c(res$target, res$separate))]<-nodecolor.special
  #node settings
  nodes <- data.frame(id = bnlearn::nodes(net),
                      shape = c(rep("ellipse", length(nodes(net)))),#shape of nodes
                      title = as.character(title),                #node title
                      color = nodecolors,                         #colors
                      label = as.character(names_P2),             #node labels
                      font=list(color="black"))                   #font color
                       #shadow = c(FALSE, TRUE)                   #shadow
  strength<-round(res$arc.confid$strength[as.numeric(rownames(plyr::match_df(as.data.frame(res$arc.confid[,1:2]), data.frame(sapply(as.data.frame(arcs(res$bn)), as.character), stringsAsFactors=FALSE), on=c("from", "to"))))],2)
  #edge settings
  edges <- data.frame(from = bnlearn::arcs(net)[,1],
                      to = bnlearn::arcs(net)[,2],
                      arrows=arrows,
                      color=rep("darkgray", dim(bnlearn::arcs(net))[1]),
                      font.color = "gray",
                      title=strength,
                      value=strength,
                      scaling=list(min=5, max=10))
  #build network
  if(hierarchical){
    graph<-visNetwork::visNetwork(nodes, edges, width = "100%", height="200%", main = main,
                      footer=paste("Group Bayesian Network.\n The thickness of an edge represents its confidence.", sep="")) %>%
      visNetwork::visHierarchicalLayout(blockShifting=TRUE, edgeMinimization=FALSE)%>%
      visNetwork::visOptions(highlightNearest = list(enabled = T, hover = T),nodesIdSelection = T)%>%
      visNetwork::visInteraction(navigationButtons = TRUE)

  } else {
    graph<-visNetwork::visNetwork(nodes, edges, main = main,
                      footer=paste("Group Bayesian Network.\n The thickness of an edge represents its confidence. Clusters are named by its most central variable. The number in brackets denotes the number of variables in the cluster.", sep="")) %>%
      #visNetwork::visIgraphLayout(type = "square")%>%
      visPhysics(solver = "forceAtlas2Based",
                 forceAtlas2Based = list(gravitationalConstant = -100))%>%
      visNetwork::visOptions(highlightNearest = list(enabled = T, hover = T),nodesIdSelection = T)%>%
      visNetwork::visInteraction(navigationButtons = TRUE)
  }
  if(save.file){
    #save as html object
    if(is.null(save.name)){
      save.name=paste(format(Sys.Date(), format="%Y%m%d"),"_GroupBN", sep="")
    }
    visNetwork::visSave(graph, file=paste(save.name, ".html", sep=""), selfcontained = TRUE, background = "white")
    message("html written. (", save.name, ".html)")
    invisible(graph)
   } else {
     #optionally: output in RStudio Viewer
     print(graph)
     invisible(graph)
   }
}

#is.groupbn
is.groupbn <- function(x){
  inherits(x, "groupbn")
}

#print.groupbn
print.groupbn<-function(x,...){
  if (!inherits(x, "groupbn"))
    stop("use only with \"groupbn\" objects")
    gnr <- dim(x$group.data)[2]
    separate<-x$separate
    gsz<- mean(summary(as.factor(x$grouping)))
    nbrsz<- mean(sapply(x$bn$nodes, function(y){length(y[[2]])}))
    cat("group Bayesian network (class 'groupbn') \n\n")
    cat("name of target variable: ", x$target, "\n",sep = "")
    if(length(mb(x$bn, x$target))==0){
      cat("Warning: Target variable is disconnected!\n")
    }
    if(!is.null(separate)){
      cat("separated: ", paste(separate, collapse=" & "), "\n",sep = "")
    }
    if (is.numeric(gnr)){
      cat("number of groups: ", gnr,"\n", sep = " ")
    }
    if(is.numeric(gsz)&is.numeric(nbrsz)){
      cat("average group size: ", round(gsz,2), " \naverage neighbourhood size: ", round(nbrsz,2), "\n")
    }
    cat("achieved scoring: \n", attr(x$score, "scores"), "; BIC (netw.): ", round(stats::BIC(x$fit, stats::na.omit(x$group.data)),2), "\n",sep = "")
    cat("\n")
  res <- matrix("", 13, 2)
  colnames(res) <- c("name", "description")
  res[1, ] <- c("$bn", "Bayesian network structure")
  res[2, ] <- c("$fit", "fitted Bayesian network (multinomial)")
  res[3, ] <- c("$arc.confid", "arc confidence")
  res[4, ] <- c("$X.quali", "qualitative variables in a data.frame")
  res[5, ] <- c("$X.quanti", "quantitative variables in a data.frame")
  res[6, ] <- c("$grouping", "group memberships")
  res[7, ] <- c("$k", "number of groups of initial grouping")
  res[8, ] <- c("$group.data", "group representatives used for network inference")
  res[9, ] <- c("$target", "name of target variable")
  res[10,] <- c("$separate", "name of any other separated variables")
  res[11, ] <- c("$pca.param", "pca parameters of each group")
  res[12, ] <- c("$disc.param", "discretization intervals of each group")
  res[13, ] <- c("$score", "cross entropy and additional scoring information")
  row.names(res) <- rep("", 13)
  print(res)
}

groupbn.build.blacklist<-function(data, separate){
  blacklist<-as.data.frame(matrix(0, (dim(data)[2])*length(separate), 2))
  colnames(blacklist)=c("From", "To")
  sep<-c()
  for(i in 1:length(separate)){
    sep<-c(sep, rep(separate[i],dim(data)[2]))
  }
  blacklist[,2]<-sep
  blacklist[,1]<-rep(colnames(data),length(separate))

  blacklist<-blacklist[-which(blacklist[,1]==blacklist[,2]),]
  return(blacklist)
}

Try the GroupBN package in your browser

Any scripts or data that you put into this service are public.

GroupBN documentation built on March 7, 2021, 5:06 p.m.