R/gtclust.R

Defines functions compute_height collapse front plot.gtclust gtmethod_bayes_dirichlet gtmethod_bayes_dgmm gtmethod_bayes_mom gtmethod_chisq gtmethod_median gtmethod_centroid gtmethod_ward build_geotree order_tree order_tree_rec geocutree gtclust_graph gtclust_poly_dcsbm gtclust_lines gtclust_poly gtclust_knn gtclust_dist gtclust_delaunay gtclust_temp

Documented in front geocutree gtclust_delaunay gtclust_dist gtclust_graph gtclust_knn gtclust_lines gtclust_poly gtclust_poly_dcsbm gtclust_temp gtmethod_bayes_dgmm gtmethod_bayes_dirichlet gtmethod_bayes_mom gtmethod_centroid gtmethod_chisq gtmethod_median gtmethod_ward plot.gtclust

#' gtclust: A package for fast clustering of spatial or temporal data with contiguity constrained hierarchical clustering
#'
#' The mypackage package provides three categories of important functions:
#' gtclust_graph, gtclust_temp, gt_poly.
#' 
#' @section gtclust functions:
#' The mypackage functions ...
#'
#' @docType package
#' @name gtclust
#' @import Rcpp
#' @importFrom Rcpp evalCpp
#' @useDynLib gtclust, .registration=TRUE
NULL
#> NULL


#' @title Hierarchical clustering with contiguity constraints for temporal data
#'
#' @description This function take a data.frame and performs hierarchical clustering with contiguity constraints.
#' @param df an \code{\link[sf]{sf}} data.frame with polygons like features
#' @param method linkage criterion in ward (default) or average, median
#' @param scaling default scaling of the features in zscore (default) or raw (i.e. no scaling)
#' @param display_progress boolean to set progression bar
#' @return an \code{\link[stats]{hclust}} like object with additional slots
#' \describe{
#'   \item{data}{The numeric data (eventually scaled) used for the clustering}
#'   \item{centers}{The protoypes of each tree nodes}
#' }
#' @export
gtclust_temp=function(df,method="ward",scaling="raw",display_progress=FALSE){
  nT = nrow(df)
  nb = lapply(1:nT,\(it){
    nei = c(it-1,it+1)
    nei[nei>0 & nei<=nT]
  })
  hc_res=gtclust_graph(nb,df,method,scaling,display_progress)
  hc_res$call=sys.call()
  hc_res
}



#' @title Hierarchical clustering with contiguity constraints for point data with delaunay links 
#'
#' @description This function take a data.frame and performs hierarchical clustering with contiguity constraints.
#' @param df an \code{\link[sf]{sf}} data.frame with polygons like features
#' @param method linkage criterion in ward (default) or average, median
#' @param scaling default scaling of the features in zscore (default) or raw (i.e. no scaling)
#' @param display_progress boolean to set progression bar
#' @return an \code{\link[stats]{hclust}} like object with additional slots
#' \describe{
#'   \item{data}{The numeric data (eventually scaled) used for the clustering}
#'   \item{centers}{The protoypes of each tree nodes}
#'   \item{leafs_geometry}{geometries of the dendrogram leafs as an sfc list}
#'   \item{geotree}{geometries of the dendrogram no-leafs node as an sfc list}
#' }
#' @export
gtclust_delaunay=function(df,method="ward",scaling="raw",display_progress=FALSE){
  if(!methods::is(df,"sf")){
    stop("The dataset must be an sf data.frame.",call. = FALSE)
  }
  
  if(!all(sapply(sf::st_geometry(df),function(u){sf::st_is(u,"POINT")}))){
    stop("The dataset must contains only POINTS.",call. = FALSE)
  }
  df_nogeo=sf::st_drop_geometry(df)
  
  xy = sf::st_coordinates(df)[,1:2]
  delaunay = RTriangle::triangulate(RTriangle::pslg(xy))
  nb=rep(list(c()),nrow(df))
  for (il in 1:nrow(delaunay$E)){
    r = delaunay$E[il,]
    nb[[r[1]]]=c(nb[[r[1]]],r[2])
    nb[[r[2]]]=c(nb[[r[2]]],r[1])
  }

  hc_res=gtclust_graph(nb,df_nogeo,method,scaling,display_progress)
  hc_res$call=sys.call()
  # add geographical data
  hc_res$leafs_geometry = sf::st_geometry(df)
  hc_res$geotree = build_geotree(hc_res$merge,df)
  class(hc_res)=c(class(hc_res),"geoclust")
  hc_res
}


#' @title Hierarchical clustering with contiguity constraints for point data with distance threshold links 
#'
#' @description This function take a data.frame and performs hierarchical clustering with contiguity constraints.
#' @param df an \code{\link[sf]{sf}} data.frame with polygons like features
#' @param epsilon maximum distance allowed
#' @param method linkage criterion in ward (default) or average, median
#' @param scaling default scaling of the features in zscore (default) or raw (i.e. no scaling)
#' @param display_progress boolean to set progression bar
#' @return an \code{\link[stats]{hclust}} like object with additional slots
#' \describe{
#'   \item{data}{The numeric data (eventually scaled) used for the clustering}
#'   \item{centers}{The protoypes of each tree nodes}
#'   \item{leafs_geometry}{geometries of the dendrogram leafs as an sfc list}
#'   \item{geotree}{geometries of the dendrogram no-leafs node as an sfc list}
#' }
#' @export
gtclust_dist=function(df,epsilon,method="ward",scaling="raw",display_progress=FALSE){
  if(!methods::is(df,"sf")){
    stop("The dataset must be an sf data.frame.",call. = FALSE)
  }
  
  if(!all(sapply(sf::st_geometry(df),function(u){sf::st_is(u,"POINT")}))){
    stop("The dataset must contains only POINTS.",call. = FALSE)
  }
  df_nogeo=sf::st_drop_geometry(df)
  
  # buid graph
  buf = sf::st_buffer(df,epsilon)
  nb = sf::st_intersects(df,buf)
  class(nb)="list"
  
  hc_res=gtclust_graph(nb,df_nogeo,method,scaling,display_progress)
  hc_res$call=sys.call()
  # add geographical data
  hc_res$leafs_geometry = sf::st_geometry(df)
  hc_res$geotree = build_geotree(hc_res$merge,df)
  class(hc_res)=c(class(hc_res),"geoclust")
  hc_res
}

#' @title Hierarchical clustering with contiguity constraints for point data with knn links 
#'
#' @description This function take a data.frame and performs hierarchical clustering with contiguity constraints.
#' @param df an \code{\link[sf]{sf}} data.frame with polygons like features
#' @param k number of nearest neighbors to take for building the graph (the graph will be symmetric so some points may have in fine more neighbors)
#' @param method linkage criterion in ward (default) or average, median
#' @param scaling default scaling of the features in zscore (default) or raw (i.e. no scaling)
#' @param display_progress boolean to set progression bar
#' @return an \code{\link[stats]{hclust}} like object with additional slots
#' \describe{
#'   \item{data}{The numeric data (eventually scaled) used for the clustering}
#'   \item{centers}{The protoypes of each tree nodes}
#'   \item{leafs_geometry}{geometries of the dendrogram leafs as an sfc list}
#'   \item{geotree}{geometries of the dendrogram no-leafs node as an sfc list}
#' }
#' @export
gtclust_knn=function(df,k=3,method="ward",scaling="raw",display_progress=FALSE){
  if(!methods::is(df,"sf")){
    stop("The dataset must be an sf data.frame.",call. = FALSE)
  }
  
  if(!all(sapply(sf::st_geometry(df),function(u){sf::st_is(u,"POINT")}))){
    stop("The dataset must contains only POINTS.",call. = FALSE)
  }
  df_nogeo=sf::st_drop_geometry(df)
  
  # build graph
  xy = sf::st_coordinates(df)[,1:2]  
  knn = RANN::nn2(xy,k=k)
  # ensure symmetry and extract adjacency list from results
  nb = rep(list(c()),nrow(df))
  for (i in 1:nrow(xy)){
    knei = setdiff(knn$nn.idx[i,],i)
    nb[[i]]=unique(c(nb[[i]],knei))
    for(j in knn$nn.idx[i,]){
      nb[[j]]=unique(c(nb[[j]],i))
    }
  }

  
  hc_res=gtclust_graph(nb,df_nogeo,method,scaling,display_progress)
  hc_res$call=sys.call()
  # add geographical data
  hc_res$leafs_geometry = sf::st_geometry(df)
  hc_res$geotree = build_geotree(hc_res$merge,df)
  class(hc_res)=c(class(hc_res),"geoclust")
  hc_res
}



#' @title Hierarchical clustering with contiguity constraints between polygons
#'
#' @description This function take an \code{\link[sf]{sf}} data.frame and performs hierarchical clustering with contiguity constraints.
#' @param df an \code{\link[sf]{sf}} data.frame with polygons like features
#' @param method linkage criterion in ward (default) or average, median
#' @param scaling default scaling of the features in zscore (default) or raw (i.e. no scaling)
#' @param adjacency adjacency type to use  "rook" (default) or queen
#' @param display_progress boolean to set progression bar
#' @return an \code{\link[stats]{hclust}} like object with additional slots
#' \describe{
#'   \item{leafs_geometry}{geometries of the dendrogram leafs as an sfc list}
#'   \item{geotree}{geometries of the dendrogram no-leafs node as an sfc list}
#'   \item{data}{The numeric data (eventually scaled) used for the clustering}
#'   \item{centers}{The protoypes of each tree nodes}
#' }
#' @export
gtclust_poly=function(df,method="ward",adjacency="rook",scaling="raw",display_progress=FALSE){
  
  if(!methods::is(df,"sf")){
    stop("The dataset must be an sf data.frame.",call. = FALSE)
  }
  
  if(!all(sapply(sf::st_geometry(df),function(u){sf::st_is(u,"MULTIPOLYGON")}) | 
          sapply(sf::st_geometry(df),function(u){sf::st_is(u,"POLYGON")}))){
    stop("The dataset must contains only POLYGONS or MULTIPOLYGONS.",call. = FALSE)
  }
  df_nogeo=sf::st_drop_geometry(df)

  
  # build graph
  # see https://github.com/r-spatial/sf/issues/234#issuecomment-300511129 and ?st_relate
  if(adjacency=="rook"){
    nb = sf::st_relate(df, df, pattern = "F***1****")
  }else{
    nb = sf::st_relate(df,df, pattern = "F***T****")
  }
  class(nb)="list"
  hc_res=gtclust_graph(nb,df_nogeo,method,scaling,display_progress)
  hc_res$call=sys.call()
  # add geographical data
  hc_res$leafs_geometry = sf::st_geometry(df)
  hc_res$geotree = build_geotree(hc_res$merge,df)
  class(hc_res)=c(class(hc_res),"geoclust")
  hc_res
}


#' @title Hierarchical clustering with contiguity constraints between lines
#'
#' @description This function take an \code{\link[sf]{sf}} data.frame and performs hierarchical clustering with contiguity constraints.
#' @param df an \code{\link[sf]{sf}} data.frame with polygons like features
#' @param method linkage criterion in ward (default) or average, median
#' @param scaling default scaling of the features in zscore (default) or raw (i.e. no scaling)
#' @param display_progress boolean to set progression bar
#' @return an \code{\link[stats]{hclust}} like object with additional slots
#' \describe{
#'   \item{leafs_geometry}{geometries of the dendrogram leafs as an sfc list}
#'   \item{geotree}{geometries of the dendrogram no-leafs node as an sfc list}
#'   \item{data}{The numeric data (eventually scaled) used for the clustering}
#'   \item{centers}{The protoypes of each tree nodes}
#' }
#' @export
gtclust_lines=function(df,method="ward",scaling="raw",display_progress=FALSE){
  
  if(!methods::is(df,"sf")){
    stop("The dataset must be an sf data.frame.",call. = FALSE)
  }
  
  if(!all(sapply(sf::st_geometry(df),function(u){sf::st_is(u,"MULTILINESTRING")}) | 
          sapply(sf::st_geometry(df),function(u){sf::st_is(u,"LINESTRING")}))){
    stop("The dataset must contains only LINESTRINGS or MULTILINESTRINGS.",call. = FALSE)
  }
  df_nogeo=sf::st_drop_geometry(df)
  
  
  # build graph
  # see https://github.com/r-spatial/sf/issues/234#issuecomment-300511129 and ?st_relate
  
  nb = sf::st_relate(df,df, pattern = "F***T****")
  class(nb)="list"
  hc_res=gtclust_graph(nb,df_nogeo,method,scaling,display_progress)
  hc_res$call=sys.call()
  # add geographical data
  hc_res$leafs_geometry = sf::st_geometry(df)
  hc_res$geotree = build_geotree(hc_res$merge,df)
  class(hc_res)=c(class(hc_res),"geoclust")
  hc_res
}


#' @title Hierarchical clustering with contiguity constraints
#'
#' @description This function take an data.frame and performs hierarchical clustering with contiguity 
#' constraints using a graph describing the contiguity (provided )
#' @param adjacencies_list graph describing the contiguity between the rows of df as a list of adjacencies 
#' @param triplet graph to fit with dcsbm in triplet fromat
#' @param display_progress boolean to set progression bar
#' @return an \code{\link[stats]{hclust}} like object with additional slots
#' \describe{
#'   \item{data}{The numeric data (eventually scaled) used for the clustering}
#'   \item{centers}{The protoypes of each tree nodes}
#' }
#' @export
gtclust_poly_dcsbm = function(poly.sf,ods,join_ori,join_dest,vol,lambda_in=sum(ods[vol])/(nrow(poly.sf)^2)*100,lambda_ext=sum(ods[vol])/(nrow(poly.sf)^2)/100,adjacency="rook",display_progress=FALSE){
  poly.sf$i = 0:(nrow(poly.sf)-1)
  poly.df= poly.sf |> st_drop_geometry() 
  ods_i = ods |> 
    inner_join(poly.df|>rename(io=i),by=join_ori) |> 
    inner_join(poly.df|>rename(id=i),by=join_dest) 
  triplet = as.matrix(ods_i[,c("io","id",vol)])
  if(adjacency=="rook"){
    nb = sf::st_relate(zones,zones, pattern = "F***1****")
  }else{
    nb = sf::st_relate(zones,zones, pattern = "F***T****")
  }
  nb_c = lapply(nb,\(nei){nei-1})
  res = bayesian_hclustcc_sbm_cpp(nb_c,triplet,TRUE,FALSE,lambda_in,lambda_ext)
  # complete inter prior with linear slope if needed
  miss_prior = is.na(res$PriorInter)
  if(sum(miss_prior)>0){
    nbmiss = max(which(miss_prior))
    res$PriorInter[miss_prior]=seq(res$PriorIntra[length(res$PriorIntra)],res$PriorInter[nbmiss+1],length.out=nbmiss)
  }
  # compute the spanning tree prior term
  ptree = (res$PriorInter+res$PriorIntra-res$PriorInter[1])
  Llf = res$Ll + ptree +res$PriorK;
  # convert merge mat in hclust format
  V = nrow(zones);
  merge_mat = apply(res$merge,2,function(col){ifelse(col<V,-(col+1),col-V+1)})
  # format the results in hclust form
  hc_res = list(merge=merge_mat,
                Ll = -Llf,
                Kunif = length(Llf)-which.max(Llf)+1,
                height=compute_height(-Llf),
                order=order_tree(merge_mat,nrow(res$merge)),
                PriorIntra = res$PriorIntra,
                PriorInter = res$PriorInter,
                PriorK = res$PriorK,
                data=poly.sf,
                method="bayes_dcsbm")
  
  hc_res$call=sys.call()
  # add geographical data
  hc_res$leafs_geometry = sf::st_geometry(poly.sf)
  hc_res$geotree = build_geotree(hc_res$merge,poly.sf)
  class(hc_res)=   c("gtclust","hclust","geoclust")
  hc_res
}

#' @title Hierarchical clustering with contiguity constraints
#'
#' @description This function take an data.frame and performs hierarchical clustering with contiguity 
#' constraints using a graph describing the contiguity (provided )
#' @param adjacencies_list graph describing the contiguity between the rows of df as a list of adjacencies 
#' @param df a data.frame with numeric columns
#' @param method linkage criterion in ward (default) or average, median
#' @param scaling default scaling of the features in zscore or raw (i.e. no scaling, the default)
#' @param display_progress boolean to set progression bar
#' @return an \code{\link[stats]{hclust}} like object with additional slots
#' \describe{
#'   \item{data}{The numeric data (eventually scaled) used for the clustering}
#'   \item{centers}{The protoypes of each tree nodes}
#' }
#' @export
gtclust_graph = function(adjacencies_list,df,method="ward",scaling="raw",display_progress=FALSE){
  
  
  if(is.character(method) && !(method %in% c("ward","centroid","median","chisq","bayes_mom","bayes_dgmm","bayes_dirichlet"))){
    stop("'method' not recognized")
  }
  
  
  if (is.character(method)) 
    method <- get(paste0("gtmethod_",method), mode = "function", envir = parent.frame())
  if (is.function(method)) 
    method <- method()
  if (is.null(method$method)) {
    print(method)
    stop("'method' not recognized")
  }
  
  if(!(scaling %in% c("zscore","raw"))){
    stop("The scaling argument must be zscore or raw.")
  }
  if(!(methods::is(df,"data.frame")|methods::is(df,"matrix"))){
    stop("df must be a data.frame or a matrix")
  }
  if(methods::is(df,"matrix") & !is.numeric(df)){
    stop("df must be numeric.")
  }
  
  if(methods::is(df,"data.frame")){
    # remove geo in case
    if(methods::is(df,"sf")){
      df= sf::st_drop_geometry(df)
    }
    # select only numeric features
    num_feats = unlist(lapply(df,is.numeric))
    if(sum(num_feats)!=ncol(df)){
      warning("Some features were not numeric and have been removed from the clustering.",call. = FALSE)
      df=df[,num_feats]
    }
  }

  # check for missing values
  if(sum(is.na(df))>0){
    stop("Some regions have missing values and missing values are not allowed.",call. = FALSE)
  }
  
  # scales
  if(scaling=="zscore"){
    df_scaled = apply(df,2,\(col){(col-mean(col))/stats::sd(col)})
  }else{
    df_scaled = as.matrix(df)
  }
  
  nb_c = lapply(adjacencies_list,\(nei){nei-1})
  
  
  # check for muliple components
  G=igraph::graph_from_adj_list(adjacencies_list)
  comp=igraph::components(G)
  if(comp$no>1){
    stop("The graph is not connected, several components found.",call. = FALSE)
  }
  # run the algorithm
  if(method$method %in% c("ward","centroid","median","chisq")){
    res=hclustcc_cpp(nb_c,df_scaled,method,display_progress)
    # convert merge mat in hclust format
    V = nrow(res$data);
    merge_mat = apply(res$merge,2,function(col){ifelse(col<V,-(col+1),col-V+1)})
    # format the results in hclust form
    hc_res = list(merge=merge_mat,
                  Ll = res$Ll,
                  height=compute_height(res$Ll),
                  order=order_tree(merge_mat,nrow(res$merge)),
                  labels=(rownames(df)),
                  call=sys.call(),
                  method=method$method,
                  dist.method="euclidean",
                  data=res$data,
                  adjacencies_list=adjacencies_list,
                  centers=res$centers)
  }else{
    res=bayesian_hclustcc_cpp(nb_c,df_scaled,method,display_progress,method$approx)
    
    # complete inter prior with linear slope if needed
    miss_prior = is.na(res$PriorInter)
    if(sum(miss_prior)>0){
      nbmiss = max(which(miss_prior))
      res$PriorInter[miss_prior]=seq(res$PriorIntra[length(res$PriorIntra)],res$PriorInter[nbmiss+1],length.out=nbmiss)
    }
    # compute the spanning tree prior term
    ptree = (res$PriorInter+res$PriorIntra-res$PriorInter[1])
    Llf = res$Ll + ptree +res$PriorK;
    # convert merge mat in hclust format
    V = nrow(res$data);
    merge_mat = apply(res$merge,2,function(col){ifelse(col<V,-(col+1),col-V+1)})
    # format the results in hclust form
    hc_res = list(merge=merge_mat,
                  Ll = -Llf,
                  Kunif = length(Llf)-which.max(Llf)+1,
                  height=compute_height(-Llf),
                  PriorIntra = res$PriorIntra,
                  PriorInter = res$PriorInter,
                  PriorK = res$PriorK,
                  order=order_tree(merge_mat,nrow(res$merge)),
                  labels=(rownames(df)),
                  call=sys.call(),
                  method=method$method,
                  dist.method="euclidean",
                  data=res$data,
                  adjacencies_list=adjacencies_list,
                  centers=res$centers)
  }
  

  class(hc_res)  <- c("gtclust","hclust")

  hc_res
}


#' @title Cut a Geograpĥic Tree into Groups of Data and return an sf data.frame 
#'
#' @description Cuts a tree, e.g., as resulting from geohclust_poly, into several groups either by specifying the desired number(s) of groups or the cut height(s).
#' @param tree a tree as produced by gtclust. cutree() only expects a list with components merge, height, and labels, of appropriate content each.
#' @param k an integer scalar or vector with the desired number of groups
#' @param h numeric scalar or vector with heights where the tree should be cut.
#' At least one of k or h must be specified, k overrides h if both are given.
#' @return an \code{\link[sf]{sf}} like object
#' @export
geocutree=function(tree,k = NULL, h= NULL){
  if(!methods::is(tree,"geoclust")){
    stop("geocutree only accepts gtclust objects.",call. = FALSE)
  }
  if(is.null(k) && is.null(h)){
    stop("At least one of k or h must be specified, k overrides h if both are given.",call. = FALSE)
  }
  N=nrow(tree$merge)+1
  if(!is.null(h) & is.null(k)){
    k <- N + 1L - apply(outer(c(tree$height, Inf), h, `>`),2, which.max)
  }
  cl = stats::cutree(tree,k=k)

  istart = which(!duplicated(cl))
  clust_geo = list()
  clust_x = matrix(0,nrow=k,ncol=ncol(tree$data))
  ck=1
  for (i in istart){
    f = -i
    cnode = -i
    while(length(f)!=0){
      f = which(tree$merge[1:(N-k),1]==f | tree$merge[1:(N-k),2]==f)
      if(length(f)>0){
        cnode = f
      }
    }
    if(cnode>0){
      clust_geo[[ck]]=tree$geotree[[cnode]]
      clust_x[ck,]=tree$centers[cnode,]
    }else{
      clust_geo[[ck]]=tree$leafs_geometry[[-cnode]]
      clust_x[ck,]=tree$data[-cnode,]
    }
    ck=ck+1
  }
  colnames(clust_x)=colnames(tree$centers)
  sf::st_sf(cl=1:k,n=as.vector(table(cl)),clust_x,geometry=sf::st_as_sfc(clust_geo,crs=sf::st_crs(tree$leafs_geometry)))
}



# ecrire une version non récursive p)our éviter les pbr de stack
order_tree_rec=function(merge,i){
  if(merge[i,1]<0){
    left = -merge[i,1];
  }else{
    left = order_tree(merge,merge[i,1])
  }
  if(merge[i,2]<0){
    right = -merge[i,2];
  }else{
    right = order_tree(merge,merge[i,2])
  }
  c(left,right)
}

order_tree=function(merge,i){
  toprocess = c(i)
  result = c(i)
  while(length(toprocess)>0){
    newnodes = c()
    for (cn in toprocess){
      children = merge[cn,]
      ic     = which(result==cn)
      leftr  = result[(1:length(result))<ic]
      rightr = result[(1:length(result))>ic]
      result = c(leftr,children,rightr)
      newnodes = c(newnodes,children[children>0])
    }
    toprocess=newnodes
  }
  -result
}




build_geotree=function(merge,df){
  geoms= sf::st_geometry(df)
  geotree = list()
  for (i in 1:nrow(merge)){
    if(merge[i,1]<0){
      left = geoms[[-merge[i,1]]];
    }else{
      left = geotree[[merge[i,1]]]
    }
    if(merge[i,2]<0){
      right = geoms[[-merge[i,2]]];
    }else{
      right = geotree[[merge[i,2]]]
    }
    geotree[[i]] = sf::st_union(left,right)
  }
  geotree
}



#' @title aggregation methods for gtclust
#' @return An object of class gtmethod
#' @describeIn gtmethod classical ward method
#' @export
gtmethod_ward = function(){
  structure(list(method = "ward"), 
            class = "gtmethod")
}

#' @describeIn gtmethod classical centroid method
#' @export
gtmethod_centroid = function(){
  structure(list(method = "centroid"), 
            class = "gtmethod")
}


#' @describeIn gtmethod classical median method
#' @export
gtmethod_median = function(){
  structure(list(method = "median"), 
            class = "gtmethod")
}

#' @describeIn gtmethod chi-square method
#' @export
gtmethod_chisq = function(){
  structure(list(method = "chisq"), 
            class = "gtmethod")
}


#' @param beta prior parameters for the dirichlet distribution 
#' @describeIn gtmethod bayesian mixture of multinomials
#' @export
gtmethod_bayes_mom = function(beta = 1,approx=FALSE){
  structure(list(method = "bayes_mom",beta = beta,approx=approx), 
            class = c("gtmethod","bayesian_gtmethod"))
}

#' @param tau Prior parameter (inverse variance), (default 0.01)
#' @param kappa Prior parameter (gamma shape), (default to 1)
#' @param beta Prior parameter (gamma rate), (default to NaN, in this case beta will be estimated from data as 0.1 time the mean of X columns variances)
#' @param mu Prior for the means (vector of size D), (default to NaN, in this case mu will be estimated from data as the mean of X)
#' @describeIn gtmethod bayesian diagonal gaussian mixture model
#' @export
gtmethod_bayes_dgmm = function(tau = 0.01, kappa = 1, beta = NaN, mu = as.matrix(NaN),approx=FALSE){
  structure(list(method = "bayes_dgmm",tau=tau,kappa=kappa,beta = beta,mu=mu,approx=approx), 
            class = c("gtmethod","bayesian_gtmethod"))
}

#' @param lambda Prior parameter (inverse variance), (default 0.01)
#' @describeIn gtmethod bayesian diagonal gaussian mixture model
#' @export
gtmethod_bayes_dirichlet = function(lambda = as.matrix(NaN),approx=FALSE){
  structure(list(method = "bayes_dirichlet",lambda=lambda,approx=approx), 
            class = c("gtmethod","bayesian_gtmethod"))
}

#' @title plot gtclust dendrogram
#'
#' @param x a tree as produced by a gtclust variant. 
#' @param nb_max_leafs number of leafs to keep 
#' @return an \code{\link[ggplot2]{ggplot}} object
#' @export
plot.gtclust=function(x,y=NULL,nb_max_leafs=500,...){
  
  tree = x 
  rownames(tree$data)=1:nrow(tree$data)
  if(substr(tree$method,1,5)=="bayes"){
    im = which.min(tree$Ll)-1
    nb_max_leafs=(length(tree$height)+1)-im
  }else{
    im=(length(tree$height)+1)-nb_max_leafs
  }
  small_tree = collapse(tree, nb_max_leafs)
  dend_data = ggdendro::dendro_data(as.dendrogram(small_tree))
  cluster_sizes = sapply(small_tree$members,length)
  dend_data$labels$size=cluster_sizes[small_tree$order]
  


  segs_constr = dend_data$segments
  
  ggplot2::ggplot() + 
    ggplot2::geom_segment(data=segs_constr,ggplot2::aes_(x =~ x, y  =~ y, xend =~ xend, yend =~ yend),size=0.3)+
    ggplot2::geom_point(data = dend_data$labels, ggplot2::aes_(x=~x, y=~y-0.03*max(small_tree$height), size=~size))+
    ggplot2::theme_bw()+
    ggplot2::scale_x_continuous("",breaks=c())+
    ggplot2::scale_y_continuous(expression(-log(alpha)),n.breaks = 8)+
    ggplot2::guides(size=ggplot2::guide_legend("Branch size:"))+
    ggplot2::scale_size(breaks=round(seq(max(cluster_sizes)/3,max(cluster_sizes),length.out=3)/10)*10,limits=c(0,max(cluster_sizes)),range=c(0,4))+
    ggplot2::ggtitle(paste0(tree$method,": ",nb_max_leafs," clusters"))+
    ggplot2::theme(
      panel.grid.major = ggplot2::element_blank(),
      panel.grid.minor = ggplot2::element_blank(),
      panel.border = ggplot2::element_blank(),
      legend.position="bottom"
    )
    
  
  
}

#' @title plot gtclust pareto front
#'
#' @param x a tree as produced by a gtclust variant. 
#' @return an \code{\link[ggplot2]{ggplot}} object
#' @export
front = function(tree,mlogalpha=NULL){
  df.line=data.frame(Ll=-tree$Ll[which.min(tree$Ll):length(tree$Ll)],k=(length(tree$Ll)-which.min(tree$Ll)+1):1)
  df.front = df.line
  if(which.min(tree$Ll)>1){
    df.front$xend = tree$height[(which.min(tree$Ll)-1):length(tree$height)]
  }else{
    df.front$xend = c(0,tree$height)
  }

  df.front = df.front[nrow(df.front):1,]
  df.front = df.front[!duplicated(df.front$x),]
  if(!is.null(mlogalpha)){
    if(sum(df.front$xend<mlogalpha)>2){
      df.front = df.front[df.front$xend<mlogalpha,]
    }else{
      error("mlogalpha too small, no front to show.",call. = FALSE)
    }
  }
  xmax = max(df.front$xend)*1.05
  ny  = diff(range(df.front$Ll))*0.025
  df.front$x=c(xmax,df.front$xend[1:(nrow(df.front)-1)])
  df.front$lf = (df.front$x-df.front$xend)/df.front$x[1]

  ggplot2::ggplot(df.front)+
    ggplot2::geom_abline(data=df.line,ggplot2::aes_(intercept=~Ll,slope=~(k-1)),color="#cacaca")+
    ggplot2::geom_segment(data=df.front,ggplot2::aes_(x=~-x,xend=~-xend,y=~Ll-x*(k-1),yend=~Ll-xend*(k-1)))+
    ggplot2::geom_point(data=df.front,ggplot2::aes_(x=~-xend,y=~Ll-xend*(k-1)))+
    ggplot2::geom_text(data=df.front[df.front$lf>0.03,],ggplot2::aes_(x=~-xend,y=~Ll-xend*(k-1),label=~k),vjust="bottom",hjust = "right")+
    ggplot2::xlab(expression(log(alpha)))+
    ggplot2::ylab("ICL")+
    ggplot2::theme_bw()
}


collapse = function(tree,nb_leafs=500){
  if(length(tree$order)<=nb_leafs){
    tree$members=as.list(1:length(tree$order))
    return(tree);
  }
  L=length(tree$height)

  itokeep = seq(L-nb_leafs+2,L)
  merge=tree$merge[itokeep,]
  nodes=as.vector(merge)
  leafs = !(nodes %in% itokeep) | nodes<0
  new_nodes=rep(0,length(nodes))
  new_nodes[leafs]=-seq(1,nb_leafs)
  new_nodes[!leafs]=nodes[!leafs]-min(nodes[!leafs])+1
  new_merge=matrix(new_nodes,ncol=2)
  new_tree = tree
  new_tree$merge=new_merge
  new_tree$order=gtclust:::order_tree(new_merge,nrow(new_merge))
  new_tree$height=tree$height[itokeep]
  

  new_tree$members=lapply(seq_len(nb_leafs),function(k){
    if(nodes[leafs][k]<0){
      chilren = -nodes[leafs][k]
    }else{
      children = gtclust:::order_tree(tree$merge,nodes[leafs][k])
    }
  })
  #class(new_tree)="hclust"
  #plot(new_tree)
  new_tree
}




# extract the pareto front
compute_height <- function(heights) {
  # vector of icls value from root to leaves
  icl <- -heights[length(heights):1]
  # K
  K <- 1:length(icl)
  
  # init H
  H <- rep(0, length(icl))
  
  # current merge position
  cdi <- Inf
  # current best line
  bestline <- 1
  # vector with indexes of solutions that belong to the pareto front
  Front <- c(1)
  
  # from root to leaves
  for (l in 2:length(icl)) {
    
    # merge value with current bestline
    di <- (icl[l] - icl[bestline])
    din <- di / (l - bestline)
    
    # is their a potential merge ?
    if (di > 0) {
      
      # if this merge did not occurs after the current one update the front
      while (din > cdi & length(Front) > 1) {
        
        # remove the last solution from the front
        Front <- Front[-length(Front)]
        H[bestline] <- -1
        
        # update bestline
        bestline <- Front[length(Front)]
        
        # update merge position
        di <- (icl[l] - icl[bestline])
        din <- di / (l - bestline)
        # update previous merge position
        if (length(Front) > 1) {
          cdi <- (icl[bestline] - icl[Front[length(Front) - 1]]) / (bestline - Front[length(Front) - 1])
        } else {
          cdi <- H[1]
        }
      }
      
      # add the extracted solution to the front
      H[Front[length(Front)]] <- din
      cdi <- din
      bestline <- l
      Front <- c(Front, l)
    } else {
      # if solution not in front
      H[l] <- -1
    }
  }
  
  # copy from left previous value
  for (l in 2:length(icl)) {
    if (H[l] == -1) {
      H[l] <- H[l - 1]
    }
  }
  H[(length(H)-1):1]
}
comeetie/gtclust documentation built on Sept. 17, 2024, 12:45 a.m.