greed: Clustering and Model Selection with the Integrated Classification Likelihood

#' @useDynLib greed
#' @importFrom Rcpp sourceCpp
#' @importFrom future %<-%
#' @name %<-%
NULL

#' @importFrom future %globals%
#' @name %globals%
NULL

#' @importFrom future %seed%
#' @name %seed%
NULL

#' @include models_classes.R fit_classes.R tools_cleanpath.R
#' @import Matrix
NULL

hybrid <- function(model, alg, data, K, verbose = FALSE) {
  fimerge <- function(ncl, merge_graph) {
    merge_cstr(model, data, ncl, merge_graph, verbose)
  }
  fiswap <- function(ncl, move_mat) {
    swap_cstr(model, data, ncl, move_mat, verbose = verbose)
  }

  train.hist <- data.frame(generation = c(), icl = c(), K = c())

  # multi-start in //
  # future::plan(future::multiprocess)

  solutions <- listenv::listenv()
  # first generation of solutions
  pop_size <- alg@pop_size

  sb <- cli::cli_status("{cli::symbol$info} Initializing a population of {pop_size} solutions.")
  
  for (i in 1:pop_size) {
    cli <- sample_cl(model, data, K)
    cli <- as.numeric(factor(cli))
    Ko <- max(cli)
    moves <- as.sparse(matrix(1, Ko, Ko))
    solutions[[i]] %<-% fiswap(cli, moves) %globals% c("model", "data", "cli", "verbose", "fiswap", "moves") %seed% TRUE
  }
  solutions <- as.list(solutions)
  icls <- sapply(solutions, function(s) {
    s@icl
  })
  # check for errors
  solutions <- solutions[!is.nan(icls)]
  icls <- icls[!is.nan(icls)]
  old_best <- -Inf
  best_icl <- max(icls)
  cK <- solutions[[order(icls, decreasing = TRUE)[1]]]@K
  nbgen <- 1
  # while maximum number of generation // all solutions are equals // no improvements
  pmut <- alg@prob_mutation
  Kmax <- alg@Kmax

  
  cli::cli_status_update(sb,"{cli::symbol$info} Generation {nbgen} : best solution with an ICL of {round(solutions[[which.max(icls)]]@icl)} and {solutions[[which.max(icls)]]@K} clusters.")

  while ((max(icls) - min(icls)) > 1 & nbgen < alg@nb_max_gen & best_icl > old_best & cK < Kmax) {
    train.hist <- rbind(train.hist, data.frame(generation = nbgen, icl = icls, K = sapply(solutions, function(s) {
      max(s@cl)
    })))
    # selection keep the top half solutions
    ii <- order(icls)
    Nsel <- round(alg@pop_size * 0.4)
    ii <- ii[(length(ii) - Nsel):length(ii)]
    icls <- icls[ii]
    solutions <- solutions[ii]
    bres <- solutions[[order(icls, decreasing = TRUE)[1]]]
    new_solutions <- listenv::listenv()
    for (i in 1:(alg@pop_size - 1)) {
      ip <- 1:min(c(Nsel, length(solutions)))
      i1 <- sample(ip, 1, prob = ip)
      i2 <- sample(ip[-i1], 1, prob = ip[-i1])
      s1 <- solutions[[i1]]
      s2 <- solutions[[i2]]
      new_solutions[[i]] %<-% full_cross_over(s1, s2, fimerge, fiswap, pmut, Kmax) %globals% c("s1", "s2", "fimerge", "fiswap", "pmut", "full_cross_over", "Kmax") %seed% TRUE
    }
    solutions <- c(bres, as.list(new_solutions))
    icls <- sapply(solutions, function(s) {
      s@icl
    })
    if (sum(is.na(icls)) > 0 | sum(is.nan(icls)) > 0) {
      message("NAN in objective function returning problematic solution")
      return(solutions[[which(is.nan(icls))[1]]])
    }
    solutions <- solutions[!is.nan(icls)]

    icls <- icls[!is.nan(icls)]
    old_best <- best_icl
    best_icl <- max(icls)
    cK <- solutions[[order(icls, decreasing = TRUE)[1]]]@K
    nbgen <- nbgen + 1

    cli::cli_status_update(sb, "{cli::symbol$info} Generation {nbgen} : best solution with an ICL of {round(solutions[[which.max(icls)]]@icl)} and {solutions[[which.max(icls)]]@K} clusters.")
    
  }
  if (cK > Kmax) {
    warning("The number of clusters has reached the upper limit.\n Increase Kmax (see ?Hybrid-class) if you want to explore clusterings with more clusters.")
  }

  train.hist <- rbind(train.hist, data.frame(generation = nbgen, icl = icls, K = sapply(solutions, function(s) {
    max(s@cl)
  })))

  # best solution
  res <- solutions[[order(icls, decreasing = TRUE)[1]]]

  # compute merge path
  path <- fit_greed_path(data, res)

  # clean the resuts (compute, merge tree,...)
  path <- cleanpathopt(path)
  # store train history
  path@train_hist <- train.hist
  path
}


full_cross_over <- function(sol1, sol2, fimerge, fiswap, pmutation, Kmax) {
  if (sol1@K == 1) {
    return(sol2)
  }
  if (sol2@K == 1) {
    return(sol1)
  }


  # cartesian product on the z of the two solution
  # ncl = unclass(factor(paste(sol1@cl,sol2@cl)))
  sol <- sol1
  perm <- sample(1:sol2@K, sol2@K)
  cl2 <- perm[sol2@cl]

  K2 <- sol2@K

  Ta <- table(sol@cl, cl2)
  nbclust <- sol@K - colSums(t(apply(Ta, 1, function(r) {
    cumsum(r > 0)
  })) == matrix(rowSums(Ta > 0), nrow = nrow(Ta), ncol = ncol(Ta))) + cumsum(colSums(Ta > 0))
  #  if(max(nbclust)>Kmax){
  #    K2 = which(nbclust>Kmax)[1]
  #    cl2[cl2>K2]=K2
  #  }
  ij <- which(table(sol@cl, cl2) > 0, arr.ind = TRUE)
  ncl <- as.numeric(factor(paste(sol@cl, "_", cl2, sep = ""), levels = paste(ij[, 1], "_", ij[, 2], sep = "")))


  M <- matrix(0, nrow(ij), nrow(ij))
  for (k in 1:sol1@K) {
    M[ij[, 1] == k, ij[, 1] == k] <- 1
  }
  for (k in 1:K2) {
    M[ij[, 2] == k, ij[, 2] == k] <- 1
  }
  ijAm <- which(M > 0, arr.ind = TRUE)
  ijAm <- ijAm[ijAm[, 1] != ijAm[, 2], ]
  Am <- Matrix::sparseMatrix(ijAm[, 1], ijAm[, 2], x = rep(1, nrow(ijAm)), dims = c(max(ncl), max(ncl)))
  move_mat <- Am
  if (nrow(ijAm) > 0) {
    sol <- fimerge(ncl, Matrix::tril(Am))
    move_mat <- sol@move_mat + Matrix::t(sol@move_mat)
    ncl <- sol@cl
    for (r in seq_len(nrow(move_mat))) {
      if (sum(move_mat[r, ] != 0) > 10) {
        merges <- which(move_mat[r, ] != 0)
        best_merges_row <- order(move_mat[r, merges], decreasing = TRUE)[1:10]
        move_mat[r, setdiff(merges, merges[best_merges_row])] <- 0
      }
    }
  }

  if (stats::runif(1) < pmutation) {
    sp_cl <- sample(max(ncl), 1)
    nclold <- ncl
    ncl[ncl == sp_cl] <- sample(c(sp_cl, max(ncl) + 1), sum(ncl == sp_cl), replace = TRUE)

    if (max(ncl) > nrow(move_mat) & sum(ncl == sp_cl) > 0) {
      move_mat <- cbind(move_mat, move_mat[, sp_cl])
      move_mat <- rbind(move_mat, move_mat[sp_cl, ])
      move_mat[sp_cl, max(ncl)] <- 1
      move_mat[max(ncl), sp_cl] <- 1
    } else {
      ncl <- nclold
    }
  }
  if (nrow(ijAm) > 0) {
    sol <- fiswap(ncl, move_mat)
  }
  sol
}


incremental_cross_over <- function(sol1, sol2, fimerge, fiswap, pmutation, Kmax) {
  # cartesian product on the z of the two solution
  # ncl = unclass(factor(paste(sol1@cl,sol2@cl)))
  sol <- sol1
  ncl <- sol@cl
  ncl_old <- sol@cl
  K2 <- sol2@K
  icl <- sol@icl
  for (k2 in 1:K2) {
    cl2 <- ifelse(sol2@cl == k2, 1, 2)
    ncl_old <- ncl
    ij <- which(table(ncl, cl2) > 0, arr.ind = TRUE)
    ncl <- as.numeric(factor(paste(ncl, "_", cl2, sep = ""), levels = paste(ij[, 1], "_", ij[, 2], sep = "")))
    M <- matrix(0, max(ncl), max(ncl))
    M[ij[, 2] == 1, ij[, 2] == 1] <- 1
    diag(M) <- 0
    ijm <- which(M == 1, arr.ind = TRUE)
    move_mat <- sparseMatrix(i = ijm[, 1], j = ijm[, 2], x = rep(1, nrow(ijm)), dims = c(max(ncl), max(ncl)))

    if (sum(move_mat) > 0) {
      sol <- fimerge(ncl, Matrix::tril(move_mat))
      move_mat <- sol@move_mat + Matrix::t(sol@move_mat)
      if (sol@icl > icl) {
        icl <- sol@icl
        ncl <- sol@cl
      } else {
        ncl <- ncl_old
      }
    }
  }
  M <- matrix(0, max(ncl), max(ncl))
  Tac <- table(ncl, sol1@cl)
  for (k in 1:sol1@K) {
    ibrothers <- which(Tac[, k] != 0)
    M[ibrothers, ibrothers] <- 1
  }
  Tac <- table(ncl, sol2@cl)
  for (k in 1:K2) {
    ibrothers <- which(Tac[, k] != 0)
    M[ibrothers, ibrothers] <- 1
  }
  diag(M) <- 0
  ijm <- which(M == 1, arr.ind = TRUE)
  move_mat <- sparseMatrix(i = ijm[, 1], j = ijm[, 2], x = rep(1, nrow(ijm)), dims = c(max(ncl), max(ncl)))

  sol <- fimerge(ncl, Matrix::tril(move_mat))
  move_mat <- sol@move_mat
  ncl <- sol@cl
  for (r in seq_len(nrow(move_mat))) {
    if (sum(move_mat[r, ] != 0) > 10) {
      merges <- which(move_mat[r, ] != 0)
      best_merges_row <- order(move_mat[r, merges], decreasing = TRUE)[1:10]
      move_mat[r, setdiff(merges, merges[best_merges_row])] <- 0
    }
  }

  if (stats::runif(1) < pmutation) {
    sp_cl <- sample(max(ncl), 1)
    nclold <- ncl
    ncl[ncl == sp_cl] <- sample(c(sp_cl, max(ncl) + 1), sum(ncl == sp_cl), replace = TRUE)

    if (max(ncl) > nrow(move_mat) & sum(ncl == sp_cl) > 0) {
      move_mat <- cbind(move_mat, move_mat[, sp_cl])
      move_mat <- rbind(move_mat, move_mat[sp_cl, ])
      move_mat[sp_cl, max(ncl)] <- 1
      move_mat[max(ncl), sp_cl] <- 1
    } else {
      ncl <- nclold
    }
  }

  sol <- fiswap(ncl, move_mat)

  sol
}

comeetie/greed documentation built on Oct. 10, 2022, 5:37 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

comeetie/greed
Clustering and Model Selection with the Integrated Classification Likelihood

R/alg_hybrid.R
In comeetie/greed: Clustering and Model Selection with the Integrated Classification Likelihood

Defines functions incremental_cross_over full_cross_over hybrid

R Package Documentation

Browse R Packages

We want your feedback!

comeetie/greed Clustering and Model Selection with the Integrated Classification Likelihood

R/alg_hybrid.R In comeetie/greed: Clustering and Model Selection with the Integrated Classification Likelihood

Defines functions incremental_cross_over full_cross_over hybrid

R Package Documentation

Browse R Packages

We want your feedback!

comeetie/greed
Clustering and Model Selection with the Integrated Classification Likelihood

R/alg_hybrid.R
In comeetie/greed: Clustering and Model Selection with the Integrated Classification Likelihood