R/hidalgo.R

Defines functions print.summary.Hidalgo summary.Hidalgo plot.Hidalgo print.Hidalgo Hidalgo

Documented in Hidalgo plot.Hidalgo print.Hidalgo print.summary.Hidalgo summary.Hidalgo

#' Fit the \code{Hidalgo} model
#'
#' The function fits the Heterogeneous intrinsic dimension algorithm, developed
#' in Allegra et al., 2020. The model is a Bayesian mixture of Pareto
#' distribution with modified likelihood to induce homogeneity across
#' neighboring observations. The model can segment the observations into
#' multiple clusters characterized by different
#' intrinsic dimensions. This permits to capture hidden patterns in the data.
#' For more details on the algorithm, refer to
#' \href{https://www.nature.com/articles/s41598-020-72222-0}{Allegra et al., 2020}.
#' For an example of application to basketball data, see
#' \href{https://imstat.org/journals-and-publications/annals-of-applied-statistics/annals-of-applied-statistics-next-issues/}{Santos-Fernandez et al., 2021}.
#'
#' @param X data matrix with \code{n} observations and \code{D} variables.
#' @param dist_mat distance matrix computed between the \code{n} observations.
#' @param K integer, number of mixture components.
#' @param nsim number of MCMC iterations to run.
#' @param burn_in number of MCMC iterations to discard as burn-in period.
#' @param thinning integer indicating the thinning interval.
#' @param verbose logical, should the progress of the sampler be printed?
#' @param q integer, first local homogeneity parameter. Default is 3.
#' @param xi real number between 0 and 1, second local homogeneity parameter.
#' Default is 0.75.
#' @param alpha_Dirichlet parameter of the symmetric Dirichlet prior
#' on the mixture weights. Default is 0.05, inducing a sparse mixture.
#' Values that are too small (i.e., lower than 0.005) may cause underflow.
#' @param a0_d shape parameter of the Gamma prior on \code{d}.
#' @param b0_d rate parameter of the Gamma prior on \code{d}.
#' @param prior_type character, type of Gamma prior on \code{d}, can be
#' \describe{
#'    \item{\code{"Conjugate"}}{a conjugate Gamma distribution is elicited;}
#'    \item{\code{"Truncated"}}{the conjugate Gamma prior is truncated over the
#'    interval \code{(0,D)};}
#'    \item{\code{"Truncated_PointMass"}}{same as \code{"Truncated"}, but a
#'    point mass is placed on \code{D}, to allow the \code{id} to be
#'    identically equal to the nominal dimension.}
#' }
#' @param D integer, the maximal dimension of the dataset.
#' @param pi_mass probability placed a priori on \code{D} when
#' \code{Truncated_PointMass} is chosen.
#'
#'
#' @name Hidalgo
#'
#' @seealso \code{\link{id_by_class}} and \code{\link{clustering}}
#' to understand how to further postprocess the results.
#'
#' @return object of class \code{Hidalgo}, which is a list containing
#'\describe{
#'     \item{\code{cluster_prob}}{chains of the posterior mixture weights;}
#'     \item{\code{membership_labels}}{chains of the membership labels for all
#'     the observations;}
#'     \item{\code{id_raw}}{chains of the \code{K} intrinsic dimensions
#'     parameters, one per mixture component;}
#'     \item{\code{id_postpr}}{a chain for each observation, corrected for
#'     label switching;}
#'     \item{\code{id_summary}}{a matrix containing, for each observation, the
#'     value of posterior mean and the 5\%, 25\%, 50\%, 75\%, 95\% quantiles;}
#'     \item{\code{recap}}{a list with the objects and specifications passed to
#'     the function used in the estimation.}
#'  }
#' @export
#'
#' @references
#' Allegra M, Facco E, Denti F, Laio A, Mira A (2020).
#' “Data segmentation based on the local intrinsic dimension.”
#' Scientific Reports, 10(1), 1–27. ISSN 20452322,
#' \doi{10.1038/s41598-020-72222-0},
#'
#' Santos-Fernandez E, Denti F, Mengersen K, Mira A (2021).
#' “The role of intrinsic dimension in high-resolution player tracking data –
#' Insights in basketball.” Annals of Applied Statistics - Forthcoming, –
#' ISSN 2331-8422, 2002.04148, \doi{10.1038/s41598-022-20991-1}
#'
#' @examples
#' \donttest{
#' set.seed(1234)
#' X            <- replicate(5,rnorm(500))
#' X[1:250,1:2] <- 0
#' X[1:250,]    <- X[1:250,] + 4
#' oracle       <- rep(1:2,rep(250,2))
#' # this is just a short example
#' # increase the number of iterations to improve mixing and convergence
#' h_out        <- Hidalgo(X, nsim = 500, burn_in = 500)
#' plot(h_out, type =  "B")
#' id_by_class(h_out, oracle)
#' }
#'
#'
Hidalgo <- function(X  = NULL,
                    dist_mat = NULL,
                    K  = 10,
                    nsim     = 5000,
                    burn_in  = 5000,
                    thinning = 1,
                    verbose  = TRUE,
                    q   = 3,
                    xi = .75,
                    alpha_Dirichlet = .05,
                    a0_d = 1,
                    b0_d = 1,
                    prior_type = c("Conjugate",
                                   "Truncated",
                                   "Truncated_PointMass"),
                    D = NULL,
                    pi_mass = .5) {

  prior_type <- match.arg(prior_type)

  if (verbose)
    cat("Computing ratios and Nq...\n")

  if (!is.null(X)) {
    if(is.null(D)) D  <- ncol(X)
    InputList <-
      compute_mus(
        X = X,
        q = q,
        dist_mat = NULL,
        Nq = TRUE
      )
    mus       <- InputList$mus
    Nq        <- InputList$NQ

  } else if (!is.null(dist_mat)) {
    if (is.null(D) & prior_type != "Conjugate") {
      cat("Please provide the maximal dimension of the dataset")
    }

    InputList <-
      compute_mus(
        X = NULL,
        q = q,
        dist_mat = dist_mat,
        Nq = TRUE
      )
    mus       <- InputList$mus
    Nq        <- InputList$NQ

  } else if (is.null(X) & is.null(dist_mat)) {
    stop("Please provide either a dataset or a distance matrix", call. = FALSE)
  }


  if (verbose)
    cat("Done! \n")
  if (verbose)
    cat("Initializing the algorithm... \n")
  n  <-  length(mus)
  QQ <- (log(xi) - log(1 - xi))
  #  Initialization of Output:
  ALL_D  <- matrix(NA_real_, K, nsim)
  ALL_Pi <- matrix(NA_real_, K, nsim)
  ALL_Ci <- matrix(NA_real_, n, nsim)
  lev    <- seq_len(K)

  IRC.list <-  index_row_col(Nq, q, n)
  rm(Nq)

  pl  <- rdir_cpp(rep(alpha_Dirichlet,K))
  # in case alpha_Dirichlet is too low, initialize as equiprobable
  if( any(is.na(pl)) ){

    warning(paste("alpha_Dirichlet is too low and caused underflows when initializing.
Consider increasing its value in the next run."),
            call. = FALSE)

    pl <-  rep(alpha_Dirichlet,K)
  }

  Ci    <- sample(K, n, TRUE, pl)
  d     <- stats::rgamma(K, a0_d, 1 / b0_d)

  if (verbose)
    cat("Done! \n")


  if (verbose)
    cat("Computing recurring quantities to save you time... \n")
  ###################### Ricontrolla!!!!
  log_Zeta_extra <- log_Zeta_maker(n+1,xi,q)
  log_Zeta <- log_Zeta_extra[-(n+1)]
  ### problema prima: non pre-calcolavo 500esimo valore, che si basa con 501 vicini
  log_corr <-
    (log_Zeta_extra[seq_len(n)] - log_Zeta_extra[seq(2, n+1)]) *  (seq_len(n) - 1)
  if (verbose)
    cat("Done! \n")


  if (verbose) {
    cat("MCMC progress:\n")
    utils::flush.console()
    pbar <- utils::txtProgressBar(min = 1,
                                  max = nsim * thinning + burn_in,
                                  style = 3)
    ipbar <- 0
  }

  time1 <-  Sys.time()

  # Gibbs sampler loop

  for (sim in seq_len(nsim * thinning + burn_in)) {
    #cat(1)
    # STEP 1 - sample  indicators

    Ci <- Update_memberships_faster(
      mu_obser = mus,
      dl =  d,
      pl = pl,
      K  = K,
      N  = n,
      q  = q,
      Ci = Ci,
      QQ = QQ,
      possible_label = lev,
      index_row =  IRC.list$IR,
      index_col =  IRC.list$IC,
      log_Precomp_Z = log_Zeta,
      log_Precomp_ratios = log_corr
    )

    # STEP 2, updating Pi^*
    #cat(2)

    Ci     <- factor(Ci, levels = seq_len(K))
    N_Slog <- Groups_quantities(mu_obser = mus,
                                Ci = Ci,
                                K = K)
    n_l    <- N_Slog[, 1]
    sLog   <- N_Slog[, 2]
    pl     <- rdir_cpp(alpha_Dirichlet + n_l)

    # Step 3  --  Update d
    #cat(3)


    if (prior_type == "Conjugate") {
      d <- stats::rgamma(K, n_l + a0_d, b0_d + sLog)
    } else if (prior_type == "Truncated") {
      d <- gam_trunc(D,K,a0_d,b0_d,n_l,sLog)
    } else if (prior_type == "Truncated_PointMass") {
      d <- gam_trunc_pmass(D,K,a0_d,b0_d,n_l,sLog,pi_mass)
    }


    # Step 4  --  Store simulation post burn-in

    if (sim > burn_in && ((sim - burn_in) %% thinning == 0L)) {
      ind_rr            <- floor((sim - burn_in) / thinning)
      ALL_Ci[, ind_rr]   <-  Ci
      ALL_Pi[, ind_rr]   <-  pl
      ALL_D[, ind_rr]    <-  d
    }
    if (verbose) {
      ipbar <- ipbar + 1
      utils::setTxtProgressBar(pbar, ipbar)
    }
  } # End of Gibbs loop
  time2 <-  Sys.time()

  Recap <- list(
    X = X,
    dist_mat = dist_mat,
    mus = mus,
    K = K,
    nsim = nsim,
    burn_in = burn_in,
    thinning = thinning,
    q = q,
    xi = xi,
    alpha_Dirichlet = alpha_Dirichlet,
    a0_d = a0_d,
    b0_d = b0_d,
    prior_type = prior_type,
    D = D,
    pi_mass = pi_mass,
    elapsed = time2 - time1
  )

  # Step 5  --  Postprocessing of the MCMC

  if (verbose)
    cat("\nPosterior sampling complete. Postprocessing the chains...\n")
  intrinsic_dimension  <- t(ALL_D)
  cluster_prob         <- t(ALL_Pi)
  membership_labels    <- t(ALL_Ci)


  postpro_chains <-
    t(sapply(seq_len(nsim), function(i)
      intrinsic_dimension[i, membership_labels[i, ]]))

  avg     <- colMeans(postpro_chains)
  med     <- apply(postpro_chains, 2, stats::median)
  sd1     <-
    apply(postpro_chains, 2, function(z)
      stats::quantile(z, .05))
  sd2     <-
    apply(postpro_chains, 2, function(z)
      stats::quantile(z, .25))
  sd3     <-
    apply(postpro_chains, 2, function(z)
      stats::quantile(z, .75))
  sd4     <-
    apply(postpro_chains, 2, function(z)
      stats::quantile(z, .95))
  summa   <- data.frame(
    Q.05 = sd1,
    Q.25 = sd2,
    MEAN = avg,
    MEDIAN = med,
    Q.75 = sd3,
    Q.95 = sd4,
    OBS = seq_len(n)
  )
  if (verbose){
    cat("Done! \n")
    close(pbar)
  }
  output <- list(
    cluster_prob         = cluster_prob,
    membership_labels    = membership_labels,
    id_raw               = intrinsic_dimension,
    id_postpr            = postpro_chains,
    id_summary           = summa,
    recap                = Recap
  )
  structure(output, class = c("Hidalgo", class(output)))

}



#' @name Hidalgo
#'
#' @param x an object of class \code{Hidalgo}, obtained from the function
#' \code{Hidalgo()}.
#' @param ... other arguments passed to specific methods.
#'
#'
#' @export
print.Hidalgo <- function(x, ...) {
  cat("Model: Hidalgo\n")
  cat("Method: Bayesian Estimation\n")
  cat(paste0(
    "Prior d ~ Gamma(",
    x$recap[["a0_d"]],
    ", ",
    x$recap[["b0_d"]],
    "), type = ",
    x$recap[["prior_type"]],
    "\n"
  ))
  cat(
    paste0(
      "Prior on mixture weights: Dirichlet(",
      x$recap[["alpha_Dirichlet"]],
      ") with ",
      x$recap[["K"]],
      " mixture components\n"
    )
  )
  cat(
    paste0(
      "MCMC details:\nTotal iterations: ",
      x$recap[["nsim"]] + x$recap[["burn_in"]],
      ", Burn in: ",
      x$recap[["burn_in"]],
      ", Thinning: ",
      x$recap[["thinning"]],
      "\nUsed iterations: ",
      x$recap[["nsim"]],
      "\nElapsed time: ",
      round(x$recap[["elapsed"]], 4),
      " ",
      attr(x$recap[["elapsed"]], "units"),"\n"
    )
  )
  invisible(x)
}


#' @name Hidalgo
#'
#' @param x object of class \code{Hidalgo}, the output of the
#' \code{Hidalgo()} function.
#' @param type character that indicates the type of plot that is requested.
#' It can be:
#' \describe{
#'  \item{\code{"A"}}{plot the MCMC and the ergodic means NOT corrected
#'  for label switching;}
#'  \item{\code{"B"}}{plot the posterior mean and median of the id
#'  for each observation, after the chains are processed for label switching;}
#'  \item{\code{"C"}}{plot the estimated id distributions stratified by
#'  the groups specified in the class vector;}
#'  }
#' @param class factor variable used to stratify observations according to
#' their the \code{id} estimates.
#' @param ... other arguments passed to specific methods.
#'
#' @importFrom graphics boxplot plot segments matplot
#'
#'
#' @export
plot.Hidalgo <- function(x,
                         type = c("A","B","C"),
                         class = NULL,
                         ...) {
  type <- match.arg(type)

  if (type == "C" & is.null(class)){
      stop("Please provide the factor variable to stratify the id estimates")
  }


  if (type == "A") {
    ID  <- x$id_raw
    cmm <- (apply(ID, 2, function(x)
      cumsum(x) / seq_along(x)))
    matplot(
      ts(x$id_raw),
      type = "l",
      col = "lightgray",
      lty = 1,
      xlab = "MCMC Iteration",
      ylab = "Raw MCMC - Intrinsic Dimension"
    )
    matplot(
      cmm,
      type = "l",
      lty = 1,
      lwd = 2,
      add = T
    )
  } else if (type == "B") {

    on.exit({par(my_par)}, add = TRUE, after = TRUE)
    my_par <- par(mfrow = c(2, 1))

    ID  <- x$id_summary
    plot(
      ID$MEAN,
      ylim = c(min(ID$Q.05), max(ID$Q.95)),
      type = "n",
      main = "Posterior Means",
      ylab = "Intrinsic Dimension",
      xlab = "Observation"
    )
    segments(
      x0 = ID$OBS,
      x1 = ID$OBS,
      y0 = ID$Q.05,
      ID$Q.95,
      col = "lightgray"
    )
    points(ID$MEAN, ylim = c(min(ID$Q.05), max(ID$Q.95)), col = "darkblue")



    plot(
      ID$MEDIAN,
      ylim = c(min(ID$Q.05), max(ID$Q.95)),
      type = "n",
      main = "Posterior Medians",
      ylab = "Intrinsic Dimension",
      xlab = "Observation"
    )
    segments(
      x0 = ID$OBS,
      x1 = ID$OBS,
      y0 = ID$Q.05,
      ID$Q.95,
      col = "lightgray"
    )
    points(ID$MEDIAN, ylim = c(min(ID$Q.05), max(ID$Q.95)), col = "darkblue")

  }else if(type == "C"){

    on.exit({par(my_par)}, add = TRUE, after = TRUE)
    my_par <- par(mfrow = c(2, 1))

    ID  <- x$id_summary
    boxplot(
      ID$MEAN ~ class,
      col  = as.numeric(class)+1,
      ylab = ("ID posterior estimate") ,
      xlab = "Class",
      main = "Posterior Means"
    )
    boxplot(
      ID$MEDIAN ~ class,
      col = as.numeric(class)+1,
      ylab = ("ID posterior estimate"),
      xlab = "Class",
      main = "Posterior Medians"
    )
  }
  invisible()

}

#' @name Hidalgo
#'
#' @param object object of class \code{Hidalgo}, the output of the
#' \code{Hidalgo()} function.
#' @param ... other arguments passed to specific methods.
#'
#' @export
summary.Hidalgo <- function(object, ... ){

  out <- object$id_summary[,1:6]
  structure(out, class = c("summary.Hidalgo",class(out)))

}

#' @name Hidalgo
#'
#' @param x object of class \code{Hidalgo}, the output of the
#' \code{Hidalgo()} function.
#' @param ... other arguments passed to specific methods.
#'
#' @export
print.summary.Hidalgo <- function(x, ...){
  cat("Hidalgo - Posterior id estimates\n")
  print(summary(as.data.frame(x)))
  cat("\n")
  invisible(x)
}
Fradenti/intRinsic documentation built on Sept. 15, 2024, 4:37 a.m.