R/utils.R

Defines functions checkpoint_clean_up_ checkpoint_ make_chunks_ rm_collinear_ rm_constant_ log_sum_exp_ inv_mills_ratio_ inv_mills_ratio_matrix_ log_det log_sigmoid_ log_one_plus_exp_ get_annealing_ladder_ create_named_list_ check_structure_ check_zero_one_ check_positive_ check_natural_

# This file is part of the `locus` R package:
#     https://github.com/hruffieux/locus
#

# Diverse utility functions implementing sanity checks, basic preprocessing,
# and ticks to prevent overflow/underflow.
#

check_natural_ <- function(x, eps = .Machine$double.eps^0.75){
  if (any(x < eps | abs(x - round(x)) > eps)) {
    stop(paste0(deparse(substitute(x)),
               " must be natural."))
  }
}

check_positive_ <- function(x, eps = .Machine$double.eps^0.75){
  if (any(x < eps)) {
    err_mess <- paste0(deparse(substitute(x)), " must be positive, greater than ",
                      format(eps, digits = 3), ".")
    if (length(x) > 1) err_mess <- paste0("All entries of ", err_mess)
    stop(err_mess)
  }
}

check_zero_one_ <- function(x){
  if (any(x < 0) | any(x > 1)) {
    err_mess <- paste0(deparse(substitute(x)), " must lie between 0 and 1.")
    if (length(x) > 1) err_mess <- paste0("All entries of ", err_mess)
    stop(err_mess)
  }
}

check_structure_ <- function(x, struct, type, size = NULL,
                             null_ok = FALSE,  inf_ok = FALSE, na_ok = FALSE) {
  if (type == "double") {
    bool_type <-  is.double(x)
    type_mess <- "a double-precision "
  } else if (type == "integer") {
    bool_type <- is.integer(x)
    type_mess <- "an integer "
  } else if (type == "numeric") {
    bool_type <- is.numeric(x)
    type_mess <- "a numeric "
  } else if (type == "logical") {
    bool_type <- is.logical(x)
    type_mess <- "a boolean "
  } else if (type == "string") {
    bool_type <- is.character(x)
    type_mess <- "string "
  }

  bool_size <- TRUE # for case size = NULL (no assertion on the size/dimension)
  size_mess <- ""
  if (struct == "vector") {
    bool_struct <- is.vector(x) & (length(x) > 0) # not an empty vector
    if (!is.null(size)) {
      bool_size <- length(x) %in% size
      size_mess <- paste0(" of length ", paste0(size, collapse=" or "))
    }
  } else if (struct == "matrix") {
    bool_struct <- is.matrix(x) & (length(x) > 0) # not an empty matrix
    if (!is.null(size)) {
      bool_size <- all(dim(x) == size)
      size_mess <- paste0(" of dimension ", size[1], " x ", size[2])
    }
  }

  correct_obj <- bool_struct & bool_type & bool_size

  bool_null <- is.null(x)

  if (!is.list(x) & type != "string") {
    na_mess <- ""
    if (!na_ok) {
      if (!bool_null) correct_obj <- correct_obj & !any(is.na(x))
      na_mess <- " without missing value"
    }

    inf_mess <- ""
    if (!inf_ok) {
      if (!bool_null) correct_obj <- correct_obj & all(is.finite(x[!is.na(x)]))
      inf_mess <- ", finite"
    }
  } else {
    na_mess <- ""
    inf_mess <- ""
  }

  null_mess <- ""
  if (null_ok) {
    correct_obj <- correct_obj | bool_null
    null_mess <- " or must be NULL"
  }

  if(!(correct_obj)) {
    stop(paste0(deparse(substitute(x)), " must be a non-empty ", type_mess, struct,
               size_mess, inf_mess, na_mess, null_mess, "."))
  }
}


create_named_list_ <- function(...) {
  setNames(list(...), as.character(match.call()[-1]))
}



get_annealing_ladder_ <- function(anneal, verbose) {

  # ladder set following:
  # Importance Tempering, Robert B. Gramacy & Richard J. Samworth, pp.9-10, arxiv v4

  k_m <- 1 / anneal[2]
  m <- anneal[3]

  if(anneal[1] == 1) {

    type <- "geometric"

    delta_k <- k_m^(1 / (1 - m)) - 1

    ladder <- (1 + delta_k)^(1 - m:1)

  } else if (anneal[1] == 2) { # harmonic spacing

    type <- "harmonic"

    delta_k <- ( 1 / k_m - 1) / (m - 1)

    ladder <- 1 / (1 + delta_k * (m:1 - 1))

  } else { # linear spacing

    type <- "linear"

    delta_k <- (1 - k_m) / (m - 1)

    ladder <- k_m + delta_k * (1:m - 1)
  }

  if (verbose)
    cat(paste0("** Annealing with ", type," spacing ** \n\n"))

  ladder

}


log_one_plus_exp_ <- function(x) { # computes log(1 + exp(x)) avoiding
                                   # numerical overflow
  m <- x
  m[x < 0] <- 0

  log(exp(x - m) + exp(- m)) + m
}


log_sigmoid_ <- function(chi) {

  - log(1 + exp(- chi)) # chi is always positive so no overflow possible (underflow neither, thanks to the "+1")

}

log_det <- function(list_mat) {

  if (is.list(list_mat)) {
    sapply(list_mat, function(mat) {
      log_det <- determinant(mat, logarithm = TRUE)
      log_det$modulus * log_det$sign
    })
  } else {
    log_det <- determinant(list_mat, logarithm = TRUE)
    log_det$modulus * log_det$sign
  }

}

inv_mills_ratio_matrix_ <- function(Y, U) {

  if (is.matrix(U)) m <- matrix(NA, nrow = nrow(U), ncol = ncol(U))
  else m <- rep(NA, length(U))

  U_1 <- U[Y==1]
  m_1 <- exp(dnorm(U_1, log = TRUE) - pnorm(U_1, log.p = TRUE))
  m_1[m_1 < -U_1] <- -U_1

  m[Y==1] <- m_1


  U_0 <- U[Y==0]
  m_0 <- - exp(dnorm(U_0, log = TRUE) - pnorm(U_0, lower.tail = FALSE, log.p = TRUE))
  m_0[m_0 > -U_0] <- -U_0

  m[Y==0] <- m_0

  m

}


inv_mills_ratio_ <- function(y, U, log_1_pnorm_U, log_pnorm_U) {
  
  stopifnot(y %in% c(0, 1))
  
  # writing explicitely the formula for pnorm(, log = TRUE) is faster...
  if (y == 1) {
    
    m <- exp(-U^2/2 - log(sqrt(2*pi)) - log_pnorm_U)
    m[m < -U] <- -U
    
  } else {
    
    m <- - exp(-U^2/2 - log(sqrt(2*pi)) - log_1_pnorm_U)
    m[m > -U] <- -U
    
  }
  
  m
  
}


log_sum_exp_ <- function(x) {
  # Computes log(sum(exp(x))

  if ( max(abs(x)) > max(x) )
    offset <- min(x)
  else
    offset <- max(x)
  log(sum(exp(x - offset))) + offset

}


# entropy_ <- function(Y, U) {
#
#   log((2 * pi * exp(1))^(1/2) *
#            exp(Y * pnorm(U, log.p = TRUE) +
#                  (1-Y) * pnorm(U, lower.tail = FALSE, log.p = TRUE))) -
#   U * inv_mills_ratio_matrix_(Y, U) / 2
#
# }

rm_constant_ <- function(mat, verbose) {

  bool_cst <- is.nan(colSums(mat))

  if (any(bool_cst)) {

    rmvd_cst <- colnames(mat)[bool_cst]

    if (verbose) {
      if (sum(bool_cst) < 50) {
        cat(paste0("Variable(s) ", paste0(rmvd_cst, collapse=", "),
                  " constant across subjects. \n",
                  "Removing corresponding column(s) and saving its/their id(s) ",
                  "in the function output ... \n\n"))
      } else {
        cat(paste0(sum(bool_cst), " variables constant across subjects. \n",
                  "Removing corresponding column(s) and saving their ids ",
                  "in the function output ... \n\n"))
      }
    }

    mat <- mat[, !bool_cst, drop = FALSE]
  } else {
    rmvd_cst <- NULL
  }

  create_named_list_(mat, bool_cst, rmvd_cst)
}

rm_collinear_ <- function(mat, verbose) {

  bool_coll <- duplicated(mat, MARGIN = 2)

  if (any(bool_coll)) {

    mat_coll <- mat[, bool_coll, drop = FALSE]
    rmvd_coll <- colnames(mat_coll)

    if (verbose) {
      if (length(rmvd_coll) < 50) {
        cat(paste0("Presence of collinear variable(s). ",
                  paste0(rmvd_coll, collapse=", "), " redundant. \n",
                  "Removing corresponding column(s) and saving its/their id(s) ",
                  "in the function output ... \n"))
      } else {
        cat(paste0("Presence of collinear variables. ", length(rmvd_coll),
                  " redundant.\n", "Removing corresponding columns and saving ",
                  "their ids in the function output ... \n"))
      }
    }

    # associate to each removed replicate the name of the covariate with which
    # it is duplicated and that is kept in the dataset
    bool_with_coll <- duplicated(mat, MARGIN = 2, fromLast = TRUE) & !bool_coll
    mat_with_coll <- mat[, bool_with_coll, drop = FALSE]

    assoc_coll <- colnames(mat_with_coll)[match(data.frame(mat_coll),
                                                data.frame(mat_with_coll))]
    names(rmvd_coll) <- assoc_coll

    mat <- mat[, !bool_coll, drop = FALSE]

  } else {
    rmvd_coll <- NULL
  }

  create_named_list_(mat, bool_coll, rmvd_coll)
}

make_chunks_ <- function(x, n_g) split(x, factor(sort(rank(x) %% n_g)))


checkpoint_ <- function(it, checkpoint_path, gam_vb, converged, lb_new, lb_old, 
                        om_vb = NULL, rate = 100) {
  
  if (!is.null(checkpoint_path) && it %% rate == 0) {
    
    diff_lb <- abs(lb_new - lb_old)
    
    tmp_vb <- create_named_list_(gam_vb, converged, it, lb_new, diff_lb, om_vb)
    
    file_save <- paste0(checkpoint_path, "tmp_output_it_", it, ".RData")
    
    save(tmp_vb, file = file_save)
    
    old_file_clean_up <- paste0(checkpoint_path, "tmp_output_it_", it - 2 * rate, ".RData") # keep only the last two for comparison
    
    if (file.exists(old_file_clean_up)) 
      file.remove(old_file_clean_up)

  }
    
}


checkpoint_clean_up_ <- function(checkpoint_path) {
  
  if (!is.null(checkpoint_path)) {
    
    old_files_clean_up <- list.files(path = checkpoint_path, pattern = "tmp_output_it_")
    
    sapply(old_files_clean_up, function(ff) {
      if (file.exists(file.path(checkpoint_path, ff))) 
        file.remove(file.path(checkpoint_path, ff))
    })
    
  } 
  
}
 
hruffieux/locus documentation built on Jan. 10, 2024, 10:07 p.m.