R/prepare_locus.R

Defines functions set_struct prepare_struct_ set_groups prepare_groups_ prepare_ind_bin_ set_blocks prepare_blocks_ prepare_cv_ prepare_list_init_ prepare_list_hyper_ check_annealing_ convert_p0_av_ prepare_data_

Documented in set_blocks set_groups set_struct

# This file is part of the `locus` R package:
#     https://github.com/hruffieux/locus
#

# Internal function implementing sanity checks and needed preprocessing before
# the application of the different `locus_*_core` algorithms.
#
prepare_data_ <- function(Y, X, Z, link, ind_bin, user_seed, tol, maxit, 
                          verbose, checkpoint_path) {
  
  stopifnot(link %in% c("identity", "logit", "probit", "mix"))
  
  check_structure_(user_seed, "vector", "numeric", 1, null_ok = TRUE)
  
  check_structure_(tol, "vector", "numeric", 1)
  check_positive_(tol, eps=.Machine$double.eps)
  
  check_structure_(maxit, "vector", "numeric", 1)
  check_natural_(maxit)
  
  check_structure_(X, "matrix", "numeric")
  
  if (!is.null(checkpoint_path)) {
    
    if (!dir.exists(checkpoint_path)) {
      stop(paste0("The directory specified in checkpoint_path doesn't exist. ", 
                  "Please make sure to provide a valid path."))
    }
    
    if (!is.null(Z) | link != "identity")
      warning(paste0("Checkpointing only implemented for Z NULL and link = identity. ", 
                     "Path specified in checkpoint_path is ignored."))
    
  }
  
  n <- nrow(X)
  p <- ncol(X)
  
  check_structure_(Y, "matrix", "numeric")
  d <- ncol(Y)
  
  if (link == "mix") {
    
    ind_bin <- prepare_ind_bin_(d, ind_bin, link)
    
    if(!all(as.vector(Y[, ind_bin]) == as.numeric(as.logical(Y[, ind_bin]))))
      stop("The responses in Y corresponding to indices ind_bin must be a binary.")
    
  } else if (link != "identity"){
    
    if(!all(as.vector(Y) == as.numeric(as.logical(Y))))
      stop("Y must be a binary matrix for logistic/probit regression.")
    
  }
  
  if (nrow(Y) != n) stop("X and Y must have the same number of samples.")
  
  if (is.null(rownames(X)) & is.null(rownames(Y)))
    rownames(X) <- rownames(Y) <- paste0("Ind_", 1:n)
  else if (is.null(rownames(X))) rownames(X) <- rownames(Y)
  else if (is.null(rownames(Y))) rownames(Y) <- rownames(X)
  else if (any(rownames(X) != rownames(Y)))
    stop("The provided rownames of X and Y must be the same.")
  
  if (is.null(colnames(X))) colnames(X) <- paste0("Cov_x_", 1:p)
  if (is.null(colnames(Y))) colnames(Y) <- paste0("Resp_", 1:d)
  
  if (!is.null(Z)) {
    
    if (verbose) cat(paste0("Each factor variables must be provided by adding ",
                            "to Z (nb levels - 1) variables representing their ",
                            "levels. \n"))
    
    check_structure_(Z, "matrix", "numeric")
    
    q <- ncol(Z)
    
    if (nrow(Z) != n) stop("Z must have the same number of samples as Y and X.")
    
    if (is.null(rownames(Z))) rownames(Z) <- rownames(X)
    else if(any(rownames(Z) != rownames(X)))
      stop("The provided rownames of Z must be the same than those of X and Y or NULL.")
    
    if (is.null(colnames(Z))) colnames(Z) <- paste0("Cov_z_", 1:q)
    
    Z <- scale(Z)
    
    list_Z_cst <- rm_constant_(Z, verbose)
    Z <- list_Z_cst$mat
    bool_cst_z <- list_Z_cst$bool_cst
    rmvd_cst_z <- list_Z_cst$rmvd_cst
    
    list_Z_coll <- rm_collinear_(Z, verbose)
    Z <- list_Z_coll$mat
    q <- ncol(Z)
    bool_coll_z <- list_Z_coll$bool_coll
    rmvd_coll_z <- list_Z_coll$rmvd_coll
    
    bool_rmvd_z <- bool_cst_z
    bool_rmvd_z[!bool_cst_z] <- bool_coll_z
    
  } else {
    q <- NULL
    bool_rmvd_z <- NULL
    rmvd_cst_z <- NULL
    rmvd_coll_z <- NULL
  }
  
  X <- scale(X)
  
  list_X_cst <- rm_constant_(X, verbose)
  X <- list_X_cst$mat
  bool_cst_x <- list_X_cst$bool_cst
  rmvd_cst_x <- list_X_cst$rmvd_cst
  
  list_X_coll <- rm_collinear_(X, verbose)
  X <- list_X_coll$mat
  
  bool_coll_x <- list_X_coll$bool_coll
  rmvd_coll_x <- list_X_coll$rmvd_coll
  
  bool_rmvd_x <- bool_cst_x
  bool_rmvd_x[!bool_cst_x] <- bool_coll_x
  
  p <- ncol(X)
  if (p < 1) stop(paste0("There must be at least 1 non-constant candidate predictor ",
                         " stored in X."))
  
  if (link == "identity") {
    
    Y <- scale(Y, center = TRUE, scale = FALSE)
    
  } else if (link == "mix") {
    
    Y[, -ind_bin] <- scale(Y[, -ind_bin], center = TRUE, scale = FALSE)
    
  } else if (link == "logit") {
    
    Y <- Y - 1 / 2
    
  }
  
  
  if (is.null(q) || q < 1) Z <- NULL
  
  create_named_list_(Y, X, Z, bool_rmvd_x, bool_rmvd_z, rmvd_cst_x, rmvd_cst_z, 
                     rmvd_coll_x, rmvd_coll_z)
  
}


# Internal function implementing sanity checks and needed preprocessing for
# argument p0_av before the application of the different `locus_*_core` algorithms.
#
convert_p0_av_ <- function(p0_av, p, list_blocks, verbose, eps = .Machine$double.eps^0.5) {
  
  check_structure_(p0_av, "vector", "numeric", c(1, p))
  
  if (length(p0_av) == 1) {
    
    if (verbose) cat(paste0("Provided p0_av = ", p0_av, " interpreted as ",
                            "the prior number of predictors associated with at ",
                            "least one response. \n\n"))
    
    if (p0_av / p < eps)
      stop(paste0("p0_av = ", p0_av, ": \n",
                  "invalid provided value of p0_av.\n",
                  "The prior sparsity level, p0_av / p, must be larger than ",
                  "zero. \n",
                  "Please increase p0_av."))
    
    if (p0_av / p > 0.95)
      stop(paste0("p0_av = ", p0_av, ": \n",
                  "invalid provided value of p0_av.\n",
                  "Induces a non-sparse formulation. Please decrease p0_av."))
    
    if (p0_av > ceiling(4 * p / 5))
      warning(paste0("Prior model size p0_av = ", p0_av, ": \n",
                     "p0_av / p is large, so multiplicity control may be weak. ",
                     "You may want to consider a smaller p0_av."))
    
    p_star <- p0_av
    
  } else {
    
    if (verbose) cat(paste0("The sth entry of the provided p0_av ",
                            "interpreted as the prior probability that ",
                            "predictor s is associated with at least one ",
                            "response. \n\n"))
    
    if (any(p0_av < eps) | any(p0_av > 1 - eps))
      stop(paste0("Invalid provided vector of p0_av.\n",
                  "All entries must lie between 0 and 1 (strictly)."))
    
    if (median(p0_av) > 1 / 2)
      warning(paste0("The number of predictors with large prior inclusion ",
                     "probability is large, so multiplicity control may be weak. \n",
                     "You may want to decrease the values of several ",
                     "entries of p0_av."))
    
    p_star <- p0_av * p
    
  }
  
  if (!is.null(list_blocks)) {
    # the sparsity level needs to be adapted when block-wise inference is used
    # otherwise the selected models may be too small (empirical considerations here)
    p_star <- sapply(p_star, function(p_star_j) min(p_star_j * list_blocks$n_bl, 0.975 * p))
    
    if (verbose) cat(paste0("The sparsity level is adapted for block-wise inference ",
                            "to ensure only sufficiently large models are selected.\n\n"))
  }
  
  p_star
}



check_annealing_ <- function(anneal, link, Z, list_groups, list_struct) {
  
  check_structure_(anneal, "vector", "numeric", 3, null_ok = TRUE)
  
  if (!is.null(anneal)) {
    
    if (link != "identity" | !is.null(list_groups))
      stop(paste0("Annealing procedure not yet implemented when link is different ",
                  "from identity, Z, list_groups or list_struct is non-NULL. Exit."))
    
    check_natural_(anneal[c(1, 3)])
    check_positive_(anneal[2])
    
    stopifnot(anneal[1] %in% 1:3)
    
    if (anneal[2] < 1.5)
      stop(paste0("Initial temperature very small. May not be large enough ",
                  "for a successful exploration. Please increase it or select no annealing."))
    
    if (anneal[3] > 1000)
      stop(paste0("Temperature ladder size very large. This may be unnecessarily ",
                  "computationally demanding. Please decrease it."))
    
  }
  
}


# Internal function implementing sanity checks and needed preprocessing for the
# model hyperparameters before the application of the different `locus_*_core`
# algorithms.
#
prepare_list_hyper_ <- function(list_hyper, Y, p, p_star, q, link, ind_bin,
                                vec_fac_gr, vec_fac_st, bool_rmvd_x, bool_rmvd_z,
                                names_x, names_y, names_z, verbose) {
  
  d <- ncol(Y)
  if (!is.null(vec_fac_gr)) {
    G <- length(unique(vec_fac_gr))
  } else {
    G <- NULL
  }
  
  ns <- is.null(vec_fac_st)
  
  if (is.null(list_hyper)) {
    
    if (verbose) cat("list_hyper set automatically. \n")
    
    list_hyper <- auto_set_hyper_(Y, p, p_star, q, link, ind_bin, !ns, vec_fac_gr)
    
  } else {
    
    if (xor(is.null(G), is.null(list_hyper$G_hyper)))
      stop("If group selection was enabled when setting list_hyper, it must be used in locus and vice-versa.")
    
    if (!inherits(list_hyper, c("hyper", "out_hyper")))
      stop(paste0("The provided list_hyper must be an object of class ``hyper'' ",
                  "or ``out_hyper''. \n",
                  "*** you must either use the function set_hyper to ",
                  "set your own hyperparameters or use list_hyper from a ``vb'' ",
                  "object or set the argument list_hyper to NULL for automatic choice. ***"))
    
    if (inherits(list_hyper, "hyper")) {
      p_hyper_match <- length(bool_rmvd_x)
      if (!is.null(G))
        G_hyper_match <- length(levels(vec_fac_gr)) # counts the levels of the factors
      # to know how many groups before removal
      # of constant or collinear predictors
    } else {
      p_hyper_match <- p
      if (!is.null(G))
        G_hyper_match <- G
    }
    
    
    if (list_hyper$d_hyper != d)
      stop(paste0("The dimensions (d) of the provided hyperparameters ",
                  "(list_hyper) are not consistent with that of Y.\n"))
    
    if (!is.null(G) && list_hyper$G_hyper != G_hyper_match)
      stop(paste0("The number of groups (G) provided when setting the hyperparameters ",
                  "(list_hyper) is not consistent with that provided in the locus function.\n"))
    
    if (list_hyper$p_hyper != p_hyper_match)
      stop(paste0("The dimensions (p) of the provided hyperparameters ",
                  "(list_hyper) are not consistent with that of X.\n"))
    
    if (list_hyper$link_hyper != link)
      stop(paste0("The argument link is not consistent with the variable
                 link_hyper in list_hyper"))
    
    if(link == "mix") {
      if (!all(list_hyper$ind_bin_hyper == ind_bin))
        stop(paste0("The argument ind_bin is not consistent with the variable
                   ind_bin_hyper in list_hyper"))
    }
    
    if (ns) {
      
      if (inherits(list_hyper, "hyper")) {
        # remove the entries corresponding to the removed constant predictors in X
        # (if any)
        if (is.null(G)) {
          
          list_hyper$a <- list_hyper$a[!bool_rmvd_x]
          list_hyper$b <- list_hyper$b[!bool_rmvd_x]
          
          if (!is.null(names(list_hyper$a)) && names(list_hyper$a) != names_x)
            stop("Provided names for the entries of a do not match the colnames of X.")
          
          if (!is.null(names(list_hyper$b)) && names(list_hyper$b) != names_x)
            stop("Provided names for the entries of b do not match the colnames of X.")
          
        } else {
          
          list_hyper$a <- list_hyper$a[unique(vec_fac_gr)]
          list_hyper$b <- list_hyper$b[unique(vec_fac_gr)]
          
        }
      }
      
    } else { # list_struct non-NULL 
      
      list_hyper$m0 <- list_hyper$m0[!bool_rmvd_x]
      
      if (!is.null(names(list_hyper$m0)) && names(list_hyper$m0) != names_x)
        stop("Provided names for the entries of m0 do not match the colnames of X.")
      
    }
    
    if (link %in% c("identity", "mix")) {
      
      if (link == "mix") names_y <- names_y[-ind_bin]
      
      if (!is.null(names(list_hyper$eta)) && names(list_hyper$eta) != names_y)
        stop("Provided names for the entries of eta do not match the colnames of the continuous variables in Y")
      
      if (!is.null(names(list_hyper$kappa)) && names(list_hyper$kappa) != names_y)
        stop("Provided names for the entries of kappa do not match the colnames of the continuous variables in Y")
    }
    
    if (!is.null(q)) {
      
      if (inherits(list_hyper, "hyper")) {
        q_hyper_match <- length(bool_rmvd_z)
        # remove the entries corresponding to the removed constant predictors in X
        # (if any)
        list_hyper$phi <- list_hyper$phi[!bool_rmvd_z]
        list_hyper$xi <- list_hyper$xi[!bool_rmvd_z]
        
      } else {
        q_hyper_match <- q
      }
      
      if (list_hyper$q_hyper != q_hyper_match)
        stop(paste0("The dimensions of the provided hyperparameters ",
                    "(list_hyper) are not consistent with that of Z."))
      
      if (!is.null(names(list_hyper$phi)) && names(list_hyper$phi) != names_z)
        stop("Provided names for the entries of phi do not match the colnames of Z.")
      
      if (!is.null(names(list_hyper$xi)) && names(list_hyper$xi) != names_z)
        stop("Provided names for the entries of xi do not match the colnames of Z.")
      
    }
    
  }
  
  class(list_hyper) <- "out_hyper"
  
  list_hyper
}


# Internal function implementing sanity checks and needed preprocessing for the
# starting values before the application of the different `locus_*_core`
# algorithms.
#
prepare_list_init_ <- function(list_init, Y, p, p_star, q, link, ind_bin,
                               vec_fac_gr, bool_rmvd_x, bool_rmvd_z, user_seed, 
                               verbose) {
  
  d <- ncol(Y)
  n <- nrow(Y)
  
  if (!is.null(vec_fac_gr)) {
    G <- length(unique(vec_fac_gr))
  } else {
    G <- NULL
  }
  
  if (is.null(list_init)) {
    
    if (!is.null(user_seed) & verbose) cat(paste0("Seed set to user_seed ",
                                                  user_seed,". \n"))
    
    if (verbose) cat(paste0("list_init set automatically. \n"))
    
    list_init <- auto_set_init_(Y, G, p, p_star, q, user_seed, link, ind_bin)
    
  } else {
    
    if (xor(is.null(G), is.null(list_init$G_init)))
      stop("If group selection was enabled when setting list_init, it must be used in locus and vice-versa.")
    
    
    if (!is.null(user_seed))
      warning("user_seed not used since a non-NULL list_init was provided. \n")
    
    if (!inherits(list_init, c("init", "out_init")))
      stop(paste0("The provided list_init must be an object of class ``init'' or ",
                  " `` out_init''. \n",
                  "*** you must either use the function set_init to ",
                  "set your own initialization or use list_init from a ``vb'' ",
                  "object or  set the argument list_init to NULL for automatic ",
                  "initialization. ***"))
    
    if (inherits(list_init, "init")) {
      p_init_match <- length(bool_rmvd_x)
    } else {
      p_init_match <- p
    }
    
    if (inherits(list_init, "init")) {
      p_init_match <- length(bool_rmvd_x)
      if (!is.null(G))
        G_init_match <- length(levels(vec_fac_gr)) # counts the levels of the factors
      # to know how many groups before removal
      # of constant or collinear predictors
    } else {
      p_init_match <- p
      if (!is.null(G))
        G_init_match <- G
    }
    
    if (list_init$d_init != d)
      stop(paste0("The dimensions (d) of the provided initial parameters ",
                  "(list_init) are not consistent with that of Y.\n"))
    
    if (list_init$p_init != p_init_match)
      stop(paste0("The dimensions (p) of the provided initial parameters ",
                  "(list_init) are not consistent with that of X.\n"))
    
    if (!is.null(G) && list_init$G_init != G_init_match)
      stop(paste0("The number of groups (G) provided when setting the initial parameters ",
                  "(list_init) is not consistent with that provided in the locus function.\n"))
    
    if (list_init$link_init != link)
      stop(paste0("The argument link is not consistent with the variable
                 link_init in list_init"))
    
    if(link == "mix") {
      if (!all(list_init$ind_bin_init == ind_bin))
        stop(paste0("The argument ind_bin is not consistent with the variable
                   ind_bin_init in list_init"))
    }
    
    if (inherits(list_init, "init")) {
      
      if (is.null(G)) {
        # drops the rows corresponding to the removed constant and collinear
        # predictors in X (if any)
        list_init$gam_vb <- list_init$gam_vb[!bool_rmvd_x,, drop = FALSE]
      } else {
        # drops the rows corresponding to the empty groups (if any)
        list_init$gam_vb <- list_init$gam_vb[unique(vec_fac_gr),, drop = FALSE]
      }
      
      list_init$mu_beta_vb <- list_init$mu_beta_vb[!bool_rmvd_x,, drop = FALSE]
      
      if (link == "logit")
        list_init$sig2_beta_vb <- list_init$sig2_beta_vb[!bool_rmvd_x,, drop = FALSE]
      
    }
    
    
    if (!is.null(q)) {
      
      if (inherits(list_init, "init")) {
        q_init_match <- length(bool_rmvd_z)
        # remove the entries corresponding to the removed constant predictors in X
        # (if any)
        list_init$alpha_vb <- list_init$alpha_vb[!bool_rmvd_z,, drop = FALSE]
        
        if (link == "probit"){
          
          list_init$sig2_alpha_vb <- list_init$sig2_alpha_vb[!bool_rmvd_z]
          
        } else {
          
          list_init$sig2_alpha_vb <- list_init$sig2_alpha_vb[!bool_rmvd_z,, drop = FALSE]
          
        }
        
      } else {
        q_init_match <- q
      }
      
      if (list_init$q_init != q_init_match)
        stop(paste0("The dimensions of the provided initial parameters ",
                    "(list_init) are not consistent with that of Z."))
    }
    
  }
  
  
  if (!is.null(G)) { # converts mu_beta_vb to a list of matrices by groups
    list_init$mu_beta_vb <- lapply(unique(vec_fac_gr),
                                   function(g) list_init$mu_beta_vb[vec_fac_gr == g,, drop = FALSE])
  }
  
  class(list_init) <- "out_init"
  
  list_init
}


# Internal function implementing sanity checks and needed preprocessing for the
# model hyperparameters before the application of the cross-validation procedure
# for parameter p0_av.
#
prepare_cv_ <- function(list_cv, n, p, bool_rmvd_x, p0_av, link, list_hyper,
                        list_init, verbose) {
  
  if (!inherits(list_cv, "cv"))
    stop(paste0("The provided list_cv must be an object of class ``cv''. \n",
                "*** you must either use the function set_cv to give the settings ",
                "for the cross-validation or set list_cv to NULL to skip the ",
                "cross-validation step. ***"))
  
  if (link == "logit")
    stop(paste0("Cross-validation not implemented only for logistic regression. ", 
                "Please, set list_cv to NULL or use probit regression."))
  
  if (!is.null(p0_av) | !is.null(list_hyper) | !is.null(list_init))
    stop(paste0("p0_av, list_hyper and list_init must all be NULL if non NULL ",
                "list_cv is provided (cross-validation)."))
  
  if (list_cv$n_cv != n)
    stop(paste0("The number of samples n provided to the function set_cv",
                "is not consistent with those of the data."))
  
  if (list_cv$p_cv != length(bool_rmvd_x))
    stop(paste0("The number of candidate predictor p provided to the function ",
                "set_cv is not consistent with X."))
  
  if (any(list_cv$p0_av_grid > p)) { # p has potentially been reduced because
    # of constant candidate predictors
    
    list_cv$p0_av_grid <- create_grid_(p, list_cv$size_p0_av_grid)
    
    new_size <- length(list_cv$p0_av_grid)
    if (list_cv$size_p0_av_grid > new_size) {
      if (verbose) cat(paste0("Cross-validation p0_av_grid reduced to ", new_size,
                              " elements as p is small.\n"))
      list_cv$size_p0_av_grid <- new_size
    }
    
    message <- paste0("The cross-validation grid has been readjusted because to ",
                      "account for the removal of constant candidate predictors. Grid used: ",
                      list_cv$p0_av_grid, ". \n")
    
    if (verbose) cat(message)
    else warning(message)
  }
  
  list_cv
  
}


# Internal function implementing sanity checks and needed preprocessing to the
# settings provided by the user for block-wise parallel inference.
#
prepare_blocks_ <- function(list_blocks, bool_rmvd_x, list_cv, list_groups, list_struct) {
  
  if (!inherits(list_blocks, "blocks"))
    stop(paste0("The provided list_blocks must be an object of class ``blocks''. \n",
                "*** you must either use the function set_blocks to give the settings ",
                "for parallels applications of locus on blocks of candidate ",
                "predictors or set list_blocks to NULL to apply locus jointly on ",
                "all the candidate predictors (sufficient RAM required). ***"))
  
  if (!is.null(list_cv))
    stop(paste0("list_cv must be NULL if non NULL ",
                "list_blocks is provided (cross-validation not yet implemented)."))
  
  if (!is.null(list_groups))
    stop(paste0("Group selection not implemented for block-wise parallel inference. ",
                "list_blocks and list_groups can't be both non-NULL."))
  
  if (!is.null(list_struct))
    stop(paste0("Structured sparse priors not enabled for block-wise parallel inference. ",
                "list_blocks and list_struct can't be both non-NULL."))
  
  if (list_blocks$p_blocks != length(bool_rmvd_x))
    stop(paste0("The number of candidate predictors p provided to the function set_blocks ",
                "is not consistent with X.\n"))
  
  vec_fac_bl <- list_blocks$vec_fac_bl[!bool_rmvd_x]
  
  tab_bl <- table(vec_fac_bl)
  pres_bl <- tab_bl > 0
  
  # in case a block was removed due to the above because of bool_rmvd_x
  n_bl  <- sum(pres_bl)
  if(list_blocks$n_cpus > n_bl) n_cpus <- n_bl
  else n_cpus <- list_blocks$n_cpus
  
  create_named_list_(n_bl, n_cpus, vec_fac_bl)
  
}

#' Gather settings for parallel inference on partitioned predictor space.
#'
#' Parallel applications of the method on blocks of candidate predictors for
#' large datasets allows faster and less RAM-greedy executions.
#'
#' @param p Number of candidate predictors.
#' @param pos_bl Vector gathering the predictor block positions (first index of
#'   each block).
#' @param n_cpus Number of CPUs to be used. If large, one should ensure that
#'   enough RAM will be available for parallel execution. Set to 1 for serial
#'   execution.
#' @param verbose If \code{TRUE}, messages are displayed when calling
#'   \code{set_blocks}.
#'
#' @return An object of class "\code{blocks}" preparing the settings for parallel
#'   inference in a form that can be passed to the \code{\link{locus}}
#'   function.
#'
#' @examples
#' seed <- 123; set.seed(seed)
#'
#' ###################
#' ## Simulate data ##
#' ###################
#'
#' ## Example using small problem sizes:
#' ##
#' n <- 200; p <- 1200; p0 <- 200; d <- 50; d0 <- 40
#'
#' ## Candidate predictors (subject to selection)
#' ##
#' # Here we simulate common genetic variants (but any type of candidate
#' # predictors can be supplied).
#' # 0 = homozygous, major allele, 1 = heterozygous, 2 = homozygous, minor allele
#' #
#' X_act <- matrix(rbinom(n * p0, size = 2, p = 0.25), nrow = n)
#' X_inact <- matrix(rbinom(n * (p - p0), size = 2, p = 0.25), nrow = n)
#'
#' shuff_x_ind <- sample(p)
#' X <- cbind(X_act, X_inact)[, shuff_x_ind]
#'
#' bool_x_act <- shuff_x_ind <= p0
#'
#' pat_act <- beta <- matrix(0, nrow = p0, ncol = d0)
#' pat_act[sample(p0*d0, floor(p0*d0/5))] <- 1
#' beta[as.logical(pat_act)] <-  rnorm(sum(pat_act))
#'
#' ## Gaussian responses
#' ##
#' Y_act <- matrix(rnorm(n * d0, mean = X_act %*% beta, sd = 0.5), nrow = n)
#' Y_inact <- matrix(rnorm(n * (d - d0), sd = 0.5), nrow = n)
#' shuff_y_ind <- sample(d)
#' Y <- cbind(Y_act, Y_inact)[, shuff_y_ind]
#'
#' ########################
#' ## Infer associations ##
#' ########################
#'
#' n_bl <- 6
#' pos_bl <- seq(1, p, by = ceiling(p/n_bl))
#' list_blocks <- set_blocks(p, pos_bl, n_cpus = 1)
#'
#' vb <- locus(Y = Y, X = X, p0_av = p0, link = "identity",
#'             list_blocks = list_blocks, user_seed = seed)
#'
#' @seealso \code{\link{locus}}
#'
#' @export
set_blocks <- function(p, pos_bl, n_cpus, verbose = TRUE) {
  
  check_structure_(p, "vector", "numeric", 1)
  check_natural_(p)
  
  check_structure_(verbose, "vector", "logical", 1)
  
  check_structure_(pos_bl, "vector", "numeric")
  check_natural_(pos_bl)
  
  if (length(pos_bl) > 25)
    warning(paste0("The provided number of blocks may be too large for accurate ",
                   "inference. If possible, use less blocks."))
  
  if (any(pos_bl < 1) | any(pos_bl > p))
    stop("The positions provided in pos_bl must range between 1 and total number of variables in X, p.")
  
  if (any(duplicated(pos_bl)))
    stop("The positions provided in pos_bl must be unique.")
  
  if (any(pos_bl != cummax(pos_bl)))
    stop("The positions provided in pos_bl must be monotonically increasing.")
  
  vec_fac_bl <- as.factor(cumsum(seq_along(1:p) %in% pos_bl))
  
  n_bl <- length(unique(vec_fac_bl))
  
  check_structure_(n_cpus, "vector", "numeric", 1)
  check_natural_(n_cpus)
  
  
  if (n_cpus > 1) {
    
    n_cpus_avail <- parallel::detectCores()
    if (n_cpus > n_cpus_avail) {
      n_cpus <- n_cpus_avail
      warning(paste0("The number of CPUs specified exceeds the number of CPUs ",
                     "available on the machine. The latter has been used instead."))
    }
    
    if (n_cpus > n_bl){
      message <- paste0("The number of cpus in use is at most equal to the number of blocks.",
                        "n_cpus is therefore set to ", n_bl, ". \n")
      if(verbose) cat(message)
      else warning(message)
      n_cpus <- n_bl
    }
    
    if (verbose) cat(paste0("locus applied in parallel on ", n_bl,
                            " blocks of candidate predictors, using ", n_cpus, " CPUs.\n",
                            "Please make sure that enough RAM is available. \n"))
  }
  
  p_blocks <- p
  
  list_blocks <- create_named_list_(p_blocks, n_bl, n_cpus, vec_fac_bl)
  
  class(list_blocks) <- "blocks"
  
  list_blocks
}


# Internal function implementing sanity checks the index of binary responses in
# case `locus_mix_core` or `locus_mix_info_core` is used.
#
prepare_ind_bin_ <- function(d, ind_bin, link) {
  
  if (link == "mix") {
    
    check_structure_(ind_bin, "vector", "numeric")
    ind_bin <- sort(unique(ind_bin))
    if (!all(ind_bin %in% 1:d))
      stop(paste0("All indices provided in ind_bin must be integers between 1 ",
                  "and the total number of responses, d = ", d, "."))
    
    if (length(ind_bin) == d)
      stop(paste0("Argument ind_bin indicates that all responses are binary. \n",
                  "Please set link to logit or probit, or change ind_bin to ",
                  "the indices of the binary responses only."))
    
  } else if (!is.null(ind_bin)) {
    
    stop("Argument ind_bin must be NULL if link is not set to mix.")
    
  }
  
  ind_bin
}



# Internal function implementing sanity checks and needed preprocessing to the
# settings provided by the user for group selection.
#
prepare_groups_ <- function(list_groups, X, q, bool_rmvd_x, link, list_cv) {
  
  if (!inherits(list_groups, "groups"))
    stop(paste0("The provided list_groups must be an object of class ``groups''. \n",
                "*** you must use the function set_groups to give the settings ",
                "for group selection. ***"))
  
  if (!is.null(list_cv))
    stop(paste0("list_cv must be NULL if non NULL ",
                "list_groups is provided (cross-validation not yet implemented). \n"))
  
  if (list_groups$p_groups != length(bool_rmvd_x))
    stop(paste0("The number of candidate predictors p provided to the function set_groups ",
                "is not consistent with X.\n"))
  
  p <- length(bool_rmvd_x)
  
  if(!(is.factor(list_groups$vec_fac_gr) && length(list_groups$vec_fac_gr) == p))
    stop(paste0("list_groups$vec_fac_gr must be a non-empty a factor of length ", p, "."))
  
  if(link != "identity" | !is.null(q))
    stop("Group selection implemented only for identity link and Z = NULL. Exit.")
  
  vec_fac_gr <- list_groups$vec_fac_gr[!bool_rmvd_x] # some groups may disappear here, but this is not a problem
  X <- lapply(unique(vec_fac_gr), function(g) X[, vec_fac_gr == g, drop = FALSE])
  
  create_named_list_(X, vec_fac_gr) # X is now a list in which the matrix is split across groups
  
}


#' Gather settings for application of the `locus` function with group selection.
#'
#' [FUNCTIONALITY UNDER ACTIVE DEVELOPMENT, PERFORMANCE (CPU TIME) NOT OPTIMIZED].
#' Posterior probabilities of associations are computed for predefined groups of
#' candidate predictors. Within each group, the mean-field inference procedure
#' makes no independence assumptions for the regression coefficients; variables
#' in each group are approximated by a multivariate normal distribution, and
#' they share a single binary latent selection variable.
#'
#' @param n Number of samples.
#' @param p Number of candidate predictors.
#' @param pos_gr Vector gathering the predictor group positions (first index of
#'   each group). The predictors must be ordered by groups.
#' @param verbose If \code{TRUE}, messages are displayed when calling
#'   \code{set_blocks}.
#'
#' @return An object of class "\code{groups}" preparing the settings for group
#'   selection in a form that can be passed to the \code{\link{locus}}
#'   function.
#'
#' @examples
#' seed <- 123; set.seed(seed)
#'
#' ###################
#' ## Simulate data ##
#' ###################
#'
#' ## Example using small problem sizes:
#' ##
#' n <- 200; p <- 250; p0 <- 75; d <- 50; d0 <- 40
#'
#' ## Candidate predictors (subject to selection)
#' ##
#' # Here we simulate common genetic variants (but any type of candidate
#' # predictors can be supplied).
#' # 0 = homozygous, major allele, 1 = heterozygous, 2 = homozygous, minor allele
#' #
#' X_act <- matrix(rbinom(n * p0, size = 2, p = 0.25), nrow = n)
#' X_inact <- matrix(rbinom(n * (p - p0), size = 2, p = 0.25), nrow = n)
#'
#' shuff_x_ind <- sample(p)
#' X <- cbind(X_act, X_inact)[, shuff_x_ind]
#'
#' bool_x_act <- shuff_x_ind <= p0
#'
#' pat_act <- beta <- matrix(0, nrow = p0, ncol = d0)
#' pat_act[sample(p0*d0, floor(p0*d0/5))] <- 1
#' beta[as.logical(pat_act)] <-  rnorm(sum(pat_act))
#'
#' ## Gaussian responses
#' ##
#' Y_act <- matrix(rnorm(n * d0, mean = X_act %*% beta, sd = 0.5), nrow = n)
#' Y_inact <- matrix(rnorm(n * (d - d0), sd = 0.5), nrow = n)
#' shuff_y_ind <- sample(d)
#' Y <- cbind(Y_act, Y_inact)[, shuff_y_ind]
#'
#' ########################
#' ## Infer associations ##
#' ########################
#'
#' n_gr <- 100
#' pos_gr <- seq(1, p, by = ceiling(p/n_gr))
#' list_groups <- set_groups(n, p, pos_gr)
#'
#' g0_av <- 50 # Number of active groups. /!\ Often best to set it large, as a
#'             # too small value may (wrong) result in no group being selected.
#'
#' vb <- locus(Y = Y, X = X, p0_av = g0_av, link = "identity",
#'   list_groups = list_groups, user_seed = seed)
#'
#' @seealso \code{\link{locus}}
#'
#' @export
#'
set_groups <- function(n, p, pos_gr, verbose = TRUE) {
  
  check_structure_(n, "vector", "numeric", 1)
  check_natural_(n)
  
  check_structure_(p, "vector", "numeric", 1)
  check_natural_(p)
  
  check_structure_(verbose, "vector", "logical", 1)
  
  check_structure_(pos_gr, "vector", "numeric")
  check_natural_(pos_gr)
  
  if (p / length(pos_gr) > 500)
    warning(paste0("The provided number of groups may be too small for tractable ",
                   "inference. If possible, use more groups."))
  
  if (any(pos_gr < 1) | any(pos_gr > p))
    stop("The positions provided in pos_gr must range between 1 and total number of variables in X, p.")
  
  if (any(duplicated(pos_gr)))
    stop("The positions provided in pos_gr must be unique.")
  
  if (any(pos_gr != cummax(pos_gr)))
    stop("The positions provided in pos_gr must be monotonically increasing.")
  
  vec_fac_gr <- as.factor(cumsum(seq_along(1:p) %in% pos_gr))
  
  if (length(unique(vec_fac_gr)) == p)
    stop(paste0("All the groups are of size one, no group selection will be performed. ",
                "Set argument list_groups to NULL in the locus function."))
  
  if (max(table(vec_fac_gr)) >= n)
    stop(paste0("One or more group size(s) is greater or equal to n.  ",
                "Corresponding empirical covariances not positive definite. Use smaller group sizes."))
  
  if (verbose) print(paste0("Number of groups: ", length(unique(vec_fac_gr))))
  
  p_groups <- p
  
  list_groups <- create_named_list_(p_groups, vec_fac_gr)
  
  class(list_groups) <- "groups"
  
  list_groups
}






# Internal function implementing sanity checks and needed preprocessing to the
# settings provided by the user for structured sparsity priors.
#
prepare_struct_ <- function(list_struct, n, q, bool_rmvd_x, link, list_cv, list_groups) {
  
  if (!inherits(list_struct, "struct"))
    stop(paste0("The provided list_struct must be an object of class ``struct''. \n",
                "*** you must use the function set_struct to give the settings ",
                "for structured sparse priors. ***"))
  
  if (!is.null(list_cv))
    stop(paste0("list_cv must be NULL if non NULL ",
                "list_struct is provided (cross-validation not yet implemented). \n"))
  
  if (list_struct$n_struct != n)
    stop(paste0("The number of samples n provided to the function set_struct ",
                "is not consistent with X.\n"))
  
  if (list_struct$p_struct != length(bool_rmvd_x))
    stop(paste0("The number of candidate predictors p provided to the function set_struct ",
                "is not consistent with X.\n"))
  
  p <- length(bool_rmvd_x)
  
  if(!(is.factor(list_struct$vec_fac_st) && length(list_struct$vec_fac_st) == p))
    stop(paste0("list_struct$vec_fac_st must be a non-empty a factor of length ", p, "."))
  
  if(link != "identity" | !is.null(q) | !is.null(list_groups))
    stop("Structured sparse priors enabled only for identity link, Z = NULL and with no group selection. Exit.")
  
  vec_fac_st <- list_struct$vec_fac_st[!bool_rmvd_x] # some blocks may disappear here, but this is not a problem
  
  # in case a block was removed due to the above because of bool_rmvd_x
  n_gr <- sum(table(vec_fac_st) > 0)
  if(list_struct$n_cpus > n_gr) n_cpus <- n_gr
  else n_cpus <- list_struct$n_cpus
  
  create_named_list_(n_cpus, vec_fac_st)
  
}




#' Gather settings for application of the `locus` function with structured
#' sparse prior.
#'
#' [FUNCTIONALITY UNDER ACTIVE DEVELOPMENT, PERFORMANCE (CPU TIME) NOT OPTIMIZED].
#' Posterior probabilities of associations are computed using an empirical
#' covariance estimate of the candidate predictors. This estimate has a block
#' structure (which could reflect linkage disequilibrium patterns when
#' considering genome-wide associations). Such a structure is necessary in
#' large problems for tractability both time- and memory-wise. The posterior
#' probablity of inclusion corresponding to a given block are approximated by a
#' multivariate distribution through a Bernoulli-probit link function.
#'
#' @param n Number of samples.
#' @param p Number of candidate predictors.
#' @param pos_st Vector gathering the predictor block positions (first index of
#'   each block). The predictors must be ordered by blocks.
#' @param n_cpus Number of CPUs to be used. Only used if \code{hyper} is 
#'   \code{FALSE}, otherwise set it to 1. If large, one should ensure that 
#'   enough RAM will be available for parallel execution. Set to 1 for serial 
#'   execution.
#' @param verbose If \code{TRUE}, messages are displayed when calling
#'   \code{set_struct}.
#'
#' @return An object of class "\code{struct}" preparing the settings for group
#'   selection in a form that can be passed to the \code{\link{locus}}
#'   function.
#'
#' @examples
#' seed <- 123; set.seed(seed)
#'
#' ###################
#' ## Simulate data ##
#' ###################
#'
#' ## Example using small problem sizes:
#' ##
#' n <- 200; p <- 300; p0 <- 100; d <- 50; d0 <- 40
#'
#' ## Candidate predictors (subject to selection)
#' ##
#' # Here we simulate common genetic variants (but any type of candidate
#' # predictors can be supplied).
#' # 0 = homozygous, major allele, 1 = heterozygous, 2 = homozygous, minor allele
#' #
#' X_act <- matrix(rbinom(n * p0, size = 2, p = 0.25), nrow = n)
#' X_inact <- matrix(rbinom(n * (p - p0), size = 2, p = 0.25), nrow = n)
#'
#' shuff_x_ind <- sample(p)
#' X <- cbind(X_act, X_inact)[, shuff_x_ind]
#'
#' bool_x_act <- shuff_x_ind <= p0
#'
#' pat_act <- beta <- matrix(0, nrow = p0, ncol = d0)
#' pat_act[sample(p0*d0, floor(p0*d0/5))] <- 1
#' beta[as.logical(pat_act)] <-  rnorm(sum(pat_act))
#'
#' ## Gaussian responses
#' ##
#' Y_act <- matrix(rnorm(n * d0, mean = X_act %*% beta, sd = 0.5), nrow = n)
#' Y_inact <- matrix(rnorm(n * (d - d0), sd = 0.5), nrow = n)
#' shuff_y_ind <- sample(d)
#' Y <- cbind(Y_act, Y_inact)[, shuff_y_ind]
#'
#' ########################
#' ## Infer associations ##
#' ########################
#'
#' n_st <- 100
#' pos_st <- seq(1, p, by = ceiling(p/n_st))
#' list_struct <- set_struct(n, p, pos_st, n_cpus = 1)
#'
#' vb <- locus(Y = Y, X = X, p0_av = p0, link = "identity",
#'    list_struct = list_struct, user_seed = seed)
#'
#' @seealso \code{\link{locus}}
#'
#' @export
#'
set_struct <- function(n, p, pos_st, n_cpus, verbose = TRUE) {
  
  check_structure_(n, "vector", "numeric", 1)
  check_natural_(n)
  
  check_structure_(p, "vector", "numeric", 1)
  check_natural_(p)
  
  check_structure_(verbose, "vector", "logical", 1)
  
  check_structure_(n_cpus, "vector", "numeric", 1)
  check_natural_(n_cpus)
  
  check_structure_(pos_st, "vector", "numeric")
  check_natural_(pos_st)
  
  if (any(pos_st < 1) | any(pos_st > p))
    stop("The positions provided in pos_st must range between 1 and total number of variables in X, p.")
  
  if (any(duplicated(pos_st)))
    stop("The positions provided in pos_st must be unique.")
  
  if (any(pos_st != cummax(pos_st)))
    stop("The positions provided in pos_st must be monotonically increasing.")
  
  vec_fac_st <- as.factor(cumsum(seq_along(1:p) %in% pos_st))
  
  n_gr <- length(unique(vec_fac_st))
  
  if (length(unique(vec_fac_st)) == p)
    stop(paste0("All the blocks are of size one, no structured selection or blockwise estimation will be performed. ",
                "Set argument list_struct to NULL in the locus function."))
  
  # Should not be needed as regularization performed anyway.
  # if (max(table(vec_fac_st)) >= n)
  #   stop(paste0("One or more block size(s) is greater or equal to n.  ",
  #                 "Corresponding empirical covariances not positive definite. Use smaller block sizes."))
  
  if (max(table(vec_fac_st)) >= n/2)
    warning(paste0("One or more block size(s) is greater or equal to n/2.  ",
                   "Corresponding empirical covariances may not be positive definite. ",
                   "Regularization will be used but this may affect the quality of inference."))
  
  if (p / length(pos_st) > 500)
    warning(paste0("The provided number of blocks may be too small for tractable ",
                   "inference. If possible, use more blocks."))
  
  if (n_cpus > 1) {
    
    n_cpus_avail <- parallel::detectCores()
    if (n_cpus > n_cpus_avail) {
      n_cpus <- n_cpus_avail
      warning(paste0("The number of CPUs specified exceeds the number of CPUs ",
                     "available on the machine. The latter has been used instead."))
    }
    
    if (n_cpus > n_gr){
      message <- paste0("The number of cpus in use is at most equal to the number of blocks.",
                        "n_cpus is therefore set to ", n_gr, ". \n")
      if(verbose) cat(message)
      else warning(message)
      n_cpus <- n_gr
    }
    
    if (verbose) print(paste0("Number of blocks: ", length(unique(vec_fac_st)),
                              ", number of CPUs: ", n_cpus))
    
  }
  
  n_struct <- n
  p_struct <- p
  
  list_struct <- create_named_list_(n_struct, p_struct, n_cpus, vec_fac_st)
  
  class(list_struct) <- "struct"
  
  list_struct
}
hruffieux/locus documentation built on Jan. 10, 2024, 10:07 p.m.