R/seq_ord_model.R

Defines functions seq_ord_model

Documented in seq_ord_model

#' @title The sequential logistic regression model for multi-classification
#'   problem under the ordinal case.
#'
#' @description \code{seq_ord_model} chooses the subjects sequentially by the
#' logistic regression model for ordinal case
#'
#' @details The \code{\link{seq_ord_model}} function and
#'   \code{\link{seq_cat_model}} function are very similar. seq_ord_model is
#'   also a multinomial logistic regression model but under the ordinal case
#'   that estimate the coefficient variables and determines the samples given
#'   the fixed size confidence set. seq_ord_model selects the sample in the same
#'   way as seq_cat_model: both are two methods. The details about the selecting
#'   methoed in seq_ord_model please refer to the \code{\link{seq_cat_model}}
#'   function.
#' @param labeled_ids A numeric vector for the unique identification of the
#'   labeled dataset
#' @param unlabeled_ids A numeric vector for the unique identification of the
#'   unlabeled dataset
#' @param splitted A list containing the datasets which we will use in the
#'   cordinl case. Note that the element of the data_split is the samples from
#'   Classes k-1and Classes k
#' @param newY A numeric number denotes the value of the labels from 0 to K
#'   which is the number of categories
#' @param train A matrix for the labeled samples. Note that the indices of the
#'   samples in the train dataset is the same as the labeled_ids
#' @param data  A matrix denotes all the data including the labeled samples and
#'   the unlabeled samples. Note that the first column of the dataset is the
#'   response variable, that's the labels and the rest is the explanatory
#'   variables.
#' @param d  A numeric number specifying the length of the fixed size confidence
#'   set for our model. The default value is 0.8.
#' @param adaptive A character string that determines the sample selection
#'   criterion to be used, matching one of 'random' or 'A_optimal'. The default
#'   value is 'random'.
#' @export
#' @return a list containing the following components
#' \item{d}{the length of the fixed size confidence set that we specify}
#' \item{n}{the current sample size when the stopping criterion is satisfied}
#' \item{is_stopped}{the label of sequential iterations stop or not. When the
#' value of is_stopped is TRUE, it means the iteration stops}
#' \item{beta_est}{the estimated coeffificent when the criterion is safisfied}
#' \item{cov}{the covariance matrix between the estimated parameters}
#' \item{adaptive}{the sample selection criterion we used }
#'
#' @references {
#' Li, J., Chen, Z., Wang, Z., & Chang, Y. I. (2020). Active learning in
#' multiple-class classification problems via individualized binary models.
#' \emph{Computational Statistics & Data Analysis}, 145, 106911.
#' doi:10.1016/j.csda.2020.106911
#' }
#'
#' @seealso{
#'    \code{\link{seq_cat_model}} for categorical case
#'
#'    \code{\link{seq_bin_model}} for binary classification case
#'
#'    \code{\link{seq_GEE_model}} for generalized estimating equations case.
#'
#'}
#'
#'
#' @examples
#' # generate the toy example
#' beta <- matrix(c(1,2,1,-1,1,2), ncol=2)
#' res <-  gen_multi_data(beta, N = 10000, type = 'ord', test_ratio = 0.3)
#' train_id <- res$train_id
#' train <- res$train
#' test <- res$test
#' res <- init_multi_data(train_id, train, init_N = 300, type = 'ord')
#' splitted <- res$splitted
#' train <- res$train
#' newY <- res$newY
#' labeled_ids <- res$labeled_ids
#' unlabeled_ids <- res$unlabeled_ids
#' data <- res$data
#'
#' # use seq_ord_model to multi-classification problem under the ordinal case.
#' # You can remove '#' to run the command.
#' # start_time <- Sys.time()
#' # logitA_ord <- seq_ord_model(labeled_ids, unlabeled_ids, splitted, newY,
#' #                             train, data, d = 0.5, adaptive = "A_optimal")
#' # logitA_ord$time <- as.numeric(Sys.time() - start_time, units = "mins")
#' # print(logitA_ord)


seq_ord_model <- function(labeled_ids, unlabeled_ids, splitted, newY, train, data, d = 0.8, adaptive="random") {
  i <- length(labeled_ids)
  model <- function(df) glm(Y ~ . -1, data = df, family = "binomial")
  models <- lapply(splitted, model)
  beta_list <- lapply(models, `[[`, "coefficients")
  p <- length(beta_list[[1]])
  beta_mat <- matrix(unlist(beta_list), nrow = p)

  while (TRUE) {
    beta_mat <- logit_model_ord(splitted, newY, beta_mat)
    cov <- getWH_ord(train, beta_mat)
    eigen_min <- cov$eigen_min
    K <- dim(beta_mat)[2]
    p <- dim(beta_mat)[1]
    Kp <- K * p
    d <- d
    is_stopped <- (eigen_min > qchisq(df=Kp, 0.95) / d^2)
    N <- nrow(data)
    if (!is_stopped && i < N) {
      if (adaptive == "random") {
        ind <- sample(1:length(unlabeled_ids), 1)
      }

      if (adaptive == "A_optimal") {
        ind <- A_optimal_ord(data[,-1], beta_mat,
                             cov$W, unlabeled_ids)
      }

      res <- update_data_ord(ind, splitted, data, train, labeled_ids, unlabeled_ids)
      splitted <- res$splitted
      newY <- res$newY
      train <- res$train
      labeled_ids <- res$labeled_ids
      unlabeled_ids <- res$unlabeled_ids

      i <- i + 1
    } else {
      results <- list(d          =  d,
                      n          =  i,
                      is_stopped = is_stopped,
                      beta_est   = beta_mat,
                      cov        =   cov$inv_sigma,
                      adaptive = adaptive)
      class(results) <- c('seqmulti','list')
      return(results)
    }
  }
}

Try the seqest package in your browser

Any scripts or data that you put into this service are public.

seqest documentation built on July 2, 2020, 2:28 a.m.