R/BinomialModel.R

Defines functions SplitTrainTest CalculatePref PlotAUC BinomialAUC Enet RegressionResult ProcessingData BinomialModel

Documented in BinomialAUC BinomialModel CalculatePref Enet PlotAUC ProcessingData RegressionResult SplitTrainTest

#' Binomial Model Construction
#'
#' @description
#' Constructs and evaluates binomial logistic regression models using Lasso and Ridge
#' regularization. Processes input data, scales features if specified, splits data into
#' training/testing sets, and fits both Lasso and Ridge models. Optionally generates
#' AUC plots for model evaluation.
#'
#' @param x A data frame containing sample ID and features. First column must be sample ID.
#' @param y A data frame where first column is sample ID and second column is outcome
#'   (numeric or factor).
#' @param seed Integer for random seed. Default is `123456`.
#' @param scale Logical indicating whether to scale features. Default is `TRUE`.
#' @param train_ratio Numeric between 0 and 1 for training proportion. Default is `0.7`.
#' @param nfold Integer for cross-validation folds. Default is `10`.
#' @param plot Logical indicating whether to generate AUC plots. Default is `TRUE`.
#' @param cols Optional color vector for ROC curves. Default is `NULL`.
#' @param palette Character string for color palette. Default is `"jama"`.
#'
#' @return List containing:
#' \describe{
#'   \item{lasso_result}{Lasso model results}
#'   \item{ridge_result}{Ridge model results}
#'   \item{train.x}{Training data with IDs}
#' }
#'
#' @export
#' @author Dongqiang Zeng
#'
#' @examples
#' set.seed(123)
#' x <- data.frame(
#'   ID = paste0("Sample", 1:50),
#'   Feature1 = rnorm(50),
#'   Feature2 = rnorm(50),
#'   Feature3 = rnorm(50)
#' )
#' y <- data.frame(
#'   ID = x$ID,
#'   Outcome = factor(rbinom(50, 1, 0.5))
#' )
#' result <- BinomialModel(x = x, y = y, plot = FALSE, nfold = 5)
#' str(result, max.level = 1)
BinomialModel <- function(x, y, seed = 123456, scale = TRUE, train_ratio = 0.7,
                          nfold = 10, plot = TRUE, palette = "jama", cols = NULL) {
  rlang::check_installed("glmnet")

  x <- as.data.frame(x)
  y <- as.data.frame(y)

  cli::cli_alert_info("Processing data")
  processdat <- ProcessingData(x = x, y = y, scale = scale, type = "binomial")
  x_scale <- processdat$x_scale
  y <- processdat$y
  x_ID <- processdat$x_ID

  cli::cli_alert_info("Splitting data into training and test sets")
  train_test <- SplitTrainTest(
    x = x_scale, y = y, train_ratio = train_ratio,
    type = "binomial", seed = seed
  )
  train.x <- train_test$train.x
  train.y <- train_test$train.y
  test.x <- train_test$test.x
  test.y <- train_test$test.y
  train_sample <- train_test$train_sample
  return.x <- data.frame(ID = x_ID[train_sample], train.x)

  cli::cli_alert_info("Running LASSO")
  set.seed(seed)
  lasso_model <- glmnet::cv.glmnet(
    x = train.x, y = train.y, family = "binomial",
    type.measure = "class", alpha = 1, nfolds = nfold
  )
  lasso_result <- RegressionResult(
    train.x = train.x, train.y = train.y,
    test.x = test.x, test.y = test.y, model = lasso_model
  )

  if (plot) {
    p1 <- PlotAUC(
      train.x = train.x, train.y = train.y,
      test.x = test.x, test.y = test.y, model = lasso_model,
      cols = cols, palette = palette, modelname = "lasso_model"
    )
    if (interactive()) print(p1)
  }

  cli::cli_alert_info("Running RIDGE REGRESSION")
  set.seed(seed)
  ridge_model <- glmnet::cv.glmnet(
    x = train.x, y = train.y, family = "binomial",
    type.measure = "class", alpha = 0, nfolds = nfold
  )
  ridge_result <- RegressionResult(
    train.x = train.x, train.y = train.y,
    test.x = test.x, test.y = test.y, model = ridge_model
  )

  if (plot) {
    p2 <- PlotAUC(
      train.x = train.x, train.y = train.y,
      test.x = test.x, test.y = test.y, model = ridge_model,
      cols = cols, palette = palette, modelname = "ridge_model"
    )
    if (interactive()) print(p2)
  }

  cli::cli_alert_success("Model fitting complete")

  list(
    lasso_result = lasso_result, ridge_result = ridge_result, train.x = return.x,
    plots = if (plot) list(lasso = p1, ridge = p2) else NULL
  )
}

#' Process Data for Model Construction
#'
#' @description
#' Preprocesses data for binomial or survival analysis. Aligns and filters data
#' based on sample IDs, optionally scales data, and ensures appropriate data types.
#' Handles missing values by removing columns with NA values.
#'
#' @param x Data frame of predictors with first column as IDs.
#' @param y Data frame of outcomes with first column as IDs. For survival,
#'   expects two additional columns for time and status.
#' @param scale Logical indicating whether to scale predictors.
#' @param type Character string: `"binomial"` or `"survival"`.
#'
#' @return List containing:
#' \describe{
#'   \item{x_scale}{Processed predictor matrix}
#'   \item{y}{Processed outcome variable}
#'   \item{x_ID}{Sample IDs}
#' }
#'
#' @export
#'
#' @examples
#' x <- data.frame(ID = 1:10, predictor1 = rnorm(10), predictor2 = rnorm(10))
#' y <- data.frame(ID = 1:10, outcome = sample(c(0, 1), 10, replace = TRUE))
#' result <- ProcessingData(x, y, scale = TRUE, type = "binomial")
ProcessingData <- function(x, y, scale, type = c("binomial", "survival")) {
  type <- rlang::arg_match(type)

  if (!is.data.frame(x)) {
    cli::cli_abort("{.arg x} must be a data frame")
  }
  if (!is.data.frame(y)) {
    cli::cli_abort("{.arg y} must be a data frame")
  }
  if (ncol(x) < 2) {
    cli::cli_abort("{.arg x} must have at least 2 columns (ID + predictors)")
  }

  colnames(x)[1] <- "ID"
  colnames(y)[1] <- "ID"
  x$ID <- as.character(x$ID)
  y$ID <- as.character(y$ID)

  if (type == "survival") {
    if (ncol(y) < 3) {
      cli::cli_abort("{.arg y} must have at least 3 columns for survival analysis (ID, time, status)")
    }
    colnames(y) <- c("ID", "time", "status")
    y <- dplyr::filter(y, .data$time > 0)
  }

  samples <- intersect(x$ID, y$ID)
  if (length(samples) == 0) {
    cli::cli_abort("No matching sample IDs found between x and y")
  }

  x <- x[match(samples, x$ID), ]
  y <- y[match(samples, y$ID), ]

  if (type == "binomial") {
    colnames(y) <- c("ID", "Group")
    y <- dplyr::pull(y, .data$Group)
    if (!is.factor(y)) {
      cli::cli_alert_info("Converting outcome to factor")
      y <- as.factor(y)
    }
  } else if (type == "survival") {
    y <- y[, c("time", "status")]
  }

  x_scale <- if (scale) {
    scale(x[, -1, drop = FALSE], center = TRUE, scale = TRUE)
  } else {
    as.matrix(x[, -1, drop = FALSE])
  }

  x_ID <- x[, "ID"]
  na_cols <- which(colSums(is.na(x_scale)) > 0)
  if (length(na_cols) > 0) {
    cli::cli_warn("Removing {length(na_cols)} column(s) with NA values")
    x_scale <- x_scale[, -na_cols, drop = FALSE]
  }

  list(x_scale = x_scale, y = y, x_ID = x_ID)
}

#' Regression Result Computation
#'
#' @description
#' Computes regression results with coefficients at lambda.min and lambda.1se,
#' and evaluates AUC for binomial outcomes. Returns a comprehensive summary
#' of model performance on both training and testing datasets.
#'
#' @param train.x Training predictors matrix.
#' @param train.y Training outcomes (binary factor).
#' @param test.x Testing predictors matrix.
#' @param test.y Testing outcomes (binary factor).
#' @param model Fitted cv.glmnet model object.
#'
#' @return List containing:
#' \describe{
#'   \item{model}{The fitted cv.glmnet model}
#'   \item{coefs}{Data frame with feature names and coefficients at lambda.min and lambda.1se}
#'   \item{AUC}{Matrix of AUC values for train/test sets at both lambda values}
#' }
#'
#' @export
#'
#' @examples
#' if (requireNamespace("glmnet", quietly = TRUE)) {
#'   set.seed(123)
#'   train_data <- matrix(rnorm(100 * 10), ncol = 10)
#'   train_outcome <- rbinom(100, 1, 0.5)
#'   test_data <- matrix(rnorm(50 * 10), ncol = 10)
#'   test_outcome <- rbinom(50, 1, 0.5)
#'   fitted_model <- glmnet::cv.glmnet(train_data, train_outcome, family = "binomial", nfolds = 5)
#'   results <- RegressionResult(
#'     train.x = train_data, train.y = train_outcome,
#'     test.x = test_data, test.y = test_outcome, model = fitted_model
#'   )
#' }
RegressionResult <- function(train.x, train.y, test.x, test.y, model) {
  if (!is.matrix(train.x) || !is.matrix(test.x)) {
    cli::cli_abort("train.x and test.x must be matrices")
  }

  coefs <- cbind(
    stats::coef(model, s = model$lambda.min),
    stats::coef(model, s = model$lambda.1se)
  )
  coefs <- data.frame(
    feature = rownames(coefs),
    lambda.min = coefs[, 1],
    lambda.1se = coefs[, 2]
  )

  datasets <- list(
    list(data = train.x, outcome = train.y, lambda = model$lambda.min),
    list(data = train.x, outcome = train.y, lambda = model$lambda.1se),
    list(data = test.x, outcome = test.y, lambda = model$lambda.min),
    list(data = test.x, outcome = test.y, lambda = model$lambda.1se)
  )

  aucs <- vapply(datasets, function(d) {
    BinomialAUC(model, d$data, d$lambda, d$outcome)
  }, numeric(1))

  AUC <- matrix(aucs,
    ncol = 2, byrow = TRUE,
    dimnames = list(c("train", "test"), c("lambda.min", "lambda.1se"))
  )

  list(model = model, coefs = coefs, AUC = AUC)
}

#' Elastic Net Model Fitting
#'
#' @description
#' Fits elastic net model with cross-validation to find optimal alpha and lambda.
#' Searches across a grid of alpha values (0 to 1) and lambda values to minimize
#' cross-validation error.
#'
#' @param train.x Training predictors matrix.
#' @param train.y Training binary outcomes (0/1 or factor).
#' @param lambdamax Maximum lambda value for the grid search.
#' @param nfold Number of CV folds. Default is `10`.
#'
#' @return List containing:
#' \describe{
#'   \item{chose_alpha}{Optimal alpha value (0-1)}
#'   \item{chose_lambda}{Optimal lambda value}
#' }
#'
#' @export
#'
#' @examples
#' if (requireNamespace("glmnet", quietly = TRUE)) {
#'   set.seed(123)
#'   train_data <- matrix(rnorm(50 * 5), ncol = 5)
#'   train_outcome <- rbinom(50, 1, 0.5)
#'   result <- Enet(train.x = train_data, train.y = train_outcome, lambdamax = 1, nfold = 5)
#' }
Enet <- function(train.x, train.y, lambdamax, nfold = 10) {
  train.x <- as.matrix(train.x)
  train.y <- as.numeric(train.y)

  if (nrow(train.x) != length(train.y)) {
    cli::cli_abort("Number of rows in train.x ({nrow(train.x)}) must equal length of train.y ({length(train.y)})")
  }
  if (lambdamax <= 0) {
    cli::cli_abort("{.arg lambdamax} must be greater than 0, got {lambdamax}")
  }
  if (nfold < 2) {
    cli::cli_abort("{.arg nfold} must be at least 2, got {nfold}")
  }

  alpha.grid <- seq(0, 1, by = 0.2)
  lambda.grid <- seq(1e-4, lambdamax, length.out = 10)

  res <- lapply(alpha.grid, function(a) {
    fit <- glmnet::cv.glmnet(
      x = train.x, y = train.y, family = "binomial",
      alpha = a, lambda = lambda.grid, nfolds = nfold, type.measure = "class"
    )
    data.frame(alpha = a, lambda = fit$lambda.min, cv_error = min(fit$cvm))
  })

  res <- do.call(rbind, res)
  best <- res[which.min(res$cv_error), ]

  list(chose_alpha = best$alpha, chose_lambda = best$lambda)
}

#' Calculate AUC for Binomial Model
#'
#' @description
#' Computes Area Under the ROC Curve (AUC) for model predictions using the
#' ROCR package. Handles binary classification models from glmnet.
#'
#' @param model Fitted glmnet model object.
#' @param newx New data matrix for prediction.
#' @param s Lambda value for prediction (e.g., "lambda.min" or numeric).
#' @param acture.y Actual binary outcomes (numeric 0/1 or factor).
#'
#' @return Numeric AUC value between 0 and 1.
#'
#' @export
#'
#' @examples
#' if (requireNamespace("glmnet", quietly = TRUE) &&
#'   requireNamespace("ROCR", quietly = TRUE)) {
#'   set.seed(123)
#'   train_data <- matrix(rnorm(100 * 5), ncol = 5)
#'   train_outcome <- rbinom(100, 1, 0.5)
#'   test_data <- matrix(rnorm(50 * 5), ncol = 5)
#'   test_outcome <- rbinom(50, 1, 0.5)
#'   fitted_model <- glmnet::cv.glmnet(train_data, train_outcome, family = "binomial", nfolds = 5)
#'   auc_value <- BinomialAUC(fitted_model, test_data, fitted_model$lambda.min, test_outcome)
#'   print(auc_value)
#' }
BinomialAUC <- function(model, newx, s, acture.y) {
  rlang::check_installed("ROCR")
  prob <- stats::predict(model, newx = newx, s = s, type = "response")
  pred <- ROCR::prediction(prob, acture.y)
  as.numeric(ROCR::performance(pred, "auc")@y.values)
}

#' Plot AUC ROC Curves
#'
#' @description
#' Generates ROC curves for model evaluation comparing training and testing
#' performance at both lambda.min and lambda.1se. Creates a ggplot visualization
#' with AUC values in the legend.
#'
#' @param train.x Training predictors matrix.
#' @param train.y Training outcomes (binary factor).
#' @param test.x Testing predictors matrix.
#' @param test.y Testing outcomes (binary factor).
#' @param model Fitted cv.glmnet model.
#' @param modelname Character string for plot title.
#' @param cols Optional color vector for ROC curves.
#' @param palette Color palette name from IOBR palettes. Default is `"jama"`.
#'
#' @return ggplot object of ROC curves.
#'
#' @export
#'
#' @examples
#' if (requireNamespace("glmnet", quietly = TRUE)) {
#'   set.seed(123)
#'   train_data <- matrix(rnorm(100 * 5), ncol = 5)
#'   train_outcome <- rbinom(100, 1, 0.5)
#'   test_data <- matrix(rnorm(50 * 5), ncol = 5)
#'   test_outcome <- rbinom(50, 1, 0.5)
#'   fitted_model <- glmnet::cv.glmnet(train_data, train_outcome, family = "binomial", nfolds = 5)
#'   p <- PlotAUC(train_data, train_outcome, test_data, test_outcome, fitted_model, "MyModel")
#'   print(p)
#' }
PlotAUC <- function(train.x, train.y, test.x, test.y, model, modelname,
                    cols = NULL, palette = "jama") {
  cols <- cols %||% palettes(
    category = "box", palette = palette,
    show_message = FALSE, show_col = FALSE
  )

  datasets <- list(
    list(data = train.x, outcome = train.y, lambda = model$lambda.min),
    list(data = train.x, outcome = train.y, lambda = model$lambda.1se),
    list(data = test.x, outcome = test.y, lambda = model$lambda.min),
    list(data = test.x, outcome = test.y, lambda = model$lambda.1se)
  )

  pref <- lapply(datasets, function(d) {
    CalculatePref(model, d$data, d$lambda, d$outcome)
  })

  aucs <- round(vapply(datasets, function(d) {
    BinomialAUC(model, d$data, d$lambda, d$outcome)
  }, numeric(1)), 2)

  legend.name <- paste(
    c("train_lambda.min", "train_lambda.1se", "test_lambda.min", "test_lambda.1se"),
    "AUC", aucs,
    sep = " "
  )
  names(pref) <- c("train_lambda.min", "train_lambda.1se", "test_lambda.min", "test_lambda.1se")

  plotdat <- do.call(rbind, lapply(names(pref), function(nm) {
    data.frame(
      s = nm,
      x = pref[[nm]]@x.values[[1]],
      y = pref[[nm]]@y.values[[1]]
    )
  }))

  plotdat$s <- factor(plotdat$s, levels = names(pref))

  ggplot2::ggplot(plotdat, ggplot2::aes(x = .data$x, y = .data$y, color = .data$s)) +
    ggplot2::geom_path(linewidth = 1) +
    ggplot2::geom_abline(intercept = 0, slope = 1, linetype = "dashed") +
    ggplot2::xlab("False positive rate") +
    ggplot2::ylab("True positive rate") +
    ggplot2::theme_bw() +
    ggplot2::scale_color_manual(values = cols, labels = legend.name) +
    ggplot2::ggtitle(stringr::str_replace(modelname, "_", " ")) +
    ggplot2::theme(
      legend.title = ggplot2::element_blank(),
      plot.title = ggplot2::element_text(size = ggplot2::rel(2), hjust = 0.5),
      axis.text.x = ggplot2::element_text(face = "plain", angle = 0, hjust = 1, color = "black"),
      axis.text.y = ggplot2::element_text(face = "plain", angle = 0, hjust = 1, color = "black")
    )
}

#' Calculate Performance Metrics
#'
#' @description
#' Computes True Positive Rate (TPR) and False Positive Rate (FPR) for ROC
#' analysis using the ROCR package. Used internally for ROC curve generation.
#'
#' @param model Fitted glmnet model.
#' @param newx New data matrix for prediction.
#' @param s Lambda value for prediction.
#' @param acture.y Actual binary outcomes.
#'
#' @return ROCR performance object containing TPR and FPR values.
#'
#' @export
#'
#' @examples
#' if (requireNamespace("glmnet", quietly = TRUE) &&
#'   requireNamespace("ROCR", quietly = TRUE)) {
#'   fitted_model <- glmnet::cv.glmnet(matrix(rnorm(100), ncol = 2), rbinom(50, 1, 0.5), nfolds = 3)
#'   perf <- CalculatePref(fitted_model, matrix(rnorm(20), ncol = 2), "lambda.min", rbinom(10, 1, 0.5))
#' }
CalculatePref <- function(model, newx, s, acture.y) {
  rlang::check_installed("ROCR")
  prob <- stats::predict(model, newx = newx, s = s, type = "response")
  pred <- ROCR::prediction(prob, acture.y)
  ROCR::performance(pred, "tpr", "fpr")
}

#' Split Data into Training and Testing Sets
#'
#' @description
#' Divides dataset into training and testing sets using random sampling.
#' Maintains data integrity for both binomial and survival analysis types.
#'
#' @param x Predictor matrix or data frame.
#' @param y Outcome vector (binomial) or matrix with time/status (survival).
#' @param train_ratio Proportion for training (0-1). Default is `0.7`.
#' @param type Analysis type: `"binomial"` or `"survival"`.
#' @param seed Random seed for reproducibility.
#'
#' @return List containing:
#' \describe{
#'   \item{train.x}{Training predictors matrix}
#'   \item{train.y}{Training outcomes}
#'   \item{test.x}{Testing predictors matrix}
#'   \item{test.y}{Testing outcomes}
#'   \item{train_sample}{Indices of training samples}
#' }
#'
#' @export
#'
#' @examples
#' data_matrix <- matrix(rnorm(200), ncol = 2)
#' outcome_vector <- rbinom(100, 1, 0.5)
#' split_data <- SplitTrainTest(
#'   data_matrix, outcome_vector,
#'   train_ratio = 0.7,
#'   type = "binomial", seed = 123
#' )
SplitTrainTest <- function(x, y, train_ratio, type = c("binomial", "survival"), seed) {
  type <- rlang::arg_match(type)

  if (train_ratio <= 0 || train_ratio >= 1) {
    cli::cli_abort("{.arg train_ratio} must be between 0 and 1, got {train_ratio}")
  }

  sizes <- round(nrow(x) * train_ratio)
  set.seed(seed)
  train_sample <- sample(seq_len(nrow(x)), size = sizes, replace = FALSE)
  test_sample <- setdiff(seq_len(nrow(x)), train_sample)

  train.x <- x[train_sample, , drop = FALSE]
  test.x <- x[test_sample, , drop = FALSE]

  if (type == "binomial") {
    train.y <- y[train_sample]
    test.y <- y[test_sample]
  } else if (type == "survival") {
    train.y <- y[train_sample, , drop = FALSE]
    test.y <- y[test_sample, , drop = FALSE]
  }

  list(
    train.x = train.x, train.y = train.y, test.x = test.x, test.y = test.y,
    train_sample = train_sample
  )
}

Try the IOBR package in your browser

Any scripts or data that you put into this service are public.

IOBR documentation built on May 30, 2026, 5:07 p.m.