R/Wtsmth_Fit.R

Defines functions fit_WTSMTH

Documented in fit_WTSMTH

#' Penalized Regression with Lasso and Weighted Fusion Penalties with Given Parameters
#'
#' Performs penalized regression with Lasso penalty and weighted fusion penalty
#' for a given pair of tuning parameters (lambda1 and lambda2), which is 
#' determined by the user based on prior knowledge or use any number just for 
#' testing purpose.
#' 
#' @param data An object of class "WTsmth.data" as generated by prep()
#' @param lambda1 A scalar numeric. Lambda_1 value to be considered. Provided
#'   value will be transformed to 2^(lambda1).
#' @param lambda2 A scalar numeric Lambda_2 value to be considered. Provided
#'   value will be transformed to 2^(lambda2).
#' @param weight A character. The type of weighting. Must be one of 
#'   eql, keql, wcs, kwcs, wif, kwif indicating
#'   equal weight, K x equal weight, Cosine similarity, K x cosine similarity, 
#'   inverse frequency, and K x inverse frequency, where K is the number of
#'   individuals in each CNV-active region. 
#'   `eql` and `keql` gives equal weight to adjacent CNVs.
#'    `wcs` and `kwcs` allow similar CNV fragments to have more similar effect size. 
#'    `wif` and `kwif` will encourage CNV with lower frequency to borrow 
#'   information from nearby more frequent CNV fragments.
#'   Considering that CNVs usually present in some CNV-active regions and there are
#'    large regions in between with no CNV at all. K will describe the number of individuals 
#'    having any CNV activities in a CNV-active region, and varying the weight according 
#'    to the sample size across regions. 
#' @param family A character. The family of the outcome. Must be one of
#'   "gaussian" (Y is continuous) or "binomial" (Y is binary).
#' @param iter.control A list object. Allows user to control iterative
#'   update procedure. Allowed elements are "max.iter", the maximum number
#'   of iterations; "tol.beta", the difference between consecutive beta
#'   updates below which the procedure is deemed converged; and "tol.loss",
#'   the difference in consecutive loss updates below which the procedure
#'   is deemed converged.
#' @param ... Ignored.
#'
#' 
#' @returns A numeric vector. The estimated model parameters
#' 
#'
#' @include ctnsSolution.R helpful_tests.R rwlsSolution.R utils.R
#'
#' @export
#' 
#' @examples
#' # Note we use here a very small example data set to expedite examples. 
#' 
#' # load toy dataset
#' data("CNVCOVY")
#' 
#' # prepare data format for regression analysis
#' 
#' ## Continuous outcome Y_QT
#' frag_data <- prep(CNV = CNV, Y = Y_QT, Z = Cov, rare.out = 0.05)
#' QT_fit <- fit_WTSMTH(frag_data, 
#'                      lambda1 = -5, 
#'                      lambda2 = 21, 
#'                      weight = "eql", 
#'                      family = "gaussian")
#'                         
#' ## Binary outcome Y_BT
#'
#' # We can directly replace frag_data$Y with Y_BT in the correct format,
#' # ensuring that the ordering matches that of the prepared object.
#' 
#' rownames(Y_BT) <- Y_BT$ID
#' frag_data$Y <- Y_BT[names(frag_data$Y), "Y"] |> drop()
#' names(frag_data$Y) <- rownames(frag_data$Z) 
#'
#' # Or, we can also repeat the prep() call
#' # frag_data <- prep(CNV = CNV, Y = Y_BT, Z = Cov, rare.out = 0.05)
#'
#' BT_fit <- fit_WTSMTH(frag_data, 
#'                         lambda1 = -5, 
#'                         lambda2 = 6, 
#'                         weight = "eql",
#'                         family = "binomial")
fit_WTSMTH <- function(data, lambda1, lambda2, weight = NULL,
                       family = c("gaussian", "binomial"), 
                       iter.control = list(max.iter = 8L, 
                                           tol.beta = 10^(-3), 
                                           tol.loss = 10^(-6)),
                       ...) {
  
  extras <- list(...)
  
  # take the first value as default
  family <- match.arg(family)
  
  stopifnot(
    "`data` must be a 'WTsmth.data' object" = !missing(data) && inherits(data, "WTsmth.data"),
    "`lambda1 must be a numeric vector" = !missing(lambda1) && .isNumericVector(lambda1, 1L),
    "`lambda2 must be a numeric vector" = !missing(lambda2) && .isNumericVector(lambda2, 1L),
    "`weight` must be one of eql, keql, wcs, kwcs, wif, kwif" =
      is.null(weight) || {.isCharacterVector(weight, 1L) &&
          weight %in% c("eql", "keql", "wcs", "kwcs", "wif", "kwif")},
    "`iter.control` must be a list; allowed elements are max.iter, tol.beta, and tol.loss" = 
      .isNamedList(iter.control, c("max.iter", "tol.beta", "tol.loss"))
  )
  

  
  iter.control <- .testIterControl(iter.control)  
  
  #if fit one lmd1 and lmd2 pair, data is not expanded with "weight"
  if (is.null(data$XZ)) {
    if (is.null(weight)) stop("`weight` must be provided", call. = FALSE)
    data <- .expandWTsmth(data, weight = weight)
  } else {
    if (!is.null(weight)) warning("`weight` input ignored; data already expanded", call. = FALSE)
  }
  
 
  if(family == "binomial") {
    data$Y <- .confirmBinary(data$Y)
    #subset for CV
  }
  if (family == "gaussian") {
    data$Y <- .confirmContinuous(data$Y)
  }
  
  ##for updating iteratively -- mainly useful in binory outcomes
  data$XZ_update <- data$XZ
  data$Y_update <- data$Y
  
  ## for crossvalidation
  if (!is.null(extras$subset)) {
    data$Y_update <- data$Y[extras$subset]
    data$XZ_update <- data$XZ[extras$subset, , drop = FALSE]
    data$Y <- data$Y[extras$subset]
    data$XZ <- data$XZ[extras$subset, , drop = FALSE]
  }
  
  
  intercept_name <- "(Intercept)"
  while (intercept_name %in% colnames(data$XZ)) {
    intercept_name <- sample(LETTERS, 10, TRUE)
  }
  XZ_colnames <- c(intercept_name, colnames(data$XZ))
  
  
  Y_app <- rep.int(0L, nrow(data$A))
  names(Y_app) <- rownames(data$A)
  
  if (family == "gaussian") {
    
    # Update XZp1 and send to data$XZ
    data$XZ_update <- cbind(1.0, data$XZ_update)
    colnames(data$XZ_update) <- XZ_colnames
    
    # Update XZ_app
    XZ_app <- cbind(0.0, sqrt(2.0^(lambda2)) * data$A)
    rownames(XZ_app) <- rownames(data$A)
    
    
    b_coef = .ctnsSolution(data = data, X.app = XZ_app, Y.app = Y_app, lambda1 = lambda1)
    #data = data; X.app = XZ_app; Y.app = Y_app; lambda1 = lambda1
  } else {
    
    XN = nrow(data$XZ)
    AN = nrow(data$A)
    XZ_app <- cbind(0.0, sqrt(2*(XN+AN)*(2^(lambda2))) * data$A)
    #dim(XZ_app)
    rownames(XZ_app) <- rownames(data$A)
    
    b_coef = .rwlsSolution(data = data,
                  X.app = XZ_app, Y.app = Y_app, 
                  lambda1 = lambda1,
                  iter.control = iter.control)
  }
 # data = data; X.app = XZ_app; Y.app = Y_app;  lambda1 = lambda1; iter.control = iter.control
  b_intercept <- c("(Intercept)", "", "", "", "",  b_coef[1])
  
  b_cnv <- data.frame(Vnames = names(b_coef[-1]),
                       coef= b_coef[-1])
  cnvr_info <- data$CNVR.info
  cnvr_info$Vnames <- paste0(cnvr_info$deldup, cnvr_info$grid.id)
  b_cnv <- merge(cnvr_info[ ,c("Vnames", "CHR", "CNV.start", "CNV.end", "deldup")],
                 b_cnv, by = "Vnames")
  b_cnv <- b_cnv[order(b_cnv$deldup, b_cnv$CNV.start),]
  
  b_x <- cbind(names(b_coef[(1+ncol(data$design)+1):(1+ncol(data$design)+ncol(data$Z))]), matrix("", nrow=ncol(data$Z), ncol=4), b_coef[(1+ncol(data$design)+1):(1+ncol(data$design)+ncol(data$Z))])
  colnames(b_x) <- c("Vnames", "CHR", "CNV.start", "CNV.end", "deldup", "coef")
  
  b_cnv <- rbind(b_intercept, b_cnv, b_x)
  
  b_cnv <- data.frame("Vnames" = b_cnv$Vnames , 
                      "CHR" = b_cnv$CHR |> as.integer(),
                      "CNV.start" = b_cnv$CNV.start |> as.numeric(),
                      "CNV.end" = b_cnv$CNV.end |> as.numeric(),
                      "deldup" = b_cnv$deldup ,
                      "coef" = b_cnv$coef |> as.numeric())
  

}

Try the CNVreg package in your browser

Any scripts or data that you put into this service are public.

CNVreg documentation built on April 4, 2025, 12:41 a.m.