R/stepreg_240422.R
In glmnetr: Nested Cross Validation for the Relaxed Lasso and Other Machine Learning Models

Documented in best.preds stepreg summary.stepreg

################################################################################
##### stepreg_yymmdd.R #########################################################
################################################################################
#' Fit the steps of a stepwise regression.  
#'
#' @param xs_st predictor input - an n by p matrix, where n (rows) is sample size, and p (columns) 
#' the number of predictors.  Must be in matrix form for complete data, no NA's, no Inf's, etc.,
#' and not a data frame. 
#' @param start_time_st start time, Cox model only - class numeric of length same as number of patients (n)
#' @param y_st output vector: time, or stop time for Cox model, y_st 0 or 1 for binomal (logistic), numeric for gaussian. 
#' Must be a vector of length same as number of sample size. 
#' @param event_st event_st indicator, 1 for event, 0 for census, Cox model only.
#' Must be a numeric vector of length same as sample size.
#' @param steps_n number of steps done in stepwise regression fitting 
#' @param method method for choosing model in stepwise procedure, "loglik" or "concordance".
#' Other procedures use the "loglik".     
#' @param family model family, "cox", "binomial" or "gaussian" 
#' @param track 1 to output stepwise fit program, 0 (default) to suppress 
#'
#' @return does a stepwise regression of depth maximum depth steps_n
#' 
#' @seealso 
#'    \code{\link{summary.stepreg}} , \code{\link{aicreg}} , \code{\link{cv.stepreg}} , \code{\link{nested.glmnetr}}   
#' 
#' @export
#' 
#' @importFrom stats binomial glm.fit 
#'
#' @examples
#' set.seed(18306296)
#' sim.data=glmnetr.simdata(nrows=100, ncols=100, beta=c(0,1,1))
#' # this gives a more intersting case but takes longer to run
#' xs=sim.data$xs           
#' # this will work numerically
#' xs=sim.data$xs[,c(2,3,50:55)] 
#' y_=sim.data$yt  
#' event=sim.data$event
#' # for a Cox model 
#' cox.step.fit = stepreg(xs, NULL, y_, event, family="cox", steps_n=40) 
#' # ... and for a linear model 
#' y_=sim.data$yt  
#' norm.step.fit = stepreg(xs, NULL, y_, NULL, family="gaussian", steps_n=40) 
#' 
stepreg = function(xs_st, start_time_st=NULL, y_st, event_st, steps_n=0, method="loglik", family=NULL, track=0) {
  
#  if  ( is.null(start_time_st)) { print("HERE step 0 : start_time_st=NULL") }
  
#  xx1 = start_time_st[1]
#  xx2 = start_time_st[2]
#  if  (!is.null(start_time_st)) { print("HERE step 1 : start_time_st=", start_time_st[1], start_time_st[2], start_time_st[3], start_time_st[4]) }
#  if  (!is.null(start_time_st)) { print("HERE step 1 : start_time_st=", xx1, xx2) }
  
  ## xs_st or predictors - must be as matrix 
  xsk = dim(xs_st)[2]                                                              ## number of candidate predictor variables 
  nobs = dim(xs_st)[1] 
  one = c( rep(1, nobs) )
  
  if (steps_n==0) { steps_n=xsk }
  steps_n = min(steps_n, xsk)
  
  if ( is.null(family)) {family = NA}
  if ( is.na(family) & (!is.null(start_time_st)) ) { family = "cox" }
  if ( is.na(family) & (!is.null(event_st)) & is.null(y_st)) { 
    y_st = event_st 
    event_st = NULL 
    family = "binomial" 
  } else if ( is.na(family) & (!is.null(event_st     )) ) { 
    family = "cox" 
  }
  if ( is.na(family) & ( length(table(y_st)) == 2 ) ) { family = "binomial" }
  if ( is.na(family) & ( length(table(y_st)) >  2 ) ) { family = "gaussian" }
  if ( family == "poison" ) { family = "poisson" }
  if ( family %in% c("gausian", "gause", "normal") ) { family = "gaussian" }
  if ( family %in% c("cox", "Cox", "coxph") ) { family = "cox" }

  coxcontrol = survival::coxph.control() 
  glmcontrol = glm.control()

  riskvarsp =  c( rep(0, xsk)  )                                                ## iniitialize risk factors for model with no terms - presence indicator 
  
  xsnames = colnames(xs_st)
  dframe   = as.data.frame( cbind(y_st, xs_st) ) 
#  colnames(dframe)
  
  betalast = NULL 
# i_ = 1 ; j_ = 1 ;   ## for testing only 
# i_ = 2 ; j_ = 1 ; 
  ##### begin i_ loop ##########################################################
  if (track>=1) { cat ("Calculating stepwise model step : ") }
  for (i_ in 1:steps_n)  { 
#  for (i_ in 1:10)  { 
    if (track>=1) { cat ( i_, " ") } 
    for (j_ in 1:xsk) {
      if (riskvarsp[j_] == 0) {
        riskvarsnewp = as.numeric( riskvarsp ) ;                                ## initialize risk factor for new model (present indicator) from last best fit
        riskvarsnewp[j_] = 1                                                    ## add current variable to risk factors for new model fit (present indicators)
        riskvarsnewp 

        sum( riskvarsnewp )
        dim(xs_st)
        
        riskm = as.matrix( xs_st[,(riskvarsnewp==1)] )                          ## risk factors for model fit in current step 
        riskm [ 1:20,]
        dim(riskm)

        riskvarsnew = xsnames[riskvarsnewp==1]
        riskvarsnew
      
##        if ( (i_==1) & (j_ %in% c((0:20)*10+1) ) ) cat ("HERE start_time_st = ", start_time_st[1:10], "\n" )
        if (family == "cox") {
          if (is.null(betalast)) { init = rep(0,dim(riskm)[2]) 
          } else { init = betalast[ (riskvarsnewp==1) ] ; length(init) } 
          if (is.null(start_time_st)) { 
            fit1 = survival::coxph.fit(riskm ,Surv(                y_st,  event_st), strata=NULL, init=init,
                                       control=coxcontrol, method="efron", rownames=NULL, resid=FALSE)  
            clist =  concordancefit(Surv(       y_st, event_st), -fit1$linear.predictors)   
          } else {  
            init = c(rep(0,dim(riskm)[2]))
            fit1 = survival::agreg.fit(riskm ,Surv( start_time_st, y_st, event_st), strata=NULL, init=init,  
                                      control=coxcontrol, method="efron", rownames=NULL, resid=FALSE) 
            clist = concordancefit(Surv(start_time_st, y_st, event_st), -fit1$linear.predictors)     ## get concordances  --  allows extension to shose best by C instead of loglik
#           if ( (i_==1) & (j_ %in% c(0:5) ) ) cat ("HERE step 0.2" , "\n" )
          }
          if (rankMatrix(riskm) < 1e-12) { fit1$coefficients = rep(0,dim(riskm)[2]) } 
          fit1$coefficients[is.na(fit1$coefficients)] = 0                       # assign missing coefficients value 0          
          loglik = fit1$loglik 
          names(loglik) = c("loglik.null", "loglik")
          concord = c(as.numeric( clist[1] ), sqrt( as.numeric( clist[4] ) ) )    ## check this XXX
          length( concord )
          if (length( concord ) == 2) { 
            names(concord) = c("concordance", "std") 
          } else {  
            print(" HERE step 2" )  
            print(  names(clist))  
          }
        } else if ( family == "binomial" ) {
          riskm1 = cbind(one, riskm)                                                               ## c(rep(1,dim(y_st)[1]))  ## start=c(0) 
          fit1 = glm.fit(riskm1 , y_st , family = binomial(link = "logit"), control = glmcontrol)  ## , start = beta_val , weights=weights_val_j
          summary(fit1)
#          beta = fit1$coefficients 
#          beta[is.na(beta)] = 0 
#          phat = riskm1 %*% beta # fit1$coefficients
#          table(phat)
          fit1$coefficients[is.na(fit1$coefficients)] = 0                       # assing missing coefficients value 0
#          if (rankMatrix(riskm1) < 1e-12) { fit1$coefficients = rep(0,dim(riskm1)[2]) } 
#          clist = concordancefit( y_st, fit1$linear.predictors)  
#          form1 = formula( paste(" y_st ~ " , paste(riskvarsnew, collapse = " + " ) ) )  
#          fit1 = glm( form1, data=dframe, family = binomial(link = "logit"))   ## , start = beta_val , weights=weights_val_j          
          clist =  concordance( y_st ~ fit1$linear.predictors )                 ## get concordances  
          concord = c(concordance=as.numeric( clist[1] ), se=sqrt( as.numeric( clist[4] ) ) )  
          loglik = c(loglik.null=-fit1$null.deviance/2, loglik=-fit1$deviance/2)
        } else if ( family == "gaussian" ) {
          ##          riskm1 = cbind(1, riskm)                                                    ## c(rep(1,dim(y_st)[1]))  ## start=c(0) 
          ##          fit1 = glm.fit(riskm1 , y_st , family = binomial(link = "logit"), control = glmcontrol)  ## , start = beta_val , weights=weights_val_j
          ##          fit1 = glm(event_st ~ riskm1 , family=family)                              ######## your model statement here #######
          ##          ##        (var(fit1$linear.predictors) + var(fit1$residuals))/var(y_st) 
          ##          rsquare = (var(fit1$linear.predictors)                      )/var(y_st)
          ##          fit1 = glm(xs_st , event_st , family = binomial(link = "logit"), control = glmcontrol)  ## , start = beta_val , weights=weights_val_j
          ##          clist =  concordancefit(Surv(  c(rep(1, length(y_st))) , y_st), -fit1$linear.predictors)     ## get concordances  --  allows extension to shose best by C instead of loglik
          ##          loglik = -fit1$deviance/2
          fit1 = lm(y_st ~ riskm)         ###### and in other calls like this ###### , weights=weights_val
          fit1$coefficients[is.na(fit1$coefficients)] = 0                       # assing missing coefficients value 0
          n = length(y_st) 
          kp1 = length(fit1$coefficients) 
          rsquare = var(fit1$fitted.values)/var(y_st) 
          rsquareadj = 1 - ((1-rsquare)*(n-1)/(n-kp1)) 
          rsquare = c(rsquare=rsquare, rsquareadj=rsquareadj)
##          names(fit1)
##          summary(fit1)      
          S2 = var(fit1$residuals)*(n-1)/n
          loglik      = -(n/2)*log(2*pi) - n*log(sqrt(S2)) -(1/(2*S2))*n*S2 
          S2 = var(y_st)*(n-1)/n
          loglik.null = -(n/2)*log(2*pi) - n*log(sqrt(S2)) -(1/(2*S2))*n*S2 
          loglik = c(loglik.null=loglik.null, loglik=loglik)
##          logLik(fit1)
        }  
    
        if (family == "cox") {
          modsum_ = c(ii=i_, jj=j_, best=0, loglik, concord ) 
          modsumt = data.frame(t(modsum_), t(riskvarsnewp),  t(rep(0, xsk)) )     ## first set up data row with place holders for coefficients 
          colnames(modsumt)[(xsk+8):(2*xsk+7)] = xsnames
        } else if (family == "binomial") { 
          modsum_ = c(ii=i_, jj=j_, best=0, loglik, concord ) 
          modsum_
          modsumt = data.frame(t(modsum_), t(riskvarsnewp),  t(rep(0, xsk+1)) )     ## first set up data row with place holders for coefficients 
          colnames(modsumt)[(xsk+8):(2*xsk+8)] = c("Int",xsnames)
        } else if (family == "gaussian") { 
#          modsum_ = c(ii=i_, jj=j_, best=0, loglik.null=loglik.null, loglik=loglik , rsquare=rsquare, rsquareadj=rsquareadj ) 
#          modsum_ = c(ii=i_, jj=j_, best=0, loglik, rsquare=rsquare, rsquareadj=rsquareadj ) 
          modsum_ = c(ii=i_, jj=j_, best=0, loglik, rsquare ) 
          modsum_
          modsumt = data.frame(t(modsum_), t(riskvarsnewp),  t(rep(0, xsk+1)) )     ## first set up data row with place holders for coefficients 
          colnames(modsumt)[(xsk+8):(2*xsk+8)] = c("Int",xsnames)
        }

        ## store model summary for one model fit in this temporary object

        dim(modsumt)                                                 
        modsumt 
        
#        if (rankMatrix(riskm) < 1e-12) { fit1$coefficients = rep(0,dim(riskm)[2]) } 
        fit1$coefficients 
        
        ## save model coefficients in model summary matrix 
#        riskvarsnew = xsnames [riskvarsnewp==1]
        riskvarsnew        
        coefsp = as.numeric( (xsnames %in% riskvarsnew ) )                      ## present(indicators)
        coefsp                             ## same as riskvarsnewp
        coefsi = c(1:xsk)[ (coefsp==1) ]                                        ## numerical index
        coefsi
        if (family == "cox") {
          coefsw = coefsi + xsk + 7                                             ## where (column) in the model summary matrix
          coefsw
          modsumt[ coefsw ] = fit1$coefficients                                 ## save coefficient values to MODel SUMmary 
          modsumt 
        } else {
          coefsw = c(1,coefsi+1) + xsk + 7                                      ## where (column) in the model summary matrix
          coefsw
          modsumt[ coefsw ] = fit1$coefficients                                 ## save coefficient values to MODel SUMmary 
          modsumt 
        }
        ## end i_, j_ coefficient calculation (not loop)  
      } else {
        if (family == "cox") {
          modsumt = data.frame( ii=i_, jj=j_, best=0, modsumlast[ 4:(2*xsk+7) ] )
          colnames(modsumt)[(xsk+8):(2*xsk+7)] = xsnames
        } else {
          modsumt = data.frame( ii=i_, jj=j_, best=0, modsumlast[ 4:(2*xsk+8) ] )
          colnames(modsumt)[(xsk+8):(2*xsk+8)] = c("Int",xsnames)
        }
      } 

      if ((i_==1) & (j_==1)) { 
        modsum  = modsumt                                                       ## store model summaries here in modsum
      } else {
        modsum = rbind( modsum, modsumt )                                       ## store model summaries here in modsum
      }

    }  ## end of j_ loop

    ## find best model from current step (number of model terms), and save for usage in next step 
    modsumi_ = modsum[(modsum$ii==i_),]                                         ## model summaries for step i_ 
#    modsumi_ 
    if (toupper(method)=="LOGLIK") { 
      imax = which.max( modsumi_$loglik )                                       ## find best model log likelihood,  in this iteration by
    } else {
      imax = which.max( modsumi_$concordance )                                  ## find best model concordance,  in this iteration by
    }
    imax
    modsum[ ((i_-1)*xsk + imax), 3] = 1                                         ## set best indicator for best model in entire model summary set  
    modsumlast = modsumi_[ imax , ]                                             ## model summary for best fit model in this LAST iteration  
    modsumlast 
    riskvarsp =  modsumlast[ 8:(xsk+7)]                                         ## predictors in best model (present indicators)
    riskvarsp
    if (family=="cox") { betalast = modsumlast[(xsk+8):(2*xsk+7)]
    } else { betalast = modsumlast[(xsk+8):(2*xsk+8)] }
#    riskvarsi = c(1:xsk)[riskvarsp==1]
#    riskvarsi 

  }  
  ##### end i_ loop ############################################################ 
  if (track>=1) { cat ("\n") }
  
##  colnames(modsum)[6] = "Concordance"
##  colnames(modsum)[7] = "C STD"
  rownames(modsum) = c(1:dim(modsum)[1]) 
#  print(class(modsum))
  class(modsum) <- c("stepreg")
#  print(class(modsum))
  return(modsum)
} 

##############################################################################################################################
##############################################################################################################################

##============================================================================================================================
##============================================================================================================================

#' Get predictors form a stepwise regression model. 
#'
#' @param modsumbest matrix with best predictors based upon number of model terms 
#' @param k_ Value for number of predictors in model 
#' @param risklist Riskset list 
#' @param risklistl Number of terms (length) in the riskset 
#'
#' @return input to best.preds()
#'
#' @noRd

preds_1 = function (modsumbest, k_, risklist, risklistl) {
  temp1 = modsumbest [ , 8:(risklistl+7) ]  
  temp2 = temp1[k_,]  
  temp3 = risklist[ (temp2 ==1) ]
  temp4 =  paste(temp3, collapse = " + " )   
  return(temp4) 
}

##### get lists for all best models of different sizes #########################

#' Get the best models for the steps of a stepreg() fit 
#'
#' @param modsum model summmary
#' @param risklist riskset list 
#'
#' @return best predictors at each step of a stepwise regression
#' 
#' @seealso 
#'    \code{\link{stepreg}} , \code{\link{cv.stepreg}} , \code{\link{nested.glmnetr}}   
#' 
#' @export
#'
best.preds = function(modsum, risklist) {
  risklistl = length( risklist )
  modsumbest = modsum[ (modsum$best==1), ]
#  print ( dim(modsumbest ))
  modsumbestd = dim(modsumbest) 
  k_ = 1 
  preds = data.frame(k_, preds_1(modsumbest, k_, risklist, risklistl) )                                          ## as.data.frame(k_, preds_1(k_) ) 
  for (k_ in 2:modsumbestd[1]) {
    preds_ = data.frame(k_, preds_1(modsumbest, k_, risklist, risklistl) ) 
    preds = rbind (preds, preds_)
  }
  return(preds)
}
##############################################################################################################################
##############################################################################################################################

#' Briefly summarize steps in a stepreg() output object, i.e. a stepwise regression fit
#'
#' @param object A stepreg() output object 
#' @param ... Additional arguments passed to the summary function.  
#'
#' @return Summarize a stepreg() object
#' 
#' @seealso 
#'    \code{\link{stepreg}} , \code{\link{cv.stepreg}} , \code{\link{nested.glmnetr}}
#' 
#' @export

summary.stepreg = function(object, ...) {
  if ( ( dim(object)[2] %% 2) ==0 ) { cox=0 }
  data1 = object[(object$best==1),]
  if (cox==1) { 
    k_ = dim(data1)[2]/2 - 3.5
    keepvars = c(1:(k_+1))[(colSums(abs(data1[,(k_+8):(2*k_+7)]))!=0)] ## + 7 + k_ 
  } else {
    k_ = dim(data1)[2]/2 - 4
    keepvars = c(1:(k_+1))[(colSums(abs(data1[,(k_+8):(2*k_+8)]))!=0)] ## + 7 + k_ 
  }
  keepvars = keepvars + 7 + k_ 
  data2 = data1[,c(1:7,keepvars)]
  return(data2)
}

##############################################################################################################################
##############################################################################################################################
Any scripts or data that you put into this service are public.
glmnetr documentation built on June 8, 2025, 10:12 a.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
glmnetr
Nested Cross Validation for the Relaxed Lasso and Other Machine Learning Models

R/stepreg_240422.R
In glmnetr: Nested Cross Validation for the Relaxed Lasso and Other Machine Learning Models

Defines functions summary.stepreg best.preds stepreg

Documented in best.preds stepreg summary.stepreg

Try the glmnetr package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

glmnetr Nested Cross Validation for the Relaxed Lasso and Other Machine Learning Models

R/stepreg_240422.R In glmnetr: Nested Cross Validation for the Relaxed Lasso and Other Machine Learning Models

Defines functions summary.stepreg best.preds stepreg

Documented in best.preds stepreg summary.stepreg

Try the glmnetr package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

glmnetr
Nested Cross Validation for the Relaxed Lasso and Other Machine Learning Models

R/stepreg_240422.R
In glmnetr: Nested Cross Validation for the Relaxed Lasso and Other Machine Learning Models