R/psfmi_lm_bw.R
In psfmi: Prediction Model Pooling, Selection and Performance Evaluation Across Multiply Imputed Datasets

Documented in psfmi_lm_bw

#' Backward selection of Linear regression models across multiply imputed data.
#'
#' \code{psfmi_lm_bw} Backward selection of Linear regression
#' models across multiply imputed data using selection methods RR, D1, D2, D4 and MPR.
#' Function is called by \code{psfmi_lm}. 
#'
#' @param data Data frame with stacked multiple imputed datasets.
#'   The original dataset that contains missing values must be excluded from the
#'   dataset. The imputed datasets must be distinguished by an imputation variable,
#'   specified under impvar, and starting by 1.
#' @param nimp A numerical scalar. Number of imputed datasets. Default is 5.
#' @param impvar A character vector. Name of the variable that distinguishes the
#' imputed datasets.
#' @param Outcome Character vector containing the name of the continuous outcome variable.
#' @param P Character vector with the names of the predictor variables.
#'   At least one predictor variable has to be defined. Give predictors unique names
#'   and do not use predictor name combinations with numbers as, age2, BMI10, etc.
#' @param p.crit A numerical scalar. P-value selection criterium. A value of 1
#'   provides the pooled model without selection.
#' @param method A character vector to indicate the pooling method for p-values to pool the
#'   total model or used during predictor selection. This can be "RR", D1", "D2" or "MPR".
#'   See details for more information. Default is "RR".
#' @param keep.P A single string or a vector of strings including the variables that are forced
#'   in the model during predictor selection. All type of variables are allowed.
#'   
#' @author Martijn Heymans, 2021
#' @keywords internal
#'  
#' @export
psfmi_lm_bw <- function(data, nimp, impvar, Outcome, P, p.crit, method, keep.P)
{
  call <- match.call()
  
  RR.model <- P_rm_step <- fm_step <- imp.dt <- multiparm <- list()
  
  P_orig <-
    P
  keep.P <-
    sapply(as.list(keep.P), clean_P)
  P_orig_temp <-
    clean_P(P)
  
  if(!is_empty(keep.P))
    if(length(P_orig)==1)
      if(P_orig_temp == keep.P)
        stop("\n", "No need to define keep.predictors. Exclude keep.predictors and set p.crit = 1","\n")
  
  # Loop k, to pool models in multiply imputed datasets
  for (k in 1:(length(P)+1)) {
    
    # set regression formula fm
    Y <-
      c(paste(Outcome, paste("~")))
    fm <-
      as.formula(paste(Y, paste(P, collapse = "+")))
    
    # Extract df of freedom for MPR
    if(method=="MPR" | method=="RR"){
      
      # Extract LR and P value of model for MPR
      chi.LR <-
        data.frame(matrix(0, length(attr(terms(fm), "term.labels")), nimp))
      chi.p <-
        data.frame(matrix(0, length(attr(terms(fm), "term.labels")), nimp))
      
      fit <- list()
      for (i in 1:nimp) {
        imp.dt[[i]] <- data[data[impvar] == i, ]
        fit[[i]] <- glm(fm, data = imp.dt[[i]])
        if(length(attr(terms(fm), "term.labels")) == 1){
          chi.LR[, i] <- car::Anova(fit[[i]])[1]
          chi.p[, i] <- car::Anova(fit[[i]])[3]
        } else {
          chi.LR[, i] <- car::Anova(fit[[i]])[, 1]
          chi.p[, i] <- car::Anova(fit[[i]])[, 3]
        }
      }
      
      # Rubin's Rules
      RR.model[[k]] <- summary(pool(fit))
      names(RR.model)[[k]] <-
        paste("Step", k)#
    }
    
    # D1 and D2 pooling methods
    if(method=="D1" | method == "D2" | method == "D4" ) {
      
      pool.p.val <-
        matrix(0, length(P), 2)
      P_test <-
        clean_P(P)
      
      for (j in 1:length(P)) {
        cov.nam0 <-
          P[-j]
        if (length(P) == 1) {
          cov.nam0 <-
            "1"
        }
        Y <-
          c(paste(Outcome, paste("~")))
        form1 <-
          as.formula(paste(Y, paste(P, collapse = "+")))
        form0 <-
          as.formula(paste(Y, paste(cov.nam0, collapse = "+")))
        
        if(any(grepl(P_test[j], P_test[-j]))){
          cov.nam0 <-
            P[-grep(P_test[j], P_test)]
          form0 <-
            as.formula(paste(Y, paste(c(cov.nam0), collapse = "+")))
        }
        if(method=="D4"){
          data <-
            subset(data, data[impvar] <= nimp)
          imp_list <-
            data %>% group_split(data[, impvar], .keep = FALSE) %>%
            mitools::imputationList(imp_list)
          
          fit0 <-
            with(data=imp_list, expr= glm(as.formula(paste(Y,
                                                           paste(cov.nam0, collapse = "+")))))
          fit1 <-
            with(data=imp_list, expr= glm(as.formula(paste(Y,
                                                           paste(P, collapse = "+")))))
          
          RR.model[[k]] <- summary(pool(fit1))
          names(RR.model)[[k]] <-
            paste("Step", k)
          
          res_D4 <-
            pool_D4(data=data, fm0=form0, fm1=form1, nimp=nimp,
                    impvar=impvar, robust=TRUE, model_type="linear")
          pvalue <-
            res_D4$pval
          fstat <-
            res_D4$F
          pool.p.val[j, ] <-
            c(pvalue, fstat)
        }
        
        if(method =="D1" | method =="D2"){
          fit1 <- fit0 <- imp.dt <- list()
          for (i in 1:nimp) {
            imp.dt[[i]] <-
              data[data[impvar] == i, ]
            fit1[[i]] <-
              glm(form1, data = imp.dt[[i]])
            fit0[[i]] <-
              glm(form0, data = imp.dt[[i]])
          }
          RR.model[[k]] <-
            summary(pool(fit1))
          names(RR.model)[[k]] <-
            paste("Step", k)
          
          test_P <-
            mitml::testModels(fit1, fit0, method = method)
          pvalue <-
            test_P$test[4]
          fstat <-
            test_P$test[1]
          pool.p.val[j, ] <-
            c(pvalue, fstat)
        }
      }
      p.pool <-
        data.frame(pool.p.val)
      row.names(p.pool) <-
        P
      names(p.pool) <-
        c(paste("p-values", method), "F-statistic")
    }
    
    # MPR Pooling
    if(method=="MPR") {
      p.pool <-
        data.frame(apply(chi.p, 1 , median))
      rownames(p.pool) <-
        P
      names(p.pool) <-
        c("p-value MPR")
    }
    
    # RR Pooling
    if(method=="RR") {
      p.pool <-
        data.frame(RR.model[[k]][-1, 6],
                   row.names=P)
      names(p.pool) <-
        c("p-value RR")
    }
    
    # Extract regression formula's
    fm_step[[k]] <-
      formula(fm)
    names(fm_step)[[k]] <-
      paste("Step", k)
    # Extract multiparameter pooling
    multiparm[[k]] <-
      p.pool
    names(multiparm)[[k]] <-
      paste("Step", k)
    
    # Clean variable names for selection
    P_temp <-
      clean_P(row.names(p.pool))
    p.pool_temp <-
      p.pool
    row.names(p.pool_temp) <-
      P_temp
    # detect interaction terms and exclude variables
    # that are part of interaction
    if(any(grepl("[*]", P))){
      P_int_id <-
        P_temp[grep("[*]", P_temp)]
      # Detect main effects of interaction terms
      P_main <-
        unique(unlist(str_split(P_int_id, "[*]")))
      # Exclude variables to keep and main effect from selection
      remove_id <-
        !row.names(p.pool_temp) %in% unique(c(P_main))
      p.pool <-
        p.pool[remove_id, , FALSE]
      p.pool_temp <-
        p.pool_temp[remove_id, , FALSE]
    }
    if(!is_empty(keep.P)){
      remove_P_keep <-
        !row.names(p.pool_temp) %in% keep.P
      p.pool <-
        p.pool[remove_P_keep, , FALSE]
    }
    
    if(p.crit==1){
      break()
    }
    
    if(nrow(p.pool)==0)
      break()
    
    # Select variables
    P_temp <-
      row.names(p.pool)
    P_excl <-
      which(p.pool[, 1] == max(p.pool[, 1]))
    if(length(P_excl) > 1) {
      P_excl <-
        P_excl[1]
    }
    
    if (p.pool[, 1][P_excl] < p.crit) {
      message("\n", "Selection correctly terminated, ",
              "\n", "No more variables removed from the model", "\n")
      (break)()
    }
    
    P_out <-
      P_temp[P_excl]
    
    if(p.pool[, 1][P_excl] > p.crit) {
      message("Removed at Step ", k,
              " is - ", P_out)
      
    }
    
    P_rm_step[[k]] <-
      P_out
    # Variable excluded on each step
    P <-
      P[!P %in% P_out]
    
    if(is_empty(P)){
      fm_step[[k+1]] <-
        as.formula(paste(Y, 1))
      names(fm_step)[[k+1]] <-
        paste("Step", k+1)
      multiparm[[k+1]] <-
        0
      names(multiparm)[[k+1]] <-
        paste("Step", k+1)
      fit <- list()
      for (i in 1:nimp) {
        imp.dt[[i]] <-
          data[data[impvar] == i, ]
        fit[[i]] <-
          glm(fm_step[[k+1]], data = imp.dt[[i]])
      }
      
      # Rubin's Rules
      out.res <-
        summary(pool(fit))
      RR.model[[k+1]] <-
        out.res
      names(RR.model)[[k+1]] <-
        paste("Step", k+1)
      break()
    }
  }
  # End k loop
  ##############################################################
  
  P_rm_step_final <-
    P_rm_step
  if(is_empty(P)) {
    P_rm_step <-
      lapply(P_rm_step, clean_P)
  } else {
    P_rm_step <-
      lapply(P_rm_step[-k], clean_P)
  }
  
  # Extract selected models
  outOrder_step <-
    P_orig_temp
  if(!is_empty(P_rm_step)){
    P_remove <-
      data.frame(do.call("rbind",
                         lapply(P_rm_step, function(x) {
                           outOrder_step %in% x })))
    names(P_remove) <-
      P_orig
    row.names(P_remove) <-
      paste("Step", 1:length(P_rm_step))
    if(nrow(P_remove)!=1) {
      P_remove <-
        apply(P_remove, 2, function(x) ifelse(x, 1, 0))
      P_remove_final <-
        colSums(P_remove)
      P_remove <-
        rbind(P_remove, P_remove_final)
      row.names(P_remove)[nrow(P_remove)] <-
        "Removed"
    } else {
      P_remove <-
        matrix(apply(P_remove, 2, function(x) ifelse(x, 1, 0)), 1, length(P_orig))
      dimnames(P_remove) <-
        list("Removed", P_orig)
    }
  } else {
    P_remove <-
      matrix(rep(0, length(P_orig)), 1, length(P_orig))
    dimnames(P_remove) <-
      list("Removed", P_orig)
  }
  
  P_included <-
    as_tibble(names(P_remove[nrow(P_remove), ][P_remove[nrow(P_remove), ] == 0] ))
  predictors_final <-
    names(P_remove[nrow(P_remove), ][P_remove[nrow(P_remove), ] == 0])
  if(length(P_orig)==1 & !is_empty(P))
    P_included <- predictors_final <- P
  
  if(is_empty(P) | p.crit==1){
    RR_model_step <-
      RR.model
    multiparm_step <-
      multiparm
    fm_step_total <-
      fm_step
    if(p.crit!=1) names(RR_model_step) <- names(multiparm_step) <- names(fm_step_total) <-
      paste("Step", 1:(k+1), "- removal -", c(unlist(P_rm_step_final), "ended"))
    if(p.crit==1)  names(RR_model_step) <- names(multiparm_step) <- names(fm_step_total) <-
      paste("Step", 1, "- no variables removed -")
    RR_model_final <-
      RR_model_step[k+1]
    multiparm_final <-
      multiparm_step[k+1]
    fm_step_final <-
      fm_step_total[k+1]
    if(p.crit==1) {
      Y_initial <-
        c(paste(Outcome, paste("~")))
      formula_initial <-
        as.formula(paste(Y_initial, paste(P_orig, collapse = "+")))
      fm_step_final <- formula_initial
      RR_model_final <-
        RR_model_step[k]
      multiparm_final <-
        multiparm_step[k]
    }
  }
  if(!is_empty(P) & p.crit !=1){
    RR_model_step <-
      RR.model
    multiparm_step <-
      multiparm
    fm_step_total <-
      fm_step
    names(RR_model_step) <-
      names(multiparm_step) <- names(fm_step_total) <-
      paste("Step", 1:k, "- removal -", c(unlist(P_rm_step_final), "ended"))
    RR_model_final <-
      RR.model[k]
    multiparm_final <-
      multiparm[k]
    fm_step_final <-
      fm_step[k]
  }
  
  Y_initial <-
    c(paste(Outcome, paste("~")))
  formula_initial <-
    as.formula(paste(Y_initial, paste(P_orig, collapse = "+")))
  
  bw <-
    list(data = data, RR_model = RR_model_step, RR_model_final = RR_model_final,
         multiparm = multiparm_step, multiparm_final = multiparm_final,
         formula_step = fm_step_total, formula_final = fm_step_final,
         formula_initial = formula_initial,
         predictors_in = P_included, predictors_out = P_remove,
         impvar = impvar, nimp = nimp, Outcome = Outcome,
         method = method, p.crit = p.crit, call = call,
         model_type = "linear", direction = "BW",
         predictors_final = predictors_final,
         predictors_initial = P_orig, keep.predictors = keep.P)
  
  return(bw)
}

Any scripts or data that you put into this service are public.

psfmi documentation built on July 9, 2023, 7:02 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

psfmi
Prediction Model Pooling, Selection and Performance Evaluation Across Multiply Imputed Datasets

R/psfmi_lm_bw.R
In psfmi: Prediction Model Pooling, Selection and Performance Evaluation Across Multiply Imputed Datasets

Defines functions psfmi_lm_bw

Documented in psfmi_lm_bw

Try the psfmi package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

psfmi Prediction Model Pooling, Selection and Performance Evaluation Across Multiply Imputed Datasets

R/psfmi_lm_bw.R In psfmi: Prediction Model Pooling, Selection and Performance Evaluation Across Multiply Imputed Datasets

Defines functions psfmi_lm_bw

Documented in psfmi_lm_bw

Try the psfmi package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

psfmi
Prediction Model Pooling, Selection and Performance Evaluation Across Multiply Imputed Datasets

R/psfmi_lm_bw.R
In psfmi: Prediction Model Pooling, Selection and Performance Evaluation Across Multiply Imputed Datasets