R/psfmi_coxr_bw.R
In psfmi: Prediction Model Pooling, Selection and Performance Evaluation Across Multiply Imputed Datasets

Documented in psfmi_coxr_bw

#' Backward selection of Cox regression models in multiply imputed data.
#'
#' \code{psfmi_coxr_bw} Backward selection of Cox regression
#' models in multiply imputed data using selection methods RR, D1, D2 and MPR.
#' Function is called by \code{psfmi_coxr}.
#'
#' @param data Data frame with stacked multiple imputed datasets.
#'  The original dataset that contains missing values must be excluded from the
#'  dataset. The imputed datasets must be distinguished by an imputation variable,
#'  specified under impvar, and starting by 1.
#' @param nimp A numerical scalar. Number of imputed datasets. Default is 5.
#' @param impvar A character vector. Name of the variable that distinguishes the
#'  imputed datasets.
#' @param status The status variable, normally 0=censoring, 1=event.
#' @param time Follow up time.
#' @param P Character vector with the names of the predictor variables.
#'   At least one predictor variable has to be defined. Give predictors unique names
#'   and do not use predictor name combinations with numbers as, age2, BMI10, etc.
#' @param p.crit A numerical scalar. P-value selection criterium. A value of 1
#'   provides the pooled model without selection.
#' @param method A character vector to indicate the pooling method for p-values to pool the
#'   total model or used during predictor selection. This can be "RR", D1", "D2" or "MPR".
#'   See details for more information. Default is "RR".
#' @param keep.P A single string or a vector of strings including the variables that are forced
#'   in the model during predictor selection. All type of variables are allowed.
#' @param strata.P A single string including the strata variable. 
#'   
#' @author Martijn Heymans, 2020
#' @keywords internal
#'   
#' @export
psfmi_coxr_bw <- function(data, nimp, impvar, status, time, P, p.crit, method, keep.P, strata.P)
{
  call <- match.call()
  
  RR.model <- P_rm_step <- fm_step <- imp.dt <- multiparm <- list()
  
  P_orig <-
    P
  if(length(P_orig)==1 & any(grepl("strata", P_orig)))
    stop("\n", "strata variable is only defined, add more predictors","\n")
  
  keep.P <-
    sapply(as.list(keep.P), clean_P)
  
  P_orig_temp <-
    clean_P(P)
  
  if(!is_empty(keep.P))
    if(length(P_orig)==1)
      if(P_orig_temp == keep.P)
        stop("\n", "No need to define keep.predictors. Exclude keep.predictors and set p.crit = 1","\n")
  
  if(any(grepl("strata", P_orig))){
    P_length <- length(P)
  } else {
    P_length <- length(P)+1
  }
  
  # Loop k, to pool models in multiply imputed datasets
  for (k in 1:P_length) {

    # set regression formula fm
    Y <-
      c(paste0("Surv(", time, ",", status, ")~"))
    fm <-
      as.formula(paste(Y, paste(P, collapse = "+")))
    
    # Extract df of freedom for MPR
    if(method=="MPR" | method=="RR"){
      
      if(any(grepl("strata", P_orig))){
        if(length(attr(terms(fm), "term.labels"))>1){
          chi.LR <-
            data.frame(matrix(0, length(attr(terms(fm), "term.labels"))-1, nimp))
          chi.p <-
            data.frame(matrix(0, length(attr(terms(fm), "term.labels"))-1, nimp))
        } else {
          chi.LR <-
            data.frame(matrix(0, length(attr(terms(fm), "term.labels")), nimp))
          chi.p <-
            data.frame(matrix(0, length(attr(terms(fm), "term.labels")), nimp))
        }
      } else {
        chi.LR <-
          data.frame(matrix(0, length(attr(terms(fm), "term.labels")), nimp))
        chi.p <-
          data.frame(matrix(0, length(attr(terms(fm), "term.labels")), nimp))
      }
      
      
      fit <- list()
      for (i in 1:nimp) {
        imp.dt[[i]] <- data[data[impvar] == i, ]
        fit[[i]] <- coxph(fm, data = imp.dt[[i]])
        if(length(attr(terms(fm), "term.labels")) == 1){
          if(any(grepl("strata", P_orig))){
            chi.LR[, i] <-
              suppressWarnings(suppressMessages(car::Anova(fit[[i]],
                                                           test.statistic=c("Wald"))[-1, 2]))
            chi.p[, i] <-
              suppressWarnings(suppressMessages(car::Anova(fit[[i]],
                                                           test.statistic=c("Wald"))[-1, 4]))
          } else{
            chi.LR[, i] <-
              car::Anova(fit[[i]])[-1, 2]
            chi.p[, i] <-
              car::Anova(fit[[i]])[-1, 4]
          }
        } else {
          if(any(grepl("strata", P_orig))){
            chi.LR[, i] <-
              suppressWarnings(suppressMessages(car::Anova(fit[[i]],
                                                           test.statistic=c("Wald"))[, 1]))
            chi.p[, i] <-
              suppressWarnings(suppressMessages(car::Anova(fit[[i]],
                                                           test.statistic=c("Wald"))[, 3]))
          } else {
            chi.LR[, i] <-
              car::Anova(fit[[i]])[, 1]
            chi.p[, i] <-
              car::Anova(fit[[i]])[, 3]
          }
        }
      }
      
      # Rubin's Rules
      out.res <-
        suppressWarnings(summary(pool(fit)))
      HR <-
        exp(out.res$estimate)
      lower.EXP <-
        exp(out.res$estimate - (qt(0.975, out.res$df)*out.res$std.error))
      upper.EXP <-
        exp(out.res$estimate + (qt(0.975, out.res$df)*out.res$std.error))
      model.res <-
        data.frame(cbind(out.res, HR, lower.EXP, upper.EXP))
      RR.model[[k]] <-
        model.res
      names(RR.model)[[k]] <-
        paste("Step", k)#
    }
    
    # D1 and D2 pooling methods
    if(method=="D1" | method == "D2") {
      
      P_test <-
        clean_P(P)
      
      if(any(grepl("strata", P_orig))){
        if(length(P)==1){
          pool.p.val <-
            matrix(0, length(P), 2)
        } else {
          pool.p.val <-
            matrix(0, length(P_test[!P_test %in% strata.P]), 2)
        }
        P_test <- P_test[!P_test %in% strata.P]
        P <- P[!grepl("strata", P)]
      } else {
        pool.p.val <-
          matrix(0, length(P), 2)
      }
      
      for (j in 1:length(P)) {
        cov.nam0 <-
          P[-j]
        if (length(P) == 1) {
          cov.nam0 <-
            "1"
        }
        Y <-
          c(paste0("Surv(", time, ",", status, ")~"))
        
        if(any(grepl("strata", P_orig))){
          form1 <-
            as.formula(paste(Y, paste(c(P, P_orig[grepl("strata", P_orig)]), collapse = "+")))
          form0 <-
            as.formula(paste(Y, paste(c(cov.nam0, P_orig[grepl("strata", P_orig)]), collapse = "+")))
        } else{
          form1 <-
            as.formula(paste(Y, paste(P, collapse = "+")))
          form0 <-
            as.formula(paste(Y, paste(cov.nam0, collapse = "+")))
        }
        if(any(grepl(P_test[j], P_test[-j]))){
          cov.nam0 <-
            P[-grep(P_test[j], P_test)]
          form0 <-
            as.formula(paste(Y, paste(c(cov.nam0), collapse = "+")))
        }
        
        if(method =="D1" | method =="D2")
          fit1 <- fit0 <- imp.dt <- list()
        for (i in 1:nimp) {
          imp.dt[[i]] <-
            data[data[impvar] == i, ]
          fit1[[i]] <-
            coxph(form1, data = imp.dt[[i]])
          fit0[[i]] <-
            coxph(form0, data = imp.dt[[i]])
        }
        out.res1 <-
          suppressWarnings(summary(pool(fit1)))
        HR <-
          exp(out.res1$estimate)
        lower.EXP <-
          exp(out.res1$estimate - (qt(0.975, out.res1$df)*out.res1$std.error))
        upper.EXP <-
          exp(out.res1$estimate + (qt(0.975, out.res1$df)*out.res1$std.error))
        model.res1 <-
          data.frame(cbind(out.res1, HR, lower.EXP, upper.EXP))
        RR.model[[k]] <-
          model.res1
        names(RR.model)[[k]] <-
          paste("Step", k)
        
        test_P <-
          mitml::testModels(fit1, fit0, method = method)
        pvalue <-
          test_P$test[4]
        fstat <-
          test_P$test[1]
        pool.p.val[j, ] <-
          c(pvalue, fstat)
        
      }
      p.pool <-
        data.frame(pool.p.val)
      row.names(p.pool) <-
        P
      names(p.pool) <-
        c(paste("p-values", method), "F-statistic")
    fm <- form1  
    }
    
    # MPR Pooling
    if(method=="MPR") {
      p.pool <-
        data.frame(apply(chi.p, 1 , median))
      P_names <- P
      if(any(grepl("strata", P_names))){
        rownames(p.pool) <-
          P_names[!grepl("strata", P_names)]
      } else {
        rownames(p.pool) <-
          P
      }
      names(p.pool) <-
        c("p-value MPR")
    }
    
    # RR Pooling
    if(method=="RR") {
      p.pool <-
        data.frame(RR.model[[k]][, 6],
                   row.names=P)
      names(p.pool) <-
        c("p-value RR")
    }
    
    # Extract regression formula's
    fm_step[[k]] <-
      formula(fm)
    names(fm_step)[[k]] <-
      paste("Step", k)
    # Extract multiparameter pooling
    multiparm[[k]] <-
      p.pool
    names(multiparm)[[k]] <-
      paste("Step", k)
    
    # Clean variable names for selection
    P_temp <-
      clean_P(row.names(p.pool))
    p.pool_temp <-
      p.pool
    row.names(p.pool_temp) <-
      P_temp
    # detect interaction terms and exclude variables
    # that are part of interaction
    if(any(grepl("[*]", P))){
      P_int_id <-
        P_temp[grep("[*]", P_temp)]
      # Detect main effects of interaction terms
      P_main <-
        unique(unlist(str_split(P_int_id, "[*]")))
      # Exclude variables to keep and main effect from selection
      remove_id <-
        !row.names(p.pool_temp) %in% unique(c(P_main))
      p.pool <-
        p.pool[remove_id, , FALSE]
      p.pool_temp <-
        p.pool_temp[remove_id, , FALSE]
    }
    if(!is_empty(keep.P)){
      remove_P_keep <-
        !row.names(p.pool_temp) %in% keep.P
      p.pool <-
        p.pool[remove_P_keep, , FALSE]
    }
    
    if(p.crit==1){
      break()
    }
    
    if(nrow(p.pool)==0)
      break()
    
    # Select variables
    P_temp <-
      row.names(p.pool)
    P_excl <-
      which(p.pool[, 1] == max(p.pool[, 1]))
    if(length(P_excl) > 1) {
      P_excl <-
        P_excl[1]
    }
    
    if (p.pool[, 1][P_excl] < p.crit) {
      message("\n", "Selection correctly terminated, ",
              "\n", "No more variables removed from the model", "\n")
      (break)()
    }
    
    P_out <-
      P_temp[P_excl]
    
    if(p.pool[, 1][P_excl] > p.crit) {
      message("Removed at Step ", k,
              " is - ", P_out)
      
    }
    
    P_rm_step[[k]] <-
      P_out
    # Variable excluded on each step
    P <-
      P[!P %in% P_out]
    
    if(length(P)==1 & any(grepl("strata", P)))
      P <- NULL
    
    if(is_empty(P)){
      fm_step[[k+1]] <-
        as.formula(paste(Y, 1))
      names(fm_step)[[k+1]] <-
        paste("Step", k+1)
      multiparm[[k+1]] <-
        0
      names(multiparm)[[k+1]] <-
        paste("Step", k+1)
      fit <- list()
      for (i in 1:nimp) {
        imp.dt[[i]] <-
          data[data[impvar] == i, ]
        fit[[i]] <-
          coxph(fm_step[[k+1]], data = imp.dt[[i]])
      }
      
      if(is_empty(P)){
        RR.model[[k+1]] <-
          fit[[1]]
        names(RR.model)[[k+1]] <-
          paste("Step", k+1)
        break()
      }
      
      # Rubin's Rules
      out.res <-
        suppressWarnings(summary(pool(fit)))
      HR <-
        exp(out.res$estimate)
      lower.EXP <-
        exp(out.res$estimate - (qt(0.975, out.res$df)*out.res$std.error))
      upper.EXP <-
        exp(out.res$estimate + (qt(0.975, out.res$df)*out.res$std.error))
      model.res <-
        data.frame(cbind(out.res, HR, lower.EXP, upper.EXP))
      RR.model[[k+1]] <-
        model.res
      names(RR.model)[[k+1]] <-
        paste("Step", k+1)
      break()
    }
  }
  # End k loop
  ##############################################################
  
  P_rm_step_final <-
    P_rm_step
  if(is_empty(P)) {
    P_rm_step <-
      lapply(P_rm_step, clean_P)
  } else {
    P_rm_step <-
      lapply(P_rm_step[-k], clean_P)
  }
  
  # Extract selected models
  outOrder_step <-
    P_orig_temp
  if(!is_empty(P_rm_step)){
    P_remove <-
      data.frame(do.call("rbind",
                         lapply(P_rm_step, function(x) {
                           outOrder_step %in% x })))
    names(P_remove) <-
      P_orig
    row.names(P_remove) <-
      paste("Step", 1:length(P_rm_step))
    if(nrow(P_remove)!=1) {
      P_remove <-
        apply(P_remove, 2, function(x) ifelse(x, 1, 0))
      P_remove_final <-
        colSums(P_remove)
      P_remove <-
        rbind(P_remove, P_remove_final)
      row.names(P_remove)[nrow(P_remove)] <-
        "Removed"
    } else {
      P_remove <-
        matrix(apply(P_remove, 2, function(x) ifelse(x, 1, 0)), 1, length(P_orig))
      dimnames(P_remove) <-
        list("Removed", P_orig)
    }
  } else {
    P_remove <-
      matrix(rep(0, length(P_orig)), 1, length(P_orig))
    dimnames(P_remove) <-
      list("Removed", P_orig)
  }
  
  P_included <-
    as_tibble(names(P_remove[nrow(P_remove), ][P_remove[nrow(P_remove), ] == 0] ))
  predictors_final <-
    names(P_remove[nrow(P_remove), ][P_remove[nrow(P_remove), ] == 0])
  if(length(P_orig)==1 & !is_empty(P))
    P_included <- predictors_final <- P
  
  if(is_empty(P) | p.crit==1){
    RR_model_step <-
      RR.model
    multiparm_step <-
      multiparm
    fm_step_total <-
      fm_step
    if(p.crit!=1) names(RR_model_step) <- names(multiparm_step) <- names(fm_step_total) <-
      paste("Step", 1:(k+1), "- removal -", c(unlist(P_rm_step_final), "ended"))
    if(p.crit==1)  names(RR_model_step) <- names(multiparm_step) <- names(fm_step_total) <-
      paste("Step", 1, "- no variables removed -")
    RR_model_final <-
      RR_model_step[k+1]
    multiparm_final <-
      multiparm_step[k+1]
    fm_step_final <-
      fm_step_total[k+1]
    if(p.crit==1) {
      Y_initial <-
        c(paste0("Surv(", time, ",", status, ")~"))
      formula_initial <-
        as.formula(paste(Y_initial, paste(P_orig, collapse = "+")))
      fm_step_final <-
        formula_initial
      RR_model_final <-
        RR_model_step[k]
      multiparm_final <-
        multiparm_step[k]
    }
  }
  if(!is_empty(P) & p.crit !=1){
    RR_model_step <-
      RR.model
    multiparm_step <-
      multiparm
    fm_step_total <-
      fm_step
    names(RR_model_step) <-
      names(multiparm_step) <- names(fm_step_total) <-
      paste("Step", 1:k, "- removal -", c(unlist(P_rm_step_final), "ended"))
    RR_model_final <-
      RR.model[k]
    multiparm_final <-
      multiparm[k]
    fm_step_final <-
      fm_step[k]
  }
  
  Y_initial <-
    c(paste0("Surv(", time, ",", status, ")~"))
  formula_initial <-
    as.formula(paste(Y_initial, paste(P_orig, collapse = "+")))
  
  bw <-
    list(data = data, RR_model = RR_model_step, RR_model_final = RR_model_final,
         multiparm = multiparm_step, multiparm_final = multiparm_final,
         formula_step = fm_step_total, formula_final = fm_step_final,
         formula_initial = formula_initial, predictors_in = P_included,
         predictors_out = P_remove,
         impvar = impvar, nimp = nimp, status = status, time = time,
         method = method, p.crit = p.crit, call = call,
         model_type = "survival", direction = "BW",
         predictors_final = predictors_final, predictors_initial = P_orig,
         keep.predictors = keep.P, strata.variable = strata.P)
  return(bw)
}
Any scripts or data that you put into this service are public.
psfmi documentation built on July 9, 2023, 7:02 p.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
psfmi
Prediction Model Pooling, Selection and Performance Evaluation Across Multiply Imputed Datasets

R/psfmi_coxr_bw.R
In psfmi: Prediction Model Pooling, Selection and Performance Evaluation Across Multiply Imputed Datasets

Defines functions psfmi_coxr_bw

Documented in psfmi_coxr_bw

Try the psfmi package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

psfmi Prediction Model Pooling, Selection and Performance Evaluation Across Multiply Imputed Datasets

R/psfmi_coxr_bw.R In psfmi: Prediction Model Pooling, Selection and Performance Evaluation Across Multiply Imputed Datasets

Defines functions psfmi_coxr_bw

Documented in psfmi_coxr_bw

Try the psfmi package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

psfmi
Prediction Model Pooling, Selection and Performance Evaluation Across Multiply Imputed Datasets

R/psfmi_coxr_bw.R
In psfmi: Prediction Model Pooling, Selection and Performance Evaluation Across Multiply Imputed Datasets