R/psfmi_coxr_bw.R

Defines functions psfmi_coxr_bw

Documented in psfmi_coxr_bw

#' Backward selection of Cox regression models in multiply imputed data.
#'
#' \code{psfmi_coxr_bw} Backward selection of Cox regression
#' models in multiply imputed data using selection methods RR, D1, D2 and MPR.
#' Function is called by \code{psfmi_coxr}.
#'
#' @param data Data frame with stacked multiple imputed datasets.
#'  The original dataset that contains missing values must be excluded from the
#'  dataset. The imputed datasets must be distinguished by an imputation variable,
#'  specified under impvar, and starting by 1.
#' @param nimp A numerical scalar. Number of imputed datasets. Default is 5.
#' @param impvar A character vector. Name of the variable that distinguishes the
#'  imputed datasets.
#' @param status The status variable, normally 0=censoring, 1=event.
#' @param time Follow up time.
#' @param P Character vector with the names of the predictor variables.
#'   At least one predictor variable has to be defined. Give predictors unique names
#'   and do not use predictor name combinations with numbers as, age2, BMI10, etc.
#' @param p.crit A numerical scalar. P-value selection criterium. A value of 1
#'   provides the pooled model without selection.
#' @param method A character vector to indicate the pooling method for p-values to pool the
#'   total model or used during predictor selection. This can be "RR", D1", "D2" or "MPR".
#'   See details for more information. Default is "RR".
#' @param keep.P A single string or a vector of strings including the variables that are forced
#'   in the model during predictor selection. All type of variables are allowed.
#' @param strata.P A single string including the strata variable. 
#'   
#' @author Martijn Heymans, 2020
#' @keywords internal
#'   
#' @export
psfmi_coxr_bw <- function(data, nimp, impvar, status, time, P, p.crit, method, keep.P, strata.P)
{
  call <- match.call()
  
  RR.model <- P_rm_step <- fm_step <- imp.dt <- multiparm <- list()
  
  P_orig <-
    P
  if(length(P_orig)==1 & any(grepl("strata", P_orig)))
    stop("\n", "strata variable is only defined, add more predictors","\n")
  
  keep.P <-
    sapply(as.list(keep.P), clean_P)
  
  P_orig_temp <-
    clean_P(P)
  
  if(!is_empty(keep.P))
    if(length(P_orig)==1)
      if(P_orig_temp == keep.P)
        stop("\n", "No need to define keep.predictors. Exclude keep.predictors and set p.crit = 1","\n")
  
  if(any(grepl("strata", P_orig))){
    P_length <- length(P)
  } else {
    P_length <- length(P)+1
  }
  
  # Loop k, to pool models in multiply imputed datasets
  for (k in 1:P_length) {

    # set regression formula fm
    Y <-
      c(paste0("Surv(", time, ",", status, ")~"))
    fm <-
      as.formula(paste(Y, paste(P, collapse = "+")))
    
    # Extract df of freedom for MPR
    if(method=="MPR" | method=="RR"){
      
      if(any(grepl("strata", P_orig))){
        if(length(attr(terms(fm), "term.labels"))>1){
          chi.LR <-
            data.frame(matrix(0, length(attr(terms(fm), "term.labels"))-1, nimp))
          chi.p <-
            data.frame(matrix(0, length(attr(terms(fm), "term.labels"))-1, nimp))
        } else {
          chi.LR <-
            data.frame(matrix(0, length(attr(terms(fm), "term.labels")), nimp))
          chi.p <-
            data.frame(matrix(0, length(attr(terms(fm), "term.labels")), nimp))
        }
      } else {
        chi.LR <-
          data.frame(matrix(0, length(attr(terms(fm), "term.labels")), nimp))
        chi.p <-
          data.frame(matrix(0, length(attr(terms(fm), "term.labels")), nimp))
      }
      
      
      fit <- list()
      for (i in 1:nimp) {
        imp.dt[[i]] <- data[data[impvar] == i, ]
        fit[[i]] <- coxph(fm, data = imp.dt[[i]])
        if(length(attr(terms(fm), "term.labels")) == 1){
          if(any(grepl("strata", P_orig))){
            chi.LR[, i] <-
              suppressWarnings(suppressMessages(car::Anova(fit[[i]],
                                                           test.statistic=c("Wald"))[-1, 2]))
            chi.p[, i] <-
              suppressWarnings(suppressMessages(car::Anova(fit[[i]],
                                                           test.statistic=c("Wald"))[-1, 4]))
          } else{
            chi.LR[, i] <-
              car::Anova(fit[[i]])[-1, 2]
            chi.p[, i] <-
              car::Anova(fit[[i]])[-1, 4]
          }
        } else {
          if(any(grepl("strata", P_orig))){
            chi.LR[, i] <-
              suppressWarnings(suppressMessages(car::Anova(fit[[i]],
                                                           test.statistic=c("Wald"))[, 1]))
            chi.p[, i] <-
              suppressWarnings(suppressMessages(car::Anova(fit[[i]],
                                                           test.statistic=c("Wald"))[, 3]))
          } else {
            chi.LR[, i] <-
              car::Anova(fit[[i]])[, 1]
            chi.p[, i] <-
              car::Anova(fit[[i]])[, 3]
          }
        }
      }
      
      # Rubin's Rules
      out.res <-
        suppressWarnings(summary(pool(fit)))
      HR <-
        exp(out.res$estimate)
      lower.EXP <-
        exp(out.res$estimate - (qt(0.975, out.res$df)*out.res$std.error))
      upper.EXP <-
        exp(out.res$estimate + (qt(0.975, out.res$df)*out.res$std.error))
      model.res <-
        data.frame(cbind(out.res, HR, lower.EXP, upper.EXP))
      RR.model[[k]] <-
        model.res
      names(RR.model)[[k]] <-
        paste("Step", k)#
    }
    
    # D1 and D2 pooling methods
    if(method=="D1" | method == "D2") {
      
      P_test <-
        clean_P(P)
      
      if(any(grepl("strata", P_orig))){
        if(length(P)==1){
          pool.p.val <-
            matrix(0, length(P), 2)
        } else {
          pool.p.val <-
            matrix(0, length(P_test[!P_test %in% strata.P]), 2)
        }
        P_test <- P_test[!P_test %in% strata.P]
        P <- P[!grepl("strata", P)]
      } else {
        pool.p.val <-
          matrix(0, length(P), 2)
      }
      
      for (j in 1:length(P)) {
        cov.nam0 <-
          P[-j]
        if (length(P) == 1) {
          cov.nam0 <-
            "1"
        }
        Y <-
          c(paste0("Surv(", time, ",", status, ")~"))
        
        if(any(grepl("strata", P_orig))){
          form1 <-
            as.formula(paste(Y, paste(c(P, P_orig[grepl("strata", P_orig)]), collapse = "+")))
          form0 <-
            as.formula(paste(Y, paste(c(cov.nam0, P_orig[grepl("strata", P_orig)]), collapse = "+")))
        } else{
          form1 <-
            as.formula(paste(Y, paste(P, collapse = "+")))
          form0 <-
            as.formula(paste(Y, paste(cov.nam0, collapse = "+")))
        }
        if(any(grepl(P_test[j], P_test[-j]))){
          cov.nam0 <-
            P[-grep(P_test[j], P_test)]
          form0 <-
            as.formula(paste(Y, paste(c(cov.nam0), collapse = "+")))
        }
        
        if(method =="D1" | method =="D2")
          fit1 <- fit0 <- imp.dt <- list()
        for (i in 1:nimp) {
          imp.dt[[i]] <-
            data[data[impvar] == i, ]
          fit1[[i]] <-
            coxph(form1, data = imp.dt[[i]])
          fit0[[i]] <-
            coxph(form0, data = imp.dt[[i]])
        }
        out.res1 <-
          suppressWarnings(summary(pool(fit1)))
        HR <-
          exp(out.res1$estimate)
        lower.EXP <-
          exp(out.res1$estimate - (qt(0.975, out.res1$df)*out.res1$std.error))
        upper.EXP <-
          exp(out.res1$estimate + (qt(0.975, out.res1$df)*out.res1$std.error))
        model.res1 <-
          data.frame(cbind(out.res1, HR, lower.EXP, upper.EXP))
        RR.model[[k]] <-
          model.res1
        names(RR.model)[[k]] <-
          paste("Step", k)
        
        test_P <-
          mitml::testModels(fit1, fit0, method = method)
        pvalue <-
          test_P$test[4]
        fstat <-
          test_P$test[1]
        pool.p.val[j, ] <-
          c(pvalue, fstat)
        
      }
      p.pool <-
        data.frame(pool.p.val)
      row.names(p.pool) <-
        P
      names(p.pool) <-
        c(paste("p-values", method), "F-statistic")
    fm <- form1  
    }
    
    # MPR Pooling
    if(method=="MPR") {
      p.pool <-
        data.frame(apply(chi.p, 1 , median))
      P_names <- P
      if(any(grepl("strata", P_names))){
        rownames(p.pool) <-
          P_names[!grepl("strata", P_names)]
      } else {
        rownames(p.pool) <-
          P
      }
      names(p.pool) <-
        c("p-value MPR")
    }
    
    # RR Pooling
    if(method=="RR") {
      p.pool <-
        data.frame(RR.model[[k]][, 6],
                   row.names=P)
      names(p.pool) <-
        c("p-value RR")
    }
    
    # Extract regression formula's
    fm_step[[k]] <-
      formula(fm)
    names(fm_step)[[k]] <-
      paste("Step", k)
    # Extract multiparameter pooling
    multiparm[[k]] <-
      p.pool
    names(multiparm)[[k]] <-
      paste("Step", k)
    
    # Clean variable names for selection
    P_temp <-
      clean_P(row.names(p.pool))
    p.pool_temp <-
      p.pool
    row.names(p.pool_temp) <-
      P_temp
    # detect interaction terms and exclude variables
    # that are part of interaction
    if(any(grepl("[*]", P))){
      P_int_id <-
        P_temp[grep("[*]", P_temp)]
      # Detect main effects of interaction terms
      P_main <-
        unique(unlist(str_split(P_int_id, "[*]")))
      # Exclude variables to keep and main effect from selection
      remove_id <-
        !row.names(p.pool_temp) %in% unique(c(P_main))
      p.pool <-
        p.pool[remove_id, , FALSE]
      p.pool_temp <-
        p.pool_temp[remove_id, , FALSE]
    }
    if(!is_empty(keep.P)){
      remove_P_keep <-
        !row.names(p.pool_temp) %in% keep.P
      p.pool <-
        p.pool[remove_P_keep, , FALSE]
    }
    
    if(p.crit==1){
      break()
    }
    
    if(nrow(p.pool)==0)
      break()
    
    # Select variables
    P_temp <-
      row.names(p.pool)
    P_excl <-
      which(p.pool[, 1] == max(p.pool[, 1]))
    if(length(P_excl) > 1) {
      P_excl <-
        P_excl[1]
    }
    
    if (p.pool[, 1][P_excl] < p.crit) {
      message("\n", "Selection correctly terminated, ",
              "\n", "No more variables removed from the model", "\n")
      (break)()
    }
    
    P_out <-
      P_temp[P_excl]
    
    if(p.pool[, 1][P_excl] > p.crit) {
      message("Removed at Step ", k,
              " is - ", P_out)
      
    }
    
    P_rm_step[[k]] <-
      P_out
    # Variable excluded on each step
    P <-
      P[!P %in% P_out]
    
    if(length(P)==1 & any(grepl("strata", P)))
      P <- NULL
    
    if(is_empty(P)){
      fm_step[[k+1]] <-
        as.formula(paste(Y, 1))
      names(fm_step)[[k+1]] <-
        paste("Step", k+1)
      multiparm[[k+1]] <-
        0
      names(multiparm)[[k+1]] <-
        paste("Step", k+1)
      fit <- list()
      for (i in 1:nimp) {
        imp.dt[[i]] <-
          data[data[impvar] == i, ]
        fit[[i]] <-
          coxph(fm_step[[k+1]], data = imp.dt[[i]])
      }
      
      if(is_empty(P)){
        RR.model[[k+1]] <-
          fit[[1]]
        names(RR.model)[[k+1]] <-
          paste("Step", k+1)
        break()
      }
      
      # Rubin's Rules
      out.res <-
        suppressWarnings(summary(pool(fit)))
      HR <-
        exp(out.res$estimate)
      lower.EXP <-
        exp(out.res$estimate - (qt(0.975, out.res$df)*out.res$std.error))
      upper.EXP <-
        exp(out.res$estimate + (qt(0.975, out.res$df)*out.res$std.error))
      model.res <-
        data.frame(cbind(out.res, HR, lower.EXP, upper.EXP))
      RR.model[[k+1]] <-
        model.res
      names(RR.model)[[k+1]] <-
        paste("Step", k+1)
      break()
    }
  }
  # End k loop
  ##############################################################
  
  P_rm_step_final <-
    P_rm_step
  if(is_empty(P)) {
    P_rm_step <-
      lapply(P_rm_step, clean_P)
  } else {
    P_rm_step <-
      lapply(P_rm_step[-k], clean_P)
  }
  
  # Extract selected models
  outOrder_step <-
    P_orig_temp
  if(!is_empty(P_rm_step)){
    P_remove <-
      data.frame(do.call("rbind",
                         lapply(P_rm_step, function(x) {
                           outOrder_step %in% x })))
    names(P_remove) <-
      P_orig
    row.names(P_remove) <-
      paste("Step", 1:length(P_rm_step))
    if(nrow(P_remove)!=1) {
      P_remove <-
        apply(P_remove, 2, function(x) ifelse(x, 1, 0))
      P_remove_final <-
        colSums(P_remove)
      P_remove <-
        rbind(P_remove, P_remove_final)
      row.names(P_remove)[nrow(P_remove)] <-
        "Removed"
    } else {
      P_remove <-
        matrix(apply(P_remove, 2, function(x) ifelse(x, 1, 0)), 1, length(P_orig))
      dimnames(P_remove) <-
        list("Removed", P_orig)
    }
  } else {
    P_remove <-
      matrix(rep(0, length(P_orig)), 1, length(P_orig))
    dimnames(P_remove) <-
      list("Removed", P_orig)
  }
  
  P_included <-
    as_tibble(names(P_remove[nrow(P_remove), ][P_remove[nrow(P_remove), ] == 0] ))
  predictors_final <-
    names(P_remove[nrow(P_remove), ][P_remove[nrow(P_remove), ] == 0])
  if(length(P_orig)==1 & !is_empty(P))
    P_included <- predictors_final <- P
  
  if(is_empty(P) | p.crit==1){
    RR_model_step <-
      RR.model
    multiparm_step <-
      multiparm
    fm_step_total <-
      fm_step
    if(p.crit!=1) names(RR_model_step) <- names(multiparm_step) <- names(fm_step_total) <-
      paste("Step", 1:(k+1), "- removal -", c(unlist(P_rm_step_final), "ended"))
    if(p.crit==1)  names(RR_model_step) <- names(multiparm_step) <- names(fm_step_total) <-
      paste("Step", 1, "- no variables removed -")
    RR_model_final <-
      RR_model_step[k+1]
    multiparm_final <-
      multiparm_step[k+1]
    fm_step_final <-
      fm_step_total[k+1]
    if(p.crit==1) {
      Y_initial <-
        c(paste0("Surv(", time, ",", status, ")~"))
      formula_initial <-
        as.formula(paste(Y_initial, paste(P_orig, collapse = "+")))
      fm_step_final <-
        formula_initial
      RR_model_final <-
        RR_model_step[k]
      multiparm_final <-
        multiparm_step[k]
    }
  }
  if(!is_empty(P) & p.crit !=1){
    RR_model_step <-
      RR.model
    multiparm_step <-
      multiparm
    fm_step_total <-
      fm_step
    names(RR_model_step) <-
      names(multiparm_step) <- names(fm_step_total) <-
      paste("Step", 1:k, "- removal -", c(unlist(P_rm_step_final), "ended"))
    RR_model_final <-
      RR.model[k]
    multiparm_final <-
      multiparm[k]
    fm_step_final <-
      fm_step[k]
  }
  
  Y_initial <-
    c(paste0("Surv(", time, ",", status, ")~"))
  formula_initial <-
    as.formula(paste(Y_initial, paste(P_orig, collapse = "+")))
  
  bw <-
    list(data = data, RR_model = RR_model_step, RR_model_final = RR_model_final,
         multiparm = multiparm_step, multiparm_final = multiparm_final,
         formula_step = fm_step_total, formula_final = fm_step_final,
         formula_initial = formula_initial, predictors_in = P_included,
         predictors_out = P_remove,
         impvar = impvar, nimp = nimp, status = status, time = time,
         method = method, p.crit = p.crit, call = call,
         model_type = "survival", direction = "BW",
         predictors_final = predictors_final, predictors_initial = P_orig,
         keep.predictors = keep.P, strata.variable = strata.P)
  return(bw)
}

Try the psfmi package in your browser

Any scripts or data that you put into this service are public.

psfmi documentation built on July 9, 2023, 7:02 p.m.