R/utils.R

Defines functions remove_alias is_model_possible extract_from_model

Documented in extract_from_model

#' Extract objects from a model
#'
#' @param model an object of type "lm" or "coxph"
#' @param vector A character vector of object to extract
#'
#' @return A vector
#' @export
#'
#' @examples
extract_from_model <- function(model, vector){
  extracted <- model %>%
    broom::tidy() %>%
    dplyr::filter(term != "(Intercept)" & term != "Residuals")

  extract2(extracted, vector) %>%
    setNames(extracted$term)
}

is_model_possible <- function(model){
  isTRUE(class(model)[1] == "lm" && df.residual(model) != 0 && deviance(model) >= sqrt(.Machine$double.eps) |
           inherits(model, "glm") && isTRUE(model$converged) |
           inherits(model, "coxph") && !any(is.nan(sqrt(diag(model$var)))))
}

remove_alias <- function(vars, mod, correction = FALSE) {
  tab_mod <- broom::tidy(mod)
  alias <- dplyr::filter(tab_mod, is.nan(statistic) | is.infinite(statistic) | abs(estimate) < .Machine$double.eps) %>%
    magrittr::extract2("term")
  alias <- c(alias, names(which(is.na(coef(mod)))))
  tab <- mod$model[-1]
  expanded_vars <- map(names(tab), function(x){
    if (is.factor(tab[[x]])){
      paste0(x, levels(tab[[x]]))
    } else x
  })
  vari <- map_lgl(expanded_vars, ~ isTRUE(any(alias %in% .)))
  if(correction){
    corrected <- map_lgl(vars[vari], function(x){
      ligne <- which(names(mod$xlevels) == x)
      if(length(ligne)){
        nb <- paste0(names(mod$xlevels[ligne]), mod$xlevels[[ligne]]) %in% alias %>%
          sum()
        if(nb == length(mod$xlevels[[ligne]]) - 1) TRUE else FALSE
      } else TRUE
    })
    vari[which(vari)] <- corrected
  }

  return(vari)

}

get_big_vif <- function(tab, vardep, varindep, varajust, type, mod, left_form){
  vars <- c(varindep, varajust)
  elimine <- character(0)

  if(length(varindep) > 1) { ## first, test multicolinearity among varindep only
    mod_indep <- update_mod(tab, mod, vardep, varindep, type, left_form)
    infl <- suppressWarnings(car::vif(mod_indep))
    if(!is.null(dim(infl))) infl <- infl[, 1, drop = TRUE]
    elimine <- remove_big_vif(tab, vardep, varindep, character(0), type, infl)
    if(length(elimine)){
      varindep <- remove_elements(varindep, elimine)
      vars <- c(varindep, varajust)
      if (length(vars) > 1){
        mod <- update_mod(tab, mod, vardep, vars, type, left_form)
        infl <- suppressWarnings(car::vif(mod))
        if(!is.null(dim(infl))) infl <- infl[, 1, drop = TRUE]
      }
    }
  }


  if (length(vars) > 1){
    infl <- suppressWarnings(car::vif(mod))
    if(!is.null(dim(infl))) infl <- infl[, 1, drop = TRUE]
    elimine_ajust <- remove_big_vif(tab, vardep, varindep, varajust, type, infl, only_varajust = TRUE) # ##  test multicolinearity among varajust + varindep only without removing varindep
    if(length(elimine_ajust)){
      vars <- remove_elements(vars, elimine_ajust)
      if (length(vars) > 1){
        mod <- update_mod(tab, mod, vardep, vars, type, left_form)
        infl <- suppressWarnings(car::vif(mod))
        if(!is.null(dim(infl))) infl <- infl[, 1, drop = TRUE]
      }
      elimine <- c(elimine, elimine_ajust)
    }
    # big_vif <- which(infl > 5)
    # if (length(big_vif)){
    #   if (varindep[1] %in% names(big_vif)) infl[varindep[1]] <- 0
    #   remaining_varindep <- intersect(vars, varindep)
    #   if (all(remaining_varindep %in% names(big_vif))) infl[remaining_varindep] <- 0
      elimine <- append(elimine,
                        remove_big_vif(tab, vardep, intersect(varindep, vars), intersect(varajust, vars), type, infl, only_varajust = TRUE)) # if necessary, remove all other vars
    #}
  }
  return(elimine)
}

get_choix_var <- function(tab){
  lab <- label(tab)
  names(tab) %>%
    setNames(lab) %>%
    sort()
}

drop_levels <- function(tab, remove = FALSE){
  exLabels <- label(tab)
  tab %<>% droplevels()
  label(tab, self = FALSE) <- exLabels
  if (remove){
    enleve <- map(seq_len(ncol(tab)), function(i){
      x <- tab[[i]]
      if (count_items(x) < 2) i
    }) %>% compact() %>% list_c()
    if(length(enleve)) tab <- tab[-enleve]
  }

  return(tab)
}

prepare_model <- function(tab, remove = FALSE){
  na.exclude(tab) %>%
    drop_levels(remove)
}

#' Get the nearest (ceiling) thousand
#'
#' @param x a length one numeric vector
#'
#' @return the nearest thousand
#' @export
#'
#' @examples
nearest_up_thousand <- function(x){
  order <- 10^3
  divide_by(x, order) %>%
    ceiling %>%
    magrittr::multiply_by(order)
}


#' Get the predicted number of terms in a statistical model
#'
#' @param tab The data frame
#' @param varajust A character vector of adjustment variables
#'
#' @return
#' @export
#'
#' @examples
get_nvar_mod <- function(tab, varajust = NULL, remove1 = TRUE){
  if(!is.null(varajust)) {
    tab %<>%
      dplyr::select(-dplyr::one_of(varajust))
  }
  map_dbl(tab, function(x){
    if(is.numeric(x)) 1
    else if (is.factor(x)) {
      n <- nlevels(x)
      if (remove1) n-1 else n
    }
  }) %>%
    sum() %>%
    magrittr::subtract(1)
}

#' Format numbers
#'
#' @param numbers a numeric vector
#' @param digits number of digits
#'
#' @return a character vector
#' @export
#'
#' @examples
format_number <- function(numbers, digits = 3){
  map_chr(numbers, function(x){
    if (is.nan(x)) return("NaN")
    if (is.na(x)) return("-")
    if  (x > 1E6) return("+Inf")
    if  (x < -1E6) return("-Inf")
    if (abs(x) < 1E-4) return(base::format(0))
    puiss <- floor(log10(abs(x)) + 1)
    nsmall <- ifelse (puiss >= 3, 0, digits - puiss)
    if (nsmall < 0) nsmall <- 0
    if (digits < 1) {
      x <- round(x, digits)
      digits <- 1
    }
    base::format(x, digits = digits, nsmall = nsmall)
  })
}

#' Use sprintf for displaying numbers with the right digit number
#'
#' @param string the string
#' @param ... values to be passed into string. Only logical, integer, real and character vectors are supported, but some coercion will be done: see the ‘Details’ section. Up to 100.
#'
#' @return A formatted string
#' @export
#'
#' @examples
sprintf_number_table <- function(string, ...){
  .dots <- list(...) %>%
    map_if(is.numeric, format_number)
  do.call(sprintf, c(list(fmt = string), .dots))
}

#' @export
format_pval <- function(number, keepNA = FALSE){
  map_chr(number, function(x){
    if (is.nan(x)) return("NaN")
    if (is.na(x) && keepNA) return(NA)
    if(is.na(x) | is.character(x)) return("-")
    f <- if (x < 0.001) f <- "<0.001"
    else if(x < 0.01) f <- "<0.01"
    else  {
      digits <- if (x < .051 & x > 0.049) 3 else 2
      base::format.pval(x, eps = 0.001, digits = digits)
    }
  })
}


remove_na <- function(x, y, ..., drop_factor = FALSE){
  dots <- list(...)
  if(length(dots) > 0){
    names(dots) <- paste0("V", seq_along(dots))
    supp <- as_tibble(dots)
  }
  else supp <- NULL
  no_na <- tibble(x, y) %>%
    bind_cols(supp)
  if(!any(label(no_na) == ".missing", na.rm = TRUE)) no_na %<>% na.exclude
  if (any(map_lgl(no_na, is.factor)) & drop_factor){
    no_na <- mutate_if(no_na, is.factor, fct_drop)
  }
  label(no_na, self = FALSE)[c("x","y")] <- c(label(x), label(y))
  no_na
}

#' @export
label_cutOff <- function (breaks, included) {
  l <- length(breaks)
  map(seq_len(l-1), function(i){
    if (i == 1){
      if (!is.null(included) && !included)
        paste0("<", breaks[i+1])
      else
        paste0("≤", breaks[i+1])
    }
    else if (i < l-1)
      paste(breaks[i], "-", breaks[i+1])
    else if (i == l-1){
      if (!is.null(included) && !included)
        paste0("≥", breaks[i])
      else
        paste0(">", breaks[i])
    }
  }) %>% compact() %>% list_c()
}


#' Get the number(s) formatted in percentage
#'
#' @param nb a numeric vector
#' @param symbol displays "\%"
#'
#' @export
#'
#' @examples
pourcent <- function(nb, symbol = TRUE, arrondi = NULL){
  map_chr(nb, function(x){
    if (is.nan(x) | is.na(x)) return("-")
    if (is.null(arrondi)) arrondi <- 2
    val <- multiply_by(x, 100)
    val <- if (val > 1E-3) {
      base::format(val, digits = arrondi, nsmall = arrondi - 2)
    } else 0

    if (symbol) paste0(val, "%") else format(val)
  })
}

#' Nice display of the analysis
#'
#' @param table The analysed data frame
#' @param ...
#'
#' @return
#' @export
#'
#' @examples
show_table_markdown <- function(table){
  table %>%
    prepare_table_export()
}

prepare_table_export <- function(tab){
  tab %<>% select(-.variable)
  if (nrow(tab) > 1){
    tab$id[2:nrow(tab)] <- NA
  }
  tab
}

are_enough_levels <- function(tab, x){
  extract2(tab, x) %>%
    as.factor() %>%
    fct_drop() %>%
    nlevels() %>%
    is_greater_than(1)
}

are_enough_cor <- function(tab, x, y, univ){
  min_rows <- ifelse(univ, 0L, 3L)
  if (is.numeric(tab[[x]]) & is.numeric(tab[[y]])) {
    nrow(tab) > 3L
  } else if (is.factor(tab[[x]]) & is.factor(tab[[y]])) {
    all(table(tab) >= min_rows)
  } else if (is.factor(tab[[x]])){
    counts <- group_by(tab, dplyr::across(all_of(x)), .drop = FALSE) %>%
      dplyr::count()
    all(counts$n > min_rows)
  } else if (is.factor(tab[[y]])){
    counts <- group_by(tab, dplyr::across(all_of(y)), .drop = FALSE) %>%
      dplyr::count()
    all(counts$n > min_rows)
  }
}

remove_big_vif <- function(tab, vardep, varindep, varajust, type, infl, only_varajust = FALSE) {
  vars <- c(varindep, varajust)
  selected_vars <- (if(only_varajust) varajust else vars)
  ajust <- infl[which(names(infl) %in% selected_vars)]
  elimine <- character(0)
  while (length(ajust) > 0 && max(ajust, na.rm = TRUE) > 5 & length(vars) > 1){
    big_vif <- which(ajust > 5)
    if (length(varindep) && varindep[1] %in% names(big_vif)) ajust[varindep[1]] <- 0

    gros <- names(ajust[which.max(ajust)])
    selected_vars <- selected_vars[selected_vars != gros]
    vars <- vars[vars != gros]
    elimine <- append(elimine, gros)
    if (length(vars) > 1){
      formule <- paste(vardep, "~", paste(vars, collapse = "+"))
      if (type == "logistic")
        model <- glm(as.formula(formule), data = tab, family = "binomial")
      else if (type == "linear")
        model <- lm(as.formula(formule), data = tab)
      else if (type == "survival"){
        tab2 <- tab[, c(".time", vardep, vars)] %>%
          na.exclude()
        formule <- sprintf("Surv(.time, %s) ~ %s", vardep, paste(vars, collapse = "+"))
        model <- survival::coxph(formula = as.formula(formule), model = TRUE, data = tab2)
      }
      alias <- remove_alias(vars, model)
      while (any(alias)){
        elimine <- append(elimine, vars[alias])
        vars <- vars[!alias]
        model <- update_mod(tab, model, vardep, vars, type, sprintf("Surv(.time, %s)", vardep))
        alias <- remove_alias(vars, model)
      }
      infl <- suppressWarnings(car::vif(model))
      if(!is.null(dim(infl))) infl <- infl[, 1, drop = TRUE]
      ajust <- infl[which(names(infl) %in% selected_vars)]
    }
  }
  elimine
}

#' @export
add_class <- function(x, classe){
  structure(x, class = unique(c(classe, class(x))))
}


no_multibyte <- function(x){
  UseMethod("no_multibyte")
}

no_multibyte.data.frame <- function(x){
  map_lgl(x, function(y){
    no_multibyte.default(y)
  }) %>%
    all(na.rm = TRUE)
}

no_multibyte.default <- function(x){
  stri_enc_isutf8(x) %>%
    all(na.rm = TRUE)
}

remove_multibyte <- function(x){
  UseMethod("remove_multibyte")
}
remove_multibyte.data.frame <- function(x){
  modify_if(x, Negate(no_multibyte.default), function(y) {
    remove_multibyte.default(y)
  }) %>%
    setNames(remove_multibyte.default(names(x)))
}

remove_multibyte.default <- function(x){
  iconv(x, sub="")
}

remove_multibyte_if_any <- function(x){
  if(!no_multibyte(x)){
    remove_multibyte(x)
  } else {
    x
  }
}

solve_contrast <- function(tab, vardep, x, univ = FALSE) {
  if(!is.null(x)){
    if (identical(class(x), class(tab[[vardep]])) && isTRUE(all.equal(x, tab[[vardep]], check.attributes = FALSE))) return(TRUE)
    if(count_items(x) < 2) return(FALSE)
    tmp <- data.frame(a = x, b = tab[[vardep]]) %>%
      na.exclude()
    if (is.factor(tmp$a) & is.factor(tmp$b)){
      if (length(compare_terms(tmp))) return(FALSE)
    }
    are_enough_cor(tmp, "a", "b", univ) &&
      are_enough_levels(tmp, "a") && are_enough_levels(tmp, "b")
  } else FALSE
}

update_mod <- function(tab, model, vardep, vars, type, left_form = NULL){
  stats::update(model,
                       as.formula(
                         sprintf("%s ~ %s",
                                 ifelse(type == "survival", left_form, vardep),
                                 paste(vars, collapse = " + ")
                         )),
                       data = tab)
}

compare_terms <- function(tmp){
  f1 <- as.formula(sprintf("~ %s - 1", names(tmp)[1]))
  f2 <- as.formula(sprintf("~ %s - 1", names(tmp)[2]))
  ma <- model.matrix(f1, data = tmp)
  mb <- model.matrix(f2, data = tmp)
  m <- cbind(ma, mb)
  l <- ncol(m)
  map(seq_len(l), function(i){
    if (i < l){
      map(seq.int(i + 1, l), function(j){
        if (sum(m[, i] == m[, j]) == nrow(m)){
          names(tmp)
        }
      }) %>% compact()
    }
  }) %>% compact()
}

identical_model_frame <- function(tab, formula, type){
  mf <- model.frame(formula, data = tab)
  if (type == "survival"){
    exLabel <- names(which(label(tab) == attr(mf[[1]], "inputAttributes")$event$label))
    mf <- cbind(mf[[1]][, 2], mf)
    mf[[2]] <- mf[[2]][, 1]
    names(mf)[1] <- ".time"
    names(mf)[2] <- exLabel
  }
  l <- ncol(mf)
  ide <- map(seq_len(l), function(i){
    if (i < l){
      map(seq.int(i + 1, l), function(j){
        tmp <- mf[c(i,j)] %>%
          na.exclude()
        if (is.factor(tmp[[1]]) & is.factor(tmp[[2]])) {
          compare_terms(tmp) %>%
          unlist() %>%
          unique()
        } else if (sum(as.numeric(mf[[i]]) == as.numeric(mf[[j]])) > 0.95 * nrow(mf)){
                names(mf)[c(i, j)]
        }
      }) %>%
        compact()
    }
  }) %>%
    compact()
  if(length(ide)){
    reduce(ide, union)
  } else NULL
}

#' @export
remove_elements <- function(vector, ...){
  dots <- list(...) %>% unlist
  vector[!vector %in% dots]
}

#' @export
is_entier <- function(x){
  if(is.factor(x)) {
    lev <- suppressWarnings(as.numeric(as.character(levels(x))))
    all(is_wholenumber(lev), na.rm = TRUE) & nlevels(x) < 10 & nlevels(x) >= 2
  } else {
    lev <- unique(na.exclude(x))
    all(is_wholenumber(lev), na.rm = TRUE) & length(lev) < 10 & length(lev) >= 2
  }
}

is_wholenumber <-
  function(x, tol = .Machine$double.eps^0.5)  {
    if (any(is.na(x)))
      FALSE
    else
      abs(x - round(x)) < tol
  }

#' Find object in the parents of the calling environment
#'
#' @param name a length 1 character vector of the objet to find
#' @param parent list of environments
#'
#' @return the last environment where the  object is found
#'
find_env <- function(name, parents = parent.frame()){
  n <- 0
  env <- sys.frame(n)
  while(!identical(env, parents)){
    if (name %in% ls(env)) {
      found <-  env
    }
    n <- n + 1
    env <- sys.frame(n)
  }
  return(found)
}

create_tab_cens <- function(x, time, censure){
  tab_cens <- remove_na(time, x, censure, drop_factor = TRUE)
  names(tab_cens) <- c(".time", "x", "censure")
  tab_cens$censure %<>% as.character() %>% as.numeric()
  tab_cens
}


coef_to_prob <- function(x) exp(x) / (1 + exp(x))


regroup_quantile_calibration <- function(x, quantiles){
  q <- quantiles
  quantiles[length(quantiles)] <- 1
  x$pred <- cut(x$M, breaks = quantiles,
                labels = ((quantiles+lag(quantiles))/2)[-1]) %>%
    as.character() %>%
    as.numeric()
  if(is.factor(x$D)){
    x$D <- as.numeric(x$D) - 1
  }
  x %>%
    group_by(pred) %>%
    summarise(obs = list(binom.test(sum(D), length(D))))
}

#' @export
add_elements <- function(vector, ...){
  dots <- list(...) %>% unlist
  append(vector, dots)
}

keep_warning <- function(expr) {
  myWarnings <- NULL
  wHandler <- function(w) {
    if(!grepl("Vectorizing",w$message)) myWarnings <<- c(myWarnings, w$message)
    invokeRestart("muffleWarning")
  }
  val <- withCallingHandlers(expr, warning = wHandler)
  if (is.null(val)) return(NULL)
  structure(val, warning = myWarnings)
}

filter_glm_fit <- function(mod, tab, varindep, varajust, pred = 0){
  tab %<>% na.exclude()
  eps <- .Machine$double.eps*1000
  p <- predict(mod, type = "response")
  t <- tab[p < 1-eps & p > eps,]
  if(pred == 2){
    allVars <- prepare_variables(t, varindep, varajust, pred)
    formule <- sprintf(". ~ %s", paste(purrr::list_c(allVars), collapse = " + "))
    mod2 <- try2(update(mod, formula = formule, data = t))
  } else {
    mod2 <- try2(update(mod, data = t))
  }
  if (is_error(mod2)) return(NULL)
  if (all(round(coef(mod), 2) == round(coef(mod2),2) , na.rm = TRUE)) {
    attr(mod2, "warning") <- NULL
  }
  mod2
}

#' @export
get_min_class <- function(tab, vardep, type = "logistic"){
  min(table(tab[[vardep]]))
}

#' @export
find_best_precision <- function(variable){
  range(variable, na.rm = TRUE) %>%
    base::diff() %>%
    log10() %>%
    ceiling() %>%
    subtract(2) %>%
    10^.
}

qc <- function(...){
  exprs(...) %>%
    map_chr(as_name) %>%
    unname()
}

remove_missing_levels <- function(tab, mod){
  UseMethod("remove_missing_levels", mod)
}

remove_missing_levels.mira <- function(tab, mod){
  remove_missing_levels.default(tab, getfit(mod, 1))
}

remove_missing_levels.default <- function(tab, mod){
  train_lev <- mod$xlevels
  if(!length(train_lev)) return(tab)
  test_lev <- map(tab, function(x) levels(x))
  all_vars <- test_lev[names(train_lev)]
  if(identical(all_vars, train_lev)) return(tab)

  differences <- map(seq_along(all_vars), function(i) {
    setdiff(all_vars[[i]], train_lev[[i]])
  }) %>% setNames(names(all_vars))

  for (i in seq_along(differences)){
    if(length(differences[[i]])){
      levels(tab[[names(differences[i])]])[levels(tab[[names(differences[i])]]) == differences[[i]]] <- NA
    }
  }
  tab
}

tryCatch_all <- function(expr) {
  warn <- err <- NULL
  value <- withCallingHandlers(
    tryCatch(expr, error=function(e) {
      err <<- e
      NULL
    }), warning=function(w) {
      if (is.null(warn)){
        warn <<- w
      } else {
        warn$message <<- paste(warn$message, w$message, sep = " / ")
      }

      invokeRestart("muffleWarning")
    })
  structure(list(value=value, warning=warn, error=err),
            class = unique(c(class(err), class(warn), class(value))))
}

#' Simple tryCatch for errors and warnings
#'
#' @param expr expression to be evaluated.
#' @param errors character vector containing error message to bypass
#' @param warnings character vector containing warning message to bypass
#'
#' @return
#' @export
#'
try2 <- function(expr, errors, warnings){
  grepf <- function(pattern, x){
    if (identical(pattern, ".*")) TRUE else grepl(pattern, x, fixed = TRUE)
  }
  if(missing(errors)) errors <- NULL
  if(missing(warnings)) warnings <- NULL
  res <- tryCatch_all(expr)
  if (is_error(res)){
    all_cond <- map_lgl(errors, grepf, res$error$message)
    if (all(all_cond == FALSE, na.rm = TRUE)){
      warning(paste(Sys.time(), "-", "Error: ", res$error$message))
    }
    return(structure(list(), message = res$error$message, class = "error"))
  }
  if (is_warning(res)){
    all_cond <- map_lgl(warnings, grepf, res$warning$message)
    if (all(all_cond == FALSE, na.rm = TRUE)){
      warning(paste(Sys.time(), "-", "Error warning: ", res$warning$message))
    }
    if(is.null(res$value)) res$value <- list()
    return(structure(res$value, message = res$warning$message, class = unique(c(class(res$value), "warning"))))
  }
  res$value
}

#' Helper function to get error or warning class
#' @param x a variable
#' @export
is_error <- function(x){
  inherits(x, c("error", "try-error"))
}

#' @rdname is_error
#' @export
is_warning <- function(x){
  inherits(x, "warning") | !is.null(attr(x, "warning"))
}

#' @export
count_items <- function(x){
  length(unique(na.exclude(x)))
}

gettext <- function(..., domain = "R-simplestats"){
  base::gettext(..., domain = domain)
}

gettextf <- function(fmt, ..., domain = "R-simplestats"){
  base::gettextf(fmt, ..., domain = domain)
}
resample <- function(x, ...) x[sample.int(length(x), ...)]
KZARCA/simplestats documentation built on Feb. 19, 2024, 1:11 a.m.