R/prepare_tab.R

Defines functions homogeneise_tab create_tabi make_tab_survival standardize_tab replace_infinite remove_guillemets standardize_names_basic standardize_names transform_date replace_virgules lower_tab factor_strings remove_na_cols remove_na_rows make_name_matching

Documented in create_tabi make_tab_survival standardize_names standardize_tab

make_name_matching <- function(tab, trunc = TRUE, length = 40){
  df <- tibble(noms = names(tab))
  mutate(df,
         index = dplyr::row_number(),
         present = noms %in% names(remove_na_cols(tab)),
         name_matching = standardize_names_basic(noms),
         labs = standardize_names(noms, trunc = trunc, length = length))
}

remove_na_rows <- function(tab){
  na_rows <- which(rowSums(is.na(tab)) == ncol(tab))
  if (length(na_rows)){
    return(tab[-na_rows, , drop = FALSE])
  }
  return(tab)
}


remove_na_cols <- function(tab){
  na_cols <- which(colSums(is.na(tab)) == nrow(tab))
  if (length(na_cols)){
    return(tab[-na_cols])
  }
  return(tab)
}

factor_strings <- function(tab){
  as.data.frame(
    lapply(tab, function(x) {
      if(inherits(x, "POSIXct")) {
        return(as.character(x))
      }
      if(all(is.na(x))) return(NA)
      if (is.numeric(x) && (min(x, na.rm = TRUE) > 1E5 || max(x, na.rm = TRUE) >= 2^30)) x <- as.character(x)
      if(is.character(x) | is.factor(x)) {
        b <- factor(x)
        if (nlevels(b) >= 10L) return(as.character(x))
        x <- reorder(b, b, function(y)-length(y))
        levels(x) %<>% str_trunc(20)
      } else if (count_items(x) < 5 & is_entier(x)){
        x <- factor(x)
        levels(x) %<>% str_trunc(20)
      }
      if (is.factor(x) & nlevels(x) < 2 | is.numeric(x) & count_items(x) < 2){
        return(as.character(x))
      }
      x
    }), stringsAsFactors = FALSE
  )
}

lower_tab <- function(tab){
  as.data.frame(
    lapply(tab, function(x) {
      if(is.character(x) | is.factor(x)) {
        if (!all(grepl("^[A-Z]+$", x), na.rm = TRUE)) {
          iconv(x, sub = "") #%>%
          #tolower()
        } else x
      }
      else x
    }), stringsAsFactors = FALSE
  )
}

replace_virgules <- function(tab){
  as.data.frame(
    lapply(tab, function(x) {
      if (all(grepl("^-?[0-9]*[,\\.]?[0-9]*$", x) | is.na(x), na.rm = TRUE)){
        x <- suppressWarnings(as.numeric(gsub(",", ".", x)))
      }
      else
        x
    }), stringsAsFactors = FALSE
  )
}

transform_date <- function(tab){
  possibleFormats <- cbind(
    c(
      "^[0-3][0-9]/[0-1][0-9]/[1-2][0-9][0-9][0-9]$",
      "^[0-3][0-9]/[0-1][0-9]/[0-9][0-9]$",
      "^[0-3][0-9]-[0-1][0-9]-[1-2][0-9][0-9][0-9]$",
      "^[0-3][0-9]-[0-1][0-9]-[0-9][0-9]$",
      "^[1-2][0-9][0-9][0-9]-[0-1][0-9]-[0-3][0-9]$"
    ),
    c("%d/%m/%Y", "%d/%m/%y", "%d-%m-%Y", "%d-%m-%y", "%Y-%m-%d")
  )
  lapply(seq_along(tab), function(c) {
    column <- tab[[c]]
    walk(seq_len(nrow(possibleFormats)), function(i){
      format = possibleFormats[i, 1]
      if (any(grepl(format, column))) {
        conv <- map_dbl(column, function(cell){
          suppressWarnings(num <- as.numeric(cell))
          if (grepl(format, cell))
            as.Date(cell, format = possibleFormats[i, 2])
          else if (!is.na(num))
            as.Date(num, origin = "1899-12-30")
          else
            NA
        })

        tab[[c]] <<- as.Date(base::unname(conv), origin = "1970-01-01")
      }
    })
  })
  return(tab)
}


#' Create better colnames
#'
#' @param names character vector
#'
#' @return nice names vectors
#' @export
#'
#' @examples
standardize_names <- function(names, trunc = FALSE, length = 40){
  noms <- standardize_names_basic(names) %>%
    str_replace_all("_", " ") %>%
    str_replace("^L ","L'") %>%
    str_replace_all(" l "," l'") %>%
    str_replace("^C ","C'") %>%
    str_replace_all(" c "," c'") %>%
    str_replace("^D ","D'") %>%
    str_replace_all(" d "," d'") %>%
    str_replace_all("\n"," ") %>%
    trimws() %>%
    capitalize()
  if (trunc == TRUE){
    noms %<>% str_trunc(length)
  }
  noms
}

standardize_names_basic <- function(names){
  remove_multibyte_if_any(names) %>%
    trimws() %>%
    str_replace("^[\"\'](.*)[\"\']$", "\\1") %>%
    str_replace_all("[\\.]+"," ") %>%
    str_replace_all("[ ]+", " ") %>%
    str_replace_all("[\\[\\]]+", " ") %>%
    trimws()
}

remove_guillemets <- function(tab){
  mutate_if(tab, function(p) is.factor(p) & any(str_detect(p, "^[\"\'](.*)[\"\']"), na.rm = TRUE), function(x){
    re <- str_replace(x, "^[\"\'](.*)[\"\']", "\\1")
    suppressWarnings({
      if (!any(is.na(as.numeric(na.exclude(re))))){
        as.numeric(re)
      } else as.factor(re)
    })
  })
}

replace_infinite <- function(tab){
 lapply(tab, function(x){
   x[is.infinite(x)] <- NA
   x
 }) %>%
    as.data.frame(stringAsFactors = FALSE)
}


#' Prepare a data.frame for analysis
#'
#' This function \itemize{
#'    \item creates a label for each columns, allowing for a better display,
#'    \item removes empty cells on top of the data.frame
#'    \item standardize the dates
#'    \item lowers case of levels for character vectors
#'    \item factors qualitative variables
#'    \item removes quoting characters
#'}
#' @param tab the data.frame
#'
#' @return A standardized data.frame
#' @export
#'
#' @examples
standardize_tab <- function(tab){
  tab %<>% remove_na_rows() %>%
    remove_na_cols()
  tab <- tab[!is.na(names(tab)) & names(tab) != ""]
  labs <- standardize_names(names(tab), trunc = TRUE)
  names(tab) <- standardize_names_basic(names(tab))

  tab %<>%
    replace_infinite() %>%
    remove_na_rows() %>%
     replace_virgules() %>%
     transform_date() %>%
     lower_tab() %>%
     factor_strings() %>%
     remove_guillemets()

  names(tab) <- gsub("\\.", "_", names(tab))

  label(tab, self = FALSE) <- labs
  return(tab)
}

#' Add column .time
#'
#' Adds the column .time, which is the follow-up time
#' @param tab The data frame
#' @param vardep The dependent variable
#' @param passage If 1, The first level of the event variable is the absence of event; if 2, the second level is the absence of event
#' @param typeCensure If 1: Follow-up time computed from 2 dates: inclusion and censor; \
#' If 2: the follow-up time is already inside a column
#' @param dateInclusion Date vector of length equals to the number of rows of tab: inclusion date.
#' @param dateSortie Date vector of length equals to the number of rows of tab: censor or event.
#' @param var_time Name of the column indicating the follow-up time follow-up

#' @details This function requires either var_time or dateSortie and dateInclusion
#' @return A table with .time column
#' @export
#'
#' @examples
make_tab_survival <- function(tab, vardep, passage = 1, typeCensure = 2, dateInclusion = NULL,
                              dateSortie = NULL, var_time = NULL, limit = NULL){
  lev <- levels(tab[[vardep]])
  tab[[vardep]] <- relevel(tab[[vardep]], ref=ifelse(passage == 1, lev[1], lev[2]))
  lev <- levels(tab[[vardep]])
  #attr(tab[[vardep]], "scores") <- -table(tab[[vardep]])
  if (typeCensure == 1) {
    tab$.time <- as.numeric(dateSortie - dateInclusion)
  } else if (typeCensure == 2) {
    tab$.time <- tab[[var_time]]
    tab %<>% select(-one_of(var_time))
  }
  exLabel <- label(tab[[vardep]])
  tab[[vardep]] %<>%
    as.numeric %>%
    subtract(1)
  if(!is.null(limit) && !is.na(limit) && limit != 0){
    time2 <- pmin(limit, tab$.time)
    tab[[vardep]] <- ifelse(tab[[vardep]] == 1 & tab$.time > time2 | tab[[vardep]] == 0, 0, 1)
    tab$.time <- time2
  }
  label(tab[[vardep]]) <- exLabel
  tab
}


#' Create intermediate table
#'
#' @param tab a data frame
#' @param type a character vector. Possible values : "desc", "expl" and "pred"
#'
#' @return a curated data frame
#' @export
create_tabi <- function(tab, type, keep = NULL){
  tf <- Filter(function(x) count_items(x) > 1 & !inherits(x, "Date") & !is.character(x), tab)
  if (type == "desc"){
    return(tf)
  } else {
    if (ncol(tf) > 1){
      filtered <- Filter(function(x) get_propDM(x) <= 0.2, tf)
      removed <- get_large_missing(filtered)
      all_removed <- intersect(
        c(removed, setdiff(names(tf), names(filtered))),
        names(tab))

      final <- tf[setdiff(names(tf), all_removed)]
      if (type == "pred" && !is.null(keep)) final <- tf[union(intersect(names(tf), keep), names(final))]
      structure(final, removed = all_removed)
    } else {
      tf
    }
  }
}

#' @export
homogeneise_tab <- function(tab, before, after){
  purrr::walk2(before, after, function(x, y){
    if (is.factor(tab[[x]]) & !is.factor(tab[[y]]) | !is.factor(tab[[x]]) & is.factor(tab[[y]])) {
      exlab <- label(tab[[x]])
      tab[[x]] <<- as.numeric(as.character(tab[[x]]))
      label(tab[[x]]) <<- exlab
      exlab <- label(tab[[y]])
      tab[[y]] <<- as.numeric(as.character(tab[[y]]))
      label(tab[[y]]) <<- exlab
    }
  })
  all_vars <- purrr::map2(before, after, function(x, y){
    if (solve_contrast(tab, x, tab[[y]], univ = TRUE)) {
      c(x, y)
    }
  }) %>% compact() %>%
    purrr::list_c()
  tab[all_vars]
}
KZARCA/simplestats documentation built on July 30, 2024, 7:03 p.m.