R/loadData.R

Defines functions split_data_from_listing guess_ID_variable codegen_load_all_sas_data parse_formats list_freetext_markdown read_sas_formatted split_redcap_dataset read_redcap_formatted unlabel extract_labels

Documented in codegen_load_all_sas_data extract_labels guess_ID_variable list_freetext_markdown parse_formats read_redcap_formatted read_sas_formatted split_redcap_dataset unlabel

#' Extract the label attribute from data
#'
#' @param dat data in the form of a \code{\link[base]{list}}, \code{\link[base]{data.frame}}
#' or \code{\link[tibble]{tibble}}, or a vector
#'
#' @return list of labels
#' @examples
#' a <- c(1, 2)
#' attr(a, "label") <- "b"
#' identical(extract_labels(a), list(a = attr(a, "label")))
#' @export
#'
extract_labels <- function(dat) {
  if (inherits(dat, "list") | inherits(dat, "data.frame") | inherits(dat, "tbl")) {
    extracted_labels <- lapply(dat, function(x) attr(x, "label"))
  } else {
    nm <- deparse(substitute(dat))
    extracted_labels <- list(attr(dat, "label"))
    names(extracted_labels) <- nm
  }
  extracted_labels <- extracted_labels[!sapply(extracted_labels, is.null) &
    !sapply(extracted_labels, function(x) isTRUE(trimws(x) == ""))]
  return(extracted_labels)
}

#' Remove the label attribute from data
#'
#' @inheritParams extract_labels
#'
#' @return data with the labels removed
#' @examples
#' a <- c(1, 2)
#' attr(a, "label") <- "b"
#' identical(unlabel(a), c(1, 2))
#' @export
#'
unlabel <- function(dat) {
  unlabel_fun <- function(x) {
    if (inherits(x, "labelled")) {
      class(x) <- class(x)[class(x) != "labelled"]
    }
    if (!is.null(attr(x, "label"))){
      attr(x, "label") <- NULL
    }
    return(x)
  }
  if (inherits(dat, "data.frame") | inherits(dat, "tbl")) {
    dat <- mutate(dat, across(everything(), unlabel_fun))
  } else if (inherits(dat, "list")) {

  } else {
    dat <- unlabel_fun(dat)
  }
  dat
}

#' Convencience function to load datasets downloaded from a Redcap database
#'
#' This function is specifically tailored to the way the default import script
#' provided by a Redcap database functions. First, the \code{Hmisc} package is loaded.
#' The .csv file containing the data is assumed to be located in the current working directory.
#' Labels are assigned to all variables. Variables which are supposed to be factors are twice,
#' once as a factor and once in an unformatted way.
#'
#' This script removes the "unformatted factor" variables and properly assignes labels.
#'
#' @param path_to_redcap_script (character) Path to the (automatically generated) redcap script for data import
#'
#' @return tibble with data
#' @examples
#' path_to_redcap_script <- system.file("examples", "testredcap.r", package = "DescrTab2")
#' read_redcap_formatted(path_to_redcap_script)
#' @export
#' @importFrom Hmisc label label<-
read_redcap_formatted <- function(path_to_redcap_script = NULL) {
  stopifnot(is.character(path_to_redcap_script))
  source(path_to_redcap_script, encoding = "UTF-8", local = TRUE)
  data <- as_tibble(data)
  colnames_data <- names(data)
  for (colname in colnames_data) {
    if (str_detect(colname, "\\.factor$")) {
      fac <- data[[colname]]
      label(fac) <- label(data[[str_remove(colname, "\\.factor$")]])
      data[str_remove(colname, "\\.factor$")] <- fac
      data <- data[names(data) != colname]
    }
  }
  data
}

#' Split a dataset imported from Redcap into convenient subsets
#'
#' This function seperates a datasets into three parts: "Singular" data, which is the
#' data from non-repeating instruments. "missings_everywhere", which is data which is missing for each row.
#' The last parts are all the repeating instruments, which are referred to by their name as recorded in
#' \code{dat$redcap_repeat_instrument}.
#'
#' @param dat a \code{tibble} produced by \code{\link{read_redcap_formatted}}.
#' @param id_name (character) the name of the subject ID variable.
#'
#' @return a list of datasets separated into the categories as described
#' @export
#'
#' @examples
#' path_to_redcap_script <- system.file("examples", "testredcap.r", package = "DescrTab2")
#' dat <- read_redcap_formatted(path_to_redcap_script)
#' d <- split_redcap_dataset(dat, guess_ID_variable(dat, TRUE))
split_redcap_dataset <- function(dat, id_name = "patid") {
  missings_everywhere <-
    dat %>% select(!!id_name, where(~ (all(is.na(.x)) | all(.x == ""))))

  d <-
    lapply(
      as.character(unique(
        fct_explicit_na(dat$redcap_repeat_instrument)
      )),
      function(x) {
        dat %>%
          filter(fct_explicit_na(dat$redcap_repeat_instrument) == !!x)
      }
    )

  names(d) <-
    as.character(unique(fct_explicit_na(dat$redcap_repeat_instrument)))
  names(d)[names(d) == "(Missing)"] <- "Singular"

  d <- lapply(d, function(x) {
    x %>% select(where(~ !(all(is.na(.x)) | all(.x == ""))))
  })

  d[["missings_everywhere"]] <- missings_everywhere
  d
}

#' Convencience function to load SAS datasets
#'
#' @param path_to_data path to .sas7bdat file
#' @param path_to_format path to .sas7bcat file
#'
#' @return tibble with data
#' @export
#' @examples
#' path_to_data <- system.file("examples", "testsas.sas7bdat", package = "DescrTab2")
#' pat_to_format <- system.file("examples", "formats.sas7bcat", package = "DescrTab2")
#' read_sas_formatted(path_to_data, pat_to_format)
#' @importFrom haven read_sas
#'
read_sas_formatted <- function(path_to_data = NULL, path_to_format = NULL) {
  erg <- read_sas(
    path_to_data,
    path_to_format
  )
  erg <- erg %>%
    mutate(across(where(function(x) inherits(x, "haven_labelled")), as_factor))
  erg
}

#' Create a markdown listing from a character dataset
#'
#' @param dat a character \code{data.frame} or \code{tibble}.
#'
#' @return string containing markdown code listing all nonempty free text in the dataset
#' @examples
#' dat  <- data.frame(Freetext = c("Some text", "More text"))
#' list_freetext_markdown(dat)
#' # use inside a .Rmd document like this:
#' # `r list_freetext_markdown(dat)`
#' @export
#'
list_freetext_markdown <- function(dat) {
  dat <- as_tibble(dat)
  str <- ""
  for (i in 1:ncol(dat)) {
    var <- pull(dat, !!i)
    name <- names(dat[, i])[1]
    lab <- attr(var, "label")
    print_name <- if (is.null(lab)) name else paste0(lab, " (", name, ")")
    var <- var[!(var %in% c("", NA_character_))]
    if (length(var) > 0) {
      namerow <- paste0("**", print_name, "**\n\n")
      varrows <- paste0(" * ", var, "\n")
      str <- paste0(str, namerow, paste0(varrows, collapse = ""), "\n", collapse = "")
    }
  }
  str
}


#' Parse a text file containing format information
#'
#' Useful to extract factor formatting information contained in a proc format SAS statement.
#'
#' @param path_to_format_definition (string) Path to the text file to be parsed
#' @param ignore_keywords A vector of keywords to be ignored when searching for the name of
#' the variable to be formatted
#' @param encoding Encoding for the text file
#'
#' @return A named list with format definitions
#' @examples
#' tmpfile <- tempfile()
#' write(     "proc format;
#'              value yn  1=\"yes\"
#'                        0=\"no\";
#'              value sex 1=\"female\"
#'                        0=\"male\";
#'               run;",tmpfile)
#' parse_formats(tmpfile)
#' @export
parse_formats <- function(path_to_format_definition,
                          ignore_keywords = c("value"),
                          encoding = "ISO-8859-1") {
  ff <- file(path_to_format_definition, encoding = encoding)
  f <- readLines(ff) %>%
    paste0(collapse = "")
  close(ff)

  warning("The algorithm to extract SAS comments in this function is not implemented well.
This function works best if you manually remove all comments from the
text file and make sure there are no labels containing strings of the form '/*' or '*/'.")

  # strip "proc format;" and "run;" from the file
  tmp <- str_extract(f, "(?<=([pP][rR][oO][cC] [fF][oO][rR][mM][aA][tT])).*(?=[rR][uU][nN];)")
  # strip all comments delminited by /*  */
  tmp <- str_remove_all(tmp, "\\/\\*.*?\\*\\/")

  tmp <- strsplit(tmp, "")[[1]]

  i <- 1L

  strbuf <- ""
  current_delimiter <- NULL
  start_delim_index <- NULL
  end_delim_index <- NULL
  levels <- numeric()
  labels <- character()
  parse_varname <- TRUE
  parse_level <- FALSE
  parse_label <- FALSE

  format_list <- list()
  while (i <= length(tmp)) {
    current_char <- tmp[i]

    if (parse_varname) {
      strbuf <- paste0(strbuf, current_char)
      current_word <- str_extract(strbuf, "(?<=[\\s\\;])[^\\s]+(?=\\s)")
      if (!is.na(current_word)) {
        if (current_word %in% ignore_keywords) {
          strbuf <- str_replace(strbuf, "(?<=[\\s\\;])[^\\s]+(?=\\s)", "")
        } else {
          labels <- character()
          if (isTRUE(str_split(current_word, "")[[1]][[1]] == "$")) {
            char_factor <- TRUE
            varname <- paste0(str_split(current_word, "")[[1]][-1], collapse = "")
            levels <- character()
          } else {
            char_factor <- FALSE
            varname <- current_word
            levels <- numeric()
          }
          parse_varname <- FALSE
          parse_level <- TRUE
          strbuf <- current_char
        }
      }
    }
    if (parse_level) {
      if (isTRUE(current_char == ";")) {
        parse_level <- FALSE
        parse_varname <- TRUE
        current_delimiter <- NULL
        format_list[[varname]] <- list(levels = levels, labels = labels)
      }
      if (char_factor) {
        if (isTRUE(current_char == current_delimiter) | (isTRUE(str_detect(current_delimiter, "\\s")) && isTRUE(current_char == "="))) {
          end_delim_index <- i
        }
        if (!is.null(end_delim_index)) {
          current_level <- paste0(tmp[(start_delim_index + 1):(end_delim_index - 1)], collapse = "")
          if (!is.na(current_level)) {
            levels[length(levels) + 1] <- current_level
            parse_level <- FALSE
            parse_label <- TRUE
            current_delimiter <- NULL
            start_delim_index <- NULL
            end_delim_index <- NULL
            strbuf <- current_char
            i <- i + 1L
            next
          }
        }
        if (is.null(current_delimiter) && isTRUE(str_detect(current_char, "[\\'\"[:alpha:]\\.]"))) {
          if (str_detect(current_char, "[[:alpha:]\\.]")) {
            start_delim_index <- i - 1
            current_delimiter <- tmp[i - 1]
          } else {
            start_delim_index <- i
            current_delimiter <- current_char
          }
        }
      } else {
        strbuf <- paste0(strbuf, current_char)
        current_level <- str_extract(strbuf, "(?<=\\s)[[:digit:]\\.\\-]+(?=[\\s\\=])")
        if (!is.na(current_level)) {
          current_level <- if (isTRUE(current_level == ".")) NA_real_ else as.numeric(current_level)
          levels[length(levels) + 1] <- current_level
          parse_label <- TRUE
          parse_level <- FALSE
          strbuf <- current_char
        }
      }
    }
    if (parse_label) {
      if (isTRUE(current_char == current_delimiter) | (isTRUE(str_detect(current_delimiter, "\\s")) && isTRUE(current_char == ";"))) {
        end_delim_index <- i
      }
      if (!is.null(end_delim_index)) {
        current_label <- paste0(tmp[(start_delim_index + 1):(end_delim_index - 1)], collapse = "")
        if (!is.na(current_label)) {
          labels[length(labels) + 1] <- current_label
          parse_label <- FALSE
          if (isTRUE(current_char == ";")) {
            format_list[[varname]] <- list(levels = levels, labels = labels)
            parse_varname <- TRUE
          } else {
            parse_level <- TRUE
          }
          current_delimiter <- NULL
          start_delim_index <- NULL
          end_delim_index <- NULL
          strbuf <- current_char
          i <- i + 1L
          next
        }
      }
      if (is.null(current_delimiter) && str_detect(current_char, "[\\'\"[:alpha:]\\.]")) {
        if (str_detect(current_char, "[[:alpha:]\\.]")) {
          start_delim_index <- i - 1
          current_delimiter <- tmp[i - 1]
        } else {
          start_delim_index <- i
          current_delimiter <- current_char
        }
      }
    }
    i <- i + 1L
  }
  return(format_list)
}

#' Create code to load all SAS datasets in a folder.
#'
#' This is useful if you work with lots of separate SAS datasets spread out in the same folder.
#'
#' @param dir path to dataset folder
#' @param format path to format file
#'
#' @return NULL. Relevant code is printed to the console.
#' @examples
#' codegen_load_all_sas_data(system.file("examples", package = "DescrTab2"))
#' @export
codegen_load_all_sas_data <- function(dir, format = NULL) {
  e <- str_subset(list.files(dir), "\\.sas7bdat$")
  p <- paste0(dir, e)
  cat(paste0(
    "d.", str_replace(e, "\\.sas7bdat$", ""), " <- haven::read_sas(data_file = \"", p,
    "\", catalog_file = \"", format, "\")\n"
  ))
  cat(paste0("d <- list(", paste0("d.", str_replace(e, "\\.sas7bdat$", ""), collapse = ", "), ")\n"))
  cat(paste0("names(d) <- ", paste0("c(", paste0(paste0("\"", str_replace(e, "\\.sas7bdat$", ""), "\""),
    collapse = ", "
  ), ")")))
  return(NULL)
}

#' Make an educated guess about the name of the ID variable from a dataset
#'
#' @param dat a dataset with names (\code{list}, \code{data.frame}, \code{tibble})
#' @param suppressWarnings (logical) suppress warning messages if you know what you are dooing
#'
#' @return if exactly one possible
#' @export
#'
#' @examples
#' dat <- data.frame(ID = c(1,2,3,4,5),
#'                  other = c(1,2,3,4,5))
#' guess_ID_variable(dat)
#' @importFrom stringr str_to_lower
#' @importFrom magrittr `%>%`
guess_ID_variable <- function(dat, suppressWarnings = FALSE) {
  original_colnames <- names(dat)
  colnames <- str_to_lower(original_colnames)
  # common words to describe subject id
  ids1 <- str_detect(
    colnames,
    "(^subjectid$)|(^subid$)|(^subject$)|
(^patientid$)|(^patid$)|(^patient$)|
(^screeningno$)|(^screeno$)|(^screenno$)(^scrno$)|
(^randomid$)|(^randid$)|(^ranid$)|(^randomno$)|(^randno$)|(^ranno$)|(^rano$)
(^indices$)|(^index$)|(^idx$)"
  )
  # regex to catch "id" if it is not part of another word
  ids2 <- str_detect(colnames, "(?<![:alpha:])id(?![:alpha:])")
  ids <- original_colnames[ids1 | ids2]
  if (length(ids) > 1) {
    if (!isTRUE(suppressWarnings)) {
      warning("Multiple possible ID variables found. No candidate is chosen automatically.")
    }
    return(NULL)
  } else if (length(ids) == 1) {
    if (!isTRUE(suppressWarnings)) {
      warning(paste0(
        "One possible candidate for an ID variable found: ", ids,
        ". This variable is used as ID."
      ))
    }
    return(ids)
  } else {
    return(NULL)
  }
}
# nocov start
# work in progress
split_data_from_listing <- function(dat, n_split_levels = 10, n_split_characters = 35) {
  dat <- as_tibble(dat)
  idx_dat <- list()
  idx_list <- list()
  for (i in 1:ncol(dat)) {
    too_many_levels <- (is.character(pull(dat, !!i)) | inherits(pull(dat, !!i), "Date")) &
      (length(levels(factor(pull(dat, !!i)))) > n_split_levels)
    label_too_long <- length(attr(pull(dat, !!i), "label")) > n_split_characters
    name_too_long <- if (is.null(attr(pull(dat, !!i), "label"))) FALSE else length(names(dat[, i])[1]) > n_split_characters
    factor_levels_too_long <- (is.character(pull(dat, !!i)) | inherits(pull(dat, !!i), "Date") | is.factor(pull(dat, !!i))) &
      any(sapply(levels(factor(pull(dat, !!i))), function(x) length(x) > n_split_characters))

    if (too_many_levels | label_too_long | name_too_long | factor_levels_too_long) {
      idx_list[[length(idx_list) + 1]] <- i
    } else {
      idx_dat[[length(idx_dat) + 1]] <- i
    }
  }
  return(list(
    data = dat[, unlist(idx_dat)],
    list = dat[, unlist(idx_list)]
  ))
}
# nocov end

Try the DescrTab2 package in your browser

Any scripts or data that you put into this service are public.

DescrTab2 documentation built on Sept. 6, 2022, 9:05 a.m.