R/clean_data_atts.R

Defines functions clean_data_atts

Documented in clean_data_atts

#' "Clean" Data Frame Attributes
#'
#' @description
#' Drops name.lab and val.lab attributes associated with columns that are not
#' present in the data.frame (i.e., have been dropped) and re-arranges
#' data.frame attributes so that they appear in a clean, logical order.
#'
#' @details
#' labelr meta-data exist as data.frame attributes, added through interactive
#' use in a potentially haphazard order. This function, which is used inside
#' other labelr functions, drops labels for variables that are not (no longer)
#' present in the data.frame and re-arranges label and other data.frame
#' attributes to put them in a more, logical, user-readable order when accessed
#' via, e.g., `attributes()`.
#'
#' @param data a data.frame.
#'
#' @return A data.frame, with attributes re-arranged.
#' @export
#' @examples
#' # make toy demographic (age, gender, raceth) data set
#' set.seed(555)
#' df <- make_demo_data(n = 1000)
#'
#' # let's add variable VALUE labels for variable "raceth"
#' df <- add_val_labs(df,
#'   vars = "raceth", vals = c(1:7),
#'   labs = c("White", "Black", "Latino", "Asian", "AIAN", "Multi", "Other"),
#'   max.unique.vals = 50
#' )
#'
#' # let's add variable VALUE labels for variable "gender"
#' df <- add_val1(
#'   data = df, gender, vals = c(0, 1, 2),
#'   labs = c("M", "F", "O"), max.unique.vals = 50
#' )
#'
#' # let's add variable NAME labels
#' df <- add_name_labs(df, name.labs = c(
#'   "age" = "Age in years",
#'   "raceth" = "raceth category",
#'   "gender" = "gender assigned at birth"
#' ))
#'
#'
#' # let's add a frame label
#' df <- add_frame_lab(df, frame.lab = "This is a fictional data set that includes
#'                     demographic variables. It is generated by
#'                     labelr::make_demo_data")
#'
#' # show attributes
#' attributes(df)
#'
#' # re-arrange and show attributes
#' df2 <- clean_data_atts(df)
#' attributes(df2)
#'
#' # confirm that attributes from df are all present in df2
#' all(attributes(df) %in% attributes(df2)) # TRUE
clean_data_atts <- function(data) {
  # make this a Base R data.frame
  data <- as_base_data_frame(data)

  val_labs_to_check <- names(attributes(data))[grepl(
    "val.labs",
    names(attributes(data))
  )]

  name_labs_names <- names(attributes(data)[["name.labs"]])

  val_labs_names <- gsub("val.labs.", "", val_labs_to_check)

  u_fact_to_check <- names(attributes(data))[grepl(
    "u.factor",
    names(attributes(data))
  )]

  u_fact_names <- gsub("u.factor.", "", u_fact_to_check)

  o_fact_to_check <- names(attributes(data))[grepl(
    "o.factor",
    names(attributes(data))
  )]

  o_fact_names <- gsub("u.factor.", "", u_fact_to_check)

  all_var_names <- unique(c(
    name_labs_names, val_labs_names,
    u_fact_names, o_fact_names
  ))

  absent_var_names <- all_var_names[!all_var_names %in% names(data)]

  # drop unneeded val.labs and factor information
  if (length(absent_var_names) > 0) {
    for (i in seq_along(absent_var_names)) {
      this_var <- absent_var_names[i]
      this_val_lab_name <- paste0("val.labs.", this_var)
      this_u_fact_name <- paste0("u.factor.", this_var)
      this_o_fact_name <- paste0("o.factor.", this_var)
      attributes(data)[[this_val_lab_name]] <- NULL
      attributes(data)[[this_u_fact_name]] <- NULL
      attributes(data)[[this_o_fact_name]] <- NULL

      # if a name.lab exists for this_var, remove it
      if (!is.null(attributes(data)[["name.labs"]])) {
        if (!is.null(attributes(data)[["name.labs"]][this_var])) {
          names_lab_att <- attributes(data)[["name.labs"]]
          names_lab_att <- names_lab_att[!names(names_lab_att) %in% this_var]
          attributes(data)[["name.labs"]] <- names_lab_att
        }
      }
    }
  }

  # re-arrange attributes to be in a clean, logical order
  all_att_names <- names(attributes(data))

  core_att_names <- c("names", "row.names", "class")
  core_in <- core_att_names[core_att_names %in% all_att_names]

  frame_name <- c("frame.lab", "name.labs")
  frame_name_in <- frame_name[frame_name %in% all_att_names]

  val_lab_names <- paste0("val.labs.", names(data))
  val_lab_in <- val_lab_names[val_lab_names %in% all_att_names]

  fact_names <- c(
    paste0("u.factor.", names(data)),
    paste0("o.factor.", names(data))
  )

  fact_in <- fact_names[fact_names %in% all_att_names]

  names_in_combined <- c(core_in, frame_name_in, val_lab_in, fact_in)

  other_names_in <- all_att_names[!all_att_names %in% names_in_combined]

  final_names <- c(
    core_in, frame_name_in,
    fact_in, val_lab_in,
    other_names_in
  )

  final_atts <- attributes(data)[final_names]

  attributes(data) <- NULL
  attributes(data) <- final_atts

  # use any name.labs as label attributes
  for (i in names(data)) {
    name_lab <- suppressWarnings(get_name_labs(data, i)$lab)
    if (!is.na(name_lab[1]) && name_lab[1] != "NA") attr(data[[i]], "label") <- name_lab[1]
  }

  return(data)
}

Try the labelr package in your browser

Any scripts or data that you put into this service are public.

labelr documentation built on Sept. 11, 2024, 9:05 p.m.