R/format_vertical_data.R

Defines functions get_column_data restructure_as_in_xlsx get_col_names format_vertical_data

#' Extracts vertical sheet data using the supplied tidyxl_data and formatting_data
#'
#' @param tidyxl_data cell data extracted from a .xlsx file using the tidyxl
#' package
#' @param formatting_data formatting data extracted from a .xlsx file using the
#' tidyxl package
#' @param stat_pub_name the name of the statistical publication
#' @param drop_col whether or not to drop the column identifier column for each
#' series
#'
#' @noRd
#'
format_vertical_data <- function(tidyxl_data,
                                 formatting_data,
                                 stat_pub_name,
                                 frequency,
                                 drop_col = TRUE) {
  existing_cols <- sort(unique(tidyxl_data$col))

  row_names <- get_col_names(tidyxl_data, existing_cols)

  data_below_top_row <-
    dplyr::filter(.data = tidyxl_data, row > unique(row_names$row))

  restructured_data <-
    restructure_as_in_xlsx(existing_cols, row_names, data_below_top_row)

  pivoted_data <-
    tidyr::pivot_longer(
      data = restructured_data,
      cols = !tidyselect::contains("abn", ignore.case = TRUE) &
        dplyr::where(is.numeric),
      names_to = "series",
      values_to = "value"
    )

  name_cleaned_data <- clean_col_names(pivoted_data)

  extra_meta_data <-
    get_extra_meta_data(
      frequency = frequency,
      stat_pub_name = stat_pub_name,
      row_names, existing_cols, data_below_top_row, formatting_data
    )

  cleaned_data <-
    dplyr::left_join(
      x = name_cleaned_data,
      y = extra_meta_data,
      by = dplyr::join_by("col", "series"),
      relationship = "many-to-one"
    )

  cleaned_data <- dplyr::relocate(
    .data = cleaned_data, unit, .before = value
  )

  cleaned_data <- dplyr::relocate(
    .data = cleaned_data, statistics_publication_name, .before = 1
  )

  cleaned_data <- dplyr::relocate(
    .data = cleaned_data, frequency, .before = unit
  )

  if (drop_col) {
    cleaned_data <- dplyr::select(cleaned_data, !col)
  }

  return(cleaned_data)
}

#' Takes the tidyxl data and returns the data about the top row of names
#'
#' @param tidyxl_data cell data extracted from a .xlsx file using the tidyxl
#' package
#' @param existing_cols the number of valid columns in the .xlsx file
#'
#' @noRd
#'
get_col_names <- function(tidyxl_data, existing_cols) {
  row_names <-
    dplyr::filter(
      .data = tidyxl_data,
      all(data_type == "character"), setequal(existing_cols, col),
      .by = "row"
    )

  if (length(row_names$character) < 1) {
    cli::cli_abort(
      message = "Could not extract row names from the xlsx file.",
      class = "readapra_error_could_not_get_row_names"
    )
  }

  return(row_names)
}

#' Restructures the tidy_xl data so that it resembles what was originally in
#' the .xlsx file
#'
#' @param existing_cols the number of valid columns in the .xlsx file
#' @param row_names the tibble containing the row names data
#' @param data_below_top_row the tibble containing the data below the row names
#'
#' @noRd
#'
restructure_as_in_xlsx <- function(existing_cols,
                                   row_names,
                                   data_below_top_row) {
  list_column_data <-
    purrr::map(
      .x = existing_cols,
      .f = ~ get_column_data(.x, data = data_below_top_row)
    )
  rejoined_column_data <-
    purrr::reduce(
      .x = list_column_data,
      .f = dplyr::left_join,
      by = dplyr::join_by("row")
    )

  renamed_column_data <-
    rlang::set_names(
      x = dplyr::select(.data = rejoined_column_data, !row),
      nm = paste0(row_names$character, "_", existing_cols)
    )
  return(renamed_column_data)
}

#' Takes a tibble containing column data and extracts each column irrespective
#' of its data type
#'
#' @param col_num the column number in the sheet to extract
#' @param data the data to extract column number data from
#'
#' @noRd
#'
get_column_data <- function(col_num, data) {
  col_data <- dplyr::filter(.data = data, col == col_num)
  col_data <- dplyr::mutate(.data = col_data, date = lubridate::as_date(date))
  col_data <- dplyr::select(.data = col_data, row, error, logical, numeric, date, character)
  col_data <- dplyr::select(.data = col_data, dplyr::where(~ !all(is.na(.))))
}

#' Takes the pivoted data and cleans the column names and extracts the col column
#'
#' @param pivoted_data the pivoted data to be cleaned
#'
#' @noRd
#'
clean_col_names <- function(pivoted_data) {
  cleaned_names_data <- janitor::clean_names(pivoted_data)

  names(cleaned_names_data) <-
    stringr::str_remove_all(names(cleaned_names_data), "_\\d+$")

  cleaned_names_data$series <- remove_escape_sequences(cleaned_names_data$series)

  cleaned_names_data <-
    tidyr::separate_wider_regex(
      data = cleaned_names_data,
      cols = series,
      patterns = c(series = ".*", "_", col = ".*")
    )

  # Second run to align with meta data
  cleaned_names_data$series <- remove_escape_sequences(cleaned_names_data$series)

  cleaned_names_data <-
    dplyr::relocate(cleaned_names_data, col, .after = tidyselect::last_col())

  cleaned_names_data$col <- as.numeric(cleaned_names_data$col)

  names(cleaned_names_data) <-
    dplyr::case_when(
      stringr::str_detect(
        string = "^period$",
        pattern = stringr::str_trim(
          stringr::regex(names(cleaned_names_data), ignore_case = TRUE)
        )
      ) ~ "date",
      .default = names(cleaned_names_data)
    )

  return(cleaned_names_data)
}

#' Generates additional meta data to attach to the output tibble using the
#' formatting data (i.e unit and column number)
#'
#' @param stat_pub_name the name of the statistical publication
#' @param row_names the tibble containing the row names data
#' @param existing_cols the existing cols
#' @param data_below_top_row the tibble containing the data below the row names
#' @param formatting_data formatting data extracted from a .xlsx file using the
#' tidyxl package
#'
#' @noRd
#'
get_extra_meta_data <- function(stat_pub_name,
                                frequency,
                                row_names,
                                existing_cols,
                                data_below_top_row,
                                formatting_data) {
  column_binder <-
    dplyr::mutate(
      .data = tibble::tibble(series = row_names$character),
      col = existing_cols
    )

  extra_meta_data <- joined_formatting_data(data_below_top_row, formatting_data)

  # Sometimes formatting changes across column. Taking most frequent formatting.
  extra_meta_data <-
    dplyr::summarise(
      .data = extra_meta_data,
      count = dplyr::n(),
      .by = c(col, unit)
    )
  extra_meta_data <- dplyr::slice_max(.data = extra_meta_data, count, by = col)
  extra_meta_data <- dplyr::select(.data = extra_meta_data, col, unit)

  # Joining data together
  extra_meta_data <- dplyr::left_join(column_binder, extra_meta_data, by = "col")
  extra_meta_data <- clean_unit_data(extra_meta_data)
  extra_meta_data$series <- remove_escape_sequences(extra_meta_data$series)
  extra_meta_data$statistics_publication_name <- stat_pub_name
  extra_meta_data$frequency <- frequency

  return(extra_meta_data)
}

#' Safely get the vertical sheet data
#'
#' @noRd
#'
safe_format_vertical_data <- purrr::safely(format_vertical_data)

#' Attempts to get the QADIPS Key Stats sheet data and if it encounters an error
#' it throws a warning and returns a empty tibble
#'
#' @param tidyxl_data cell data extracted from a .xlsx file using the tidyxl
#' package
#' @param formatting_data formatting data extracted from a .xlsx file using the
#' tidyxl package
#' @param stat_pub_name the name of the statistical publication
#' @param col whether or not to drop the column identifier column for each
#' series
#' @param call the caller environment
#'
#' @keywords internal
#' @noRd
#'
attempt_format_vertical_data <-
  function(tidyxl_data,
           formatting_data,
           stat_pub_name,
           frequency,
           drop_col = TRUE,
           call = rlang::caller_env()) {
    outcome <- safe_format_vertical_data(
      tidyxl_data, formatting_data, stat_pub_name, frequency,
      drop_col = drop_col
    )
    if (!is.null(outcome$error)) {
      cli::cli_abort(
        message = "The .xlsx file was in an unrecognised structure and could not be imported.",
        class = "readapra_error_vertical_data_unreadable",
        call = call
      )
    } else {
      return(outcome$result)
    }
  }

Try the readapra package in your browser

Any scripts or data that you put into this service are public.

readapra documentation built on April 12, 2025, 1:48 a.m.