R/data_wide.R

Defines functions get_smoking_wide

Documented in get_smoking_wide

#' Get data from ONS on England smoking prevalence
#'
#' @description The data from ONS is in wide form and the spreadsheet has many
#' tabs. This data extraction relates to the data Table 1. Proportion of
#' cigarette smokers, by sex and age, Great Britain, 1974 to 2019.
#'
#' @param clean Logical, default is to remove blank rows and columns but if the
#' original is required then `clean = FALSE`
#'
#' @return data frame
#' @export
get_smoking_wide <- function(clean = TRUE) {
  url1 <- "https://www.ons.gov.uk/file?uri=%2fpeoplepopulationandcommunity%2fhealthandsocialcare%2fdrugusealcoholandsmoking%2fdatasets%2fadultsmokinghabitsingreatbritain%2f2019/adultsmokinghabitsingreatbritain2019final.xls"
  p1f <- withr::local_tempfile(fileext = "xls")

  utils::download.file(url1, p1f, mode = "wb")

  df <- readxl::read_xls(path = p1f, sheet = "Table 1", skip = 5)

  if (clean) {

    # Create the age bands and gender labels
    cnames_age <- c(
      "16-24",
      "25-34",
      "35-49",
      "50-59",
      "60 and over",
      "All aged 16 and over"
    )

    cnames_gender <- c(
      "Men",
      "Women",
      "All persons"
    )

    # remove the hyphen as this necessitates the use of back ticks in variable names

    cnames_clean <- stringr::str_remove(cnames_age, "-")

    # remove spaces in the age bands like '60 and over'

    cnames_clean <- stringr::str_replace_all(cnames_clean, stringr::fixed(" "), "")

    # Code to tidy up the spreadsheet but retain the wide form, merges the two
    # headers of sex and age bands together. Keeping the two apart will mean duplication
    # as there are multiple columns for each name (All persons, Women or 16-24, 16-24
    # instead of All person 16-24, Women 16-24).

    clean_data <- readxl::read_xls(path = p1f, sheet = "Table 1", skip = 5) %>%
      janitor::clean_names() %>%
      janitor::remove_empty(c("rows", "cols")) %>%
      tidyr::pivot_longer(-all_persons_aged_16_and_over,
        values_to = "values",
        names_to = "names"
      ) %>%
      tidyr::fill(values) %>%
      dplyr::mutate(
        all_persons_aged_16_and_over =
          dplyr::case_when(
            is.na(all_persons_aged_16_and_over) & .$values %in% cnames_age ~ "cname_age",
            is.na(all_persons_aged_16_and_over) & .$values %in% cnames_gender ~ "cname_gender",
            TRUE ~ all_persons_aged_16_and_over
          )
      ) %>%
      dplyr::filter(!stringr::str_detect(all_persons_aged_16_and_over, tolower("weight")))

    #### Take the top two rows to make new column names

    cnames_added <- clean_data %>%
      dplyr::filter(all_persons_aged_16_and_over == "cname_gender") %>%
      cbind(clean_data %>%
        dplyr::filter(all_persons_aged_16_and_over == "cname_age") %>%
        dplyr::rename(
          age_values = values,
          age_names = names
        )) %>%
      dplyr::select(
        names,
        values,
        age_values
      ) %>%
      tidyr::unite(values, values:age_values, sep = " ") %>%
      dplyr::mutate(
        all_persons_aged_16_and_over = "header",
        values =
          dplyr::case_when(
            stringr::str_detect(.$values, "All persons") == TRUE
            ~ stringr::str_replace(
              .$values, "All persons",
              "Allpersons"
            ),
            # Remove space for later use of separate()
            # which uses space to determine first part of string
            TRUE ~ .$values
          )
      )

    # Put the column names and data back into one table
    df <- clean_data %>%
      dplyr::filter(
        all_persons_aged_16_and_over != "cname_gender",
        all_persons_aged_16_and_over != "cname_age"
      ) %>%
      dplyr::union(cnames_added) %>%
      tidyr::pivot_wider(
        names_from = names,
        values_from = values
      ) %>%
      janitor::row_to_names(row_number = nrow(.), remove_rows_above = FALSE) %>%
      dplyr::filter(
        stringr::str_length(.$header) <= 8,
        .$header != "Notes"
      ) %>%
      dplyr::mutate(
        header = substring(.$header, 1, 4),
        dplyr::across(tidyselect::vars_select_helpers$where(is.character), as.numeric),
        dplyr::across(tidyselect::vars_select_helpers$where(is.numeric), round, 1)
      )
  }
  df
}
Lextuga007/smokingPrevalence documentation built on Dec. 18, 2021, 4:34 a.m.