pttdatahaku: Data management in PTT

Documented in agg_abolished_mun agg_regions

#' Aggregate abolished municipalicities
#'
#' @param x A data.frame like object.
#' @param value_cols A character vector of name of columns to aggregate.
#' @param n_col A locigal to include number of observations in aggragation
#' @param na.rm	logical. Should missing values (including NaN) be removed?
#' @export
#'
#' @examples
#' x <- data.frame(
#'   alue_code = c("KU911", "KU541"),
#'    alue_name = c("A", "B"), values = c(1,2))
#' agg_abolished_mun(x)
#' agg_abolished_mun(x, n_col = TRUE)
#'
agg_abolished_mun <- function(x, value_cols = c("values"), n_col = FALSE, na.rm = FALSE){

  # Should think better solution to calculate n only if n_col = TRUE
  if (hasName(x, "n")) warning("Column n will be overwriten")

  y <- x %>%
    statficlassifications::join_abolished_mun("alue_code") %>%
    select(!alue_name) %>%
    group_by(across(!any_of(value_cols))) %>%
    summarise(across(any_of(value_cols), sum, na.rm = na.rm), n = n()) %>%
    ungroup() |>
    mutate(alue_name = statficlassifications::codes_to_names(alue_code)) %>%
    relocate(names(x))

  if (!n_col) y$n <- NULL

  y <- add_ptt_attr(y, x)
  y
}


#' Aggregate to other region level
#'
#' @param x A data.frame like object.
#' @param from A name of reginal classification.
#' @param to A name of reginal classification.
#' @param value_cols A character vector of name of columns to aggregate.
#' @param pass_region_codes Region codes to be passed true. For example whole
#'        country "SSS". Does not affect aggregation. Default NULL.
#' @param na.rm	logical. Should missing values (including NaN) be removed?
#' @param all_to_regions logical. Should all to regions included
#'        even if not in data.
#' @param custom_key A data.frame. Custom classification key in same form as
#'        as key from \code{\link[statficlassifications]{get_regionkey}}.
#' @param by_name A locigal to join by name or code (default).
#' @export
#'
#' @import dplyr
#' @examples
#' x <- data.frame(alue_code = c("SSS", "KU049", "KU091", "KU109"),
#'                 alue_name = c("KOKO MAA", "Espoo", "Helsinki", "Hämeenlinna"), values = c(1,1,1,2))
#' agg_regions(x, na.rm = TRUE)
#' agg_regions(x, na.rm = TRUE, pass_region_codes = "SSS")
#' agg_regions(x, na.rm = TRUE, pass_region_codes = "SSS", all_to_regions = FALSE)
#' z <- dplyr::mutate(x, values2 = c(3,4,5,6))
#' agg_regions(z, na.rm = TRUE, value_cols = c("values", "values2"))
#' agg_regions(x, na.rm = TRUE,
#'             custom_key = data.frame(
#'                kunta_code = c("SSS", "KU049", "KU091", "KU109"),
#'                kunta_name = c("KOKO MAA", "Espoo", "Helsinki", "Hämeenlinna"),
#'                maakunta_code = as.factor(c("SSS", "MK1", "MK1", "MK2")),
#'                maakunta_name = as.factor(c("SSS", "Maakunta1", "Maakunta1", "Maakunta2")))
#'                )
#' agg_regions(x, na.rm = TRUE,
#'             custom_key = data.frame(
#'                kunta_code = c("SSS", "KU049", "KU091", "KU109"),
#'                kunta_name = c("KOKO MAA", "Espoo", "Helsinki", "Hämeenlinna"),
#'                maakunta_code = as.factor(c("SSS", "MK1", "MK1", "MK2")),
#'                maakunta_name = as.factor(c("SSS", "Maakunta1", "Maakunta1", "Maakunta2"))),
#'                by_name = TRUE
#'                )
agg_regions <- function(x, from = "kunta", to = "maakunta",
                        value_cols = c("values"),
                        pass_region_codes = NULL, na.rm = FALSE,
                        all_to_regions = TRUE,
                        custom_key = NULL,
                        by_name = FALSE){

  if (is.null(custom_key)){
    region_key <- statficlassifications::get_regionkey(from, to)
  } else {
    region_key <- custom_key
  }

  if (by_name) {
    region_key[paste0(from, "_code")] <- NULL
    by_var <- "alue_name"
  } else{
    region_key[paste0(from, "_name")] <- NULL
    by_var <- "alue_code"
  }

  region_key <-
    region_key |>
    rename_with(~gsub(paste0("^", from, "_"), "alue_", .x)) |>
    rename_with(~gsub(paste0("^", to, "_"), "to_", .x))


  y <- x %>%
    mutate(check = 1) |>
    right_join(region_key, by = by_var) %>%
    {if (!all_to_regions) filter(., !is.na(check)) else .} %>%
    select(-check) |>
    select(-any_of(c("alue_name", "alue_code"))) %>%
    rename(alue_code = to_code, alue_name = to_name) %>%
    group_by(across(!all_of(value_cols))) %>%
    summarise_at(value_cols, sum, na.rm = na.rm) %>%
    ungroup() |>
    relocate(names(x))

  if (!is.null(pass_region_codes)){
    y <- bind_rows(
      filter(x, alue_code %in% pass_region_codes),
      y
    ) |>
      droplevels()
  }

  y <- add_ptt_attr(y, x)
  y
}

#' Add regional aggragation
#'
#' @describeIn agg_regions
#'
#' @export

add_regional_agg <- function(x, from = "kunta", to = "maakunta"){

  y <- bind_rows(x, agg_regions(x, from = from, to = to)) %>%
    droplevels()
  y
}

#' Re-add ptt attributes back
#'
#' @param to A df to put attributes
#' @param from A df to get attributes froms
#'


add_ptt_attr <- function(to, from){
  attr(to, "citation") <- attr(from, "citation")
  attr(to, "codes_names") <- attr(from, "codes_names")
  to
}


#' Aggregate to yearly data
#'
#' Time variable have to be Date in time column.
#'
#' @param x A data.frame like object.
#' @param na.rm logical. For \code{\link[base]{sum}}, should missing values
#'        (including NaN) be removed?
#' @export
#'
#'
agg_yearly <- function(x, na.rm = FALSE){

  y <- x %>%
    mutate(time = lubridate::ymd(lubridate::year(time), truncated = 2)) %>%
    group_by(across(!values)) %>%
    summarise(values = sum(values, na.rm = na.rm), .groups = "drop") %>%
    ungroup() |>
    relocate(names(x))

  y <- add_ptt_attr(y, x)
  y
}


#' Remove columns with unique value.
#'
#'
#' @param data a data.frame with columns with unique values
#'
#' @return data.frame
#' @export
#'
#' @examples
#'  data <- data.frame(var1 = letters[1:10],
#'                     var2 = rnorm(10),
#'                     var3 = "a")
#'
#'  data <- rm_empty_cols(data)
rm_empty_cols <- function(data) {

  data[,sapply(names(data), function(x) {length(unique(data[[x]])) > 1})]

}

#' Select columns and filter SSS from columns not selected
#'
#' Only removes columns that contain variable SSS. Note that the data has
#' to have codes.
#'
#' @param data data.frame to modify
#' @param ... chr, column names to select
#' @param SSS logical, whether to leave SSS to selected columns
#'
#' @return data.frame
#' @export
#'
#' @examples
#' data <- cbind(tidyr::crossing(var1 = c("SSS", 1, 2),
#'                        var2 = c("SSS", "a", "b"),
#'                        var3 = c("SSS", "c", "d")),
#'               value = rnorm(27))
#' data |> ptt_select(var3, value)
#' data |> ptt_select(var2, var3, value)
#'
ptt_select <- function(data, ..., SSS = FALSE) {

  sel_cols <- sapply(substitute(list(...)), deparse)[-1]

  if(any(!sel_cols %in% names(data))) {
    stop(paste(sel_cols[!sel_cols %in% names(data)], "not in the data."))
  }

  not_sel_cols <- names(data)[!names(data) %in% sel_cols]

  for(col in not_sel_cols[!not_sel_cols %in% c("time", "value")]) {
    data <- data[data[[col]] == "SSS",]
  }

  if(!SSS) {
    for(col in sel_cols[!sel_cols %in% c("time", "value")]) {
      data <- data[data[[col]] != "SSS",]
    }
  }

  rm_empty_cols(data)
}


#' Aggregate based on key
#'
#' @param x A data.frame like object.
#' @param by A character vector of variables to join by.
#' @param na.rm	logical. Should missing values (including NaN) be removed?
#' @param key A data.frame. A classification key in same form as
#'        as key from \code{\link[statficlassifications]{get_regionkey}}.
#' @export
#'
#' @import dplyr
#' @examples
#' x <- data.frame(alue_code = c("SSS", "KU049", "KU091", "KU109"), values = c(1,1,1,2))
#' agg_regions(x, na.rm = TRUE)
#' agg_regions(x, na.rm = TRUE, pass_region_codes = "SSS")
#' agg_regions(x, na.rm = TRUE, pass_region_codes = "SSS", all_to_regions = FALSE)
#' z <- dplyr::mutate(x, values2 = c(3,4,5,6))
#' agg_regions(z, na.rm = TRUE, value_cols = c("values", "values2"))
#' agg_key(x, na.rm = TRUE,
#'             key = data.frame(
#'                alue_code = c("SSS", "KU049", "KU091", "KU109"),
#'                maakunta_name = c("SSS", "Maakunta1", "Maakunta1", "Maakunta2"))
#'                )
#' agg_key(x,
#'             .fns = mean,
#'             key = data.frame(
#'                alue_code = c("SSS", "KU049", "KU091", "KU109"),
#'                maakunta_name = c("SSS", "Maakunta1", "Maakunta1", "Maakunta2"))
#'                )
#'xx <- data.frame(alue_code = c("SSS", "KU049", "KU091", "KU109"), values = c(1,2,1,1), size = c(1,2,1,1))
#' agg_key(xx,
#'             .fns = weighted.mean,
#'             w = size,
#'             key = data.frame(
#'                alue_code = c("SSS", "KU049", "KU091", "KU109"),
#'                maakunta_name = c("SSS", "Maakunta1", "Maakunta1", "Maakunta2"))
#'                )
agg_key <- function(x, by = NULL,
                    .fns = sum,
                        value_cols = c("values", "value"),
                    w = NULL,
                         na.rm = FALSE,
                        all_to_regions = TRUE,
                        key = NULL){

# Value_cols
  value_cols <- intersect(value_cols, names(x))
  if (is.null(value_cols)) rlang::abort("values_cols must be specified")


# by = NULL, copied from dplyr standardise_join_by
  if (is.null(by)) {
    by <- intersect(names(x), names(key))
    if (length(by) == 0) {
      rlang::abort(c("`by` must be supplied when `x` and `key` have no common variables.",
              i = "use by = character()` to perform a cross-join."))
    }

  }

  # region_key[paste0(from, "_name")] <- NULL
  # region_key <-
  #   region_key |>
  #   rename_with(~gsub(paste0("^", from, "_"), "alue_", .x)) |>
  #   rename_with(~gsub(paste0("^", to, "_"), "to_", .x))

# join
  y <- x %>%
    mutate(check = 1) |>
    right_join(key, by = by)

#check join
    check_na <- which(is.na(y[["check"]]))
    if (any(check_na)) {
      message("Data includes classes not in key. Key missing for rows: ",
              paste(check_na, collapse = ", "))
      }

  y <- y |>
    select(-check) |>
    select(-all_of(by)) |>
    group_by(across(!any_of(c(value_cols, as_label(enquo(w)))))) |>
    summarise(across(value_cols, .fns = ~.fns(.x,  w = {{w}}, na.rm = na.rm))) |>
    ungroup()

  # if (!is.null(pass_codes)){
  #   y <- bind_rows(
  #     filter(x, alue_code %in% pass_codes),
  #     y
  #   ) |>
  #     droplevels()
  # }

  y <- add_ptt_attr(y, x)
  y
}