R/basic_tbl_tools.R

Defines functions .count_variable tbl_mutate tbl_summarise tbl_pivot_longer tbl_allocate .tbl_allocate filter_across tbl_distinct_variables

Documented in filter_across tbl_allocate tbl_distinct_variables tbl_mutate tbl_pivot_longer tbl_summarise

#' Distinct Set of Variables
#'
#' @param data
#' @param variables vector of variables for distinct analays
#' @param id_variables vector of id variables
#' @param append_slug if not `NULL` slug to add to names
#' @param to_arrow_table if `TRUE` coerce to arrow
#'
#' @return
#' @export
#'
#' @examples
#' library(asbtools)
#' tbl_distinct_variables(iris, "Species")
#' tbl_distinct_variables(data = ggplot2::diamonds, variables = c("cut", "color"))
#'
tbl_distinct_variables <-
  function(data, variables = NULL, id_variables = NULL, append_slug = NULL, to_arrow_table = F) {
    if (length(variables) == 0) {
      "Enter variables" %>% message()
      return(data)
    }
    actual_vars <- names(data)[names(data) %in% variables]

    if (length(actual_vars) == 0) {
      return(data)
    }
    data <-
      data %>%
      select(one_of(actual_vars)) %>%
      distinct() %>%
      collect() %>%
      as_tibble()

    if (length(actual_vars) == 1) {
      data <- data %>%
        filter(!is.na(!!sym(actual_vars))) %>%
        arrange(!!sym(actual_vars))
    }

    if (length(actual_vars) > 1) {
      data <- data %>%
        arrange(!!sym(actual_vars[[1]]))
    }

    if (length(append_slug) > 0) {
      names(data)[names(data) %in% c(data %>% select(-one_of(id_variables)) %>% names())] <-
        names(data)[names(data) %in% c(data %>% select(-one_of(id_variables)) %>% names())] %>%
        str_c(append_slug, sep = "_")
    }

    if (to_arrow_table) {
      data <- arrow::arrow_table(data)
    }

    gc(verbose = T,
       reset = T,
       full = T)

    data


    data
  }
#' Filter across features
#'
#' @param data
#' @param filter_across_columns
#' @param exclude_filters
#' @param keep_filters
#'
#' @return
#' @export
#'
#' @examples
filter_across <-
  function(data,
           filter_across_columns = NULL,
           exclude_filters = NULL,
           keep_filters = NULL) {
    if (length(filter_across_columns) == 0) {
      return(data)
    }

    if (length(exclude_filters) > 0) {
      filter_slug <-
        exclude_filters %>% str_c(collapse = "|")

      filter_across_columns %>%
        walk(function(x) {
          glue("Filtering out {x} excluding {filter_slug}") %>% message()
          data <<- data %>%
            filter(!(!!sym(x) %>% str_detect(filter_slug)))
        })
    }

    if (length(keep_filters) > 0) {
      filter_slug <-
        keep_filters %>% str_c(collapse = "|")

      filter_across_columns %>%
        walk(function(x) {
          glue("Filtering {x} keeping {filter_slug}") %>% message()
          data <<-
            data %>%
            filter((!!sym(x) %>% str_detect(filter_slug)))
        })
    }

    data
  }

.tbl_allocate <-
  function(data,
           allocation_variable = NULL,
           split_variable = NULL,
           split_separator = "\\|",
           split_data = TRUE,
           is_already_allocated = F,
           remove_orginal_allocation = TRUE) {
    if (length(allocation_variable) == 0) {
      message("Enter allocation variable")
      return(data)
    }

    if (length(split_variable) == 0) {
      message("Enter spit variable")
      return(data)
    }

    if (is_already_allocated) {
      amt_var <- allocation_variable
    } else {
      amt_var <-
        glue("{allocation_variable}_allocated") %>% as.character()

    }


    data <- data %>%
      mutate(
        count_split = !!sym(split_variable) %>% str_count(split_separator) + 1,
        UQ(amt_var) := (1 / count_split) * (!!sym(allocation_variable))
      ) %>%
      select(-count_split)

    if (remove_orginal_allocation) {
      data <- data %>%
        select(-one_of(allocation_variable))
    }

    if (split_data) {
      data <-
        data %>%
        separate_rows(!!sym(split_variable), sep = split_separator) %>%
        mutate_if(is.character, str_squish)
    }

    data

  }

#' Allocate summarized data
#'
#' @param data
#' @param split_separator
#' @param keep_columns
#' @param remove_columns
#' @param split_data
#' @param remove_orginal_allocation
#' @param allocation_variable
#' @param split_variables
#' @param summarise_groups
#' @param widen_variable
#' @param count_variable
#' @param distinct_variables
#' @param amount_variables
#' @param mean_variables
#' @param median_variables
#' @param min_variables
#' @param max_variables
#' @param first_variables
#' @param last_variables
#' @param variance_variables
#' @param sd_variables
#' @param which_max_variables
#' @param which_min_variables
#' @param remove_top_amount
#' @param coalesce_numeric
#' @param return_message
#'
#' @return
#' @export
#'
#' @examples
tbl_allocate <-
  function(data,
           allocation_variable = NULL,
           split_variables = NULL,
           split_separator = "\\|",
           keep_columns = NULL,
           remove_columns = NULL,
           split_data = TRUE,
           summarise_groups = NULL,
           widen_variable = NULL,
           count_variable = "count",
           distinct_variables = NULL,
           amount_variables = NULL,
           mean_variables = NULL,
           median_variables = NULL,
           min_variables = NULL,
           max_variables = NULL,
           first_variables = NULL,
           last_variables = NULL,
           variance_variables = NULL,
           sd_variables = NULL,
           which_max_variables = NULL,
           which_min_variables = NULL,
           remove_top_amount = TRUE,
           coalesce_numeric = F,
           remove_orginal_allocation = TRUE,
           return_message = TRUE) {
    if (length(keep_columns) > 0) {
      data <-
        data %>%
        select(one_of(c(
          keep_columns, split_variables, allocation_variable
        )))
    }

    if (length(remove_columns) > 0) {
      data <- data %>%
        select(-one_of(remove_columns))
    }

    seq_along(split_variables) %>%
      walk((function(x) {
        split_var <-
          split_variables[[x]]

        if (return_message) {
          glue("\n\nAllocating {allocation_variable} over {split_var}\n\n") %>% message()
        }

        if (x != 1) {
          alloc_var <-
            glue("{allocation_variable}_allocated") %>% as.character()
          is_already_allocated <- T
          remove_alloc <- F
        } else {
          alloc_var <- allocation_variable
          remove_alloc <- T
          is_already_allocated <- F
        }

        data <<-
          .tbl_allocate(
            data = data,
            allocation_variable = alloc_var,
            split_variable = split_var,
            split_data = split_data,
            remove_orginal_allocation = remove_alloc,
            split_separator = split_separator,
            is_already_allocated = is_already_allocated
          )

      }))

    if (length(summarise_groups) > 0) {
      if (return_message) {
        glue("Summarizing by {str_c(summarise_groups, collapse = ', ')}") %>% message()
      }
      amt_col <-
        glue("{allocation_variable}_allocated") %>% as.character()
      data <- tbl_summarise(
        data = data,
        group_variables = summarise_groups,
        widen_variable = widen_variable,
        count_variable = count_variable,
        distinct_variables = distinct_variables,
        amount_variables = amt_col,
        mean_variables = mean_variables,
        median_variables = median_variables,
        min_variables = min_variables,
        max_variables = max_variables,
        first_variables = first_variables,
        last_variables = last_variables,
        variance_variables = variance_variables,
        sd_variables = sd_variables,
        coalesce_numeric = coalesce_numeric,
        top_variables = 1,
        calculation_variable = amt_col,
        which_max_variables = which_max_variables,
        which_min_variables = which_min_variables,
        remove_top_amount = remove_top_amount,
      )
    }

    data
  }

#' Title
#'
#' @param data
#' @param long_columns
#' @param numeric_groups
#' @param names_to
#' @param values_to
#' @param drop_na
#' @param names_prefix
#' @param names_pattern
#' @param ...
#'
#' @return
#' @export
#'
#' @example inst/examples/tbl_pivot_longer_examples.R
#'
tbl_pivot_longer <-
  function(data,
           long_columns = NULL,
           numeric_groups = NULL,
           names_to = "feature",
           values_to = "value",
           drop_na = T,
           names_prefix = NULL,
           names_pattern = NULL,
           ...) {
    if (length(long_columns) == 0) {
      long_cols <- data %>%
        select_if(is.numeric) %>% names() %>%
        discard(function(x) {
          x %in% numeric_groups
        })

    }

    if (length(long_columns) > 0) {
      long_cols <- long_columns
    }

    data %>%
      tidyr::pivot_longer(
        cols = long_cols,
        names_to = names_to,
        values_to = values_to,
        values_drop_na = drop_na,
        names_prefix = names_prefix,
        names_pattern = names_pattern
      )
  }

# summarise ---------------------------------------------------------------

#' Summarise across tibble
#'
#' @param data
#' @param group_variables
#' @param widen_variable
#' @param count_variable
#' @param distinct_variables
#' @param amount_variables
#' @param mean_variables
#' @param median_variables
#' @param min_variables
#' @param max_variables
#' @param first_variables
#' @param last_variables
#' @param variance_variables
#' @param sd_variables
#' @param top_variables
#' @param calculation_variable
#' @param which_max_variables
#' @param which_min_variables
#' @param unique_variables
#' @param coalesce_numeric
#' @param remove_top_amount
#' @param filters
#' @param ...
#' @param unique_concatenator
#' @param append_slug
#' @param pmin_variables
#' @param pmax_variables
#'
#' @return
#' @export
#'
#' @examples
#' library(tidyverse)
#' library(asbviz)
#' ggplot2::diamonds %>%
#' mutate_if(is.factor, as.character) %>%
#' tbl_summarise(
#' unique_variables = "clarity",
#' calculation_variable = "price",
#' amount_variable = "price"
#' )
#'
#' ggplot2::diamonds %>%
#' tbl_summarise(
#' group = "cut",
#' unique_variables = "clarity",
#' calculation_variable = "price",
#' amount_variable = "price"
#' )
#'
#' ggplot2::diamonds %>%
#' tbl_summarise(
#' widen_variable = "cut",
#' unique_variables = "clarity",
#' calculation_variable = "price",
#' amount_variable = "price"
#' )
#'
#'
#'
tbl_summarise <-
  function(data,
           group_variables = NULL,
           append_slug = NULL,
           widen_variable = NULL,
           count_variable = "count",
           distinct_variables = NULL,
           amount_variables = NULL,
           mean_variables = NULL,
           top_variables = NULL,
           calculation_variable = NULL,
           median_variables = NULL,
           min_variables = NULL,
           pmin_variables = NULL,
           max_variables = NULL,
           pmax_variables = NULL,
           which_max_variables = NULL,
           which_min_variables = NULL,
           unique_variables = NULL,
           unique_concatenator = " | ",
           first_variables = NULL,
           last_variables = NULL,
           variance_variables = NULL,
           sd_variables = NULL,
           coalesce_numeric = F,
           remove_top_amount = T,
           filters = c("UNNAMED", "UNKNOWN"),
           ...) {
    if (length(group_variables) == 0 & length(widen_variable) == 0) {
      group_slugs <- NULL
    } else {
      group_slugs <- c(group_variables, widen_variable) %>% unique()
    }

    all_data <- tibble()

    across_length <-
      length(distinct_variables) + length(amount_variables) + length(mean_variables) + length(median_variables) +
      length(min_variables) + length(max_variables) + length(first_variables) + length(last_variables) + length(variance_variables) + length(sd_variables) +
      length(unique_variables) + length(pmin_variables) + length(pmax_variables)

    has_across <-  across_length > 0

    if (across_length + length(count_variable) == 0) {
      "No summary variables" %>% message()
      return(tibble())
    }

    if (length(calculation_variable) == 0 &
        length(amount_variables) > 0) {
      calculation_variable <- amount_variables[[1]]
    }

    analysis_vars <-
      c(
        group_variables,
        widen_variable,
        distinct_variables,
        amount_variables,
        mean_variables,
        top_variables,
        calculation_variable,
        median_variables,
        min_variables,
        pmin_variables,
        max_variables,
        pmax_variables,
        which_max_variables,
        which_min_variables,
        unique_variables,
        first_variables,
        last_variables,
        variance_variables,
        sd_variables
      ) %>%
      unique()

    data <- data %>%
      select(one_of(analysis_vars))

    if (length(count_variable) > 0) {
      all_data <-
        data %>%
        group_by(!!!syms(group_slugs)) %>%
        summarise(UQ(count_variable) := n(),
                  .groups = "drop")
    }

    if (has_across) {
      if (length(calculation_variable) == 0 & length(amount_variables) > 0) {
        calculation_variable <- amount_variables[[1]]
      }

      if (length(calculation_variable) == 0 && length(amount_variables) == 0) {
        calculation_variable <- ""
      }

      df <-
        data %>%
        group_by(!!!syms(group_slugs)) %>%
        summarise(
          across(
            .cols = all_of(amount_variables),
            .fns = ~ {
              sum(.x, na.rm = T)
            },
            .names = "{.col}_total"
          ),
          across(
            .cols = all_of(distinct_variables),
            .fns = ~ {
              n_distinct(.x, na.rm = T)
            },
            .names = "count_{.col}_distinct"
          ),
          across(
            .cols = all_of(mean_variables),
            .fns = ~ {
              mean(.x, na.rm = T)
            },
            .names = "{.col}_mean"
          ),
          across(
            .cols = all_of(median_variables),
            .fns = ~ {
              median(.x, na.rm = T)
            },
            .names = "{.col}_median"
          ),
          across(
            .cols = all_of(min_variables),
            .fns = ~ {
              min(.x, na.rm = T)
            },
            .names = "{.col}_min"
          ),
          across(
            .cols = all_of(pmin_variables),
            .fns = ~ {
              pmin(.x, na.rm = T)
            },
            .names = "{.col}_pmin"
          ),
          across(
            .cols = all_of(max_variables),
            .fns = ~ {
              max(.x, na.rm = T)
            },
            .names = "{.col}_max"
          ),
          across(
            .cols = all_of(pmax_variables),
            .fns = ~ {
              pmax(.x, na.rm = T)
            },
            .names = "{.col}_pmax"
          ),
          across(
            .cols = all_of(unique_variables),
            .fns = ~ {
              .x %>%
                str_split("\\|") %>% flatten_chr() %>% str_squish() %>%
                unique() %>% str_c(collapse = unique_concatenator)
            },
            .names = "{.col}_unique"
          ),
          across(
            .cols = all_of(sd_variables),
            .fns = ~ {
              sd(.x, na.rm = T)
            },
            .names = "{.col}_sd"
          ),
          across(
            .cols = all_of(variance_variables),
            .fns = ~ {
              sd(.x, na.rm = T)
            },
            .names = "{.col}_variance"
          ),
          across(
            .cols = all_of(first_variables),
            .fns = ~ {
              first(.x, na.rm = T)
            },
            .names = "{.col}_first"
          ),
          across(
            .cols = all_of(which_max_variables),
            .fns = ~ {
              .x[which.max(!!sym(calculation_variable))]
            },
            .names = "{.col}_which_max"
          ),
          across(
            .cols = all_of(which_min_variables),
            .fns = ~ {
              .x[which.min(!!sym(calculation_variable))]
            },
            .names = "{.col}_which_min"
          ),
          across(
            .cols = all_of(last_variables),
            .fns = ~ {
              last(.x, na.rm = T)
            },
            .names = "{.col}_first"
          ),
          ...,
          .groups = "drop"
        )



      if (nrow(all_data) > 0) {
        if (length(group_slugs) > 0) {
          all_data <- all_data %>%
            left_join(df, by = group_slugs)
        }
        if (length(group_slugs) == 0) {
          all_data <- all_data %>%
            bind_cols(df)
        }
      }

      if (nrow(all_data) == 0) {
        all_data <- df
      }
    }

    if (length(widen_variable) > 0) {



      long_cols <- all_data |>
        select(-one_of(group_slugs)) |>
        mutate_if(is.logical, as.numeric) |>
        mutate_if(is.factor, as.numeric) |>
        select_if(is.numeric) |>
        names()

      all_data <-
        all_data |>
        mutate_if(is.logical, as.numeric) |>
        mutate_if(is.factor, as.numeric) |>
        pivot_longer(cols = long_cols) |>
        unite(feature, name, !!sym(widen_variable), sep = "_") |>
        spread(feature, value) |>
        janitor::clean_names()

      if (coalesce_numeric) {
        all_data <- all_data %>%
          mutate_if(is.numeric,  ~ {
            coalesce(.x, 0L)
          })
      }

    }

    if (length(top_variables) > 0) {
      df_top <-
        tbl_top_n_groups(
          data = data,
          group_variables = group_variables,
          top_variables = top_variables,
          calculation_variable = calculation_variable,
          filters = filters,
          top = 1,
          remove_top_amount = remove_top_amount
        )

      if (length(widen_variable) > 0) {
        all_data <- all_data |> janitor::clean_names()
      }

      all_data <-
        all_data %>%
        left_join(df_top, by = group_variables)
    }

    if (coalesce_numeric) {
      all_data <- all_data %>%
        mutate_if(is.numeric,  ~ {
          coalesce(.x, 0L)
        })
    }

    all_data <- all_data %>%
      mutate_if(is.character, list(function(x) {
        x %>% coalesce("UNKNOWN")
      }))

    bad_totals <-
      names(all_data) %>% str_detect("_total_total") %>% sum(na.rm = T) > 0

    if (bad_totals) {
      new_var <-
        names(all_data)[names(all_data) %>% str_detect("_total_total$")] %>%
        str_remove_all("_total")
      names(all_data)[names(all_data) %>% str_detect("_total_total$")] <-
        names(all_data)[names(all_data) %>% str_detect("_total_total$")] %>%
        str_remove_all("_total")

      names(all_data)[names(all_data) %>% str_detect(new_var)] <-
        names(all_data)[names(all_data) %>% str_detect(new_var)] %>% str_c("_total")
    }


    if (length(append_slug) > 0) {
      names(all_data)[names(all_data) %in% c(all_data %>% select(-one_of(group_variables)) %>% names())] <-
        names(all_data)[names(all_data) %in% c(all_data %>% select(-one_of(group_variables)) %>% names())] %>%
        str_c(append_slug, sep = "_")
    }


    all_data

  }

#' Mutate Data with Various New Features
#'
#' @param data
#' @param group_variables
#' @param absolute_change_columns
#' @param cumulative_sum_columns
#' @param percent_change_columns
#' @param index_columns
#' @param index_calculation_variable
#' @param lag_columns
#' @param cumulative_mean_columns
#' @param min_rank_columns
#' @param pct_rank_columns
#' @param dense_rank_columns
#' @param lead_columns
#' @param cummulative_min_columns
#' @param cumulative_max_columns
#' @param ...
#'
#' @return
#' @export
#'
#' @example inst/examples/tbl_mutate_examples.R
tbl_mutate <-
  function(data,
           group_variables = NULL,
           absolute_change_columns = NULL,
           cumulative_sum_columns = NULL,
           cumulative_mean_columns = NULL,
           cummulative_min_columns = NULL,
           cumulative_max_columns = NULL,
           percent_change_columns = NULL,
           index_columns = NULL,
           index_calculation_variable = NULL,
           index_override_value = NULL,
           lag_columns = NULL,
           lead_columns = NULL,
           min_rank_columns = NULL,
           pct_rank_columns = NULL,
           dense_rank_columns = NULL,
           ...) {
    data <-
      data %>%
      group_by(!!!syms(group_variables)) %>%
      mutate(
        across(
          .cols = all_of(absolute_change_columns),
          .fns = ~ {
            .x - lag(.x, na.rm = T)
          },
          .names = "{.col}_absolute_change"
        ),
        across(
          .cols = all_of(cumulative_sum_columns),
          .fns = ~ {
            cumsum(.x)
          },
          .names = "{.col}_cumulative_total"
        ),
        across(
          .cols = all_of(lag_columns),
          .fns = ~ {
            lag(.x, na.rm = T)
          },
          .names = "{.col}_lag"
        ),
        across(
          .cols = all_of(lead_columns),
          .fns = ~ {
            lead(.x, na.rm = T)
          },
          .names = "{.col}_lead"
        ),
        across(
          .cols = all_of(cumulative_mean_columns),
          .fns = ~ {
            cummean(.x)
          },
          .names = "{.col}_cumulative_mean"
        ),
        across(
          .cols = all_of(cummulative_min_columns),
          .fns = ~ {
            cummin(.x)
          },
          .names = "{.col}_cumulative_min"
        ),
        across(
          .cols = all_of(cumulative_max_columns),
          .fns = ~ {
            cummax(.x)
          },
          .names = "{.col}_cumulative_max"
        ),
        across(
          .cols = all_of(percent_change_columns),
          .fns = ~ {
            (.x / lag(.x, na.rm = T)) - 1
          },
          .names = "pct_change_{.col}"
        ),
        across(
          .cols = all_of(min_rank_columns),
          .fns = ~ {
            .x %>% dplyr::min_rank()
          },
          .names = "rank_min_{.col}"
        ),
        across(
          .cols = all_of(pct_rank_columns),
          .fns = ~ {
            .x %>% dplyr::min_rank()
          },
          .names = "rank_pct_{.col}"
        ),
        across(
          .cols = all_of(dense_rank_columns),
          .fns = ~ {
            .x %>% dplyr::min_rank()
          },
          .names = "rank_dense_{.col}"
        )
      )

    if (length(index_columns) > 0 &
        length(index_calculation_variable) > 0) {
      if (length(index_override_value) == 0) {
        min_index <-
          data %>%
          select(all_of(index_calculation_variable)) %>%
          distinct() %>%
          pull() %>%
          min(na.rm = T)

        data <-
          data %>%
          arrange(!!sym(index_calculation_variable)) %>%
          mutate(
            across(
              .cols = all_of(index_columns),
              .fns = ~ {
                (.x / (.x[which.min(!!sym(index_calculation_variable))])) * 100
              },
              .names = "index_{index_calculation_variable}_{min_index}_{.col}"
            )
          )
      }
      if (length(index_override_value) > 0) {
        data <-
          data %>%
          arrange(!!sym(index_calculation_variable)) %>%
          mutate(
            across(
              .cols = all_of(index_columns),
              .fns = ~ {
                (.x / (.x[which(!!sym(index_calculation_variable) == index_override_value)])) * 100
              },
              .names = "index_{index_calculation_variable}_{index_override_value}_{.col}"
            )
          )
      }

    }

    if (length(group_variables) > 0) {
      data <-
        data %>%
        ungroup()
    }

    data <- data %>% janitor::clean_names()

    data


  }

# count -------------------------------------------------------------------
.count_variable <-
  function(data,
           matching_variables = c("^name", "^type"),
           exclude_columns = c("nameContract"),
           numeric_columns = NULL,
           include_logical = T,
           include_factor = T,
           weight = NA,
           include_proportion = T) {
    data <-
      data %>%
      dplyr::select(which(colMeans(is.na(.)) < 1))
    match_slugs <- matching_variables %>% str_c(collapse = "|")
    select_names <- c()
    char_names <-
      data %>% select(matches(match_slugs)) %>% select_if(list(function(x) {
        is.character(x)
      })) %>% names()
    select_names <- select_names %>% append(char_names)

    if (include_logical) {
      select_names <-
        select_names %>% append(data %>% select_if(is.logical) %>% names())
    }

    if (include_factor) {
      select_names <-
        select_names %>% append(data %>% select_if(is.factor) %>% names())
    }

    if (length(numeric_columns) > 0) {
      select_names <-
        select_names %>%
        append(data %>%
                 select_if(is.numeric) %>% names() %>% keep(function(x) {
                   x %in% numeric_columns
                 }))
    }


    if (length(exclude_columns) > 0) {
      select_columns <-
        select_names %>% discard(function(x) {
          x %in% exclude_columns
        })
    } else {
      select_columns <-
        select_names
    }


    if (is.na(weight)) {
      df_counts <-
        select_columns %>%
        map_dfr(function(col) {
          d <- data %>% count(!!sym(col), name = "count", sort = T) %>%
            rename(variable := !!sym(col)) %>%
            mutate(variable = as.character(variable),
                   column = col) %>%
            select(column, everything()) %>%
            mutate_if(is.numeric, as.numeric)

          if (include_proportion) {
            new_variable <- "pct_count"
            d <- d %>%
              mutate(!!sym(new_variable) := count / sum(count))
          }
          return(d)
        })
    } else {
      df_counts <-
        select_columns %>%
        map_dfr(function(col) {
          d <-
            data %>% count(
              !!sym(col),
              name = weight,
              wt = !!sym(weight),
              sort = T
            ) %>%
            rename(variable := !!sym(col)) %>%
            mutate(variable = as.character(variable),
                   column = col) %>%
            select(column, everything()) %>%
            mutate_if(is.numeric, as.numeric)

          if (include_proportion) {
            new_var <- str_c("pct_", weight %>% make_clean_names())
            d <-
              d %>%
              mutate(!!sym(new_var) := !!sym(weight) / sum(!!sym(weight)))

            d
          }
        })
    }

    df_counts <-
      df_counts %>%
      mutate(variable = variable %>% coalesce("UNKNOWN"))

    df_counts

  }

#' Build a tibble of counts
#'
#' @param data
#' @param include_logical
#' @param include_factor
#' @param unite_features
#' @param include_proportion
#' @param matching_variables
#' @param weight_columns
#' @param exclude_columns
#' @param numeric_columns
#' @param scale_results
#'
#' @return
#' @export
#'
#' @examples
tbl_count <-
  function(data,
           matching_variables = c("^name", "^type"),
           weight_columns = c(NA),
           exclude_columns = NULL,
           include_logical = T,
           include_factor = T,
           unite_features = T,
           numeric_columns = NULL,
           scale_results = F,
           include_proportion = T) {
    if (length(weight_columns) == 0) {
      stop("Enter weights")
    }

    all_data <-
      weight_columns %>%
      map(function(weight) {
        .count_variable(
          data = data,
          matching_variables = matching_variables,
          exclude_columns = exclude_columns,
          include_logical = include_logical,
          weight = weight,
          include_factor = include_factor,
          numeric_columns = numeric_columns,
          include_proportion = include_proportion
        )
      })

    all_data <-
      all_data %>% reduce(left_join, by = c("column", "variable"))

    if (unite_features) {
      all_data <-
        all_data %>%
        unite(item, column, variable, remove = F) %>%
        mutate(item = item %>% map_chr(janitor::make_clean_names))
    }

    if (scale_results) {
      all_data <- all_data %>%
        pre_process_data(scale_data = T, center = T)
    }

    all_data
  }


# groups ------------------------------------------------------------------

.top_group <-
  function(data,
           group_variables = "parent_keywords",
           top_variable = "name_awardee_clean",
           calculation_variable = "amount_award_allocated",
           filters = c("UNNAMED", "UNKNOWN"),
           top = 1,
           remove_top_amount = T) {
    new_var_name <- glue("{top_variable}_top") %>% as.character()
    amount_var <-
      glue("{calculation_variable}_{top_variable}_top") %>% as.character()

    if (length(filters) > 0) {
      filter_slugs <- str_c(filters, collapse = "|")
      data <- data %>%
        filter(!(!!sym(top_variable) %>% str_detect(filter_slugs)))
    }

    if (length(group_variables) > 0) {
      data <-
        data %>%
        filter(!is.na((!!sym(top_variable)))) %>%
        group_by(!!!syms(c(group_variables, top_variable))) %>%
        summarise(UQ(amount_var) := sum(!!sym(calculation_variable), na.rm = T)) %>%
        collect() %>%
        ungroup() %>%
        arrange(desc(!!sym(amount_var))) %>%
        group_by(!!!syms(group_variables)) %>%
        slice(1:top) %>%
        ungroup() %>%
        rename(UQ(new_var_name) := top_variable)
    }

    if (length(group_variables) == 0) {
      data <-
        data %>%
        filter(!is.na((!!sym(top_variable)))) %>%
        group_by(!!!sym(top_variable)) %>%
        summarise(UQ(amount_var) := sum(!!sym(calculation_variable), na.rm = T)) %>%
        collect() %>%
        ungroup() %>%
        arrange(desc(!!sym(amount_var))) %>%
        slice(1:top) %>%
        ungroup() %>%
        rename(UQ(new_var_name) := top_variable)
    }

    if (remove_top_amount) {
      data <- data %>%
        select(-one_of(amount_var))
    }
    data
  }

#' Top Groups
#'
#' @param data
#' @param group_variables
#' @param top_variables
#' @param calculation_variable
#' @param filters
#' @param top
#' @param remove_top_amount
#'
#' @return
#' @export
#'
#' @examples
tbl_top_n_groups <-
  function(data,
           group_variables = NULL,
           top_variables = NULL,
           calculation_variable = "amount_award_allocated",
           filters = c("UNNAMED", "UNKNOWN"),
           top = 1,
           remove_top_amount = T) {
    if (length(calculation_variable) == 0) {
      "Enter calculation variable" %>% message()
      return(data)
    }

    if (length(top_variables) == 0) {
      "Enter top variables" %>% message()
      return(data)
    }
    all_data <-
      top_variables %>%
      map(function(x) {
        .top_group(
          data = data,
          group_variables = group_variables,
          top_variable = x,
          calculation_variable = calculation_variable,
          filters = filters,
          top = top,
          remove_top_amount = remove_top_amount
        )
      })

    if (length(group_variables) == 0) {
      all_data <-
        all_data %>% reduce(bind_cols)
    }

    if (length(group_variables) > 0) {
      all_data <-
        all_data %>% reduce(left_join, by  = group_variables)
    }

    all_data
  }



# feature_correlations ----------------------------------------------------

#' Correlation tibble
#'
#' @param data
#' @param correlation_method	a character string indicating which correlation coefficient (or covariance) is to be computed. One of "pearson" (default), "kendall", or "spearman": can be abbreviated.
#' @param correlation_used an optional character string giving a method for computing covariances in the presence of missing values. This must be (an abbreviation of) one of the strings "everything", "all.obs", "complete.obs", "na.or.complete", or "pairwise.complete.obs".
#' @param diagonal
#' @param remove_columns
#' @param include_logical if `TRUE` includes logical features
#' @param include_factors if `TRUE` converts factors to variables
#' @param character_to_factor if `TRUE` converts character vectors to factor
#' @param full_rank if `TRUE` returns full rank dummy variables
#'
#' @return
#' @export
#'
#' @examples
tbl_correlations <-
  function(data,
           correlation_method = "pearson",
           correlation_used = "pairwise.complete.obs",
           diagonal = NA,
           remove_columns = NULL,
           include_logical = T,
           include_factors = F,
           character_to_factor = F,
           exclude_feature_columns = NULL,
           exclude_feature_correlated_columns = NULL,
           full_rank = T) {
    options(warn = -1)
    if (length(remove_columns)) {
      data <-
        data %>%
        select(-one_of(remove_columns))
    }

    if (character_to_factor)  {
      data <- data %>%
        mutate(across(is.factor, as.factor))
    }

    if (include_factors) {
      data <- data %>%
        dummify_data(snake_names = T,
                     is_full_rank = full_rank)
    }

    if (include_logical) {
      data <- data %>%
        mutate(across(is.logical, as.numeric))
    }

    data <-
      data %>%
      select_if(is.numeric) %>%
      correlate(use = correlation_used,
                diagonal = diagonal,
                method = correlation_method) %>%
      rename(feature = term)

    data <-
      data %>%
      pivot_longer(cols = data %>% select(-feature) %>% names(),
                   names_to = "feature_correlated") %>%
      mutate(correlation_method,
             correlation_used,
             .before = "feature") %>%
      filter(feature != feature_correlated) %>%
      mutate(
        value_absolute = abs(value),
        type_correlation = case_when(value > 0 ~ "positive",
                                     TRUE ~ "negative")
      ) %>%
      arrange(feature, -value_absolute)

    if (length(exclude_feature_columns) > 0) {
      exclude_feature_columns_slugs <-
        str_c(exclude_feature_columns, collapse = " | ")

      data <- data %>%
        filter(!feature %>% str_detect(exclude_feature_columns_slugs))
    }

    if (length(exclude_feature_correlated_columns) > 0) {
      exclude_feature_correlated_columns_slugs <-
        str_c(exclude_feature_correlated_columns, collapse = " | ")

      data <- data %>%
        filter(!feature_correlated %>% str_detect(exclude_feature_correlated_columns_slugs))
    }


    data


  }

#' Dimension Reduced Correlations
#'
#' @param data
#' @param exclude_dimensional_correlations
#' @param methods a character string indicating which correlation coefficient (or covariance) is to be computed. One of "pearson" (default), "kendall", or "spearman": can be abbreviated.
#' @param correlation_method an optional character string giving a method for computing covariances in the presence of missing values. This must be (an abbreviation of) one of the strings "everything", "all.obs", "complete.obs", "na.or.complete", or "pairwise.complete.obs".
#' @param correlation_used
#' @param diagonal
#' @param remove_columns
#' @param include_logical
#' @param include_factors
#' @param character_to_factor
#' @param full_rank
#'
#' @return
#' @export
#'
#' @examples
tbl_dimension_correlations <-
  function(data,
           exclude_dimensional_correlations = T,
           methods = c(
             "gng",
             "cur",
             "h2oae",
             "h2oglrm",
             "ica",
             "isomap",
             "kpca",
             "lle",
             "mds",
             "nmf",
             "pca",
             "spca",
             "svd",
             "tsne",
             "umap"
           ),
           correlation_method = "pearson",
           correlation_used = "pairwise.complete.obs",
           diagonal = NA,
           remove_columns = NULL,
           include_logical = T,
           include_factors = F,
           exclude_clusters = F,
           character_to_factor = F,
           full_rank = T) {
    slugs <-
      glue("{methods}_") %>% as.character() %>% str_c(collapse = "|")

    data <- tbl_correlations(
      data = data,
      correlation_method = correlation_method,
      correlation_used = correlation_used,
      diagonal = diagonal,
      remove_columns = remove_columns,
      include_logical = include_logical,
      include_factors = include_factors,
      character_to_factor = character_to_factor,
      full_rank = full_rank
    )

    data <-
      data %>%
      filter(feature %>% str_detect(slugs)) %>%
      rename(dimension = feature)

    if (exclude_dimensional_correlations) {
      data <-
        data %>%
        filter(!feature_correlated %>% str_detect(slugs))
    }

    if (exclude_clusters) {
      data <-
        data %>%
        filter(!feature_correlated %>% str_detect("is_cluster"))
    }

    data <-
      data %>%
      separate(
        dimension,
        into = c("dimension_method", "dimension_number"),
        remove = F
      ) %>%
      mutate(dimension_number = as.numeric(dimension_number)) %>%
      arrange(dimension_number, dimension_method) %>%
      mutate(
        dimension_number = case_when(
          nchar(dimension_number) == 1 ~ str_c("00", dimension_number),
          nchar(dimension_number) == 2 ~ str_c("0", dimension_number),
          TRUE ~ as.character(dimension_number)
        ) %>% factor(ordered = T)
      )

    data


  }

#' Correlation Clusters
#'
#' @param data
#' @param exclude_cluster_correlations
#' @param methods a character string indicating which correlation coefficient (or covariance) is to be computed. One of "pearson" (default), "kendall", or "spearman": can be abbreviated.
#' @param correlation_method an optional character string giving a method for computing covariances in the presence of missing values. This must be (an abbreviation of) one of the strings "everything", "all.obs", "complete.obs", "na.or.complete", or "pairwise.complete.obs".
#' @param correlation_used
#' @param diagonal
#' @param remove_columns
#' @param include_logical
#' @param include_factors
#' @param character_to_factor
#' @param full_rank
#'
#' @return
#' @export
#'
#' @examples
tbl_cluster_correlations <-
  function(data,
           exclude_cluster_correlations = T,
           methods = c(
             "som",
             "cmeans",
             "emc",
             "hardcl",
             "hopach",
             "h2okmeans",
             "kmeans",
             "ngas",
             "pam",
             "pamk",
             "spec"
           ),
           correlation_method = "pearson",
           correlation_used = "pairwise.complete.obs",
           diagonal = NA,
           remove_columns = NULL,
           include_logical = T,
           include_factors = F,
           character_to_factor = F,
           full_rank = T) {
    slugs <-
      glue("{methods}_") %>% as.character() %>% str_c(collapse = "|")

    data <- tbl_correlations(
      data = data,
      correlation_method = correlation_method,
      correlation_used = correlation_used,
      diagonal = diagonal,
      remove_columns = remove_columns,
      include_logical = include_logical,
      include_factors = include_factors,
      character_to_factor = character_to_factor,
      full_rank = full_rank
    )

    data <-
      data %>%
      filter(feature %>% str_detect(slugs)) %>%
      rename(cluster = feature) %>%
      mutate(
        cluster = cluster %>% str_remove_all("is_cluster_"),
        feature_correlated = feature_correlated %>% str_remove_all("is_cluster_")
      )

    if (exclude_cluster_correlations) {
      data <-
        data %>%
        filter(!feature_correlated %>% str_detect(slugs))
    }

    data <-
      data %>%
      separate(cluster,
               into = c("cluster_method", "cluster_number"),
               remove = F) %>%
      mutate(cluster_number = as.numeric(cluster_number)) %>%
      arrange(cluster_number, cluster_method) %>%
      mutate(
        cluster_number = case_when(
          nchar(cluster_number) == 1 ~ str_c("00", cluster_number),
          nchar(cluster_number) == 2 ~ str_c("0", cluster_number),
          TRUE ~ as.character(cluster_number)
        ) %>% factor(ordered = T)
      )

    data


  }


# nest --------------------------------------------------------------------


#' Nest a tibble by grouping variables and specified other variables
#'
#' @param data a `tibble`
#' @param grouping_variables vector of variables to group by
#' @param nesting_variables if not `NULL` vector of other variables to select
#' @param data_column_name if not `NULL` new nested data column name
#'
#' @return `tibble`
#' @export
#'
#' @examples
#'
#' gapminder::gapminder %>%
#' tbl_nest(grouping_variables = "year",
#' data_column_name = "ttdata")
#'
#'
#' gapminder::gapminder %>%
#' tbl_nest(grouping_variables = c("continent", "country"),
#' nesting_variables = c("pop", "year"))
#'
#'
tbl_nest <- function(data,
                     grouping_variables = NULL,
                     nesting_variables = NULL,
                     data_column_name = NULL) {
  if (length(grouping_variables) == 0) {
    "Define grouping variable(s)" %>% message()
    return(data)
  }

  if (length(nesting_variables) == 0) {
    data <- data %>%
      group_by(!!!syms(grouping_variables)) %>%
      nest() %>%
      ungroup()
  }

  if (length(nesting_variables) > 0) {
    data <-
      data %>%
      select(one_of(c(
        grouping_variables, nesting_variables
      ))) %>%
      group_by(!!!syms(grouping_variables)) %>%
      nest() %>%
      ungroup()

  }

  if (length(data_column_name) > 0) {
    data <-
      data %>%
      rename(UQ(data_column_name) := data)
  }

  data
}


# groups ------------------------------------------------------------------


#' Create grouping variable
#'
#' @param data
#' @param group_variables vector of grouping variables
#' @param group_name name of new column
#' @param sep separator
#' @param remove remove original columns
#' @param override_group_name
#'
#' @return `tibble`
#' @export
#'
#' @examples
#' library(tidyverse)
#' library(asbtools)
#' diamonds %>% tbl_group_variables(group_variables = c("cut", "clarity"))
#' diamonds %>% tbl_group_variables(group_variables = c("cut", "clarity"), group_name = NULL)
#' diamonds %>% tbl_group_variables(group_variables = c("cut", "clarity"), group_name = NULL, remove = F)
#'

tbl_group_variables <-
  function(data,
           group_variables = NULL,
           group_name =  "group",
           override_group_name = F,
           sep = "|",
           remove = T) {
    if (length(group_variables) == 0) {
      message("No group variables")
      return(data)
    }

    if (length(group_name) == 0 | override_group_name) {
      group_name <-
        group_variables %>% make_clean_names() %>% str_c(collapse = "_")
    }

    data %>%
      unite(
        col = !!sym(group_name),
        all_of(group_variables),
        sep = sep,
        remove = remove
      ) %>%
      select(one_of(group_name), everything())
  }


#' Create id variables
#'
#' @param data a `tibble`
#' @param id_variables vector of id variables
#' @param id_name name of the new id variable
#' @param remove remove original variable
#' @param sep id separator
#' @param override_id_name
#'
#' @return
#' @export
#'
#' @examples
#' library(tidyverse)
#' library(asbtools)
#' diamonds %>% tbl_id_variables()
#' diamonds %>% tbl_id_variables(id_variables = c("cut", "clarity"), id_name = "id")
#' diamonds %>% tbl_id_variables(id_variables = c("cut", "clarity"), id_name = NULL)
#' diamonds %>% tbl_id_variables(id_variables = c("cut", "clarity"), group_name = NULL, remove = F)

tbl_id_variables <-
  function(data,
           id_variables = NULL,
           id_name = "id",
           override_id_name = F,
           remove = T,
           sep = "|") {

    if (length(id_name) == 0 && length(id_variables) == 0) {
      id_name <- "id"
    }

    if (length(id_name) == 0 && !override_id_name) {
      id_name <- "id"
    }

    has_id <- id_name %in% names(data)

    if (has_id) {
      data <- data %>% select(-one_of(id_name))
    }

    if (length(id_variables) == 0) {
      "No identified id variables using row id" %>% message()
      data <- data %>%
        mutate(UQ(id_name) := 1:n()) %>%
        select(one_of(id_name), everything())
      return(data)
    }


    if (length(id_name) == 0 | override_id_name) {
      id_name <-
        id_variables %>% make_clean_names() %>% str_c(collapse = "_") %>%
        str_c("id", ., sep = "_")
    }


    data <-
      data %>%
      unite(
        col = !!sym(id_name),
        all_of(id_variables),
        sep = sep,
        remove = remove
      ) %>%
      select(one_of(id_name), everything())

    data_rows <- nrow(data)

    distinct_id <- data %>%
      distinct(!!sym(id_name)) %>% nrow()

    if (data_rows != distinct_id) {
      glue("Warning the data contains {data_rows} rows while there are {distinct_id} in the {str_c(id_variables, collapse = '_')} you assigned to {id_name} consider overriding to a row id") %>% message()
    }


    data
  }
abresler/asbtools documentation built on July 28, 2022, 11:04 p.m.