pointblank: Data Validation and Organization of Metadata for Local and Remote Tables

#------------------------------------------------------------------------------#
# 
#                 _         _    _      _                _    
#                (_)       | |  | |    | |              | |   
#   _ __    ___   _  _ __  | |_ | |__  | |  __ _  _ __  | | __
#  | '_ \  / _ \ | || '_ \ | __|| '_ \ | | / _` || '_ \ | |/ /
#  | |_) || (_) || || | | || |_ | |_) || || (_| || | | ||   < 
#  | .__/  \___/ |_||_| |_| \__||_.__/ |_| \__,_||_| |_||_|\_\
#  | |                                                        
#  |_|                                                        
#  
#  This file is part of the 'rstudio/pointblank' project.
#  
#  Copyright (c) 2017-2024 pointblank authors
#  
#  For full copyright and license information, please look at
#  https://rstudio.github.io/pointblank/LICENSE.html
# 
#------------------------------------------------------------------------------#


# Get the column names for a table
# - works across all supported data sources
# - returns 'character' of length n
get_table_column_names <- function(data) {
  
  colnames(data)
}

# Get the total number of rows in a table
# - works across all supported data sources
# - returns 'numeric' of length 1
get_table_total_rows <- function(data) {

  as.numeric(dplyr::pull(dplyr::count(data, name = "n"), n))
}

# Get the total number of rows in a table
# - works across all supported data sources
# - returns 'integer' of length 1
get_table_total_columns <- function(data) {

  ncol(data)
}

# Get the total number of NA/NULLs in a table
# - works across all supported data sources
# - returns 'numeric' of length 1
get_table_total_missing_values <- function(data) {
  
  collected <- 
    dplyr::collect(
      dplyr::summarise_all(
        data, ~ sum(ifelse(is.na(.), 1, 0), na.rm = TRUE)
      )
    )

  sum(as.vector(t(collected)))
}

# Get the total number of distinct rows in a table
# - works across all supported data sources
# - returns 'numeric' of length 1
get_table_total_distinct_rows <- function(data) {

  as.numeric(dplyr::pull(dplyr::count(dplyr::distinct(data), name = "n"), n))
}

# Get the total number of distinct rows in a table column
# (must be a table with a single column)
# - works across all supported data sources
# - returns 'integer' of length 1
get_table_column_distinct_rows <- function(data_column) {
  
  as.integer(get_table_total_distinct_rows(data = data_column))
}

# Get the total number of NA/NULLs in a table column
# (must be a table with a single column)
# - works across all supported data sources
# - returns 'numeric' of length 1
get_table_column_na_values <- function(data_column) {
  
  get_table_total_missing_values(data = data_column)
}

# Get the total number of Inf/-Inf values in a table column
# (must be a table with a single column)
# - works across all supported data sources
# - returns 'integer' of length 1
get_table_column_inf_values <- function(data_column) {
  
  if (!inherits(data_column, "tbl_dbi") &&
      !inherits(data_column, "tbl_spark")) {
    
    inf_cells <-
      data_column %>%
      dplyr::pull(1) %>%
      is.infinite() %>%
      sum()
    
  } else {
    inf_cells <- 0L
  }
  
  inf_cells
}

# Get a 1-row tibble of mean, min, and max values for a table column
# (must be a table with a single column)
# - works across all supported data sources
# - returns 'tibble' of 1 row, 3 columns
get_table_column_summary <- function(data_column, round = 2) {
  
  data_column %>%
    dplyr::summarize_all(
      .funs = list(
        ~ mean(., na.rm = TRUE),
        ~ min(., na.rm = TRUE),
        ~ max(., na.rm = TRUE)
      )
    ) %>%
    dplyr::collect() %>%
    dplyr::summarize_all(~ round(., round)) %>%
    dplyr::mutate_all(.funs = as.numeric)
}

# Get a list of quantile statistics for a table column
# (must be a table with a single column)
# - works only with local data frames
# - returns a 'list' with 9 elements
get_df_column_qtile_stats <- function(data_column) {
  
  data_column %>%
    dplyr::summarize_all(
      .funs = list(
        min = ~ min(., na.rm = TRUE),
        p05 = ~ unname(stats::quantile(., probs = 0.05, na.rm = TRUE)),
        q_1 = ~ unname(stats::quantile(., probs = 0.25, na.rm = TRUE)),
        med = ~ unname(stats::quantile(., probs = 0.50, na.rm = TRUE)),
        q_3 = ~ unname(stats::quantile(., probs = 0.75, na.rm = TRUE)),
        p95 = ~ unname(stats::quantile(., probs = 0.95, na.rm = TRUE)),
        max = ~ max(., na.rm = TRUE),
        iqr = ~ stats::IQR(., na.rm = TRUE)
      )
    ) %>%
    dplyr::mutate(range = max - min) %>%
    dplyr::summarize_all(~ round(., 2)) %>%
    as.list()
}

# Get the mean value of a column in a table
# - works only with `tbl_dbi` objects
# - returns 'numeric' of length 1
get_dbi_column_mean <- function(data_column) {
  
  x <- dplyr::rename(data_column, a = 1)
  x <- dplyr::group_by(x)
  x <- dplyr::summarize(x, "__mean__" = mean(a, na.rm = TRUE))
  x <- dplyr::pull(x, `__mean__`)
  
  x
}

# nocov start

# Get the mean value of a column in a table
# - works only with `tbl_spark` objects
# - returns 'numeric' of length 1
get_spark_column_mean <- function(data_column) {
  
  get_dbi_column_mean(data_column = data_column)
}

# nocov end

# Get the variance value of a column in a table, after obtaining the mean value
# - works only with `tbl_dbi` objects
# - returns 'numeric' of length 1
get_dbi_column_variance <- function(data_column, mean_value) {
  
  x <- dplyr::rename(data_column, a = 1)
  x <- dplyr::mutate(x, "__diff__" = (!!mean_value - a)^2)
  x <- dplyr::group_by(x)
  x <- dplyr::summarize(x, "__var__"  = mean(`__diff__`, na.rm = TRUE))
  x <- dplyr::pull(x, `__var__`)
  
  x
}

# nocov start

# Get the variance value of a column in a table, after obtaining the mean value
# - works only with `tbl_spark` objects
# - returns 'numeric' of length 1
get_spark_column_variance <- function(data_column, mean_value) {
  
  get_dbi_column_variance(data_column = data_column, mean_value = mean_value)
}

# Get a list of quantile statistics for a Spark DataFrame column
# (must be a table with a single column)
# - works only with Spark DataFrames
# - returns a 'list' with 9 elements
get_spark_column_qtile_stats <- function(data_column) {
  
  column_name <- colnames(data_column)
  
  quantiles <- 
    unname(
      sparklyr::sdf_quantile(
        data_column, column_name,
        probabilities = c(0, 0.05, 0.25, 0.5, 0.75, 0.95, 1)
      )
    )

  list(
    min = quantiles[1],
    p05 = quantiles[2],
    q_1 = quantiles[3],
    med = quantiles[4],
    q_3 = quantiles[5],
    p95 = quantiles[6],
    max = quantiles[7],
    iqr = quantiles[5] - quantiles[3],
    range = quantiles[7] - quantiles[1]
  ) %>% 
    lapply(FUN = function(x) round(x, 2))
}

# nocov end

# Get a list of quantile statistics for a DBI table column
# (must be a table with a single column)
# - works only with `tbl_dbi` objects
# - returns a 'list' with 9 elements
get_dbi_column_qtile_stats <- function(data_column) {
  
  data_arranged <- 
    data_column %>%
    dplyr::rename(a = 1) %>%
    dplyr::filter(!is.na(a)) %>%
    dplyr::arrange(a) %>%
    utils::head(6E8)
  
  n_rows <- get_table_total_rows(data = data_arranged)
  
  quantile_rows <- floor(c(0.05, 0.25, 0.5, 0.75, 0.95) * n_rows)
  
  # Should there be any rows estimated as `0`, change those to `1`
  quantile_rows[quantile_rows == 0] <- 1
  
  dplyr::tibble(
    min = data_arranged %>% 
      dplyr::summarize(a = min(a, na.rm = TRUE)) %>%
      dplyr::pull(a) %>%
      as.numeric(),
    p05 = data_arranged %>% 
      utils::head(quantile_rows[1]) %>%
      dplyr::arrange(dplyr::desc(a)) %>%
      utils::head(1) %>%
      dplyr::pull(a) %>%
      as.numeric(),
    q_1 = data_arranged %>% 
      utils::head(quantile_rows[2]) %>%
      dplyr::arrange(dplyr::desc(a)) %>%
      utils::head(1) %>%
      dplyr::pull(a) %>%
      as.numeric(),
    med = data_arranged %>% 
      utils::head(quantile_rows[3]) %>%
      dplyr::arrange(dplyr::desc(a)) %>%
      utils::head(1) %>%
      dplyr::pull(a) %>%
      as.numeric(),
    q_3 = data_arranged %>%
      utils::head(quantile_rows[4]) %>%
      dplyr::arrange(dplyr::desc(a)) %>%
      utils::head(1) %>%
      dplyr::pull(a) %>%
      as.numeric(),
    p95 = data_arranged %>%
      utils::head(quantile_rows[5]) %>%
      dplyr::arrange(dplyr::desc(a)) %>%
      utils::head(1) %>%
      dplyr::pull(a) %>%
      as.numeric(),
    max = data_arranged %>%
      dplyr::summarize(a = max(a, na.rm = TRUE)) %>%
      dplyr::pull(a) %>%
      as.numeric()
  ) %>%
    dplyr::mutate(
      iqr = q_3 - q_1,
      range = max - min
    ) %>%
    dplyr::summarize_all(~ round(., 2)) %>%
    as.list()
}

get_table_column_nchar_stats <- function(data_column) {
  
  data_column %>%
    dplyr::mutate_all(.funs = nchar) %>%
    dplyr::rename(nchar = 1) %>%
    dplyr::summarize_all(
      .funs = list(
        mean = ~ mean(., na.rm = TRUE),
        min = ~ min(., na.rm = TRUE),
        max = ~ max(., na.rm = TRUE)
      )
    ) %>%
    dplyr::collect() %>%
    dplyr::mutate_all(.funs = as.numeric) %>%
    as.list()
}

# Get a ggplot2 histogram plot of character lengths for a table column
# (must be a table with a single column)
# - works across all supported data sources
# - returns a ggplot object
get_table_column_histogram <- function(data_column, lang, locale) {
  
  # TODO: Use locale value to get proper settings comma format
  
  x_label <- get_lsv("table_scan/plot_lab_string_length")[[lang]]
  y_label <- get_lsv("table_scan/plot_lab_count")[[lang]]
  
  suppressWarnings(
    data_column %>%
      dplyr::mutate_all(.funs = nchar) %>%
      dplyr::rename(nchar = 1) %>%
      dplyr::group_by(nchar) %>%
      dplyr::summarize(n = dplyr::n()) %>%
      dplyr::collect() %>%
      dplyr::filter(!is.na(nchar)) %>%
      dplyr::mutate_all(.funs = as.numeric) %>%
      ggplot2::ggplot(ggplot2::aes(x = nchar, y = n)) +
      ggplot2::geom_col(fill = "steelblue", width = 0.5) +
      ggplot2::geom_hline(yintercept = 0, color = "#B2B2B2") +
      ggplot2::labs(x = x_label, y = y_label) +
      ggplot2::scale_y_continuous(labels = scales::comma_format()) +
      ggplot2::theme_minimal()
  )
}

# Get a tibble of binned missing value proportions for a table
# - works only with `tbl_dbi` objects
# - returns 'tibble' of 20 bins * y columns rows, 4 columns
get_tbl_dbi_missing_tbl <- function(data) {
  
  n_rows <- get_table_total_rows(data = data)
  col_names <- get_table_column_names(data = data)
  
  if (n_rows < 20) {
    n_breaks <- n_rows
  } else {
    n_breaks <- 20
  }
  
  cuts <- floor(seq(from = 1, to = n_rows, length.out = n_breaks + 1))[-1]
  bin_size <- cuts[1]
  
  frequency_list <- 
    lapply(
      col_names,
      FUN = function(x__) {
        
        col_num <- which(col_names %in% x__)
        
        if (length(cuts) < 20) {
          bin_num <- cuts
        } else {
          bin_num <- 1:20
        }
        
        missing_tally <- 0L
        
        missing_freq <- 
          vapply(
            bin_num,
            FUN.VALUE = numeric(1),
            FUN = function(x) {
              
              missing_n_span <- 
                data %>% 
                dplyr::select(1, dplyr::one_of(x__))
              
              if (ncol(missing_n_span) == 1) {
                
                missing_n_span <- dplyr::rename(missing_n_span, a = 1)
                
              } else {
                
                missing_n_span <- dplyr::rename(missing_n_span, a = 2)
              }
              
              missing_n_span <- 
                missing_n_span %>%
                utils::head(cuts[x]) %>%
                dplyr::summarize_all(
                  ~ sum(ifelse(is.na(.), 1, 0), na.rm = TRUE)
                ) %>%
                dplyr::pull(a) %>%
                as.integer()
              
              missing_bin <- missing_n_span - missing_tally
              
              missing_tally <<- missing_tally + missing_bin
              
              as.numeric(missing_bin / bin_size)
            }
          )
        
        dplyr::tibble(
          col_name = x__,
          col_num = col_num,
          bin_num = bin_num,
          value = missing_freq
        )
      }
    )
  
  frequency_list %>%
    dplyr::bind_rows() %>%
    dplyr::mutate(value = ifelse(value == 0, NA_real_, value)) %>%
    dplyr::mutate(col_name = factor(col_name, levels = colnames(data)))
}

# nocov start

# Get a tibble of binned missing value proportions for a table
# - works only with `tbl_spark` objects
# - returns 'tibble' of 20 bins * y columns rows, 4 columns
get_tbl_spark_missing_tbl <- function(data) {
  
  get_tbl_dbi_missing_tbl(data = data)
}

# nocov end

# Get a tibble of binned missing value proportions for a table
# - works only with data frame objects
# - returns 'tibble' of 20 bins * y columns rows, 4 columns
get_tbl_df_missing_tbl <- function(data) {
  
  n_rows <- get_table_total_rows(data = data)
  col_names <- get_table_column_names(data = data)
  
  if (n_rows < 20) {
    n_breaks <- n_rows
  } else {
    n_breaks <- 20
  }
  
  cuts_ <- floor(seq(from = 1, to = n_rows, length.out = n_breaks + 1))[-1]
  
  frequency_list <- 
    lapply(
      col_names,
      FUN = function(x__) {
        
        data <- dplyr::select(data, dplyr::one_of(x__))
        data <- tibble::rowid_to_column(data)
        data <- 
          dplyr::mutate(data, `::cut_group::` = dplyr::case_when(
            rowid < cuts_[1]  ~ 1L,
            rowid < cuts_[2]  ~ 2L,
            rowid < cuts_[3]  ~ 3L,
            rowid < cuts_[4]  ~ 4L,
            rowid < cuts_[5]  ~ 5L,
            rowid < cuts_[6]  ~ 6L,
            rowid < cuts_[7]  ~ 7L,
            rowid < cuts_[8]  ~ 8L,
            rowid < cuts_[9]  ~ 9L,
            rowid < cuts_[10] ~ 10L,
            rowid < cuts_[11] ~ 11L,
            rowid < cuts_[12] ~ 12L,
            rowid < cuts_[13] ~ 13L,
            rowid < cuts_[14] ~ 14L,
            rowid < cuts_[15] ~ 15L,
            rowid < cuts_[16] ~ 16L,
            rowid < cuts_[17] ~ 17L,
            rowid < cuts_[18] ~ 18L,
            rowid < cuts_[19] ~ 19L,
            rowid < cuts_[20] ~ 20L,
            TRUE ~ 20L
          ))
        data <- dplyr::select(data, -rowid)
        data <- dplyr::group_by(data, `::cut_group::`)
        data <- dplyr::summarize_all(data, ~ sum(is.na(.)) / dplyr::n())
        data <- dplyr::select(data, -1)
        data <- dplyr::mutate(data, col_num = which(col_names %in% x__))
        data <- dplyr::mutate(data, bin_num = 1:n_breaks)
        data <- dplyr::mutate(data, col_name = x__)
        data <- dplyr::rename(data, value = 1)
        data <- dplyr::select(data, col_name, col_num, bin_num, value)
        
        data
      }
    )
  
  missing_tbl <- dplyr::bind_rows(frequency_list)
  missing_tbl <- 
    dplyr::mutate(missing_tbl, value = ifelse(value == 0, NA_real_, value))
  missing_tbl <- 
    dplyr::mutate(
      missing_tbl,
      col_name = factor(col_name, levels = col_names)
    )
  
  missing_tbl
}

# Get a tibble of missing value proportions, by column, for a table
# - works across all supported data sources
# - returns 'tibble' of x columns rows, 3 columns
get_missing_by_column_tbl <- function(data) {
  
  col_names <- get_table_column_names(data = data)
  
  missing_by_column_list <-
    lapply(
      col_names,
      FUN = function(x__) {
        
        data <- dplyr::select(data, dplyr::one_of(x__))
        data <- dplyr::group_by(data)
        data <- 
          dplyr::summarize_all(
            data,
            ~ sum(ifelse(is.na(.), 1, 0), na.rm = TRUE) / dplyr::n()
          )
        data <- dplyr::collect(data)
        data <- dplyr::mutate(data, col_num = which(col_names %in% x__))
        data <- dplyr::mutate(data, col_name = x__)
        data <- dplyr::rename(data, value = 1)
        
        data
      })
  
  missing_by_column_tbl <- dplyr::bind_rows(missing_by_column_list)
  missing_by_column_tbl <- 
    dplyr::mutate(missing_by_column_tbl, value = round(value, 2))
  missing_by_column_tbl <-
    dplyr::mutate(
      missing_by_column_tbl,
      col_name = factor(col_name, levels = colnames(data))
    )
  
  missing_by_column_tbl
}

# nocov start

# Get a ggplot2 plot of missing values by column (with up to 20 bins),
# supported by the `frequency_tbl` and `missing_by_column_tbl` objects
# - works across all supported data sources
# - returns a ggplot object
get_missing_value_plot <- function(data, frequency_tbl, missing_by_column_tbl) {
  
  n_rows <- get_table_total_rows(data = data)
  
  ggplot2::ggplot(
    frequency_tbl,
    ggplot2::aes(x = col_name, y = bin_num, fill = value)
  ) +
    ggplot2::geom_tile(color = "white", linejoin = "bevel") +
    ggplot2::scale_fill_gradientn(
      colours = c("gray85", "black"),
      na.value = "#A1C1E5",
      limits = c(0, 1)
    ) +
    ggplot2::scale_y_continuous(
      breaks = c(0, 1, 20),
      labels = c("", as.character(n_rows), "1")
    ) +
    ggplot2::geom_label(
      data = missing_by_column_tbl,
      mapping = ggplot2::aes(
        x = col_name,
        y = -0.2,
        label = value,
        color = value
      ),
      fill = "white",
      show.legend = FALSE
    ) +
    ggplot2::labs(x = "", y = "") + 
    ggplot2::theme_minimal() +
    ggplot2::theme(
      axis.text.x = ggplot2::element_text(
        angle = 90,
        vjust = 0.5,
        hjust = 1,
        size = 10,
        margin = ggplot2::margin(t = -1)
      ),
      axis.text.y = ggplot2::element_text(
        angle = 90,
        hjust = 0,
        margin = ggplot2::margin(r = -3)
      ),
      panel.grid = ggplot2::element_blank(),
      legend.direction = "horizontal",
      legend.title = ggplot2::element_blank(),
      legend.position = c(0.5, 1.0),
      plot.margin = ggplot2::unit(c(1, 0.5, 0, 0), "cm"),
      legend.key.width = ggplot2::unit(2.0, "cm"),
      legend.key.height = ggplot2::unit(3.0, "mm")
    )
}

get_table_slice_gt <- function(data_column,
                               locale) {
  
  data_column %>%
    gt::gt() %>%
    gt::fmt_percent(columns = 3, locale = locale) %>%
    gt_missing(columns = 1, missing_text = "**NA**") %>%
    gt::text_transform(
      locations = gt::cells_body(columns = 1),
      fn = function(x) ifelse(x == "**NA**", "<code>NA</code>", x)
    ) %>%
    gt::tab_options(
      table.border.top.style = "none",
      table.width = "100%"
    )
}

# nocov end

# Get a vector of column labels for a data frame
# - works only with data frame objects
# - returns 'character' of length n (NA if label not present or empty)
get_tbl_df_column_labels <- function(data) {
  
  n_columns <- get_table_total_columns(data = data)
  
  column_labels <- rep(NA_character_, n_columns)
  
  if (!inherits(data, "data.frame")) {
    return(column_labels)
  }
  
  vapply(
    data,
    FUN.VALUE = character(1),
    USE.NAMES = FALSE,
    FUN = function(x) {
      label <- attr(x, "label", exact = TRUE)
      if (is.null(label)) label <- NA_character_
      label
    }
  )
}
rich-iannone/pointblank documentation built on June 29, 2024, 4:09 p.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
rich-iannone/pointblank
Data Validation and Organization of Metadata for Local and Remote Tables

R/utils-profiling.R
In rich-iannone/pointblank: Data Validation and Organization of Metadata for Local and Remote Tables

Defines functions get_table_total_missing_values get_table_total_columns get_table_total_rows get_table_column_names

R Package Documentation

Browse R Packages

We want your feedback!

rich-iannone/pointblank Data Validation and Organization of Metadata for Local and Remote Tables

R/utils-profiling.R In rich-iannone/pointblank: Data Validation and Organization of Metadata for Local and Remote Tables

Defines functions get_table_total_missing_values get_table_total_columns get_table_total_rows get_table_column_names

R Package Documentation

Browse R Packages

We want your feedback!

rich-iannone/pointblank
Data Validation and Organization of Metadata for Local and Remote Tables

R/utils-profiling.R
In rich-iannone/pointblank: Data Validation and Organization of Metadata for Local and Remote Tables