Nothing
#
# _ _ _ _ _
# (_) | | | | | | | |
# _ __ ___ _ _ __ | |_ | |__ | | __ _ _ __ | | __
# | '_ \ / _ \ | || '_ \ | __|| '_ \ | | / _` || '_ \ | |/ /
# | |_) || (_) || || | | || |_ | |_) || || (_| || | | || <
# | .__/ \___/ |_||_| |_| \__||_.__/ |_| \__,_||_| |_||_|\_\
# | |
# |_|
#
# This file is part of the 'rich-iannone/pointblank' package.
#
# (c) Richard Iannone <riannone@me.com>
#
# For full copyright and license information, please look at
# https://rich-iannone.github.io/pointblank/LICENSE.html
#
get_column_roles <- function(data) {
if (!is_a_table_object(data)) {
return(NULL)
}
col_types <- get_column_types(data)
col_roles <- character(length(col_types$name))
for (i in seq_along(col_types$name)) {
col_type_i <- col_types$r_col_type[i]
col_name_i <- col_types$name[i]
data_column <- data %>% dplyr::select({{ col_name_i }})
col_role_i <-
switch(
col_type_i,
character = get_column_role_character(data_column),
numeric = get_column_role_numeric(data_column),
factor = "string.categorical",
ordered = "string.ordinal",
integer = "integer.discrete",
integer64 = "integer.discrete",
logical = "boolean.logical.categorical",
POSIXct = "datetime.continuous",
Date = "date.discrete",
list = "list_object",
NA_character_
)
col_roles[i] <- col_role_i
}
col_roles
}
# This returns a `data_column` with NULL values removed using the
# sampling procedure of `first_n` or `random_n`
get_non_null_col_sample <- function(
data_column,
sample_type = c("first_n", "random_n"),
sample_n = 1000,
sample_min_n = 1,
make_distinct = FALSE
) {
sample_type <- match.arg(sample_type)
# Stop function if there isn't exactly one column in `data_column`
if (ncol(data_column) != 1) {
stop(
"The table given as `data_column` must have exactly one column.",
call. = FALSE
)
}
# Filter out all NA/NULL values from the `data_column`
data_column <-
data_column %>%
dplyr::rename(a__ = 1) %>%
dplyr::filter(!is.na(a__))
# Obtain a subsample of non-NULL values in the column
if (sample_type == "first_n") {
data_column <- utils::head(data_column, sample_n)
} else {
data_column <- dplyr::slice_sample(data_column, n = sample_n)
}
if (make_distinct) {
data_column <- dplyr::distinct(data_column)
}
# Count the values in the column and return `NULL` if it the
# count is below the threshold set `sample_min_n`
data_column_count <- dplyr::pull(dplyr::count(data_column), n)
if (data_column_count < sample_min_n) {
return(NULL)
}
data_column
}
# This provides the cardinality of a column and we may choose
# include the exclude any NA group from this count
get_column_cardinality <- function(
data_column,
include_na = TRUE
) {
data_column_groups <-
data_column %>%
dplyr::rename(a__ = 1) %>%
dplyr::distinct()
data_column_has_any_na <-
data_column_groups %>%
dplyr::filter(is.na(a__)) %>%
dplyr::distinct() %>%
dplyr::collect() %>%
nrow() %>%
as.logical()
data_column_groups_n <-
data_column_groups %>%
dplyr::count() %>%
dplyr::pull(n) %>%
as.integer()
if (data_column_has_any_na && !include_na) {
data_column_groups_n <- data_column_groups_n - 1L
}
data_column_groups_n
}
# Check if a column of numerical values is fully integer-like
is_column_integerlike <- function(data_column) {
data_vals <-
data_column %>%
dplyr::rename(a__ = 1) %>%
dplyr::distinct() %>%
dplyr::collect() %>%
dplyr::pull(a__)
rlang::is_integerish(data_vals)
}
# Get the types for a column; this provides one or more R types and also
# the SQL type if the `data_column` comes from a database source
get_column_types <- function(data_column) {
column_info <- get_tbl_information(tbl = data_column)
column_info["tbl_src_details"] <- NULL
column_info["db_tbl_name"] <- NULL
names(column_info) <- c("tbl_src", "name", "r_col_type", "db_col_type")
column_info
}
get_string_length_stats <- function(data_column) {
data_column %>%
dplyr::rename(a__ = 1) %>%
dplyr::mutate(str_length__ = nchar(a__)) %>%
dplyr::group_by() %>%
dplyr::summarize(
min = min(str_length__, na.rm = TRUE),
mean = mean(str_length__, na.rm = TRUE),
max = max(str_length__, na.rm = TRUE)
) %>%
dplyr::ungroup() %>%
as.list()
}
# nolint start
col_name_possibly__id <- function(col_name) {
patterns <-
c(
"^(GU|gu|Gu|gU|UU|uu|Uu|uU)?(i|I)(d|D)$",
"^(GU|gu|Gu|gU|UU|uu|Uu|uU)?(i|I)(d|D)[ \\._]",
"[ \\._](GU|gu|Gu|gU|UU|uu|Uu|uU)?(i|I)(d|D)$",
"[ \\._](GU|gu|Gu|gU|UU|uu|Uu|uU)?(i|I)(d|D)[ \\._]",
"[a-z]ID$",
"^Id[A-Z]"
)
check_patterns(col_name, patterns)
}
col_name_possibly__cat <- function(col_name) {
patterns <-
c(
"^(CATG?|Catg?|catg?|TYPE|Type|type)$",
"^(CATG?|Catg?|catg?|TYPE|Type|type)[ \\._]",
"[ \\._](CATG?|Catg?|catg?|TYPE|Type|type)$",
"[ \\._](CATG?|Catg?|catg?|TYPE|Type|type)[ \\._]",
"[a-z](CATG?|Catg?|TYPE|Type)$",
"[A-Z](catg?|type)$",
"^(Catg?|catg?|type|Type)[A-Z]",
"^(CATG?|TYPE)[a-z]"
)
check_patterns(col_name, patterns)
}
col_name_possibly__ind <- function(col_name) {
patterns <-
c(
"^((i|I)(n|N)(d|D)|(i|I)(s|S))$",
"^((i|I)(n|N)(d|D)|(i|I)(s|S))[ \\._]",
"[ \\._]((i|I)(n|N)(d|D)|(i|I)(s|S))$",
"[ \\._]((i|I)(n|N)(d|D)|(i|I)(s|S))[ \\._]",
"[a-z](IS|IND)$",
"[A-Z](is|ind)$",
"^(Is|Ind)[A-Z]",
"^(IS|IND)[a-z]"
)
check_patterns(col_name, patterns)
}
col_name_possibly__count <- function(col_name) {
patterns <-
c(
"^((n|N)|(c|C)(t|T)|(c|C)(o|O)(u|U)(n|N)(t|T))$",
"^((n|N)|(c|C)(t|T)|(c|C)(o|O)(u|U)(n|N)(t|T))[ \\._]",
"[ \\._]((n|N)|(c|C)(t|T)|(c|C)(o|O)(u|U)(n|N)(t|T))$",
"[ \\._]((n|N)|(c|C)(t|T)|(c|C)(o|O)(u|U)(n|N)(t|T))[ \\._]",
"[a-z](N|C(t|T|ount|OUNT))$",
"[A-Z](n|ct|count)$",
"^(n|Ct|Count)[A-Z]",
"^(N|CT|COUNT)[a-z]"
)
check_patterns(col_name, patterns)
}
col_name_possibly__amount <- function(col_name) {
patterns <-
c(
"^((a|A)(m|M)(t|T)|(a|A)(m|M)(o|O)(u|U)(n|N)(t|T))$",
"^((a|A)(m|M)(t|T)|(a|A)(m|M)(o|O)(u|U)(n|N)(t|T))[ \\._]",
"[ \\._]((a|A)(m|M)(t|T)|(a|A)(m|M)(o|O)(u|U)(n|N)(t|T))$",
"[ \\._]((a|A)(m|M)(t|T)|(a|A)(m|M)(o|O)(u|U)(n|N)(t|T))[ \\._]",
"[a-z](Amt|Amount|AMT|AMOUNT)$",
"[A-Z](amt|amount)$",
"^(Amt|Amount|amt|amount)[A-Z]",
"^(AMT|AMOUNT)[a-z]"
)
check_patterns(col_name, patterns)
}
col_name_possibly__value <- function(col_name) {
patterns <-
c(
"^((v|V)(a|A)(l|L)|(v|V)(a|A)(l|L)(u|U)(e|E))$",
"^((v|V)(a|A)(l|L)|(v|V)(a|A)(l|L)(u|U)(e|E))[ \\._]",
"[ \\._]((v|V)(a|A)(l|L)|(v|V)(a|A)(l|L)(u|U)(e|E))$",
"[ \\._]((v|V)(a|A)(l|L)|(v|V)(a|A)(l|L)(u|U)(e|E))[ \\._]",
"[a-z](Val|Value|VAL|VALUE)$",
"[A-Z](val|value)$",
"^(Val|Value|val|value)[A-Z]",
"^(VAL|VALUE)[a-z]"
)
check_patterns(col_name, patterns)
}
col_name_possibly__date <- function(col_name) {
patterns <-
c(
"^((d|D)(t|T)|(d|D)(a|A)(t|T)(e|E))$",
"^((d|D)(t|T)|(d|D)(a|A)(t|T)(e|E))[ \\._]",
"[ \\._]((d|D)(t|T)|(d|D)(a|A)(t|T)(e|E))$",
"[ \\._]((d|D)(t|T)|(d|D)(a|A)(t|T)(e|E))[ \\._]",
"[a-z](Dt|Date|DT|DATE)$",
"[A-Z](dt|date)$",
"^(Dt|Date|dt|date)[A-Z]",
"^(DT|DATE)[a-z]"
)
check_patterns(col_name, patterns)
}
col_name_possibly__datetime <- function(col_name) {
patterns <-
c(
"^(time|Time|TIME|datetime|Datetime|DATETIME|timestamp|Timestamp|TIMESTAMP)$",
"^(time|Time|TIME|datetime|Datetime|DATETIME|timestamp|Timestamp|TIMESTAMP)[ \\._]",
"[ \\._](time|Time|TIME|datetime|Datetime|DATETIME|timestamp|Timestamp|TIMESTAMP)$",
"[ \\._](time|Time|TIME|datetime|Datetime|DATETIME|timestamp|Timestamp|TIMESTAMP)[ \\._]",
"[a-z](Time|TIME|Datetime|DATETIME|Timestamp|TIMESTAMP)$",
"^(time|Time|datetime|Datetime|timestamp|Timestamp)[A-Z]",
"^(TIME|DATETIME|TIMESTAMP)[a-z]"
)
check_patterns(col_name, patterns)
}
col_name_possibly__latitude <- function(col_name) {
patterns <-
c(
"^((l|L)(a|A)(t|T)|(l|L)(a|A)(t|T)(i|I)(t|T)(u|U)(d|D)(e|E))$",
"^((l|L)(a|A)(t|T)|(l|L)(a|A)(t|T)(i|I)(t|T)(u|U)(d|D)(e|E))[ \\._]",
"[ \\._]((l|L)(a|A)(t|T)|(l|L)(a|A)(t|T)(i|I)(t|T)(u|U)(d|D)(e|E))$",
"[ \\._]((l|L)(a|A)(t|T)|(l|L)(a|A)(t|T)(i|I)(t|T)(u|U)(d|D)(e|E))[ \\._]",
"[a-z](L(at|atitude|AT|ATITUDE))$",
"[A-Z](l(at|atitude))$",
"^(Lat|lat|Latitude|latitude)[A-Z]",
"^(LAT|LATITUDE)[a-z]"
)
check_patterns(col_name, patterns)
}
col_name_possibly__longitude <- function(col_name) {
patterns <-
c(
"^((l|L)(o|O)(n|N)(g|G)?|(l|L)(o|O)(n|N)(g|G)(i|I)(t|T)(u|U)(d|D)(e|E))$",
"^((l|L)(o|O)(n|N)(g|G)?|(l|L)(o|O)(n|N)(g|G)(i|I)(t|T)(u|U)(d|D)(e|E))[ \\._]",
"[ \\._]((l|L)(o|O)(n|N)(g|G)?|(l|L)(o|O)(n|N)(g|G)(i|I)(t|T)(u|U)(d|D)(e|E))$",
"[ \\._]((l|L)(o|O)(n|N)(g|G)?|(l|L)(o|O)(n|N)(g|G)(i|I)(t|T)(u|U)(d|D)(e|E))[ \\._]",
"[a-z](L(on|ong|ongitude|ON|ONG|ONGITUDE))$",
"[A-Z](l(on|ong|ongitude))$",
"^(Lon|lon|Longitude|longitude)[A-Z]",
"^(LON|LONG|LONGITUDE)[a-z]"
)
check_patterns(col_name, patterns)
}
col_name_possibly__country <- function(col_name) {
patterns <-
c(
"^(COUNTRY|Country|country|CTRY|Ctry|ctry|(iso|ISO)2|(iso|ISO)3)$",
"^(COUNTRY|Country|country|CTRY|Ctry|ctry|(iso|ISO)2|(iso|ISO)3)[ \\._]",
"[ \\._](COUNTRY|Country|country|CTRY|Ctry|ctry|(iso|ISO)2|(iso|ISO)3)$",
"[ \\._](COUNTRY|Country|country|CTRY|Ctry|ctry|(iso|ISO)2|(iso|ISO)3)[ \\._]",
"[a-z](COUNTRY|Country|CTRY|Ctry|ISO2|ISO3)$",
"[A-Z](country|ctry|iso2|iso3)$",
"^(Country|country|Ctry|ctry|(iso|ISO)2|(iso|ISO)3)[A-Z]",
"^(COUTNRY|CTRY|(iso|ISO)2|(iso|ISO)3)[a-z]"
)
check_patterns(col_name, patterns)
}
col_name_possibly__subd <- function(col_name) {
# kw_lower <-
# c("state", "province", "prov", "r.gio", "division", "district",
# "distrito", "land", "pref")
#
# kw_upper <- toupper(kw_lower)
#
# kw_title <- tools::toTitleCase(kw_lower)
#
# kw_all_pipe <-
# paste0("(", paste(c(kw_lower, kw_upper, kw_title), collapse = "|"), ")")
#
# kw_ut_pipe <-
# paste0("(", paste(c(kw_upper, kw_title), collapse = "|"), ")")
#
# kw_lt_pipe <-
# paste0("(", paste(c(kw_lower, kw_title), collapse = "|"), ")")
#
# kw_u_pipe <-
# paste0("(", paste(kw_upper, collapse = "|"), ")")
#
#
# patterns <-
# c(
# paste0("^", kw_all_pipe, "$"),
# paste0("^", kw_all_pipe, "[ \\._]"),
# paste0("[ \\._]", kw_all_pipe, "$"),
# paste0("[ \\._]", kw_all_pipe, "[ \\._]"),
# paste0("[a-z]", kw_ut_pipe, "$"),
# paste0("^", kw_lt_pipe, "[A-Z]")
# )
patterns <-
c(
"^(state|province|prov|r.gio|division|district|distrito|land|pref|STATE|PROVINCE|PROV|R.GIO|DIVISION|DISTRICT|DISTRITO|LAND|PREF|State|Province|Prov|R.gio|Division|District|Distrito|Land|Pref|NAME|name)$",
"^(state|province|prov|r.gio|division|district|distrito|land|pref|STATE|PROVINCE|PROV|R.GIO|DIVISION|DISTRICT|DISTRITO|LAND|PREF|State|Province|Prov|R.gio|Division|District|Distrito|Land|Pref|NAME|name)[ \\._]",
"[ \\._](state|province|prov|r.gio|division|district|distrito|land|pref|STATE|PROVINCE|PROV|R.GIO|DIVISION|DISTRICT|DISTRITO|LAND|PREF|State|Province|Prov|R.gio|Division|District|Distrito|Land|Pref|NAME|name)$",
"[ \\._](state|province|prov|r.gio|division|district|distrito|land|pref|STATE|PROVINCE|PROV|R.GIO|DIVISION|DISTRICT|DISTRITO|LAND|PREF|State|Province|Prov|R.gio|Division|District|Distrito|Land|Pref|NAME|name)[ \\._]",
"[a-z](STATE|PROVINCE|PROV|R.GIO|DIVISION|DISTRICT|DISTRITO|LAND|PREF|State|Province|Prov|R.gio|Division|District|Distrito|Land|Pref|NAME|name)$",
"[A-Z](state|province|prov|r.gio|division|district|distrito|land|pref)$",
"^(state|province|prov|r.gio|division|district|distrito|land|pref|State|Province|Prov|R.gio|Division|District|Distrito|Land|Pref)[A-Z]",
"^(STATE|PROVINCE|PROV|R.GIO|DIVISION|DISTRICT|DISTRITO|LAND|PREF)[a-z]"
)
check_patterns(col_name, patterns)
}
col_name_possibly__currency_type <- function(col_name) {
patterns <-
c(
"^(CURRENCY|Currency|currency|CURR?|Curr?|curr?)$",
"^(CURRENCY|Currency|currency|CURR?|Curr?|curr?|CY|cy)[ \\._]",
"[ \\._](CURRENCY|Currency|currency|CURR?|Curr?|curr?|CY|cy)$",
"[ \\._](CURRENCY|Currency|currency|CURR?|Curr?|curr?|CY|cy)[ \\._]",
"[a-z](CURRENCY|Currency|CURR?|Curr?|CY)$",
"[A-Z](currency|curr?|cy)$",
"^(Currency|currency|Curr?|curr?|cy)[A-Z]",
"^(CURRENCY|CURR?|CY)[a-z]"
)
check_patterns(col_name, patterns)
}
col_name_possibly__url <- function(col_name) {
patterns <-
c(
"^(URL|url|ADDR|Addr|addr|WEB|Web|web|SITE|Site|site|LINK|Link|link|PAGE|Page|page|WWW|www|Net|net)$",
"^(URL|url|ADDR|Addr|addr|WEB|Web|web|SITE|Site|site|LINK|Link|link|PAGE|Page|page|WWW|www|Net|net)[ \\._]",
"[ \\._](URL|url|ADDR|Addr|addr|WEB|Web|web|SITE|Site|site|LINK|Link|link|PAGE|Page|page|WWW|www|Net|net)$",
"[ \\._](URL|url|ADDR|Addr|addr|WEB|Web|web|SITE|Site|site|LINK|Link|link|PAGE|Page|page|WWW|www|Net|net)[ \\._]",
"[a-z](URL|ADDR|Addr|WEB|Web|SITE|Site|LINK|Link|PAGE|Page|WWW|Net)$",
"[A-Z](url|addr|web|site|link|page|www|net)$",
"^(url|Addr|addr|Web|web|Site|site|Link|link|Page|page|www|Net|net)[A-Z]",
"^(URL|ADDR|WEB|SITE|LINK|PAGE|WWW)[a-z]"
)
check_patterns(col_name, patterns)
}
col_name_possibly__email <- function(col_name) {
patterns <-
c(
"^(E?-?MAIL|E?-?mail|e?-?mail|ADDR|Addr|addr)$",
"^(E?-?MAIL|E?-?mail|e?-?mail|ADDR|Addr|addr)[ \\._]",
"[ \\._](E?-?MAIL|E?-?mail|e?-?mail|ADDR|Addr|addr)$",
"[ \\._](E?-?MAIL|E?-?mail|e?-?mail|ADDR|Addr|addr)[ \\._]",
"[a-z](E?-?MAIL|E?-?mail|ADDR|Addr)$",
"[A-Z](e?-?mail|addr)$",
"^(E?-?mail|e?-?mail|Addr|addr)[A-Z]",
"^(E?-?MAIL|ADDR)[a-z]"
)
check_patterns(col_name, patterns)
}
col_name_possibly__ip <- function(col_name) {
patterns <-
c(
"^(IP|ip|IPv[46]|ipv[46]|HOST|Host|host|NET|Net|net)$",
"^(IP|ip|IPv[46]|ipv[46]|HOST|Host|host|NET|Net|net)[ \\._]",
"[ \\._](IP|ip|IPv[46]|ipv[46]|HOST|Host|host|NET|Net|net)$",
"[ \\._](IP|ip|IPv[46]|ipv[46]|HOST|Host|host|NET|Net|net)[ \\._]",
"[a-z](IP|IPv[46]|HOST|Host|NET|Net)$",
"[A-Z](ip|ipv[46]|host|net)$",
"^(ip|IPv[46]|ipv[46]|Host|host|Net|net)[A-Z]",
"^(IP|IPv[46]|ipv[46]|HOST|NET)[a-z]"
)
check_patterns(col_name, patterns)
}
# nolint end
check_patterns <- function(x, patterns) {
for (pat in patterns) {
if (grepl(pat, x)) {
return(TRUE)
}
}
FALSE
}
get_column_role_character <- function(data_column) {
column_name <- colnames(data_column)
row_count <- get_table_total_rows(data_column)
cardinality <- get_column_cardinality(data_column)
column_samp <- get_non_null_col_sample(data_column)
if (is.null(column_samp)) {
return("string")
}
length_stats <- get_string_length_stats(column_samp)
# Get the distinct non-null items from the column sample
column_items <-
column_samp %>%
dplyr::distinct() %>%
dplyr::pull()
if (col_name_possibly__country(column_name)) {
# We might assume this is a column containing country names;
# it's useful to further probe how this is represented since
# there are a few standardized ways to refer to countries
if (any(column_items %in% countries$name)) {
prop_column_items <-
sum(column_items %in% countries$name) / length(column_items)
if (prop_column_items >= 0.5) {
return("country:iso3166-1-esn.string.categorical")
}
}
if (any(column_items %in% countries$alpha_3)) {
prop_column_items <-
sum(column_items %in% countries$alpha_3) / length(column_items)
if (prop_column_items >= 0.9) {
return("country:iso3166-1-a-3.string.categorical")
}
}
if (any(column_items %in% countries$alpha_2)) {
prop_column_items <-
sum(column_items %in% countries$alpha_2) / length(column_items)
if (prop_column_items >= 0.75) {
return("country:iso3166-1-a-2.string.categorical")
}
}
}
if (col_name_possibly__subd(column_name)) {
# The guess here is that this column may containing subdivisions;
# of one or maybe more countries; this warrant further investigation
# since even first-level subdivisions are standardized to great extent
# Extract all fully-alphabetical subdivision names to
# serve as a large set
subd_names <-
subdivisions %>%
dplyr::select(subd_name) %>%
dplyr::filter(grepl("[A-Z][A-Z]", subd_name)) %>%
dplyr::pull()
# If any of the column items matches any one of the
# `subd_names` then further inspect to see if there's a
# country-specific match
if (any(column_items %in% subd_names)) {
frac <- c()
for (co in names(subd_list_main)) {
# obtain fractional amounts of subdivision names in each main country
frac_co <-
sum(subd_list_main[[co]] %in% column_items) /
length(subd_list_main[[co]])
frac_co <- stats::setNames(frac_co, co)
frac <- c(frac, frac_co)
}
# Sort `frac` by decreasing value
frac <- sort(frac, decreasing = TRUE)
# Get the non-zero values
frac <- frac[frac != 0]
# Get the length of non-zero values
frac_length <- length(frac)
if (frac_length == 1) {
c_code <- names(frac)
return(
paste0("country_subd:iso3166-2[", c_code, "].string.categorical")
)
}
# If there's no country-specific matching, we're still
# somewhat sure this column has to do with country subdivisions
return("country_subd:iso3166-2.string.categorical")
}
# Extract all subdivision short names to serve as a large set
subd_names <-
subdivisions %>%
dplyr::filter(country_alpha_3 %in% names(subd_list_main)) %>%
dplyr::pull(name_en)
# If any of the column items matches any one of the
# `subd_names` then further inspect to see if there's a
# country-specific match
if (any(column_items %in% subd_names) &&
sum(column_items %in% subd_names) > 1) {
frac <- c()
for (co in names(subd_list_main)) {
subd_list_co <-
subdivisions %>%
dplyr::filter(country_alpha_3 == {{ co }}) %>%
dplyr::pull(name_en)
# obtain fractional amounts of subdivision names in each main country
frac_co <-
sum(subd_list_co %in% column_items) /
length(subd_list_co)
frac_co <- stats::setNames(frac_co, co)
frac <- c(frac, frac_co)
}
# Sort `frac` by decreasing value
frac <- sort(frac, decreasing = TRUE)
# Get the non-zero values
frac <- frac[frac != 0]
# Get the length of non-zero values
frac_length <- length(frac)
if (frac_length == 1) {
c_code <- names(frac)
return(
paste0("country_subd:iso3166-2-sn[", c_code, "].string.categorical")
)
}
# If there's no country-specific matching, we're still
# somewhat sure this column has to do with country subdivisions
return("country_subd:iso3166-2-sn.string.categorical")
}
}
if (col_name_possibly__currency_type(column_name)) {
# Perhaps this column contains currency types and the only way to
# be more certain is to check against 3-letter and 3-digit currency
# codes
prop_column_items_a <-
sum(column_items %in% currencies$currency_a) /
length(currencies$currency_a)
if (prop_column_items_a >= 0.9) {
return("currency:iso4217-a.string.categorical")
}
prop_column_items_n <-
sum(column_items %in% currencies$currency_n) /
length(currencies$currency_n)
if (prop_column_items_n >= 0.9) {
return("currency:iso4217-num.string.categorical")
}
}
if (col_name_possibly__url(column_name)) {
# This could very well be a URL and it's easy to check for that
# Check that values in the column follow URL formatting rules
prop_column_items_url <-
sum(grepl(regex_url(), column_items)) /
length(column_items)
if (prop_column_items_url >= 0.8) {
return("url.string")
}
}
if (col_name_possibly__email(column_name)) {
# This could very well be an email address; this isn't too
# difficult to verify
# Check that values in the column follow email formatting rules
prop_column_items_email <-
sum(grepl(regex_email(), column_items)) /
length(column_items)
if (prop_column_items_email >= 0.8) {
return("email.string")
}
}
if (col_name_possibly__ip(column_name)) {
# An IP address? Might possibly be the case. Let's check on that
# Check column for IPv4 addresses
prop_column_items_ipv4 <-
sum(grepl(regex_ipv4_address(), column_items)) /
length(column_items)
if (prop_column_items_ipv4 >= 0.9) {
return("ip.string")
}
# Check column for IPv6 addresses
prop_column_items_ipv6 <-
sum(grepl(regex_ipv6_address(), column_items)) /
length(column_items)
if (prop_column_items_ipv6 >= 0.1) {
return("ip.string")
}
}
if (col_name_possibly__id(column_name)) {
# If the column prominently has 'ID' in it's name, it's
# best to believe it's an ID column of some sort
return("id.string")
}
if (col_name_possibly__cat(column_name)) {
# This is a category or type column judging by the name;
# the naming is all-important in this case
return("string.categorical")
}
if (cardinality < 50 && row_count > 200) {
return("string.categorical")
}
if (col_name_possibly__datetime(column_name)) {
# Determine if the column sample has strings that are
# parsable as datetime values
parsable <- FALSE
for (s_fmt in strptime_8601_formats) {
parsed <- strptime(column_items, format = s_fmt)
prop_parsed <- sum(!is.na(parsed)) / length(parsed)
if (prop_parsed > 0.8) {
parsable <- TRUE
break
}
}
if (parsable) {
return("datetime.string")
}
}
if (col_name_possibly__date(column_name)) {
# Determine if the column sample has strings that are
# parsable as datetime values
parsable <- FALSE
for (s_fmt in strptime_date_formats) {
parsed <- strptime(column_items, format = s_fmt)
prop_parsed <- sum(!is.na(parsed)) / length(parsed)
if (prop_parsed > 0.8) {
parsable <- TRUE
break
}
}
if (parsable) {
return("date.string")
}
}
return("string")
}
get_column_role_numeric <- function(data_column) {
column_name <- colnames(data_column)
row_count <- get_table_total_rows(data_column)
cardinality <- get_column_cardinality(data_column)
column_samp <- get_non_null_col_sample(data_column)
if (is.null(column_samp)) {
return("numeric")
}
# Initialize the role fragment
role <- ""
# Get the distinct non-null items from the column sample
column_items <-
column_samp %>%
dplyr::distinct() %>%
dplyr::pull()
if (col_name_possibly__latitude(column_name)) {
# Determine if the column sample has values that mostly lie
# within the valid range of latitude values
within_bounds <- column_items <= 90 & column_items >= -90
prop_within_bounds <- sum(!is.na(within_bounds)) / length(within_bounds)
if (prop_within_bounds > 0.8) {
role <- "geo:latitude."
}
}
if (col_name_possibly__longitude(column_name)) {
# Determine if the column sample has values that mostly lie
# within the valid range of longitude values
within_bounds <- column_items <= 180 & column_items >= -180
prop_within_bounds <- sum(!is.na(within_bounds)) / length(within_bounds)
if (prop_within_bounds > 0.8) {
role <- "geo:longitude."
}
}
if (row_count > 200 && is_column_integerlike(data_column = column_samp)) {
return(paste0(role, "numeric.discrete"))
} else {
return(paste0(role, "numeric.continuous"))
}
return(paste0(role, "numeric"))
}
strptime_8601_formats <-
c(
"%Y-%m-%d %H:%M:%S",
"%Y-%m-%dT%H:%M:%S",
"%Y/%m/%d %H:%M:%S"
)
strptime_date_formats <-
c(
"%Y-%m-%d",
"%Y-%m-%d",
"%Y/%m/%d"
)
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.