R/column_roles.R

Defines functions get_column_role_numeric get_column_role_character check_patterns col_name_possibly__ip col_name_possibly__email col_name_possibly__url col_name_possibly__currency_type col_name_possibly__subd col_name_possibly__country col_name_possibly__longitude col_name_possibly__latitude col_name_possibly__datetime col_name_possibly__date col_name_possibly__value col_name_possibly__amount col_name_possibly__count col_name_possibly__ind col_name_possibly__cat col_name_possibly__id get_string_length_stats get_column_types is_column_integerlike get_column_cardinality get_non_null_col_sample get_column_roles

#
#                _         _    _      _                _    
#               (_)       | |  | |    | |              | |   
#  _ __    ___   _  _ __  | |_ | |__  | |  __ _  _ __  | | __
# | '_ \  / _ \ | || '_ \ | __|| '_ \ | | / _` || '_ \ | |/ /
# | |_) || (_) || || | | || |_ | |_) || || (_| || | | ||   < 
# | .__/  \___/ |_||_| |_| \__||_.__/ |_| \__,_||_| |_||_|\_\
# | |                                                        
# |_|                                                        
# 
# This file is part of the 'rich-iannone/pointblank' package.
# 
# (c) Richard Iannone <riannone@me.com>
# 
# For full copyright and license information, please look at
# https://rich-iannone.github.io/pointblank/LICENSE.html
#


get_column_roles <- function(data) {
  
  if (!is_a_table_object(data)) {
    return(NULL)
  }
  
  col_types <- get_column_types(data)
  
  col_roles <- character(length(col_types$name))
  
  for (i in seq_along(col_types$name)) {
    
    col_type_i <- col_types$r_col_type[i]
    col_name_i <- col_types$name[i]
    data_column <- data %>% dplyr::select({{ col_name_i }})
    
    col_role_i <- 
      switch(
        col_type_i,
        character = get_column_role_character(data_column),
        numeric = get_column_role_numeric(data_column),
        factor = "string.categorical",
        ordered = "string.ordinal",
        integer = "integer.discrete",
        integer64 = "integer.discrete",
        logical = "boolean.logical.categorical",
        POSIXct = "datetime.continuous",
        Date = "date.discrete",
        list = "list_object",
        NA_character_
      )
    
    col_roles[i] <- col_role_i 
  }
  
  col_roles
}

# This returns a `data_column` with NULL values removed using the
# sampling procedure of `first_n` or `random_n`
get_non_null_col_sample <- function(
    data_column,
    sample_type = c("first_n", "random_n"),
    sample_n = 1000,
    sample_min_n = 1,
    make_distinct = FALSE
) {
  
  sample_type <- match.arg(sample_type)
  
  # Stop function if there isn't exactly one column in `data_column`
  if (ncol(data_column) != 1) {
    
    stop(
      "The table given as `data_column` must have exactly one column.",
      call. = FALSE
    )
  }
  
  # Filter out all NA/NULL values from the `data_column`
  data_column <- 
    data_column %>%
    dplyr::rename(a__ = 1) %>%
    dplyr::filter(!is.na(a__))
  
  # Obtain a subsample of non-NULL values in the column
  if (sample_type == "first_n") {
    data_column <- utils::head(data_column, sample_n)
  } else {
    data_column <- dplyr::slice_sample(data_column, n = sample_n)
  }
  
  if (make_distinct) {
    data_column <- dplyr::distinct(data_column)
  }
  
  # Count the values in the column and return `NULL` if it the
  # count is below the threshold set `sample_min_n`
  data_column_count <- dplyr::pull(dplyr::count(data_column), n)
  
  if (data_column_count < sample_min_n) {
    return(NULL)
  }
  
  data_column
}

# This provides the cardinality of a column and we may choose
# include the exclude any NA group from this count
get_column_cardinality <- function(
    data_column,
    include_na = TRUE
) {
  
  data_column_groups <-
    data_column %>%
    dplyr::rename(a__ = 1) %>%
    dplyr::distinct()

  data_column_has_any_na <-
    data_column_groups %>%
    dplyr::filter(is.na(a__)) %>%
    dplyr::distinct() %>%
    dplyr::collect() %>%
    nrow() %>%
    as.logical()
  
  data_column_groups_n <-
    data_column_groups %>%
    dplyr::count() %>%
    dplyr::pull(n) %>%
    as.integer()
  
  if (data_column_has_any_na && !include_na) {
    data_column_groups_n <- data_column_groups_n - 1L
  }
  
  data_column_groups_n
}

# Check if a column of numerical values is fully integer-like
is_column_integerlike <- function(data_column) {
  
  data_vals <- 
    data_column %>%
    dplyr::rename(a__ = 1) %>%
    dplyr::distinct() %>%
    dplyr::collect() %>%
    dplyr::pull(a__)

  rlang::is_integerish(data_vals)
}

# Get the types for a column; this provides one or more R types and also
# the SQL type if the `data_column` comes from a database source
get_column_types <- function(data_column) {
  
  column_info <- get_tbl_information(tbl = data_column)
  
  column_info["tbl_src_details"] <- NULL
  column_info["db_tbl_name"] <- NULL
  
  names(column_info) <- c("tbl_src", "name", "r_col_type", "db_col_type")

  column_info
}

get_string_length_stats <- function(data_column) {
  
  data_column %>%
    dplyr::rename(a__ = 1) %>%
    dplyr::mutate(str_length__ = nchar(a__)) %>%
    dplyr::group_by() %>%
    dplyr::summarize(
      min = min(str_length__, na.rm = TRUE),
      mean = mean(str_length__, na.rm = TRUE),
      max = max(str_length__, na.rm = TRUE)
    ) %>%
    dplyr::ungroup() %>%
    as.list()
}

# nolint start

col_name_possibly__id <- function(col_name) {
  
  patterns <- 
    c(
      "^(GU|gu|Gu|gU|UU|uu|Uu|uU)?(i|I)(d|D)$",
      "^(GU|gu|Gu|gU|UU|uu|Uu|uU)?(i|I)(d|D)[ \\._]",
      "[ \\._](GU|gu|Gu|gU|UU|uu|Uu|uU)?(i|I)(d|D)$",
      "[ \\._](GU|gu|Gu|gU|UU|uu|Uu|uU)?(i|I)(d|D)[ \\._]",
      "[a-z]ID$",
      "^Id[A-Z]"
    )
  
  check_patterns(col_name, patterns)
}

col_name_possibly__cat <- function(col_name) {
  
  patterns <- 
    c(
      "^(CATG?|Catg?|catg?|TYPE|Type|type)$",
      "^(CATG?|Catg?|catg?|TYPE|Type|type)[ \\._]",
      "[ \\._](CATG?|Catg?|catg?|TYPE|Type|type)$",
      "[ \\._](CATG?|Catg?|catg?|TYPE|Type|type)[ \\._]",
      "[a-z](CATG?|Catg?|TYPE|Type)$",
      "[A-Z](catg?|type)$",
      "^(Catg?|catg?|type|Type)[A-Z]",
      "^(CATG?|TYPE)[a-z]"
    )
  
  check_patterns(col_name, patterns)
}

col_name_possibly__ind <- function(col_name) {
  
  patterns <- 
    c(
      "^((i|I)(n|N)(d|D)|(i|I)(s|S))$",
      "^((i|I)(n|N)(d|D)|(i|I)(s|S))[ \\._]",
      "[ \\._]((i|I)(n|N)(d|D)|(i|I)(s|S))$",
      "[ \\._]((i|I)(n|N)(d|D)|(i|I)(s|S))[ \\._]",
      "[a-z](IS|IND)$",
      "[A-Z](is|ind)$",
      "^(Is|Ind)[A-Z]",
      "^(IS|IND)[a-z]"
    )
  
  check_patterns(col_name, patterns)
}

col_name_possibly__count <- function(col_name) {
  
  patterns <- 
    c(
      "^((n|N)|(c|C)(t|T)|(c|C)(o|O)(u|U)(n|N)(t|T))$",
      "^((n|N)|(c|C)(t|T)|(c|C)(o|O)(u|U)(n|N)(t|T))[ \\._]",
      "[ \\._]((n|N)|(c|C)(t|T)|(c|C)(o|O)(u|U)(n|N)(t|T))$",
      "[ \\._]((n|N)|(c|C)(t|T)|(c|C)(o|O)(u|U)(n|N)(t|T))[ \\._]",
      "[a-z](N|C(t|T|ount|OUNT))$",
      "[A-Z](n|ct|count)$",
      "^(n|Ct|Count)[A-Z]",
      "^(N|CT|COUNT)[a-z]"
    )
  
  check_patterns(col_name, patterns)
}

col_name_possibly__amount <- function(col_name) {
  
  patterns <- 
    c(
      "^((a|A)(m|M)(t|T)|(a|A)(m|M)(o|O)(u|U)(n|N)(t|T))$",
      "^((a|A)(m|M)(t|T)|(a|A)(m|M)(o|O)(u|U)(n|N)(t|T))[ \\._]",
      "[ \\._]((a|A)(m|M)(t|T)|(a|A)(m|M)(o|O)(u|U)(n|N)(t|T))$",
      "[ \\._]((a|A)(m|M)(t|T)|(a|A)(m|M)(o|O)(u|U)(n|N)(t|T))[ \\._]",
      "[a-z](Amt|Amount|AMT|AMOUNT)$",
      "[A-Z](amt|amount)$",
      "^(Amt|Amount|amt|amount)[A-Z]",
      "^(AMT|AMOUNT)[a-z]"
    )
  
  check_patterns(col_name, patterns)
}

col_name_possibly__value <- function(col_name) {
  
  patterns <- 
    c(
      "^((v|V)(a|A)(l|L)|(v|V)(a|A)(l|L)(u|U)(e|E))$",
      "^((v|V)(a|A)(l|L)|(v|V)(a|A)(l|L)(u|U)(e|E))[ \\._]",
      "[ \\._]((v|V)(a|A)(l|L)|(v|V)(a|A)(l|L)(u|U)(e|E))$",
      "[ \\._]((v|V)(a|A)(l|L)|(v|V)(a|A)(l|L)(u|U)(e|E))[ \\._]",
      "[a-z](Val|Value|VAL|VALUE)$",
      "[A-Z](val|value)$",
      "^(Val|Value|val|value)[A-Z]",
      "^(VAL|VALUE)[a-z]"
    )
  
  check_patterns(col_name, patterns)
}

col_name_possibly__date <- function(col_name) {
  
  patterns <- 
    c(
      "^((d|D)(t|T)|(d|D)(a|A)(t|T)(e|E))$",
      "^((d|D)(t|T)|(d|D)(a|A)(t|T)(e|E))[ \\._]",
      "[ \\._]((d|D)(t|T)|(d|D)(a|A)(t|T)(e|E))$",
      "[ \\._]((d|D)(t|T)|(d|D)(a|A)(t|T)(e|E))[ \\._]",
      "[a-z](Dt|Date|DT|DATE)$",
      "[A-Z](dt|date)$",
      "^(Dt|Date|dt|date)[A-Z]",
      "^(DT|DATE)[a-z]"
    )
  
  check_patterns(col_name, patterns)
}

col_name_possibly__datetime <- function(col_name) {
  
  patterns <- 
    c(
      "^(time|Time|TIME|datetime|Datetime|DATETIME|timestamp|Timestamp|TIMESTAMP)$",
      "^(time|Time|TIME|datetime|Datetime|DATETIME|timestamp|Timestamp|TIMESTAMP)[ \\._]",
      "[ \\._](time|Time|TIME|datetime|Datetime|DATETIME|timestamp|Timestamp|TIMESTAMP)$",
      "[ \\._](time|Time|TIME|datetime|Datetime|DATETIME|timestamp|Timestamp|TIMESTAMP)[ \\._]",
      "[a-z](Time|TIME|Datetime|DATETIME|Timestamp|TIMESTAMP)$",
      "^(time|Time|datetime|Datetime|timestamp|Timestamp)[A-Z]",
      "^(TIME|DATETIME|TIMESTAMP)[a-z]"
    )
  
  check_patterns(col_name, patterns)
}

col_name_possibly__latitude <- function(col_name) {
  
  patterns <- 
    c(
      "^((l|L)(a|A)(t|T)|(l|L)(a|A)(t|T)(i|I)(t|T)(u|U)(d|D)(e|E))$",
      "^((l|L)(a|A)(t|T)|(l|L)(a|A)(t|T)(i|I)(t|T)(u|U)(d|D)(e|E))[ \\._]",
      "[ \\._]((l|L)(a|A)(t|T)|(l|L)(a|A)(t|T)(i|I)(t|T)(u|U)(d|D)(e|E))$",
      "[ \\._]((l|L)(a|A)(t|T)|(l|L)(a|A)(t|T)(i|I)(t|T)(u|U)(d|D)(e|E))[ \\._]",
      "[a-z](L(at|atitude|AT|ATITUDE))$",
      "[A-Z](l(at|atitude))$",
      "^(Lat|lat|Latitude|latitude)[A-Z]",
      "^(LAT|LATITUDE)[a-z]"
    )
  
  check_patterns(col_name, patterns)
}

col_name_possibly__longitude <- function(col_name) {
  
  patterns <- 
    c(
      "^((l|L)(o|O)(n|N)(g|G)?|(l|L)(o|O)(n|N)(g|G)(i|I)(t|T)(u|U)(d|D)(e|E))$",
      "^((l|L)(o|O)(n|N)(g|G)?|(l|L)(o|O)(n|N)(g|G)(i|I)(t|T)(u|U)(d|D)(e|E))[ \\._]",
      "[ \\._]((l|L)(o|O)(n|N)(g|G)?|(l|L)(o|O)(n|N)(g|G)(i|I)(t|T)(u|U)(d|D)(e|E))$",
      "[ \\._]((l|L)(o|O)(n|N)(g|G)?|(l|L)(o|O)(n|N)(g|G)(i|I)(t|T)(u|U)(d|D)(e|E))[ \\._]",
      "[a-z](L(on|ong|ongitude|ON|ONG|ONGITUDE))$",
      "[A-Z](l(on|ong|ongitude))$",
      "^(Lon|lon|Longitude|longitude)[A-Z]",
      "^(LON|LONG|LONGITUDE)[a-z]"
    )
  
  check_patterns(col_name, patterns)
}

col_name_possibly__country <- function(col_name) {
  
  patterns <- 
    c(
      "^(COUNTRY|Country|country|CTRY|Ctry|ctry|(iso|ISO)2|(iso|ISO)3)$",
      "^(COUNTRY|Country|country|CTRY|Ctry|ctry|(iso|ISO)2|(iso|ISO)3)[ \\._]",
      "[ \\._](COUNTRY|Country|country|CTRY|Ctry|ctry|(iso|ISO)2|(iso|ISO)3)$",
      "[ \\._](COUNTRY|Country|country|CTRY|Ctry|ctry|(iso|ISO)2|(iso|ISO)3)[ \\._]",
      "[a-z](COUNTRY|Country|CTRY|Ctry|ISO2|ISO3)$",
      "[A-Z](country|ctry|iso2|iso3)$",
      "^(Country|country|Ctry|ctry|(iso|ISO)2|(iso|ISO)3)[A-Z]",
      "^(COUTNRY|CTRY|(iso|ISO)2|(iso|ISO)3)[a-z]"
    )
  
  check_patterns(col_name, patterns)
}

col_name_possibly__subd <- function(col_name) {
  
  # kw_lower <-
  #   c("state", "province", "prov", "r.gio", "division", "district",
  #     "distrito", "land", "pref")
  # 
  # kw_upper <- toupper(kw_lower)
  # 
  # kw_title <- tools::toTitleCase(kw_lower)
  # 
  # kw_all_pipe <- 
  #   paste0("(", paste(c(kw_lower, kw_upper, kw_title), collapse = "|"), ")")
  # 
  # kw_ut_pipe <- 
  #   paste0("(", paste(c(kw_upper, kw_title), collapse = "|"), ")")
  # 
  # kw_lt_pipe <- 
  #   paste0("(", paste(c(kw_lower, kw_title), collapse = "|"), ")")
  #
  # kw_u_pipe <- 
  #   paste0("(", paste(kw_upper, collapse = "|"), ")")
  #
  # 
  # patterns <- 
  #   c(
  #     paste0("^", kw_all_pipe, "$"),
  #     paste0("^", kw_all_pipe, "[ \\._]"),
  #     paste0("[ \\._]", kw_all_pipe, "$"),
  #     paste0("[ \\._]", kw_all_pipe, "[ \\._]"),
  #     paste0("[a-z]", kw_ut_pipe, "$"),
  #     paste0("^", kw_lt_pipe, "[A-Z]")
  #   )
  
  patterns <- 
    c(
      "^(state|province|prov|r.gio|division|district|distrito|land|pref|STATE|PROVINCE|PROV|R.GIO|DIVISION|DISTRICT|DISTRITO|LAND|PREF|State|Province|Prov|R.gio|Division|District|Distrito|Land|Pref|NAME|name)$",
      "^(state|province|prov|r.gio|division|district|distrito|land|pref|STATE|PROVINCE|PROV|R.GIO|DIVISION|DISTRICT|DISTRITO|LAND|PREF|State|Province|Prov|R.gio|Division|District|Distrito|Land|Pref|NAME|name)[ \\._]",
      "[ \\._](state|province|prov|r.gio|division|district|distrito|land|pref|STATE|PROVINCE|PROV|R.GIO|DIVISION|DISTRICT|DISTRITO|LAND|PREF|State|Province|Prov|R.gio|Division|District|Distrito|Land|Pref|NAME|name)$",
      "[ \\._](state|province|prov|r.gio|division|district|distrito|land|pref|STATE|PROVINCE|PROV|R.GIO|DIVISION|DISTRICT|DISTRITO|LAND|PREF|State|Province|Prov|R.gio|Division|District|Distrito|Land|Pref|NAME|name)[ \\._]",
      "[a-z](STATE|PROVINCE|PROV|R.GIO|DIVISION|DISTRICT|DISTRITO|LAND|PREF|State|Province|Prov|R.gio|Division|District|Distrito|Land|Pref|NAME|name)$",
      "[A-Z](state|province|prov|r.gio|division|district|distrito|land|pref)$",
      "^(state|province|prov|r.gio|division|district|distrito|land|pref|State|Province|Prov|R.gio|Division|District|Distrito|Land|Pref)[A-Z]",
      "^(STATE|PROVINCE|PROV|R.GIO|DIVISION|DISTRICT|DISTRITO|LAND|PREF)[a-z]"
    )
  
  check_patterns(col_name, patterns)
}

col_name_possibly__currency_type <- function(col_name) {
  
  patterns <- 
    c(
      "^(CURRENCY|Currency|currency|CURR?|Curr?|curr?)$",
      "^(CURRENCY|Currency|currency|CURR?|Curr?|curr?|CY|cy)[ \\._]",
      "[ \\._](CURRENCY|Currency|currency|CURR?|Curr?|curr?|CY|cy)$",
      "[ \\._](CURRENCY|Currency|currency|CURR?|Curr?|curr?|CY|cy)[ \\._]",
      "[a-z](CURRENCY|Currency|CURR?|Curr?|CY)$",
      "[A-Z](currency|curr?|cy)$",
      "^(Currency|currency|Curr?|curr?|cy)[A-Z]",
      "^(CURRENCY|CURR?|CY)[a-z]"
    )
  
  check_patterns(col_name, patterns)
}

col_name_possibly__url <- function(col_name) {
  
  patterns <- 
    c(
      "^(URL|url|ADDR|Addr|addr|WEB|Web|web|SITE|Site|site|LINK|Link|link|PAGE|Page|page|WWW|www|Net|net)$",
      "^(URL|url|ADDR|Addr|addr|WEB|Web|web|SITE|Site|site|LINK|Link|link|PAGE|Page|page|WWW|www|Net|net)[ \\._]",
      "[ \\._](URL|url|ADDR|Addr|addr|WEB|Web|web|SITE|Site|site|LINK|Link|link|PAGE|Page|page|WWW|www|Net|net)$",
      "[ \\._](URL|url|ADDR|Addr|addr|WEB|Web|web|SITE|Site|site|LINK|Link|link|PAGE|Page|page|WWW|www|Net|net)[ \\._]",
      "[a-z](URL|ADDR|Addr|WEB|Web|SITE|Site|LINK|Link|PAGE|Page|WWW|Net)$",
      "[A-Z](url|addr|web|site|link|page|www|net)$",
      "^(url|Addr|addr|Web|web|Site|site|Link|link|Page|page|www|Net|net)[A-Z]",
      "^(URL|ADDR|WEB|SITE|LINK|PAGE|WWW)[a-z]"
    )
  
  check_patterns(col_name, patterns)
}

col_name_possibly__email <- function(col_name) {
  
  patterns <- 
    c(
      "^(E?-?MAIL|E?-?mail|e?-?mail|ADDR|Addr|addr)$",
      "^(E?-?MAIL|E?-?mail|e?-?mail|ADDR|Addr|addr)[ \\._]",
      "[ \\._](E?-?MAIL|E?-?mail|e?-?mail|ADDR|Addr|addr)$",
      "[ \\._](E?-?MAIL|E?-?mail|e?-?mail|ADDR|Addr|addr)[ \\._]",
      "[a-z](E?-?MAIL|E?-?mail|ADDR|Addr)$",
      "[A-Z](e?-?mail|addr)$",
      "^(E?-?mail|e?-?mail|Addr|addr)[A-Z]",
      "^(E?-?MAIL|ADDR)[a-z]"
    )
  
  check_patterns(col_name, patterns)
}

col_name_possibly__ip <- function(col_name) {
  
  patterns <- 
    c(
      "^(IP|ip|IPv[46]|ipv[46]|HOST|Host|host|NET|Net|net)$",
      "^(IP|ip|IPv[46]|ipv[46]|HOST|Host|host|NET|Net|net)[ \\._]",
      "[ \\._](IP|ip|IPv[46]|ipv[46]|HOST|Host|host|NET|Net|net)$",
      "[ \\._](IP|ip|IPv[46]|ipv[46]|HOST|Host|host|NET|Net|net)[ \\._]",
      "[a-z](IP|IPv[46]|HOST|Host|NET|Net)$",
      "[A-Z](ip|ipv[46]|host|net)$",
      "^(ip|IPv[46]|ipv[46]|Host|host|Net|net)[A-Z]",
      "^(IP|IPv[46]|ipv[46]|HOST|NET)[a-z]"
    )
  
  check_patterns(col_name, patterns)
}

# nolint end

check_patterns <- function(x, patterns) {
  
  for (pat in patterns) {
    
    if (grepl(pat, x)) {
      return(TRUE)
    } 
  }
  
  FALSE
}

get_column_role_character <- function(data_column) {
  
  column_name <- colnames(data_column)
  row_count <- get_table_total_rows(data_column)
  cardinality <- get_column_cardinality(data_column)
  column_samp <- get_non_null_col_sample(data_column)
  
  if (is.null(column_samp)) {
    return("string")
  }
  
  length_stats <- get_string_length_stats(column_samp)
  
  # Get the distinct non-null items from the column sample
  column_items <- 
    column_samp %>%
    dplyr::distinct() %>%
    dplyr::pull()
  
  if (col_name_possibly__country(column_name)) {
    
    # We might assume this is a column containing country names;
    # it's useful to further probe how this is represented since
    # there are a few standardized ways to refer to countries
    
    if (any(column_items %in% countries$name)) {
      
      prop_column_items <-
        sum(column_items %in% countries$name) / length(column_items)
      
      if (prop_column_items >= 0.5) {
        return("country:iso3166-1-esn.string.categorical")
      }
    }
    
    if (any(column_items %in% countries$alpha_3)) {
      
      prop_column_items <-
        sum(column_items %in% countries$alpha_3) / length(column_items)
      
      if (prop_column_items >= 0.9) {
        return("country:iso3166-1-a-3.string.categorical")
      }
    }
    
    if (any(column_items %in% countries$alpha_2)) {
      
      prop_column_items <-
        sum(column_items %in% countries$alpha_2) / length(column_items)
      
      if (prop_column_items >= 0.75) {
        return("country:iso3166-1-a-2.string.categorical")
        
      }
    }
  }
  
  if (col_name_possibly__subd(column_name)) {
    
    # The guess here is that this column may containing subdivisions;
    # of one or maybe more countries; this warrant further investigation
    # since even first-level subdivisions are standardized to great extent

    # Extract all fully-alphabetical subdivision names to
    # serve as a large set
    subd_names <- 
      subdivisions %>% 
      dplyr::select(subd_name) %>%
      dplyr::filter(grepl("[A-Z][A-Z]", subd_name)) %>% 
      dplyr::pull()
    
    # If any of the column items matches any one of the
    # `subd_names` then further inspect to see if there's a
    # country-specific match
    if (any(column_items %in% subd_names)) {
      
      frac <- c()
      
      for (co in names(subd_list_main)) {
        
        # obtain fractional amounts of subdivision names in each main country
        frac_co <- 
          sum(subd_list_main[[co]] %in% column_items) /
          length(subd_list_main[[co]])
        
        frac_co <- stats::setNames(frac_co, co)
        
        frac <- c(frac, frac_co)
      }
      
      # Sort `frac` by decreasing value
      frac <- sort(frac, decreasing = TRUE)
      
      # Get the non-zero values
      frac <- frac[frac != 0]
      
      # Get the length of non-zero values
      frac_length <- length(frac)
      
      if (frac_length == 1) {
        c_code <- names(frac)
        return(
          paste0("country_subd:iso3166-2[", c_code, "].string.categorical")
        )
      }
      
      # If there's no country-specific matching, we're still
      # somewhat sure this column has to do with country subdivisions
      return("country_subd:iso3166-2.string.categorical")
    }
    
    # Extract all subdivision short names to serve as a large set
    subd_names <- 
      subdivisions %>%
      dplyr::filter(country_alpha_3 %in% names(subd_list_main)) %>%
      dplyr::pull(name_en)
    
    # If any of the column items matches any one of the
    # `subd_names` then further inspect to see if there's a
    # country-specific match
    if (any(column_items %in% subd_names) && 
        sum(column_items %in% subd_names) > 1) {
      
      frac <- c()
      
      for (co in names(subd_list_main)) {
        
        subd_list_co <- 
          subdivisions %>%
          dplyr::filter(country_alpha_3 == {{ co }}) %>%
          dplyr::pull(name_en)
        
        # obtain fractional amounts of subdivision names in each main country
        frac_co <- 
          sum(subd_list_co %in% column_items) /
          length(subd_list_co)
        
        frac_co <- stats::setNames(frac_co, co)
        
        frac <- c(frac, frac_co)
      }
      
      # Sort `frac` by decreasing value
      frac <- sort(frac, decreasing = TRUE)
      
      # Get the non-zero values
      frac <- frac[frac != 0]
      
      # Get the length of non-zero values
      frac_length <- length(frac)
      
      if (frac_length == 1) {
        c_code <- names(frac)
        return(
          paste0("country_subd:iso3166-2-sn[", c_code, "].string.categorical")
        )
      }
      
      # If there's no country-specific matching, we're still
      # somewhat sure this column has to do with country subdivisions
      return("country_subd:iso3166-2-sn.string.categorical")
    }
  }
  
  if (col_name_possibly__currency_type(column_name)) {
    
    # Perhaps this column contains currency types and the only way to
    # be more certain is to check against 3-letter and 3-digit currency
    # codes
    
    prop_column_items_a <-
      sum(column_items %in% currencies$currency_a) / 
      length(currencies$currency_a)
    
    if (prop_column_items_a >= 0.9) {
      return("currency:iso4217-a.string.categorical")
    }
    
    prop_column_items_n <-
      sum(column_items %in% currencies$currency_n) / 
      length(currencies$currency_n)
    
    if (prop_column_items_n >= 0.9) {
      return("currency:iso4217-num.string.categorical")
    }
  }

  if (col_name_possibly__url(column_name)) {
    
    # This could very well be a URL and it's easy to check for that
    
    # Check that values in the column follow URL formatting rules
    prop_column_items_url <-
      sum(grepl(regex_url(), column_items)) / 
      length(column_items)
    
    if (prop_column_items_url >= 0.8) {
      return("url.string")
    }
  }
    
  if (col_name_possibly__email(column_name)) {
    
    # This could very well be an email address; this isn't too
    # difficult to verify
    
    # Check that values in the column follow email formatting rules
    prop_column_items_email <-
      sum(grepl(regex_email(), column_items)) / 
      length(column_items)
    
    if (prop_column_items_email >= 0.8) {
      return("email.string")
    }
  }
  
  if (col_name_possibly__ip(column_name)) {
    
    # An IP address? Might possibly be the case. Let's check on that
    
    # Check column for IPv4 addresses
    prop_column_items_ipv4 <-
      sum(grepl(regex_ipv4_address(), column_items)) / 
      length(column_items)
    
    if (prop_column_items_ipv4 >= 0.9) {
      return("ip.string")
    }
    
    # Check column for IPv6 addresses
    prop_column_items_ipv6 <-
      sum(grepl(regex_ipv6_address(), column_items)) / 
      length(column_items)
    
    if (prop_column_items_ipv6 >= 0.1) {
      return("ip.string")
    }
  }
  
  if (col_name_possibly__id(column_name)) {
    
    # If the column prominently has 'ID' in it's name, it's
    # best to believe it's an ID column of some sort
    
    return("id.string")
  }
  
  if (col_name_possibly__cat(column_name)) {
    
    # This is a category or type column judging by the name;
    # the naming is all-important in this case
    
    return("string.categorical")
  }
  
  if (cardinality < 50 && row_count > 200) {
    return("string.categorical")
  }
  
  if (col_name_possibly__datetime(column_name)) {
    
    # Determine if the column sample has strings that are
    # parsable as datetime values

    parsable <- FALSE
    
    for (s_fmt in strptime_8601_formats) {
      
      parsed <- strptime(column_items, format = s_fmt)
      
      prop_parsed <- sum(!is.na(parsed)) / length(parsed)
      
      if (prop_parsed > 0.8) {
        parsable <- TRUE
        break
      }
    }
    
    if (parsable) {
      return("datetime.string")
    }
  }
  
  if (col_name_possibly__date(column_name)) {
    
    # Determine if the column sample has strings that are
    # parsable as datetime values
    
    parsable <- FALSE
    
    for (s_fmt in strptime_date_formats) {
      
      parsed <- strptime(column_items, format = s_fmt)
      
      prop_parsed <- sum(!is.na(parsed)) / length(parsed)
      
      if (prop_parsed > 0.8) {
        parsable <- TRUE
        break
      }
    }
    
    if (parsable) {
      return("date.string")
    }
    
  }
  
  return("string")
}

get_column_role_numeric <- function(data_column) {
  
  column_name <- colnames(data_column)
  row_count <- get_table_total_rows(data_column)
  cardinality <- get_column_cardinality(data_column)
  column_samp <- get_non_null_col_sample(data_column)
  
  if (is.null(column_samp)) {
    return("numeric")
  }
  
  # Initialize the role fragment
  role <- ""
  
  # Get the distinct non-null items from the column sample
  column_items <- 
    column_samp %>%
    dplyr::distinct() %>%
    dplyr::pull()
  
  if (col_name_possibly__latitude(column_name)) {
    
    # Determine if the column sample has values that mostly lie
    # within the valid range of latitude values
    
    within_bounds <- column_items <= 90 & column_items >= -90
    
    prop_within_bounds <- sum(!is.na(within_bounds)) / length(within_bounds)
    
    if (prop_within_bounds > 0.8) {
      role <- "geo:latitude."
    }
  }
  
  if (col_name_possibly__longitude(column_name)) {
    
    # Determine if the column sample has values that mostly lie
    # within the valid range of longitude values
    
    within_bounds <- column_items <= 180 & column_items >= -180
    
    prop_within_bounds <- sum(!is.na(within_bounds)) / length(within_bounds)
    
    if (prop_within_bounds > 0.8) {
      role <- "geo:longitude."
    }
  }
  
  if (row_count > 200 && is_column_integerlike(data_column = column_samp)) {
    
    return(paste0(role, "numeric.discrete"))
    
  } else {
    return(paste0(role, "numeric.continuous"))
  }
  
  return(paste0(role, "numeric"))
}

strptime_8601_formats <-
  c(
    "%Y-%m-%d %H:%M:%S",
    "%Y-%m-%dT%H:%M:%S",
    "%Y/%m/%d %H:%M:%S"
  )

strptime_date_formats <-
  c(
    "%Y-%m-%d",
    "%Y-%m-%d",
    "%Y/%m/%d"
  )

Try the pointblank package in your browser

Any scripts or data that you put into this service are public.

pointblank documentation built on April 25, 2023, 5:06 p.m.