#' Prepare Firm Name Table
#'
#' @param .tab
#' A Dataframe with at least 3 columns: \cr
#' id: A company identifier \cr
#' ...: any number of columns with company names \cr
#' country: The country of incorporation of the firm
#'
#' @param .regex_names A regular expression that identifies the columns for the company names
#'
#' @return A dataframe
#' @export
#'
#' @examples
#' .path <- system.file("extdata", "test_tables.xlsx", package = "RFirmMatch")
#' .tab <- openxlsx::read.xlsx(.path)
#' prepare_tables(.tab)
#' rm(.tab)
prepare_tables <- function(.tab, .regex_names = "name") {
name_orig <- name_clean <- country <- id_name <- id <- dup <- NULL
# Check if firm id and country code column is available
if (!"id" %in% colnames(.tab)) {
stop("Table must have a firm identifier column ('id')")
}
if (!"country" %in% colnames(.tab)) {
stop("Table must have a country column column ('country')")
}
# Check if country is in iso-3166 alpha 3 standard
if (!all(.tab[["country"]] %in% c(ISOcodes::ISO_3166_1[["Alpha_3"]], NA_character_))) {
stop("Countries must be in ISO 3166 Alpha 3 Format")
}
# Check if table contains more than one firm name column
i_int_names <- sum(grepl(.regex_names, colnames(.tab)))
# Put table into long format
i_tab <- .tab
if (i_int_names == 0) {
stop("You must provide a regular expression to capture all columns that contain a firm name", call. = FALSE)
} else {
i_tab <- i_tab %>%
tidyr::pivot_longer(dplyr::matches(.regex_names), names_to = "id_name", values_to = "name_orig") %>%
dplyr::filter(!is.na(name_orig)) %>%
dplyr::group_by(id) %>%
dplyr::mutate(id_name = dplyr::row_number()) %>%
dplyr::ungroup()
}
i_tab %>%
dplyr::mutate(name_clean = helper_standardize_names(name_orig)) %>%
dplyr::distinct(id, name_clean, country, .keep_all = TRUE) %>%
dplyr::mutate(dup = duplicated(name_clean)) %>%
dplyr::select(id, id_name, name_orig, name_clean, country, dup, dplyr::everything())
}
#' Extract Legal Forms from Company Names
#'
#' @param .tab A table prepared by prepare_tables()
#' @param .tab_lf A dataframe generated by make_legal_form_table()
#' @param .workers Number of parallel workers
#' @param .progress Show progress bar?
#'
#' @return A datframe
#' @export
#'
#' @examples
#' .path <- system.file("extdata", "test_tables.xlsx", package = "RFirmMatch")
#' .tab <- openxlsx::read.xlsx(.path)
#' .tab <- prepare_tables(.tab)
#' extract_legal_form(.tab, make_legal_form_table())
#' rm(.tab)
#'
#'
#' ## DEBUG
#' .tab_lf = NULL
#' .workers = 2
#' .progress = FALSE
#'
extract_legal_form <- function(.tab, .tab_lf, .workers = 1, .progress = FALSE) {
id_tmp <- legal_form <- name_clean <- id <- id_name <- name_orig <- name_adj <-
country <- dup <- loc <- NULL
if (!is.null(.tab_lf)) {
i_tab_lf <- dplyr::bind_rows(RFirmMatch::table_legal_forms, .tab_lf)
} else {
i_tab_lf <- RFirmMatch::table_legal_forms
}
ichr_lf <- unique(i_tab_lf[["legal_form"]])
i_chr_name <- .tab[["name_clean"]]
future::plan("multisession", workers = .workers)
# Search for Legal Forms at the end of the string
i_tab_lf_extract <- furrr::future_map(
.x = purrr::set_names(ichr_lf, ichr_lf),
.f = ~ {
check_end <- which(endsWith(i_chr_name, paste0(" ", .x)))
check_mid <- - which(grepl(paste0(" ", .x, " "), i_chr_name, fixed = TRUE))
return(c(check_end, check_mid))
},
.options = furrr::furrr_options(seed = TRUE, globals = "i_chr_nameO"),
.progress = .progress
) %>%
purrr::compact() %>%
tibble::enframe(name = "legal_form", value = "id_tmp") %>%
tidyr::unnest(id_tmp) %>%
dplyr::mutate(
loc = dplyr::if_else(id_tmp > 0, "end", "mid"),
id_tmp = abs(id_tmp)
) %>%
dplyr::arrange(dplyr::desc(nchar(legal_form)), loc) %>%
dplyr::distinct(id_tmp, .keep_all = TRUE)
future::plan("default")
.tab %>%
dplyr::mutate(id_tmp = dplyr::row_number()) %>%
dplyr::left_join(i_tab_lf_extract, by = "id_tmp") %>%
dplyr::mutate(name_adj = dplyr::case_when(
is.na(legal_form) ~ name_clean,
!is.na(legal_form) ~ name_clean %>%
stringi::stri_replace_last_fixed(., legal_form, " ") %>%
gsub("\\s+", " ", .) %>%
trimws()
)) %>%
dplyr::select(id, id_name, name_orig, name_clean, name_adj, country, dup, legal_form, loc, dplyr::everything(), -id_tmp)
}
#' Match company names
#'
#' @param .tab0 Company Table
#' @param .tab1 Matching Table
#' @param .col_match Column name used for matching
#' @param .type c("full", "sub", "approx")
#' @param .min_char only used if .type == "sub"
#' @param .max_dist only used if .type == "approx"
#' @param .method only used if .type == "approx"
#' @param .workers Number of parallel workers (only used for .type == "approx")
#' @param .progress Show progress bar?
#' @return A Dataframe
#' @export
#'
#' @examples
#' .path <- system.file("extdata", "test_tables.xlsx", package = "RFirmMatch")
#' .tab1 <- openxlsx::read.xlsx(.path, 1)
#' .tab1 <- .tab1 %>% prepare_tables() %>% extract_legal_form(make_legal_form_table())
#'
#' .tab2 <- openxlsx::read.xlsx(.path, 2)
#' .tab2 <- .tab2 %>% prepare_tables() %>% extract_legal_form(make_legal_form_table())
#'
#' match_name(.tab1, .tab2, name_clean, "full")
#' match_name(.tab1, .tab2, name_clean, "sub")
#' match_name(.tab1, .tab2, name_clean, "approx")
#'
#'
#' ## DEBUG
#' .col_match <- quote(name_clean)
#' .min_char = 0.8
#' .max_dist = .1
#' .method = "osa"
#' .workers = 1
match_name <- function(
.tab0, .tab1, .col_match, .type = c("full", "sub", "approx"),
.min_char = 0.25, .max_dist = .1,
.method = c("osa", "lv", "dl", "hamming", "lcs", "qgram", "cosine", "jaccard", "jw", "soundex"),
.workers = 1, .progress = FALSE
) {
id_tmp <- id_tmp_0 <- id_tmp_1 <- . <- match_0 <- match_1 <- tmp_0 <- tmp_1 <-
name_clean <- n_char_0 <- n_char_1 <- method <- sim <- char <- id_0 <- id_1 <- id_name_0 <-
id_name_1 <- name_orig_0 <- name_orig_1 <- name_clean_0 <- name_clean_1 <-
name_adj_0 <- name_adj_1 <- legal_form_0 <- legal_form_1 <- country_0 <-
country_1 <- dup_0 <- dup_1 <- max_dist <- n_char <- NULL
.type <- match.arg(.type)
if (.type == "full") {
i_tab0 <- dplyr::mutate(.tab0, match = {{ .col_match }})
i_tab1 <- dplyr::mutate(.tab1, match = {{ .col_match }})
i_tab <- dplyr::inner_join(i_tab0, i_tab1, by = "match", suffix = c("_0", "_1")) %>%
dplyr::mutate(sim = 1, char = 1, method = "full") %>%
dplyr::distinct(id_0, id_1, .keep_all = TRUE) %>%
dplyr::filter(nchar(match) >= 2)
} else if (.type == "sub") {
i_tmp0 <- helper_get_substrings_table(.tab0, {{ .col_match }}, .min_char)
i_tmp1 <- helper_get_substrings_table(.tab1, {{ .col_match }}, .min_char)
i_tab <- dplyr::inner_join(i_tmp0, i_tmp1, by = "match", suffix = c("_0", "_1")) %>%
dplyr::arrange(id_0, dplyr::desc(nchar(match))) %>%
dplyr::distinct(id_0, id_1, .keep_all = TRUE) %>%
dplyr::left_join(
y = dplyr::mutate(.tab0, tmp = {{ .col_match }}),
by = c("id_0" = "id"), suffix = c("_0", "_1")) %>%
dplyr::left_join(
y = dplyr::mutate(.tab1, tmp = {{ .col_match }}),
by = c("id_1" = "id"), suffix = c("_0", "_1")) %>%
dplyr::mutate(
sim = stringdist::stringsim(tmp_0, tmp_1),
char = nchar(match) / nchar(tmp_0),
method = "sub"
) %>% dplyr::select(-tmp_0, -tmp_1)
} else if (.type == "approx") {
i_tab0 <- .tab0 %>%
dplyr::mutate(match = {{ .col_match }}, n_char = nchar(match)) %>%
dplyr::filter(n_char >= 2) %>%
split(., nchar(.[["match"]]))
i_tab1 <- .tab1 %>%
dplyr::mutate(match = {{ .col_match }}, n_char = nchar(match)) %>%
dplyr::filter(n_char >= 2)
i_tab1 <- purrr::map(
.x = as.integer(names(i_tab0)),
.f = ~ dplyr::filter(i_tab1, n_char >= floor(.x * (1 - .max_dist)) & ceiling(n_char <= .x * (1 + .max_dist)))
)
future::plan("multisession", workers = .workers)
i_tab <- furrr::future_map2_dfr(
.x = i_tab0,
.y = i_tab1,
.f = ~ fuzzyjoin::stringdist_inner_join(.x, .y, "match", max_dist = .max_dist, method = .method),
.options = furrr::furrr_options(seed = TRUE, globals = c(".method", ".max_dist")),
.progress = .progress
) %>%
`colnames<-`(gsub("\\.x$", "_0", colnames(.))) %>%
`colnames<-`(gsub("\\.y$", "_1", colnames(.))) %>%
dplyr::mutate(sim = stringdist::stringsim(match_0, match_1)) %>%
dplyr::mutate(char = NA_integer_, match = NA_character_, method = "approx") %>%
dplyr::select(-match_0, -match_1, -n_char_0, -n_char_1)
future::plan("default")
}
i_tab <- i_tab %>%
dplyr::select(
method, match, sim, char, id_0, id_1, id_name_0, id_name_1, name_orig_0, name_orig_1,
name_clean_0, name_clean_1, name_adj_0, name_adj_1, legal_form_0, legal_form_1,
country_0, country_1, dup_0, dup_1, dplyr::everything()
)
return(i_tab)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.