R/extract_legal_form.R

Defines functions extract_legal_form

Documented in extract_legal_form

# Generated by fusen: do not edit by hand

#' Extract Legal Forms
#' 
#' Description
#' 
#' @param .tab 
#' A dataframe (either the source or target dataframe)
#' @param .col_name 
#' The column with firm names
#' @param .col_country 
#' Optionally, a column with iso3 country codes
#' @param .legal_forms 
#' A dataframe with legal forms
#' @param .workers 
#' Number of cores to utilize (Default all cores determined by future::availableCores())
#'
#' @return A dataframe
#' 
#' @importFrom rlang :=
#' 
#' @export
#' @examples
#' extract_legal_form(
#'   .tab = table_source[1:100, ], 
#'   .col_name = "name", 
#'   .col_country = "iso3",
#'   .workers = 1
#'   )
extract_legal_form <- function(
  .tab, .col_name, .col_country = NULL, .legal_forms = data.frame(), 
  .workers = future::availableCores()
  ) {
  tmp <- legal_form_orig <- legal_form_stand <- legal_form <- name <- lf_stand <-
    lf_orig <- NULL
  
  .tab <- tibble::as_tibble(.tab)
  
  if (nrow(.legal_forms) == 0) {
    tab_lf_ <- get("legal_form_all")
  } else {
    tab_lf_ <- .legal_forms
  }
  
  if (is.null(.col_country)) {
    tab_lf_ <- tab_lf_ %>%
      dplyr::distinct(legal_form_orig, legal_form_stand) %>%
      dplyr::distinct(legal_form_orig, .keep_all = TRUE)
    join_by_ <- "legal_form_orig"
  } else {
    colnames(tab_lf_) <- c(.col_country, "legal_form_orig", "legal_form_stand")
    join_by_ <- c(.col_country, "legal_form_orig")
  }

  tab_ <- standardize_data(.tab, .col_name)
  lf_ <- unique(tab_lf_[["legal_form_orig"]])
  nm_ <- tab_[[.col_name]]

  f_ <- carrier::crate(function(.lf, .nm) which(endsWith(.nm, paste0(" ", .lf))))
  future::plan("multisession", workers = .workers)
  lf_ext_ <- furrr::future_map(
    .x = purrr::set_names(lf_, lf_),
    .f = ~ f_(.x, nm_),
    .options = furrr::furrr_options(seed = TRUE)
  )
  future::plan("default")
  lf_ext_ <- lf_ext_ %>%
    purrr::compact() %>%
    tibble::enframe(name = "legal_form_orig", value = "tmp") %>%
    tidyr::unnest(tmp) %>%
    dplyr::arrange(dplyr::desc(nchar(legal_form_orig))) %>%
    dplyr::distinct(tmp, .keep_all = TRUE)

  tab_ %>%
    dplyr::mutate(tmp = dplyr::row_number()) %>%
    dplyr::left_join(lf_ext_, by = "tmp") %>%
    dplyr::left_join(tab_lf_, by = join_by_) %>%
    dplyr::rename(
      lf_stand = legal_form_stand, 
      lf_orig = legal_form_orig
      ) %>%
    dplyr::relocate(lf_stand, .after = !!dplyr::sym(.col_name)) %>%
    dplyr::relocate(lf_orig, .after = !!dplyr::sym(.col_name)) %>%
    dplyr::mutate(
      !!dplyr::sym(paste0(.col_name, "_adj")) := trimws(
        stringi::stri_replace_last_fixed(!!dplyr::sym(.col_name), lf_orig, "")
      ),
      .after = !!dplyr::sym(.col_name)
    ) %>%
    dplyr::mutate(
      !!dplyr::sym(paste0(.col_name, "_adj")) := dplyr::if_else(
        condition = is.na(!!dplyr::sym(paste0(.col_name, "_adj"))),
        true = !!dplyr::sym(.col_name), 
        false = !!dplyr::sym(paste0(.col_name, "_adj"))
        )) %>%
    dplyr::mutate(
      !!dplyr::sym(paste0(.col_name, "_std")) := dplyr::if_else(
        condition = !is.na(lf_stand),
        true = paste(!!dplyr::sym(paste0(.col_name, "_adj")), lf_stand),
        false = !!dplyr::sym(paste0(.col_name, "_adj"))
      ),
      .after = !!dplyr::sym(paste0(.col_name, "_adj"))
    ) %>%
    dplyr::select(-tmp) %>%
    dplyr::mutate(!!dplyr::sym(.col_name) := .tab[[.col_name]])
  
  
}
MatthiasUckert/Rmatch documentation built on Jan. 3, 2022, 11:09 p.m.