library(kokudosuuchiUtils)
packageDescription("kokudosuuchiUtils")$Built
library(dplyr, warn.conflicts = FALSE)

Read data

html_data <- readr::read_csv(rprojroot::find_package_root_file("inst/extdata/html_data.csv"))

Extract enum from 列挙型

html_data %>%
    filter(stringr::str_detect(.data$type, "列挙型")) %>%
    select(identifier, code, name, type) %>%
    mutate_all(funs(stringr::str_replace_all(., "\\s", " "))) %>% 
    knitr::kable()

Collect results into enum_code.

enum_code <- list()

Enum type ("列挙型") is written in one of the following formats:

  1. 列挙型 (only we can do for this is to ignore)
  2. 列挙型(type1、type2、type3) (implicitly numbered by sequence of numbers)
  3. 列挙型(type1=1、type2=2、type3=3)
  4. 列挙型(1:type1, 2:type2、3:type3)
  5. multiple version of 4.

Ignore 1.

html_data_enums <- html_data %>%
    filter(stringr::str_detect(.data$type, "列挙型[^$]"))

Join codes from KSJMetadata_code.

html_data_enums <- html_data_enums %>%
  left_join(KSJMetadata_code, by = c("identifier", "name")) %>%
  select(identifier, name, code = code.y, type)

N04 should be ignored, as it is not about the codes in data but ones in colnames.

filter(html_data_enums, is.na(code)) %>%
  knitr::kable()

Extract 2. Enum of implicit codes

html_data_enums %>%
  filter(!stringr::str_detect(.data$type, ":|:|=")) %>%
  mutate_all(funs(stringr::str_replace_all(., "\\s", " "))) %>% 
  knitr::kable()

A03

enum_code_type2 <- html_data_enums %>%
  filter(!stringr::str_detect(.data$type, ":|:|=")) %>%
  # ignore other than A03
  filter(.data$identifier == "A03") %>%
  { setNames(.$type, .$code) } %>%
  purrr::map(stringr::str_replace, pattern = "^列挙型((.*))$", replacement = "\\1") %>%
  purrr::map(stringr::str_split, pattern = "、") %>%
  purrr::map(1L) %>%
  purrr::map(~ setNames(., seq_along(.)))

Extract 3.

html_data_enums %>%
  filter(stringr::str_detect(.data$type, "=")) %>%
  mutate_all(funs(stringr::str_replace_all(., "\\s", " "))) %>% 
  knitr::kable()
enum_code_type3 <- html_data_enums %>%
  filter(stringr::str_detect(.data$type, "=")) %>%
  { setNames(.$type, .$code) } %>%
  purrr::map(stringr::str_replace, pattern = "^列挙型((.*))$", replacement = "\\1") %>%
  purrr::map(stringr::str_split, pattern = "(?<=\\d)、") %>%
  purrr::map(1L) %>%
  purrr::map(function(x) {
    m <- stringr::str_split_fixed(x, "=", 2)
    setNames(m[, 1, drop = TRUE], m[, 2, drop = TRUE])
  })

Extract 4.

html_data_enums %>%
  filter(stringr::str_detect(.data$type, ":|:")) %>%
  mutate_all(funs(stringr::str_replace_all(., "\\s", " "))) %>% 
  knitr::kable()

As mentioned above, treat N04 differently.

enum_code_type4 <- html_data_enums %>%
  mutate(type = stringr::str_replace_all(.data$type, "\\s", "")) %>% 
  # ignore N04
  filter(stringr::str_detect(.data$type, ":|:"), .data$identifier != "N04") %>%
  { setNames(.$type, .$code) } %>%
  purrr::map(stringr::str_replace, pattern = "^列挙型((.*))$", replacement = "\\1") %>%
  purrr::map(stringr::str_split, pattern = "、") %>%
  purrr::map(1L) %>%
  purrr::map(function(x) {
    m <- stringr::str_split_fixed(x, ":|:", 2)
    setNames(m[, 2, drop = TRUE], m[, 1, drop = TRUE])
  })

Extract enum from コードリスト

html_data %>%
    filter(stringr::str_detect(.data$type, "コードリスト.*[:=:=]")) %>%
    select(identifier, code, name, type) %>%
    mutate_all(funs(stringr::str_replace_all(., "\\s", " "))) %>% 
    knitr::kable()
html_data_psuedo_enums_num_first <- html_data %>%
  filter(stringr::str_detect(.data$type, "コードリスト.*\\d[:=:=]"))

html_data_psuedo_enums_code_first <- html_data %>%
  filter(stringr::str_detect(.data$type, "コードリスト.*[:=:=]\\d"))

enum_code_psuedo_num_first <- html_data_psuedo_enums_num_first$type %>%
  stringi::stri_trans_nfkc() %>%
  stringr::str_replace_all(c(
    "^コードリスト型?(「都市計画決定コード」)?" = "",
    "※.*$" = "",
    "^\\(|\\)$" = ""
    )
  ) %>%
  # workaround for P24_011
  stringr::str_replace_all("(?<!^)(\\d)(?=:)", "、\\1") %>%
  stringr::str_split("、") %>%
  purrr::map(function(x) {
    m <- stringr::str_split_fixed(x, "[:=]", 2)
    setNames(m[, 2, drop = TRUE], m[, 1, drop = TRUE])
  }) %>%
  setNames(html_data_psuedo_enums_num_first$code)

enum_code_psuedo_code_first <- html_data_psuedo_enums_code_first$type %>%
  stringi::stri_trans_nfkc() %>%
  stringr::str_replace_all(c(
    "^コードリスト型?(\\(種別コード\\)|「種別コード」)?" = "",
    "^\\(|\\)$" = ""
    )
  ) %>%
  # workaround for P24_011
  stringr::str_replace_all("(?<!^)(\\d)(?=:)", "、\\1") %>%
  stringr::str_split("、") %>%
  purrr::map(function(x) {
    m <- stringr::str_split_fixed(x, "[:=]", 2)
    setNames(m[, 1, drop = TRUE], m[, 2, drop = TRUE])
  }) %>%
  setNames(html_data_psuedo_enums_code_first$code)
enum_codes_all <- c(
  enum_code_type2,
  enum_code_type3, 
  enum_code_type4,
  enum_code_psuedo_code_first,
  enum_code_psuedo_num_first
)

Write data

devtools::use_data(enum_codes_all, overwrite = TRUE)
file.copy(rprojroot::find_package_root_file("data/enum_codes_all.rda"),
          rprojroot::find_package_root_file("../kokudosuuchi/data/"),
          overwrite = TRUE)


yutannihilation/kokudosuuchiUtils documentation built on May 6, 2019, 5:04 p.m.