Due to encoding problem, this Rmd cannot be knitted by "Knit" button
library(kokudosuuchiUtils) packageDescription("kokudosuuchiUtils")$Built
library(dplyr, warn.conflicts = FALSE)
html_data <- readr::read_csv(rprojroot::find_package_root_file("inst/extdata/zokusei.csv")) colnames(html_data) <-(c("identifier", "table_num", "name", "description", "type")) html_data <- html_data %>% # remove rows without types filter(!is.na(.data$type)) %>% # remove description select(-.data$description)
excel_data <- readr::read_csv(rprojroot::find_package_root_file("inst/extdata/shape_property_table_xls.csv"))
N05_005b
and N05_005e
are crammed into one cellSee https://github.com/yutannihilation/kokudosuuchiUtils/issues/3#issuecomment-327374894
indices_double_code <- html_data$name == "設置期間(設置開始)(N05_005b)設置期間(設置終了)(N05_005e)" html_data_part_double_code <- html_data[indices_double_code, ] %>% mutate(name = stringr::str_split(.data$name, "(?<!^)(?=設置期間)")) %>% tidyr::unnest(.data$name) html_data <- bind_rows(html_data[!indices_double_code, ], html_data_part_double_code)
linebreak_pattern <- "\\s*[\\n\\r]+\\s*" comment_pattern <- "(?<=[\\))])[^\\((]+$" html_data <- html_data %>% # remove unneeded rows filter(!is.na(.data$type)) %>% mutate(name = stringr::str_replace_all(.data$name, linebreak_pattern, "")) %>% # insert separators mutate(name = stringr::str_replace(.data$name, "([\\((][A-Z][0-9a-z\\-]+[\\*※]?[_\\-][A-Za-z0-9\\-_ 〜]+[\\))])", "%NINJA%\\1%NINJA%")) %>% # separate by the separators tidyr::separate(col = .data$name, into = c("name", "code", "note"), sep = "%NINJA%", fill = "right") %>% # clean up codes mutate(code = stringr::str_replace_all(code, "[(\\()\\)]", ""), note = stringr::str_trim(note), note = dplyr::if_else(note == "", NA_character_, note))
(See Extract Attribute Names And Codes)
Tilda represents a sequence of numbers like:
P16_015~026 P17_006~
We need to translate this to the corresponding sequence of codes like:
P16_015 P16_016 P16_017 ...
(Be careful: "〜"(\u301c
) is not "~" or "~"(\uff5e
).)
indices_tilda <- dplyr::coalesce(stringr::str_detect(html_data$code, "〜"), FALSE) # extract tilda html_data_part_tilda <- html_data[indices_tilda, ] %>% # trim numbers from name mutate(name = stringr::str_replace(.data$name, "(?\\d+[\\-〜][\\dn]+)?", "")) %>% # e.g. P16_015~026 -> prefix: P16, begin: 015, end: 026 tidyr::separate(code, into = c("prefix", "begin", "end"), regex = "_|〜", fill = "right") %>% mutate_at(c("begin", "end"), funs(readr::parse_integer)) %>% # fill NAs with appropriate end numbers mutate(end = dplyr::coalesce(.data$end, 30L)) %>% # expand rows mutate(code = purrr::pmap(., function(prefix, begin, end, ...) sprintf("%s_%03d", prefix, seq(begin, end)))) %>% tidyr::unnest(.data$code) %>% # add sequencial numbers to names (e.g. 備考 -> 備考1, 備考2, ...) group_by(identifier) %>% mutate(name = paste0(name, row_number())) %>% # we don't need prefix, begin and end anymore select(-(prefix:end))
html_data_part_tilda
html_data <- bind_rows(html_data[!indices_tilda, ], html_data_part_tilda)
XX
XX
represents prefecture codes. But the codes with this seems only used for layer names, not for property names. So we ignore this here.
html_data <- html_data %>% filter(!coalesce(stringr::str_detect(.data$code, "(?<=[^X])XX$"), FALSE))
XXXX
XXXX
represents year and needs special considerations. We ignore this here.
# paste this result on kokudosuuchi KSJMetadata_code_year_cols <- html_data %>% filter(stringr::str_detect(.data$code, "(?<=[^X])XXXX$")) %>% { tibble::tibble( pattern = stringr::str_replace(.$code, "XXXX", "\\(\\\\d{4}\\)"), replacement = stringr::str_c(.$name, "\\1年度") ) }
devtools::use_data(KSJMetadata_code_year_cols, overwrite = TRUE) readr::write_csv(KSJMetadata_code_year_cols, rprojroot::find_package_root_file("inst/extdata/KSJMetadata_code_year_cols.csv")) file.copy(rprojroot::find_package_root_file("data/KSJMetadata_code_year_cols.rda"), rprojroot::find_package_root_file("../kokudosuuchi/data/"), overwrite = TRUE)
html_data <- html_data %>% filter(!coalesce(stringr::str_detect(.data$code, "(?<=[^X])XXXX$"), FALSE))
* or ※ means:
indices_asterisk <- dplyr::coalesce(stringr::str_detect(html_data$code, "\\*|※"), FALSE) # see https://github.com/yutannihilation/kokudosuuchiUtils/issues/17#issuecomment-328083456 table_for_asterisk <- bind_rows( tibble::tibble(identifier = "A34", kome = TRUE, lower_alpha = c("a", "b", "c", "d")), tibble::tibble(identifier = "A34", kome = FALSE, lower_alpha = c("f", "g")), tibble::tibble(identifier = "A35a", kome = TRUE, lower_alpha = c("a", "b")), tibble::tibble(identifier = "A35b", kome = TRUE, lower_alpha = c("d", "e")), tibble::tibble(identifier = "P22", kome = FALSE, lower_alpha = c("a", "b")), tibble::tibble(identifier = "P23", kome = FALSE, lower_alpha = c("a", "b")) ) html_data_asterisk <- html_data[indices_asterisk, ] %>% mutate(kome = stringr::str_detect(.data$code, "※")) %>% tidyr::separate(code, into = c("prefix", "number"), sep = "\\*|※") %>% inner_join(table_for_asterisk, by = c("identifier", "kome")) %>% mutate(code = stringr::str_c(.data$prefix, .data$lower_alpha, .data$number)) %>% select(-prefix, -lower_alpha, -number, -kome) html_data <- bind_rows(html_data[!indices_asterisk, ], html_data_asterisk)
readr::write_csv(html_data, rprojroot::find_package_root_file("inst/extdata/html_data.csv"))
For L05, data from HTML is wrong; they are L01_...
, but should be L05_...
. Moreovere, L01_007
should be 地区名, not 事業主体名.
So remove this from html_data
and use data from Excel.
html_data <- html_data %>% filter(!.data$identifier %in% c("L05"))
Compensations:
L05_003
: 行政コードL05_013
: 用途地域コードFor P03 and P12, data from Excel is wrong; they are P03_XXX
(3 digits), but the format in actual data is P03_XXXX
(four digits).
excel_data <- excel_data %>% filter(!category %in% c("P03", "P12"))
The website does not describe L03-a
code per code. Note that, the codes in L03-a
are ambiguous and cannot be determined.
html_data <- html_data %>% filter(!.data$identifier %in% c("L03-a"))
First, we delete rows without code from html_data
. They might be useful for getting codelist urls, but are not needed here.
Before removing the rows, extract texts so that we can join this later.
codelist <- readr::read_csv(rprojroot::find_package_root_file("inst/extdata/codelist.csv")) %>% mutate(text = stringr::str_replace_all(text, "\\s", "")) html_data_text <- select(html_data, identifier, name, text = type) %>% filter(.data$text %in% !! codelist$text) %>% distinct(identifier, name, text) %>% # These are ambiguous as the translation depends on the year. filter(!(.data$identifier == "S05-c" & .data$name %in% c("駅コード", "駅名"))) %>% filter(!(.data$identifier == "S05-d" & .data$name %in% c("着ゾーンコード", "発ゾーンコード"))) # confirm theare are no duplicated rows count(html_data_text, identifier, name, sort = TRUE) %>% filter(n > 1)
html_data <- filter(html_data, !is.na(code))
Excel data has some codes that are ambiguous. Add IDs to each column set so that it can be used afterwards.
excel_data <- excel_data %>% group_by(category) %>% mutate(item_id = cumsum(coalesce(item != lag(item), FALSE))) %>% ungroup() %>% select(identifier = category, item_id, tag, code, name)
Now it's time to join.
merged_data_from_html <- html_data %>% left_join(excel_data, by = c("identifier", "name", "code")) %>% select(identifier, name, item_id, tag, code, text = type) merged_data_from_excel <- excel_data %>% filter(!.data$code %in% !! unique(merged_data_from_html$code)) %>% select(identifier, name, item_id, tag, code) %>% # try to merge texts left_join(html_data_text, by = c("identifier", "name")) merged_data <- bind_rows( html = merged_data_from_html, excel = merged_data_from_excel, .id = "source" ) %>% arrange(identifier, item_id, code)
Note that merged_data
has more identifiers than data available from API. We may filter unavailable data.
unique(merged_data$identifier)
merged_data_w_codelist <- merged_data %>% left_join(codelist, by = c("identifier", "text")) %>% select(source, identifier, name, item_id, tag, code, correspondence_table = url_basename)
If data is included in codelist
merged_data_w_enum <- merged_data_w_codelist %>% mutate(correspondence_table = if_else(.data$code %in% names(enum_codes_all), .data$code, .data$correspondence_table))
merged_data_manually <- merged_data_w_enum %>% mutate(correspondence_table = case_when( .data$code == "L05_003" ~ "AdminAreaCd.html", .data$code == "L05_013" ~ "useDistrict.html", TRUE ~ .data$correspondence_table))
data(KSJMetadata_description_url) identifier_to_url <- setNames(KSJMetadata_description_url$url, KSJMetadata_description_url$identifier) KSJMetadata_code <- merged_data_manually %>% mutate(source = if_else(.data$source == "html", identifier_to_url[.data$identifier], "http://nlftp.mlit.go.jp/ksj/gml/shape_property_table.xls"))
Currently, kokudosuuchi's code assumes that different item_id
s have different numbers of colnames. L03-a
is my headache.
KSJMetadata_code %>% group_by(identifier) %>% filter(any(duplicated(code))) %>% count(identifier, item_id) %>% filter(any(duplicated(n)))
devtools::use_data(KSJMetadata_code, overwrite = TRUE) readr::write_csv(KSJMetadata_code, rprojroot::find_package_root_file("inst/extdata/KSJMetadata_code.csv"))
file.copy(rprojroot::find_package_root_file("data/KSJMetadata_code.rda"), rprojroot::find_package_root_file("../kokudosuuchi/data/"), overwrite = TRUE)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.