library(kokudosuuchiUtils)
packageDescription("kokudosuuchiUtils")$Built
library(dplyr, warn.conflicts = FALSE)

Download

shape_property_table_xls <- fetch_shape_property_table_xls()

Extract tilda

Also expand tildas in Excel data. This tilda is \uff5e.

indices_tilda <- dplyr::coalesce(stringr::str_detect(shape_property_table_xls$code, "~"), FALSE)
shape_property_table_xls[indices_tilda, ]
# add rowid to convert afterwords
data_orig <- tibble::rowid_to_column(shape_property_table_xls, var = "row_order") %>%
  mutate(row_order = as.numeric(row_order))

# extract tilda
data_part_tilda <- data_orig[indices_tilda, ] %>%
  tidyr::extract(code, into = c("prefix", "begin", "end"), regex = "(P11_\\d+)_(\\d+)~(\\d+)") %>%
  mutate_at(c("begin", "end"), funs(readr::parse_integer)) %>%
  # fill NAs with appropriate end numbers
  mutate(end = dplyr::coalesce(.data$end, 30L)) %>% 
  # expand rows
  mutate(code = purrr::pmap(., function(prefix, begin, end, ...) sprintf("%s_%d", prefix, seq(begin, end)))) %>%
  tidyr::unnest(.data$code) %>%
  # add sequencial numbers to names (e.g. 備考 -> 備考1, 備考2, ...)
  group_by(category) %>%
  mutate(name = paste0(name, row_number()),
         # add sequential numbers to row_order
         row_order = row_order + row_number() / 10^ceiling(log10(n() + 1))) %>%
  # we don't need prefix, begin and end anymore
  select(-(prefix:end))
excel_data <- bind_rows(data_orig[!indices_tilda, ],
                        data_part_tilda) %>%
  arrange(row_order) %>%
  select(-row_order)

Workaround for S05-b

excel_data <- excel_data %>%
  mutate(category = if_else(stringr::str_detect(.data$code, "^S05b"), "S05-b", .data$category))

Add item_id and re-fill item and tag

Merged cells are unreliable except for the top cell of the category. We need to correct toe category

excel_data <- excel_data %>%
  tidyr::extract(code, into = c("prefix", "num"), regex = "^([A-Z]\\d+[^_]+)_([\\d_]+)$", remove = FALSE) %>%
  mutate(num = readr::parse_number(stringr::str_replace(num, "_", ".")),
         is_sequential_code = coalesce(num > lag(num) & prefix == lag(prefix), TRUE),
         is_sequential_category = coalesce(category == lag(category), TRUE),
         is_start_of_different_item = !is_sequential_code | !is_sequential_category) %>% 
  group_by(category) %>% 
  mutate(item_id = cumsum(is_start_of_different_item)) %>%
  group_by(category, item_id) %>%
  mutate(item = first(item), tag = first(tag)) %>%
  ungroup() %>% 
  select(category, item, tag, code, name, notes)

Save

readr::write_csv(excel_data,
                 path = rprojroot::find_package_root_file("inst/extdata/shape_property_table_xls.csv"))


yutannihilation/kokudosuuchiUtils documentation built on May 6, 2019, 5:04 p.m.