rosstat:

library(sf)
library(tidyverse)
library(rosstat)
library(mapview)
data(rosstat_regions)

rst = read_rosstat('data-raw/12-03.docx')
# tab = readxl::read_excel('data-raw/12-03.xlsx', col_names = FALSE)

tbls = docxtractr::read_docx('data-raw/12-03.docx') %>%
  docxtractr::docx_extract_all_tbls()

max_dist = 2

# Count the number of rows in table header -- N

tab = tbls[[1]]
n = ncol(tab)


N = 1
for (v in 1:nrow(tab)) {
  if (tab[v,1] != "") {
    N = v-1
    break
  }
}

# Attach remaining tables from the bottom
k = 2
while (k <= length(tbls)){
  tab_new = tbls[[k]][-(1:N), ]
  tab = tab %>% dplyr::bind_rows(tab_new)
  k = k + 1
}

tab = setNames(tab, c('reg', 2:n))

# Count the number of columns


# Convert rosstat table to tidy format ------------------------------------

resource = vector(mode = "integer", length = N) # empty cells that should be filled
d = vector(mode = "integer", length = N) # empty cells that should NOT be filled
g = vector(mode = "integer", length = N-1) # a difference — is used to define header row type
nvals = vector(mode = "integer", length = N)

# Calculate number of empty cells (excluding NA) and to-be-filled cells

for (r in 1:N) {
  row = tab %>%
    slice(r) %>%
    unlist(use.names=FALSE)
  d[r] = length(row[row == ""]) - length(row[is.na(row)])
  nvals[r] = n - length(row[row == ""])
  resource[r] = (n - d[r]) / nvals[r]
}

# Calculate header row type

for (r in 1:N-1) {
  g[r] = d[r+1]-d[r]
}

# For every row beginning from one before last, for every column
for (i in (N-1):1) {
  m = 1
  for (j in 1:(n-1)) {

    # If empty cell then do nothing

    if ((tab[i, j] == "") | (tab[i+1, j] == "") | (tab[i+1, j+1] == "")) next

    # If g > 0 (e.g. in the next row there are more empty non-filled cells then in current)
    # then duplicate the value until filled cell is reached

    if (g[i]>0) {
      for (k in n:(j+1)) {
        tab[i,k] = tab[i,k-1]
      }
    } else {         # if g <= 0, then spread remaining values
      if (m < resource[i]){
        for (k in n:(j+1)) {
          tab[i,k] = tab[i,k-1]
        }
        m = m+1

      } else {
        m = 1
      }
    }
  }
}



# join ISO identifiers and filter only regions
tab = tab %>%
  fuzzyjoin::stringdist_left_join(rosstat_regions[, c('name_local', 'iso'), drop = TRUE],
                                  by = c('reg' = 'name_local'), max_dist = max_dist) %>%
  dplyr::select(-name_local)

# reconstruct a long form of a table

values = tab %>%
  dplyr::slice(N+1:n()) %>%
  magrittr::set_names(c('reg', 2:n, 'iso')) %>%
  tidyr::gather(var, value, 2:n) %>%
  dplyr::mutate(var = as.numeric(var),
                value = as.numeric(value))

# newtab = matrix(ncol = 3, nrow = 1) %>% data.frame()
#
# for (i in (N+1):nrow(tab)){
#   for (j in 2:n){
#     new_row = c(tab[i, n+1], j, tab[i, j])
#     newtab = dplyr::bind_rows(newtab, new_row)
#   }
# }
#
# colnames(newtab) <- c('iso', 'var', 'value')
#
# values = newtab[-1, ] %>%
#   dplyr::filter(!is.na(iso)) %>%
#   dplyr::mutate(value = as.numeric(value))

classifier = tab %>%
  select(2:n) %>%
  slice(1:N) %>%
  rownames_to_column() %>%
  gather(var, value, -rowname) %>%
  mutate(var = as.integer(var)) %>%
  spread(rowname, value) %>%
  mutate(pathString =
           dplyr::select(., -var) %>%
           apply(1, function(X){
             paste(X, collapse = '/')
           })
         )

# classifier$pathString = classifier %>%
#   dplyr::select(-var) %>%
#   apply(1, function(X){
#     paste(X, collapse = '/')
#   })

# colnames(classifier) <- c(paste0('Level', 1:(N+1)), 'var')
# rownames(classifier) <- 1:(n-1)