library(sf)
library(tidyverse)
library(rosstat)
library(mapview)
data(rosstat_regions)
rst = read_rosstat('data-raw/12-03.docx')
# tab = readxl::read_excel('data-raw/12-03.xlsx', col_names = FALSE)
tbls = docxtractr::read_docx('data-raw/12-03.docx') %>%
docxtractr::docx_extract_all_tbls()
max_dist = 2
# Count the number of rows in table header -- N
tab = tbls[[1]]
n = ncol(tab)
N = 1
for (v in 1:nrow(tab)) {
if (tab[v,1] != "") {
N = v-1
break
}
}
# Attach remaining tables from the bottom
k = 2
while (k <= length(tbls)){
tab_new = tbls[[k]][-(1:N), ]
tab = tab %>% dplyr::bind_rows(tab_new)
k = k + 1
}
tab = setNames(tab, c('reg', 2:n))
# Count the number of columns
# Convert rosstat table to tidy format ------------------------------------
resource = vector(mode = "integer", length = N) # empty cells that should be filled
d = vector(mode = "integer", length = N) # empty cells that should NOT be filled
g = vector(mode = "integer", length = N-1) # a difference — is used to define header row type
nvals = vector(mode = "integer", length = N)
# Calculate number of empty cells (excluding NA) and to-be-filled cells
for (r in 1:N) {
row = tab %>%
slice(r) %>%
unlist(use.names=FALSE)
d[r] = length(row[row == ""]) - length(row[is.na(row)])
nvals[r] = n - length(row[row == ""])
resource[r] = (n - d[r]) / nvals[r]
}
# Calculate header row type
for (r in 1:N-1) {
g[r] = d[r+1]-d[r]
}
# For every row beginning from one before last, for every column
for (i in (N-1):1) {
m = 1
for (j in 1:(n-1)) {
# If empty cell then do nothing
if ((tab[i, j] == "") | (tab[i+1, j] == "") | (tab[i+1, j+1] == "")) next
# If g > 0 (e.g. in the next row there are more empty non-filled cells then in current)
# then duplicate the value until filled cell is reached
if (g[i]>0) {
for (k in n:(j+1)) {
tab[i,k] = tab[i,k-1]
}
} else { # if g <= 0, then spread remaining values
if (m < resource[i]){
for (k in n:(j+1)) {
tab[i,k] = tab[i,k-1]
}
m = m+1
} else {
m = 1
}
}
}
}
# join ISO identifiers and filter only regions
tab = tab %>%
fuzzyjoin::stringdist_left_join(rosstat_regions[, c('name_local', 'iso'), drop = TRUE],
by = c('reg' = 'name_local'), max_dist = max_dist) %>%
dplyr::select(-name_local)
# reconstruct a long form of a table
values = tab %>%
dplyr::slice(N+1:n()) %>%
magrittr::set_names(c('reg', 2:n, 'iso')) %>%
tidyr::gather(var, value, 2:n) %>%
dplyr::mutate(var = as.numeric(var),
value = as.numeric(value))
# newtab = matrix(ncol = 3, nrow = 1) %>% data.frame()
#
# for (i in (N+1):nrow(tab)){
# for (j in 2:n){
# new_row = c(tab[i, n+1], j, tab[i, j])
# newtab = dplyr::bind_rows(newtab, new_row)
# }
# }
#
# colnames(newtab) <- c('iso', 'var', 'value')
#
# values = newtab[-1, ] %>%
# dplyr::filter(!is.na(iso)) %>%
# dplyr::mutate(value = as.numeric(value))
classifier = tab %>%
select(2:n) %>%
slice(1:N) %>%
rownames_to_column() %>%
gather(var, value, -rowname) %>%
mutate(var = as.integer(var)) %>%
spread(rowname, value) %>%
mutate(pathString =
dplyr::select(., -var) %>%
apply(1, function(X){
paste(X, collapse = '/')
})
)
# classifier$pathString = classifier %>%
# dplyr::select(-var) %>%
# apply(1, function(X){
# paste(X, collapse = '/')
# })
# colnames(classifier) <- c(paste0('Level', 1:(N+1)), 'var')
# rownames(classifier) <- 1:(n-1)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.