# M A I N ----------------------------------------------------------------------
if (FALSE)
{
# 1. Source main.R first!
# 2. Source this script
file_database <- kwb.file::to_file_database(files)
file_database$files
file_database$folders
# Get all tables from one file
tables <- get_text_tables_from_xlsx(file = files[1])
# Get table metadata
table_info <- get_table_info(tables)
# Create column metadata from the table headers
column_info <- create_column_metadata(tables)
# Add column "skip": If the user puts an "x" into this column, the
# corresponding table will not be imported.
table_info$skip <- ""
# Write table metadata to "<basename>_META.csv"
export_table_metadata(
structure(table_info, file = kwb.utils::getAttribute(tables, "file"))
)
# import_table_metadata returns NULL if no metadata file exists
import_table_metadata(files[5])
# Select all file indices
indices <- seq_along(files)
indices <- 1:2
# Change indices to test with less files
#indices <- 16
# Clear the screen
kwb.utils::clearConsole()
# Get all tables from all files
system.time(all_tables <- lapply(indices, function(index) {
cat("File index:", index)
get_text_tables_from_xlsx(files[index])
}))
# user system elapsed
# 93.604 2.936 97.305
names(all_tables) <- file_database$files$file_id[indices]
# Create column metadata for all tables
column_info_list <- lapply(all_tables, create_column_metadata)
column_info <- rbindAll(
column_info_list,
nameColumn = "file_id", namesAsFactor = FALSE
)
x <- compact_column_info(column_info)
nrow(x)
# 6141
column_info <- suggest_column_name(column_info)
column_info <- merge(column_info, file_database$files)
column_info <- merge(column_info, file_database$folders)
base_dir <- kwb.utils::getAttribute(file_database$folders, "base_dir")
file_metadata <- file.path(base_dir, "METADATA_columns_tmp.csv")
write.csv(column_info, file_metadata, row.names = FALSE)
# TODO: Rename METADATA_columns_tmp.csv to METADATA_columns.csv, let the user
# modify the file and read back into column_info
#
#column_info <- read_column_info(safePath(base_dir, "METADATA_columns.csv"))
# Use column info to convert the text tables into data frames
all_data <- text_matrices_to_data_frames(all_tables, column_info)
lapply(all_data, function(all_tables) lapply(all_tables, utils::head))
file_database$files$file_id
# Get the path to a log file
logfile_summary <- tempfile("table_summary_", fileext = ".txt")
logfile_headers <- tempfile("table_headers_", fileext = ".txt")
# Write a summary of the read structure to the log file
utils::capture.output(file = logfile_summary, {
for (tables in all_tables) print_table_summary(tables)
})
# Let's have a look at the tables in one Excel file only
tables <- all_tables[[1]]
# Get a description of the sheets in that file
get_sheet_info(tables)
# Get a description of tables in that file
get_table_info(tables)
# Get the name of the file that was read
kwb.utils::getAttribute(tables, "file")
# The tables are named by sheet number and table number within the sheet
# The numbers are hexadecimal, i.e a = 10, f = 15, 10 = 16, ff = 255,
names(tables)
# "table_01_01" "table_02_01"
# Try to guess the header rows for each table...
n_headers <- sapply(names(tables), function(table_id) {
guess_number_of_headers_from_text_matrix(
tables[[table_id]],
table_id = table_id
)
})
print_logical_matrix(guess_header_matrix(x = tables$table_01_01, n_max))
print_logical_matrix(guess_header_matrix(x = tables$table_02_01, n_max))
print_header_guesses(tables, n_max, file = logfile_headers)
lapply(all_tables[[3]], guess_header_matrix, 4)
utils::head(x <- tables$table_02_01)
is_empty <- (is.na(x) | x == "")
print_logical_matrix(utils::head(is_empty))
print_logical_matrix(utils::head(is_empty), invert = TRUE)
}
# Text Matrices to data frames -------------------------------------------------
if (FALSE) {
# Convert text matrices of known format
tables <- get_text_tables_from_xlsx(file = files[1])
selected <- grepl("^table_02_", names(tables))
tables_year_well <- lapply(tables[selected], text_matrix_to_numeric_matrix)
data_frames_year_well <- lapply(tables_year_well, as.data.frame)
str(data_frames_year_well$table_02_01)
str(data_frames_year_well$table_02_02)
}
# print_header_guesses ---------------------------------------------------------
print_header_guesses <- function(
text_matrices, n_max = 5, file = NULL, dbg = TRUE) {
if (!is.null(file)) {
debug_formatted(dbg, "Writing output to '%s'... ", file)
utils::capture.output(file = file, print_header_guesses(text_matrices, n_max))
debug_ok(dbg)
} else {
# matrix_name <- "table_03_01"
for (matrix_name in names(text_matrices)) {
header <- guess_header_matrix(x = text_matrices[[matrix_name]], n_max)
debug_formatted(dbg, "\n%s:\n", matrix_name)
print_logical_matrix(header)
}
}
}
# guess_header_matrix ----------------------------------------------------------
guess_header_matrix <- function(x, n_max = 10) {
stopifnot(is.character(x))
kwb.utils::stopIfNotMatrix(x)
x_head <- as.data.frame(utils::head(x, n_max))
do.call(cbind, lapply(x_head, function(column_values) {
sapply(seq_along(column_values), function(i) {
as.integer(!(column_values[i] %in% column_values[-(1:i)]))
})
}))
}
# text_matrix_to_numeric_matrix ------------------------------------------------
text_matrix_to_numeric_matrix <- function(x) {
print(x)
matrix(
as.numeric(x[-1, -1]),
nrow = nrow(x) - 1,
dimnames = list(x[-1, 1], x[1, -1])
)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.