#' @title Summary Tables
#' @description Generate summary tables from the preprocessed data frame.
#' @param df.preprocessed Preprocessed data.frame to be summarized
#' @param df.orig Original data.frame for comparisons
#' @param output.folder Output folder path
#' @return NULL
#' @author Leo Lahti \email{leo.lahti@@iki.fi}
#' @references See citation("bibliographica")
#' @export
#' @examples # generate_summary_tables(df)
#' @keywords utilities
generate_summary_tables <- function (df.preprocessed, df.orig, output.folder = "output.tables") {
# Circumvent build warnings
df <- author <- author_name <- author_birth <- author_death <- author_pseudonyme <- author_gender <- name <- NULL
mean_pagecounts_multivol <- mean_pagecounts_singlevol <- mean_pagecounts_issue <- NULL
obl <- NULL
tally <- NULL
desc <- NULL
original_name <- NULL
original_date <- NULL
final_author_id <- NULL
final_author_birth <- NULL
final_author_death <- NULL
publisher <- NULL
gatherings <- NULL
original_extent <- NULL
final_pagecount <- NULL
physical_extent <- NULL
estimated_pagecount <- NULL
publication_year <- NULL
publication_interval <- NULL
publication_interval_from <- NULL
publication_interval_till <- NULL
width <- NULL
height <- NULL
# Ensure compatibility
df.orig <- df.orig[match(df.preprocessed$original_row, df.orig$original_row),]
message("Write summaries of field entries and count stats for all fields")
for (field in setdiff(names(df.preprocessed),
c(names(df.preprocessed)[grep("language", names(df.preprocessed))] ,
"row.index", "paper", "publication_decade",
"publication_year", "publication_year_from", "publication_year_till",
"publication_interval",
"publication_interval_from",
"publication_interval_till",
"subject_topic",
"pagecount", "obl", "obl.original", "original_row", "dissertation",
"synodal", "language", "original", "unity", "author_birth", "author_death",
"gatherings.original", "width.original", "height.original",
"longitude", "latitude", "page", "item", "publisher.printedfor",
"publisher", "author_pseudonyme",
"control_number", "system_control_number",
"author_name", "author", "area", "width", "height", "gender"))) {
message(field)
message("Accepted entries in the preprocessed data")
s <- write_xtable(df.preprocessed[[field]], paste(output.folder, field, "_accepted.csv", sep = ""), count = TRUE)
message("Discarded entries")
if ((field %in% names(df.preprocessed)) && (field %in% names(df.orig))) {
inds <- which(is.na(df.preprocessed[[field]]))
original <- as.vector(na.omit(as.character(df.orig[[field]][inds])))
tmp <- write_xtable(original, paste(output.folder, field, "_discarded.csv", sep = ""), count = TRUE)
}
message("Nontrivial conversions")
if (field %in% names(df.preprocessed) && (field %in% names(df.orig)) && !field %in% c("dimension", "title")) {
message(field)
inds <- which(!is.na(df.preprocessed[[field]]))
original <- as.character(df.orig[[field]][inds])
polished <- as.character(df.preprocessed[[field]][inds])
tab <- cbind(original = original, polished = polished)
# Exclude trivial cases (original == polished exluding cases)
tab <- tab[!tolower(tab[, "original"]) == tolower(tab[, "polished"]), ]
tab <- as.data.frame(tab)
x <- tab %>% group_by(original, polished) %>%
tally() %>%
arrange(desc(n))
tmp <- write.table(x, file = paste(output.folder, field, "_conversion_nontrivial.csv", sep = ""),
quote = FALSE,
row.names = FALSE
)
}
}
# --------------------------------------------------------------
message("..author conversion")
o <- gsub("\\]", "", gsub("\\[", "", gsub("\\.+$", "", as.character(df.orig$author_name))))
x <- as.character(df.orig$author_date)
inds <- which(!is.na(x) & !(tolower(o) == tolower(x)))
x1 <- as.character(df.preprocessed[inds, "author"])
x2 <- as.character(df.preprocessed[inds, "author_birth"])
x3 <- as.character(df.preprocessed[inds, "author_death"])
li <- list(
original_name = o[inds],
original_date = x[inds],
final_author_id = x1,
final_author_birth = x2,
final_author_death = x3
)
df <- as.data.frame(li)
x <- df %>%
group_by(original_name, original_date, final_author_id, final_author_birth, final_author_death) %>%
tally() %>%
arrange(desc(n))
write.table(x,
file = paste(output.folder, paste("author_conversion_nontrivial.csv", sep = "_"), sep = ""),
quote = FALSE,
row.names = FALSE
)
# -----------------------------------------------------
message("subject_topic")
field <- "subject_topic"
entries <- unlist(strsplit(as.character(df.preprocessed[[field]]), ";"), use.names = FALSE)
s <- write_xtable(entries, paste(output.folder, field, "_accepted.csv", sep = ""), count = TRUE)
message("Discarded entries")
if ((field %in% names(df.preprocessed)) && (field %in% names(df.orig))) {
inds <- which(is.na(df.preprocessed[[field]]))
original <- as.vector(na.omit(as.character(df.orig[[field]][inds])))
tmp <- write_xtable(original, paste(output.folder, field, "_discarded.csv", sep = ""), count = TRUE)
}
message("Nontrivial conversions")
if (field %in% names(df.preprocessed) && (field %in% names(df.orig)) && !field %in% c("dimension", "title")) {
message(field)
inds <- which(!is.na(df.preprocessed[[field]]))
original <- as.character(df.orig[[field]][inds])
polished <- as.character(df.preprocessed[[field]][inds])
tab <- cbind(original = original, polished = polished)
# Exclude trivial cases (original == polished exluding cases)
tab <- tab[!tolower(tab[, "original"]) == tolower(tab[, "polished"]), ]
tmp <- write_xtable(tab, paste(output.folder, field, "_conversion_nontrivial.csv", sep = ""), count = TRUE)
}
# -----------------------------------------------------
message("Author")
# Separate tables for real names and pseudonymes
tab <- df.preprocessed %>% filter(!author_pseudonyme) %>%
select(author, author_gender)
tmp <- write_xtable(tab,
paste(output.folder, paste("author_accepted.csv", sep = "_"), sep = ""),
count = FALSE, sort.by = "author")
# -------------------------------------------------
message("Pseudonyme")
tab <- df.preprocessed %>% filter(author_pseudonyme) %>% select(author)
tmp <- write_xtable(tab, paste(output.folder, "pseudonyme_accepted.csv", sep = ""),
count = TRUE, sort.by = "author")
# ----------------------------------------------------
message("..author")
o <- as.character(df.orig[["author_name"]])
x <- as.character(df.preprocessed[["author"]])
inds <- which(is.na(x))
tmp <- write_xtable(o[inds],
paste(output.folder, paste("author_discarded.csv", sep = "_"), sep = ""),
count = TRUE)
# --------------------------------------------
# Author gender
message("Accepted entries in the preprocessed data")
s <- write_xtable(df.preprocessed[["author_gender"]],
paste(output.folder, "author_gender_accepted.csv", sep = ""),
count = TRUE)
message("Discarded gender")
inds <- which(is.na(df.preprocessed[["author_gender"]]))
dg <- condense_spaces(gsub("\\.", " ", tolower(as.vector(na.omit(as.character(df.preprocessed$author[inds]))))))
inds <- grep("^[a-z] [a-z]-[a-z]$", dg)
if (length(inds)>0) {
dg <- dg[-inds]
}
tmp <- write_xtable(dg,
paste(output.folder, "author_gender_discarded.csv", sep = ""),
count = TRUE)
message("Author gender tables realized in the final data")
tab <- data.frame(list(name = pick_firstname(df.preprocessed$author_name),
gender = df.preprocessed$author_gender))
tab <- tab[!is.na(tab$gender), ] # Remove NA gender
write_xtable(subset(tab, gender == "male")[,-2],
paste(output.folder, "gender_male.csv", sep = ""))
write_xtable(subset(tab, gender == "female")[,-2],
paste(output.folder, "gender_female.csv", sep = ""))
# Unknown gender
tmp <- unname(pick_firstname(df.preprocessed$author_name)[is.na(df.preprocessed$author_gender)])
tmp <- condense_spaces(gsub("\\.", " ", tolower(tmp)))
inds <- unique(
c(which(nchar(tmp) == 1),
grep("^[a-z] [a-z] von$", tmp),
grep("^[a-z]{1,2} [a-z]{1,2}$", tmp),
grep("^[a-z] [a-z] [a-z]$", tmp)))
if (length(inds) > 0) { tmp <- tmp[-inds] }
tmpg <- write_xtable(tmp, paste(output.folder, "gender_unknown.csv", sep = ""))
# Unresolved (ambiguous) gender
tmp <- unname(pick_firstname(df.preprocessed$author_name)[df.preprocessed$author_gender == "ambiguous"])
tmp <- condense_spaces(gsub("\\.", " ", tolower(tmp)))
inds <- c(which(nchar(tmp) == 1),
grep("^[a-z]{1,2} [a-z]{1,2}$", tmp),
grep("^[a-z]{1,2} [a-z]{1,2} von$", tmp),
grep("^von$", tmp),
grep("^[a-z] von$", tmp),
grep("^[a-z] [a-z] [a-z]$", tmp))
if (length(inds) > 0) { tmp <- tmp[-inds] }
tmpg <- write_xtable(tmp, paste(output.folder, "gender_ambiguous.csv", sep = ""))
#-------------------------------------------------
## PUBLISHER
# The result is visible at
# https://github.com/COMHIS/fennica/blob/master/inst/examples/publisher.md
# and that page is generated from
# inst/extdata/publisher.Rmd
message("Accepted publishers")
x <- df.preprocessed[, c("publisher", "self_published")] %>%
group_by(publisher, self_published) %>%
tally() %>%
arrange(desc(n))
s <- write.table(x,
file = paste(output.folder, field, "_accepted.csv", sep = ""),
quote = FALSE, row.names = FALSE
)
message("Discarded publishers")
if ((field %in% names(df.preprocessed)) && (field %in% names(df.orig))) {
inds <- which(is.na(df.preprocessed[[field]]))
original <- as.vector(na.omit(as.character(df.orig[[field]][inds])))
# Remove trivial cases to simplify output
inds <- c(grep("^\\[*s\\.*n\\.*\\]*[0-9]*\\.*$", tolower(original)),
grep("^\\[*s\\.*n\\.*\\[*[0-9]*$", tolower(original)))
if (length(inds) > 0) {
original <- original[-inds]
}
tmp <- write_xtable(original, paste(output.folder, field, "_discarded.csv", sep = ""), count = TRUE)
}
message("publisher conversions")
o <- as.character(df.orig[[field]])
x <- as.character(df.preprocessed[[field]])
inds <- which(!is.na(x) & !(tolower(o) == tolower(x)))
tmp <- write_xtable(cbind(original = o[inds],
polished = x[inds]),
paste(output.folder, paste(field, "conversion_nontrivial.csv", sep = "_"),
sep = ""), count = TRUE)
# --------------------------------------------
## CORPORATE
# The result is visible at
# https://github.com/COMHIS/fennica/blob/master/inst/examples/publisher.md
# and that page is generated from
# inst/extdata/publisher.Rmd
message("Accepted corporates")
field <- "corporate"
s <- write_xtable(df.preprocessed[[field]],
paste(output.folder, field, "_accepted.csv", sep = ""),
count = TRUE)
message("Discarded corporates")
if ((field %in% names(df.preprocessed)) && (field %in% names(df.orig))) {
inds <- which(is.na(df.preprocessed[[field]]))
original <- as.vector(na.omit(as.character(df.orig[[field]][inds])))
# Remove trivial cases to simplify output
inds <- c(grep("^\\[*s\\.*n\\.*\\]*[0-9]*\\.*$", tolower(original)),
grep("^\\[*s\\.*n\\.*\\[*[0-9]*$", tolower(original)))
if (length(inds) > 0) {
original <- original[-inds]
}
tmp <- write_xtable(original, paste(output.folder, field, "_discarded.csv", sep = ""), count = TRUE)
}
message("corporate conversions")
nam <- "corporate"
o <- as.character(df.orig[[nam]])
x <- as.character(df.preprocessed[[nam]])
inds <- which(!is.na(x) & !(tolower(o) == tolower(x)))
tmp <- write_xtable(cbind(original = o[inds],
polished = x[inds]),
paste(output.folder, paste(nam, "conversion_nontrivial.csv", sep = "_"),
sep = ""), count = TRUE)
# ----------------------------
message("Pagecount conversions")
o <- as.character(df.orig[["physical_extent"]])
g <- as.character(df.preprocessed$gatherings)
x <- as.character(df.preprocessed[["pagecount"]])
# Do not show the estimated ones,
# just the page counts that were originally available
#x2 <- rep("", nrow(df.preprocessed));
# x2[is.na(df.preprocessed[["pagecount.orig"]])] <- "estimate"
inds <- which(!is.na(x) & !(tolower(o) == tolower(x)) &
!is.na(df.preprocessed[["pagecount.orig"]]))
tmp <- cbind(gatherings = g[inds],
original_extent = o[inds],
final_pagecount = x[inds]
)
xx <- as.data.frame(tmp) %>% group_by(gatherings, original_extent, final_pagecount) %>%
tally() %>%
arrange(desc(n))
write.table(xx,
file = paste(output.folder, "pagecount_conversions.csv", sep = ""),
quote = FALSE,
row.names = FALSE)
# ----------------------------------------------
message("Discard summaries")
inds <- which(is.na(df.preprocessed$pagecount.orig))
tmp <- cbind(
gatherings = g[inds],
physical_extent = o[inds],
estimated_pagecount = x[inds]
)
x <- as.data.frame(tmp) %>% group_by(gatherings, physical_extent, estimated_pagecount) %>%
tally() %>%
arrange(desc(n))
write.table(x, file=paste(output.folder, "pagecount_discarded.csv", sep = ""), quote = FALSE, row.names = FALSE)
# --------------------------------------------
message("Conversion: publication year")
# Publication year
o <- gsub("\\.$", "", as.character(df.orig[["publication_time"]]))
x <- df.preprocessed[, c("publication_year", "publication_year_from", "publication_year_till")]
tab <- cbind(original = o, x)
tab <- tab[!is.na(tab$publication_year),]
xx <- as.data.frame(tab) %>% group_by(original, publication_year) %>% tally() %>% arrange(desc(n))
tmp <- write.table(xx,
file = paste(output.folder, "publication_year_conversion.csv",
sep = ""), quote = FALSE, row.names = FALSE)
message("Discarded publication year")
o <- as.character(df.orig[["publication_time"]])
x <- as.character(df.preprocessed[["publication_year"]])
inds <- which(is.na(x))
tmp <- write_xtable(o[inds],
paste(output.folder, "publication_year_discarded.csv", sep = ""),
count = TRUE)
# --------------------------------------------
message("Accepted publication frequency")
if ("publication_frequency_text" %in% names(df.preprocessed)) {
publication_frequency_annual <- NULL
dfp <- df.preprocessed[, c("publication_frequency_text",
"publication_frequency_annual")]
# Remove NA
inds <- is.na(dfp$publication_frequency_text) &
is.na(dfp$publication_frequency_annual)
dfp <- dfp[!inds,]
xx <- dfp %>% group_by(publication_frequency_text, publication_frequency_annual) %>%
tally() %>%
arrange(desc(publication_frequency_annual))
tmp <- write.table(xx,
file = paste(output.folder, "publication_frequency_accepted.csv", sep = ""),
quote = FALSE, row.names = FALSE
)
message("Conversion: publication frequency")
o <- cbind(original_frequency = condense_spaces(tolower(gsub("\\.$", "", as.character(df.orig[["publication_frequency"]])))),
original_interval = condense_spaces(tolower(gsub("\\.$", "", as.character(df.orig[["publication_interval"]])))),
original_time = condense_spaces(tolower(gsub("\\.$", "", as.character(df.orig[["publication_time"]]))))
)
x <- df.preprocessed[, c("publication_frequency_text", "publication_frequency_annual")]
tab <- cbind(x, o)
tab$publication_frequency_annual <- round(tab$publication_frequency_annual, 2)
tab <- tab[which(!rowMeans(is.na(tab[, 1:3])) == 1),] # Remove NA cases
xx <- tab %>% group_by(publication_frequency_text, publication_frequency_annual) %>%
tally() %>%
arrange(desc(n))
tmp <- write.table(xx,
file = paste(output.folder, "publication_frequency_conversion.csv",
sep = ""), quote = FALSE, row.names = FALSE)
message("Discarded publication frequency")
o <- as.character(df.orig[["publication_frequency"]])
x1 <- as.character(df.preprocessed[["publication_frequency_annual"]])
x2 <- as.character(df.preprocessed[["publication_frequency_text"]])
inds <- which(is.na(x1) & is.na(x2))
tmp <- write_xtable(o[inds],
paste(output.folder, "publication_frequency_discarded.csv", sep = ""),
count = TRUE)
}
# --------------------------------------------
message("Conversion: publication interval")
if ("publication_interval_from" %in% names(df.preprocessed)) {
# Publication interval
o <- tolower(gsub("\\.$", "", as.character(df.orig[["publication_interval"]])))
x <- df.preprocessed[, c("publication_interval_from", "publication_interval_till")]
tab <- cbind(original = o, x)
tab <- tab[!is.na(tab$publication_interval_from) | !is.na(tab$publication_interval_till),]
xx <- tab %>% group_by(original) %>% tally() %>% arrange(desc(n))
tmp <- write.table(xx,
paste(output.folder, "publication_interval_conversion_nontrivial.csv",
sep = ""), quote = FALSE, row.names = FALSE)
message("Discarded publication interval")
o <- df.orig[, c("publication_interval", "publication_time", "publication_frequency")]
o$publication_time <- gsub("^\\[", "", gsub("\\]$", "", gsub("\\.$", "", o$publication_time)))
x <- df.preprocessed[,c("publication_interval_from", "publication_interval_till")]
x2 <- df.preprocessed[, c("publication_frequency_annual", "publication_frequency_text")]
inds <- which(rowSums(is.na(x)) == 2 & rowSums(is.na(x2)) == 2 )
o <- o[inds,]
inds <- is.na(unlist(o[,1])) & grepl("^[0-9]+$", unlist(o[, 2])) & is.na(unlist(o[,3]))
xx <- o[!inds,] %>% group_by(publication_interval) %>% tally() %>% arrange(desc(n))
tmp <- write.table(xx,
file = paste(output.folder, "publication_interval_discarded.csv", sep = ""),
quote = FALSE, row.names = FALSE
)
message("Accepted publication interval")
o <- as.character(df.orig[["publication_interval"]])
x <- df.preprocessed[c("publication_interval_from", "publication_interval_till")]
inds <- which(rowSums(!is.na(x))>0)
xx <- x[inds,] %>% group_by(publication_interval_from, publication_interval_till) %>% tally() %>% arrange(desc(n))
tmp <- write.table(xx,
file = paste(output.folder, "publication_interval_accepted.csv", sep = ""),
quote = FALSE, row.names = FALSE
)
}
# --------------------------------------------
message("Authors with missing life years")
tab <- df.preprocessed %>% filter(!is.na(author_name) & (is.na(author_birth) | is.na(author_death))) %>% select(author_name, author_birth, author_death)
tmp <- write.table(tab, file = paste(output.folder, "authors_missing_lifeyears.csv", sep = ""), quote = F, row.names = F)
message("Ambiguous authors with many birth years")
births <- split(df.preprocessed$author_birth, df.preprocessed$author_name)
births <- births[sapply(births, length, USE.NAMES = FALSE) > 0]
many.births <- lapply(births[names(which(sapply(births, function (x) {length(unique(na.omit(x)))}, USE.NAMES = FALSE) > 1))], function (x) {sort(unique(na.omit(x)))})
dfs <- df.preprocessed[df.preprocessed$author_name %in% names(many.births),
c("author_name", "author_birth", "author_death")]
dfs <- unique(dfs)
dfs <- dfs %>% arrange(author_name, author_birth, author_death)
write.table(dfs, paste(output.folder, "author_life_ambiguous.csv", sep = ""), quote = F, sep = "\t", row.names = FALSE)
# -------------------------------------------------------
message("Undefined language")
gc(); rm(dfs); rm(tmp) # Cleanup
# Remove "und" from the list ("Undetermined")
f <- system.file("extdata/language_abbreviations.csv", package = "bibliographica")
abrv <- read_mapping(f, include.lowercase = T, self.match = T, ignore.empty = FALSE, mode = "table", sep = "\t")
# List unique languages that occur in the data
lang <- tolower(df.orig$language)
lang <- unique(lang[!is.na(lang)])
lang <- unlist(strsplit(lang, ";"), use.names = FALSE)
lang <- unique(lang)
lang <- lang[!grepl("^[0-9]$", lang)] # Remove numerics
# Remove the known ones (und is Undetermined)
known.abbreviations <- setdiff(abrv$synonyme, "und") # und = Undetermined
discarded.lang <- c("*", ".", "^,", "", "-", "\\\\?", "&")
unknown.lang <- lang[!lang %in% c(known.abbreviations, discarded.lang)]
message("Write unknown languages")
if (length(unknown.lang)>0) {
ltab <- table(df.orig$language)
#spl <- unlist(strsplit(names(ltab), ";"), use.names = FALSE)
# Count occurrences for each unknown lang
# TODO should be easy to speed up by considering unique entries only
# and them summing up their stats
# Identify hits 0/1
u <- sapply(unknown.lang, function (ul) grepl(paste("^", ul, "$", sep = ""), names(ltab))) |
sapply(unknown.lang, function (ul) grepl(paste("^", ul, ";", sep = ""), names(ltab))) |
sapply(unknown.lang, function (ul) grepl(paste(";", ul, ";", sep = ""), names(ltab))) |
sapply(unknown.lang, function (ul) grepl(paste(";", ul, "$", sep = ""), names(ltab)))
# Multiply by counts of each case
u <- apply(u, 2, function (x) {x * ltab})
# Sum up the occurrence counts for each unknown language
u <- colSums(u)
u <- u[u > 0]
u <- rev(sort(u))
tab <- cbind(term = names(u), n = unname(u))
tmp <- write.csv(tab,
file = paste(output.folder, "language_discarded.csv", sep = ""),
quote = F, row.names = F)
} else {
write.csv("No entries.", file = paste(output.folder, "language_discarded.csv", sep = ""))
}
message("Accepted languages")
known.lang <- lang[lang %in% known.abbreviations]
tmp <- write_xtable(map(known.lang, abrv), paste(output.folder, "language_accepted.csv", sep = ""), count = TRUE)
message("Language conversions")
field = "language"
original <- as.character(df.orig[[field]])
polished <- as.character(df.preprocessed[[field]])
tab <- cbind(original = original, polished = polished)
tmp <- write_xtable(tab, paste(output.folder, field, "_conversions.csv", sep = ""), count = TRUE)
# ---------------------------------------------------------
message("Page counts")
use.fields <- intersect(c("pagecount", "volnumber", "volcount"), names(df.preprocessed))
tab <- cbind(original = df.orig$physical_extent, df.preprocessed[, use.fields])
# For clarity: remove ECCO and Manually augmented pagecounts from ESTC data
if ("pagecount_from" %in% names(df) & nrow(df.preprocessed) == nrow(df.orig)) {
tab <- tab[df.preprocessed$pagecount_from %in% c("estc"),]
}
tmp <- write.table(tab, file = "output.tables/conversions_physical_extent.csv", quote = F, row.names = F)
message("Physical dimension info")
tab <- cbind(original = df.orig$physical_dimension,
df.preprocessed[, c("gatherings.original", "width.original", "height.original", "obl.original", "gatherings", "width", "height", "obl", "area")])
tmp <- write.table(tab, file = "output.tables/conversions_physical_dimension.csv", quote = F, row.names = F)
message("Accepted / Discarded dimension info")
inds <- which(is.na(df.preprocessed[["area"]]))
xx <-
data.frame(original = as.character(df.orig$physical_dimension)[inds],
df.preprocessed[inds, c("gatherings", "width", "height", "obl")])
xx <- xx %>% group_by(original, gatherings, width, height, obl) %>% tally() %>% arrange(desc(n))
tmp <- write.table(xx,
file = paste(output.folder, paste("physical_dimension_incomplete.csv", sep = "_"), sep = ""),
quote = F, row.names = F)
#-----------------------------------------------------------------------
message("All summary tables generated.")
gc()
return(NULL)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.