# For all parameter options for rmarkdown slide rendering, see: # http://rmarkdown.rstudio.com/developer_parameterized_reports.html # More header options in Pandoc manual # http://pandoc.org/MANUAL.html#templates if (params$update.bibliographica) { library(devtools) install_github("ropengov/bibliographica") } # Read parameters from the header mydate <- params$start min.year <- params$min.year max.year <- params$max.year # Selected place (if any) selected.place <- params$place # Time window for the plots time.window <- params$time.window # Limit the length of names to print on screen / figures nchar <- 40 library(devtools) library(estc) library(magrittr) library(bibliographica) library(reshape2) library(gridExtra) library(knitr) library(ggmap) #library(stringr) library(dplyr) library(tidyr) library(ggplot2) library(sorvi) knitr::opts_chunk$set(echo = FALSE) #knitr::opts_chunk$set(fig.path = "figure_slides/", dev="CairoPNG") knitr::opts_chunk$set(fig.path = "figure_slides/") # Set locale # tmp <- Sys.setlocale(locale="UTF-8") # Nice theme theme_set(theme_bw(26)) # Nice default themes # https://github.com/cttobin/ggthemr#palettes #ggthemr('fresh', text_size = 20) # ggthemr('greyscale', text_size = 20) #ggthemr('light', text_size = 20) # ggthemr('pale', text_size = 20) #ggthemr_reset() # Reset theme # Default number of top hits to show ntop <- 20
## SELECT AND AUGMENT THE COMPLETE DATA SET (df.orig) # Read the preprocessed data df.orig <- readRDS(datafile.preprocessed) # Keep the complete data # Add Metadata ID field idfield <- "system_control_number" df.orig$id <- sapply(strsplit(as.character(df.orig[, idfield]), ")"), function (xi) {xi[[length(xi)]]}) # Add document type df.orig$document_type <- rep(NA, nrow(df.orig)) df.orig$document_type[df.orig$pagecount > 32] <- "Books" df.orig$document_type[df.orig$pagecount <= 32] <- "Pamphlets" df.orig$document_type <- factor(df.orig$document_type) # Set the desired time window (default one decade) if (time.window == 10) { df.orig$publication_time <- df.orig$publication_decade } else { df.orig$publication_time <- time.window * floor(df.orig$publication_year / time.window) } # -------------------------------------- ## PICK SELECTED SUBSET OF THE DATA (df.preprocessed AND df.preprocessed.allplaces) # Now pick selected subset (time, place, language..) df.preprocessed <- df.orig # Selected time window df.preprocessed <- df.preprocessed %>% filter(publication_year >= min.year & publication_year <= max.year) # Selected language if (!params$language == "any") { df.preprocessed <- df.preprocessed %>% filter(language %in% params$language) } # Selected document type if (!params$document.type == "All") { df.preprocessed <- df.preprocessed %>% filter(document_type %in% params$document.type) # Remove issues if Pamphlets are selected if (params$document.type == "Pamphlets") { df.preprocessed <- df.preprocessed %>% filter(!issue) } } # Selected place df.preprocessed.allplaces <- df.preprocessed if (!selected.place == "All") { # Keep version with all places # Make version with selected place df.preprocessed <- df.preprocessed %>% filter(publication_place %in% selected.place) } # --------------------------------------- ## PICK SELECTED DOCUMENTS (df0) # Read the custom list of entry IDs # Including also the number of hits per document # and document total length for normalization purposes idfile <- params$idsource custom.ids <- read.table(idfile, skip = 1)[, c(1, 2, 4)] ids.orig <- as.character(custom.ids[,1]) custom.ids[,1] <- gsub("\\,$", "", as.character(custom.ids[,1])) custom.ids[,2] <- as.numeric(as.character(custom.ids[,2])) custom.ids[,3] <- as.numeric(as.character(custom.ids[,3])) names(custom.ids) <- c("id", "freq", "length") # Remove leading zeroes from the IDs to get them compatible with our versions custom.ids$id <- apply(cbind(substr(custom.ids$id, 1, 1), gsub("^[A-Z]0*", "", custom.ids$id)), 1, function (x) {paste(x, collapse = "")}) #--------------------------------------------------------- # Pick entries with the given IDs.. must remove the source ID from the ESTC strings df0 <- subset(df.preprocessed, id %in% custom.ids$id) # Selected place # Keep also version with all places df0.allplaces <- subset(df.preprocessed.allplaces, id %in% custom.ids$id) # Not found notfound <- setdiff(custom.ids$id, df0$id) # Combine duplicated customIDs custom.ids <- custom.ids %>% group_by(id) %>% summarise(freq = sum(freq, na.rm = T), length = sum(length, na.rm = T) ) # Add normalization field: hits per 1000 words custom.ids$freq.normalized <- 1e3 * custom.ids$freq/custom.ids$length # Number and percentage of the custom list IDs that were found # in the very original data hitn0 <- sum(custom.ids$id %in% df.orig$id, na.rm = TRUE) hitp0 <- 100 * hitn0/nrow(custom.ids) # Hits after final subsetting hitn1 <- nrow(df0) hitp1 <- 100 * hitn1/nrow(custom.ids) # Duplicated entries duplicated.ids.in.estc <- sum(duplicated(df.preprocessed$id), na.rm = T) duplicated.ids.in.query <- sum(duplicated(custom.ids$id), na.rm = T)
Selected place: r params$place
Selected language: r params$language
Document format: r params$document.type
Query IDs (n=r nrow(custom.ids)
) from file: r params$idsource
IDs found from complete ESTC: n=r hitn0
(r round(hitp0, 2)
%)
IDs found after subsetting (time, place, language, document type): n=r hitn1
(r round(hitp1, 2)
%)
Duplicated IDs in ESTC: n = r duplicated.ids.in.estc
(r round(100 * duplicated.ids.in.estc/nrow(df.preprocessed), 1)
%). Very few, each occurrence is considered separately.
Duplicated IDs in the query: r duplicated.ids.in.query
(r round(100 * duplicated.ids.in.query/nrow(custom.ids), 1)
%). According to a single check, these are often series with same ID over many issues. The total sum of query hits and lengths is used.
# Document length vs. normalized hit frequency: p <- ggplot(custom.ids, aes(x = length, y = freq.normalized)) + geom_point() + scale_x_log10() + scale_y_log10() + xlab("Document length") + ylab("Hits per unit length (x1000)") print(p)
Left: self-explanatory; Right: fraction of the ECCO query compared to all selected documents (time, language). If a specific place is selected, two bars are shown: one for the selected place and one for all places.
# If selected place is given, then show # both the selected and all places myfield <- "titlecount" df1 <- NULL if (!selected.place == "All") { df1 <- df0.allplaces } # Standard timeline # If place selection is applied then show both the # selected place and total data p1 <- plot_timeline(df0, x = "publication_decade", y = myfield, nmin = 0, mode = "absolute") + ylab("Title count (n)") + ggtitle("Total title count timeline ()") #guides(fill = "none") if (!selected.place == "All") { p1 <- p1 + scale_fill_manual("Place", values = c("black", "darkgray"), labels = c(selected.place, "All")) } p1 <- p1 + scale_y_log10() # Compare the selected field in between the two data sets df <- timeline_relative(df0, df.preprocessed, myfield) df$group <- rep(selected.place, nrow(df)) if (!selected.place == "All") { df.allplaces <- timeline_relative(df0.allplaces, df.preprocessed.allplaces, myfield) df.allplaces$group <- rep("All", nrow(df.allplaces)) df <- bind_rows(df, df.allplaces) } df$group <- factor(df$group) # Mark NAs to 0 df2 <- df %>% select(publication_time, group, fraction) df2 <- unique(df2) df2 <- df2 %>% spread(publication_time, fraction, fill = 0) df2 <- df2 %>% gather(publication_time, fraction, -group) df2$publication_time <- as.numeric(as.character(df2$publication_time)) p2 <- ggplot(df2, aes(y = fraction, x = publication_time, fill = group)) + geom_bar(stat = "identity", position = "dodge", color = "black") + scale_fill_manual(values = c("black", "darkgray")) + ylab("Fraction of total documents (%)") + xlab("Publication time") + ggtitle("Query relative to all docs") # guides(fill = "none") if (!selected.place == "All") { p2 <- p2 + scale_fill_manual("Place", values = c("black", "darkgray"), labels = c(selected.place, "All")) } grid.arrange(p1, p2, nrow = 1)
Left: self-explanatory; Right: fraction of the ECCO query compared to all selected documents (time, language). If a specific place is selected, two bars are shown: one for the selected place and one for all places.
# If selected place is given, then show # both the selected and all places myfield <- "paper" df1 <- NULL if (!params$place == "All") { df1 <- df0.allplaces } # Standard timeline # If place selection is applied then show both the # selected place and total data p1 <- plot_timeline(df0, x = "publication_decade", y = myfield, nmin = 0, mode = "absolute") + ylab("Paper (sheets)") + ggtitle("Total paper consumption timeline ()") # guides(fill = "none") if (!is.null(df1)) { p1 <- p1 + scale_fill_manual("Place", values = c("black", "darkgray"), labels = c(selected.place, "All")) } p1 <- p1 + scale_y_log10() # Compare the selected field in between the two data sets df <- timeline_relative(df0, df.preprocessed, myfield) df$group <- rep(selected.place, nrow(df)) if (!selected.place == "All") { df.allplaces <- timeline_relative(df0.allplaces, df.preprocessed.allplaces, myfield) df.allplaces$group <- rep("All", nrow(df.allplaces)) df <- bind_rows(df, df.allplaces) } df$group <- factor(df$group) # Mark NAs to 0 df2 <- df %>% select(publication_time, group, fraction) df2 <- unique(df2) df2 <- df2 %>% spread(publication_time, fraction, fill = 0) df2 <- df2 %>% gather(publication_time, fraction, -group) df2$publication_time <- as.numeric(as.character(df2$publication_time)) p2 <- ggplot(df2, aes(y = fraction, x = publication_time, fill = group)) + geom_bar(stat = "identity", position = "dodge", color = "black") + scale_fill_manual(values = c("black", "darkgray")) + ylab("Fraction of total documents (%)") + xlab("Publication time") + ggtitle("Query relative to all docs") #guides(fill = "none") if (!selected.place == "All") { p2 <- p2 + scale_fill_manual("Place", values = c("black", "darkgray"), labels = c(selected.place, "All")) } grid.arrange(p1, p2, nrow = 1)
This figure is shown only when document.type "All" is selected.
if (params$document.type == "All") { df <- df0 df2 <- df %>% group_by(publication_year, document_type) %>% summarize(paper = sum(paper, na.rm = TRUE), n = n()) %>% filter(!is.na(document_type)) p <- ggplot(df2, aes(x = publication_year, y = paper, group = document_type, color = document_type)) p <- p + geom_point() + scale_y_log10() p <- p + geom_smooth(method = "loess", span = 0.1) p <- p + xlab("Year") p <- p + ylab("Paper (sheets)") print(p) }
The top authors by title count (left; total hits across all editions shown in the middle); or by hits per edition (right)
df <- df0 field <- "author" p1 <- p2 <- p3 <- NULL p1 <- top_plot(df, field, ntop) + ggtitle("Title count") + ylab("Title count (n)") # Use unnormalized frequency for now df$hits <- unlist(custom.ids[match(df$id, custom.ids$id), "freq"], use.names = F) df$names <- df[[field]] dfs.total <- df %>% group_by(names) %>% filter(!is.na(names)) %>% summarise(count = sum(hits, na.rm = T), n = n()) %>% mutate(hits.per.edition = count/n) # Limit title length dfs.total$names <- substr(as.character(dfs.total$names), 1, nchar) # Select top-N dfs <- dfs.total %>% arrange(desc(count)) %>% head(ntop) dfs$names <- droplevels(factor(dfs$names, levels = rev(dfs$names))) theme_set(theme_bw(15)) p <- ggplot(dfs, aes(x = names, y = count)) p <- p + geom_bar(stat = "identity") p <- p + coord_flip() v <- 10^(1:max(na.omit(round(log10(max(dfs$count)))))) p <- p + scale_y_log10(breaks = v, labels = v) #p <- p + scale_y_log10(breaks = v, labels = v) p <- p + ylab("Term occurrences (n)") + xlab("") p <- p + ggtitle("Total hits") p1 <- p p <- ggplot(dfs, aes(x = names, y = hits.per.edition)) p <- p + geom_bar(stat = "identity") p <- p + coord_flip() v <- 10^(1:max(na.omit(round(log10(max(dfs$hits.per.edition)))))) p <- p + scale_y_log10(breaks = v, labels = v) p <- p + ylab("Hits per edition (n)") + xlab("") p <- p + ggtitle("Hits/edition") p2 <- p # Select top-N dfs2 <- dfs.total %>% arrange(desc(hits.per.edition)) %>% head(ntop) dfs2$names <- droplevels(factor(dfs2$names, levels = rev(dfs2$names))) p <- ggplot(dfs2, aes(x = names, y = hits.per.edition)) p <- p + geom_bar(stat = "identity") p <- p + coord_flip() v <- 10^(1:max(na.omit(round(log10(max(dfs$hits.per.edition)))))) p <- p + scale_y_log10(breaks = v, labels = v) p <- p + ylab("Hits per edition (n)") + xlab("") p <- p + ggtitle("Hits/edition") p3 <- p grid.arrange(p1, p2, p3, nrow = 1)
df <- df0 theme_set(theme_bw(20)) top.authors <- names(top(df, field = "author", n = 10)) dfs <- df %>% filter(author %in% top.authors) %>% group_by(author, publication_time) %>% tally() %>% arrange(publication_time) v <- seq(min(dfs$publication_time), max(dfs$publication_time), 20) p <- ggplot(dfs, aes(x = publication_time, y = n, fill = author)) + geom_bar(stat = "identity", position = "stack", color = "black") + xlab("Publication time") + ylab("Title count (n)") + scale_fill_grey() + scale_x_continuous(breaks = v, labels = v) + guides(fill = guide_legend("Author", reverse = TRUE)) #ggtitle("Title count timeline for the top authors") print(p)
The top places defined by title count (left); Relation between titlecount and query hits (right). Note that document lengths are not taken into account here.
df <- df0 p <- NULL v <- 10^(1:max(na.omit(round(log10(max(table(df$publication_place))))))) p1 <- top_plot(df, "publication_place", ntop) + ggtitle(paste("Title count")) + ylab("Title count (n)") + scale_y_log10(breaks = v, labels = v) field <- "publication_place" df$names <- df[[field]] df$hits <- unlist(custom.ids[match(df$id, custom.ids$id), "freq"], use.names = F) # Title counts dfs <- df %>% group_by(names) %>% filter(!is.na(names)) %>% summarise(n = n(), hits = sum(hits, na.rm = T)) p2 <- ggplot(dfs, aes(x = n, y = hits, label = names)) + geom_text() + geom_smooth(method = "lm") + scale_x_log10() + scale_y_log10() + xlab("Title count") + ylab("Query hits") + ggtitle("Comparison of title count vs. query hits") grid.arrange(p1, p2, nrow = 1)
Up to r nchar
first characters for each title are shown.
df <- df0 # Limit title length df$title <- substr(as.character(df$title), 1, nchar) p <- NULL v <- 10^(1:max(na.omit(round(log10(max(table(df$title))))))) p1 <- top_plot(df, "title", ntop) + ggtitle(paste("Title count")) + ylab("Title count (n)") + scale_y_log10(breaks = v, labels = v) print(p1)
The title length is limited up to r nchar
first characters:
df <- df0 v <- 10^(1:max(na.omit(round(log10(max(table(df$title))))))) # Use unnormalized frequency for now field <- "title" df$hits <- unlist(custom.ids[match(df$id, custom.ids$id), "freq"], use.names = F) df$names <- df[[field]] dfs.total <- df %>% group_by(names) %>% filter(!is.na(names)) %>% summarise(n = n(), # number of editions count = sum(hits, na.rm = T)) %>% # number of hits mutate(hits.per.edition = count/n) # Limit title length dfs.total$names <- substr(as.character(dfs.total$names), 1, nchar) # Select top-N dfs <- dfs.total %>% arrange(desc(count)) %>% head(ntop) # Retrieve document length from the ID info dfs$document_length <- custom.ids[match(df$id[match(dfs$names, df$title)], custom.ids$id), "length"] dfs$names <- droplevels(factor(dfs$names, levels = rev(dfs$names))) theme_set(theme_bw(15)) p <- ggplot(dfs, aes(x = names, y = count)) p <- p + geom_bar(stat = "identity") p <- p + coord_flip() v <- 10^(1:max(na.omit(round(log10(max(dfs$count)))))) p <- p + scale_y_log10(breaks = v, labels = v) #p <- p + scale_y_log10(breaks = v, labels = v) p <- p + ylab("Term occurrences (n)") + xlab("") p <- p + ggtitle("Total hits") p1 <- p p <- ggplot(dfs, aes(x = names, y = hits.per.edition)) p <- p + geom_bar(stat = "identity") p <- p + coord_flip() v <- 10^(1:max(na.omit(round(log10(max(dfs$hits.per.edition)))))) p <- p + scale_y_log10(breaks = v, labels = v) p <- p + ylab("Hits per edition (n)") + xlab("") p <- p + ggtitle("Hits/edition") p2 <- p # Select top-N dfs.total <- dfs.total %>% arrange(desc(hits.per.edition)) %>% head(ntop) dfs.total$names <- droplevels(factor(dfs.total$names, levels = rev(dfs.total$names))) p <- ggplot(dfs.total, aes(x = names, y = hits.per.edition)) p <- p + geom_bar(stat = "identity") p <- p + coord_flip() v <- 10^(1:max(na.omit(round(log10(max(dfs$hits.per.edition)))))) p <- p + scale_y_log10(breaks = v, labels = v) p <- p + ylab("Hits per edition (n)") + xlab("") p <- p + ggtitle("Hits/edition") p3 <- p grid.arrange(p1, p2, p3, nrow = 1)
The top publishers can be defined by title count (left) or by the total hits across all documents (right).
df <- df0 field <- "publisher" p <- NULL p1 <- top_plot(df, field, ntop, max.char = nchar) + ggtitle("Title count") + ylab("Title count (n)")
# Use unnormalized frequency for now df$hits <- unlist(custom.ids[match(df$id, custom.ids$id), "freq"], use.names = F) df$names <- df[[field]]
dfs <- df %>% group_by(names) %>% filter(!is.na(names)) %>% summarise(count = sum(hits, na.rm = T)) %>% arrange(desc(count)) %>% head(ntop)
# Limit title length dfs$names <- substr(as.character(dfs$names), 1, nchar)
dfs$names <- droplevels(factor(dfs$names, levels = rev(unique(dfs$names)))) theme_set(theme_bw(15)) p <- ggplot(dfs, aes(x = names, y = count)) p <- p + geom_bar(stat = "identity") p <- p + coord_flip() p <- p + ylab("Term occurrences (n)") + xlab("") p <- p + ggtitle("Query hits") p2 <- p grid.arrange(p1, p2, nrow = 1)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.