#opts_chunk$set(comment=NA, fig.width=6, fig.height=6)
opts_chunk$set(fig.path = "figure/dimension-")

Document size comparisons

These tables can be used to verify the accuracy of the conversions from the raw data to final estimates:

The estimated dimensions are based on the following auxiliary information sheets:

Left: final gatherings vs. final document dimension (width x height). Right: original gatherings versus original heights where both are available. The point size indicates the number of documents for each case. The red dots indicate the estimated height that is used when only gathering information is available.

df <- df.preprocessed
dfs <- df %>% filter(!is.na(area) & !is.na(gatherings))
dfs <- dfs[, c("gatherings", "area")]
dfm <- melt(table(dfs)) # TODO switch to gather here
names(dfm) <- c("gatherings", "area", "documents")
dfm$gatherings <- factor(dfm$gatherings, levels = levels(df$gatherings))
p <- ggplot(dfm, aes(x = gatherings, y = area)) 
p <- p + scale_y_continuous(trans = "log2")
p <- p + geom_point(aes(size = documents))
p <- p + scale_size(trans="log10")
p <- p + ggtitle("Gatherings vs. area")
p <- p + xlab("Size (gatherings)")
p <- p + ylab("Size (area)")
p <- p + coord_flip()
print(p)

# Compare given dimensions to gatherings
# (not so much data with width so skip that)
df2 <- filter(df, !is.na(height) | !is.na(width))
df2 <- df2[!is.na(as.character(df2$gatherings)),]
df3 <- filter(df2, !is.na(height))
ss <- sheet_sizes()
df3$gathering.height.estimate <- ss[match(df3$gatherings, ss$gatherings),"height"]
df4 <- df3 %>% group_by(gatherings, height) %>% tally()
p <- ggplot(df4, aes(y = gatherings, x = height))
p <- p + geom_point(aes(size = n))
p <- p + geom_point(data = unique(df3), aes(y = gatherings, x = gathering.height.estimate), color = "red")
p <- p + ylab("Gatherings (original)") + xlab("Height (original)") 
p <- p + ggtitle("Gatherings vs. height")
print(p)

dfs <- df %>% filter(!is.na(paper) & !is.na(gatherings))
dfs <- dfs[, c("gatherings", "paper")]
dfm <- melt(table(dfs)) # TODO switch to gather here
names(dfm) <- c("gatherings", "paper", "documents")
dfm$gatherings <- factor(dfm$gatherings, levels = levels(df$gatherings))
p <- ggplot(dfm, aes(x = gatherings, y = paper)) 
p <- p + scale_y_continuous(trans = "log2")
p <- p + geom_point(aes(size = documents))
p <- p + scale_size(trans="log10")
p <- p + ggtitle("Gatherings vs. paper")
p <- p + xlab("Size (gatherings)")
p <- p + ylab("Paper (sheets)")
p <- p + coord_flip()
print(p)

Left: Document dimension histogram (surface area); Middle: Paper consumption histogram; Right: title count per gatherings.

p <- ggplot(df, aes(x = area))
p <- p + geom_histogram() 
p <- p + xlab("Document surface area (log10)")
p <- p + ggtitle("Document dimension (surface area)")
p <- p + scale_x_log10()
print(p)

p <- ggplot(df, aes(x = paper))
p <- p + geom_histogram() 
p <- p + xlab("Paper (log10 sheets)")
p <- p + ggtitle("Paper consumption histogram")
p <- p + scale_x_log10()
print(p)

p <- ggplot(df, aes(x = gatherings)) 
p <- p + geom_bar()
n <- nchar(max(na.omit(table(df$gatherings))))
p <- p + scale_y_log10(breaks=10^(0:n))
p <- p + ggtitle("Title count")
p <- p + xlab("Size (gatherings)")
p <- p + ylab("Title count")
p <- p + coord_flip()
print(p)

Gatherings timelines

nmin = 15

Popularity of different document sizes over time. Left: absolute title counts. Right: relative title counts. Gatherings with less than r nmin documents at every decade are excluded:

dfs <- df %>% filter(!gatherings == "NA")
res <- timeline(dfs, group = "gatherings", nmin = nmin, mode = "absolute") 
print(res$plot)

res <- timeline(dfs, group = "gatherings", nmin = nmin, mode = "percentage") 
print(res$plot)

Title count versus paper consumption

d <- df.preprocessed %>%
       group_by(publication_year) %>%
       summarise(n = n(),
             p = sum(paper, na.rm = TRUE))
p <- ggplot(d, aes(y = n, x = p)) +
       # geom_point(aes(color = publication_year), size = 3) +
       geom_text(aes(label = publication_year), size = 3) + 
       ylab("Title count") + xlab("Paper consumption") +
       scale_x_log10() + scale_y_log10() + 
       ggtitle("Paper vs. Title count by year")
print(p)


rOpenGov/bibliographica documentation built on April 10, 2022, 8:51 p.m.