ntop <- 20
#opts_chunk$set(comment=NA, fig.width=6, fig.height=6)
opts_chunk$set(fig.path = paste0(output.folder, "figure/"))
theme_set(theme_bw(20))

Document size comparisons

These tables can be used to verify the accuracy of the conversions from the raw data to final estimates:

The estimated dimensions are based on the following auxiliary information sheets:

Left: final gatherings vs. final document dimension (width x height). Right: original gatherings versus original heights where both are available. The point size indicates the number of documents for each case. The red dots indicate the estimated height that is used when only gathering information is available.

df <- df.preprocessed
dfs <- df %>% filter(!is.na(area) & !is.na(gatherings))
dfs <- dfs[, c("gatherings", "area")]
dfm <- melt(table(dfs)) # TODO switch to gather here
names(dfm) <- c("gatherings", "area", "documents")
dfm$gatherings <- factor(dfm$gatherings, levels = levels(df$gatherings))
p <- ggplot(dfm, aes(x = gatherings, y = area)) 
p <- p + scale_y_continuous(trans = "log2")
p <- p + geom_point(aes(size = documents))
p <- p + scale_size(trans="log10")
p <- p + ggtitle("Gatherings vs. area")
p <- p + xlab("Size (gatherings)")
p <- p + ylab("Size (area)")
p <- p + coord_flip()
print(p)

# Compare given dimensions to gatherings
# (not so much data with width so skip that)
df2 <- filter(df, !is.na(height) | !is.na(width))
df2 <- df2[!is.na(as.character(df2$gatherings)),]
df3 <- filter(df2, !is.na(height))
ss <- sheet_sizes()
df3$gathering.height.estimate <- ss[match(df3$gatherings, ss$gatherings),"height"]
df4 <- df3 %>% group_by(gatherings, height) %>% tally()
p <- ggplot(df4, aes(y = gatherings, x = height))
p <- p + geom_point(aes(size = n))
p <- p + geom_point(data = unique(df3), aes(y = gatherings, x = gathering.height.estimate), color = "red")
p <- p + ylab("Gatherings (original)") + xlab("Height (original)") 
p <- p + ggtitle("Gatherings vs. height")
print(p)

Left: Document dimension histogram (surface area); Right: title count per gatherings.

p <- ggplot(df, aes(x = area))
p <- p + geom_histogram() 
p <- p + xlab("Document surface area (log10)")
p <- p + ggtitle("Document dimension (surface area)")
p <- p + scale_x_log10()
print(p)

p <- ggplot(df, aes(x = gatherings)) 
p <- p + geom_bar()
n <- nchar(max(na.omit(table(df$gatherings))))
p <- p + scale_y_log10(breaks=10^(0:n))
p <- p + ggtitle("Title count")
p <- p + xlab("Size (gatherings)")
p <- p + ylab("Title count")
p <- p + coord_flip()
print(p)


rOpenGov/bibliographica documentation built on April 10, 2022, 8:51 p.m.