df <- df.preprocessed

Open analytical ecosystems for digital humanities

Open science principles

Library catalogues: the data

df2 <- df %>% group_by(publication_year) %>% summarize(paper = sum(paper, na.rm = TRUE), n = n()) 
p <- regression_plot(paper ~ publication_year, df2) 
p <- p + ggtitle("Total annual paper consumption")
p <- p + xlab("Year")
p <- p + ylab("Paper consumption")


Publishing “history” in Britain and North America 1470-1800

Research questions

  1. Who wrote history?
  2. Where was it published?
  3. How does the publishing of history change over the early modern period?

ESTC raw data

Hierarchical information, only some fields relevant for our study


Workflow, step by step


Load the data and tools

Load the data and tools in R:

#kable(t(df.orig[22495, ]))

Polishing page counts

Raw page counts

rawpages <- as.character(unique(df.orig[sample(nrow(df.orig), 6), "physical_extent"]))

Polish page counts


Document dimension field

#kable(as.character(sample(unique(df.orig$physical_dimension), 6)))

Polish document dimensions

Pick dimension information

#kable(polish_dimensions("10 cm (12⁰)"))

Fill missing dimensions

Estimate missing dimensions

#kable(polish_dimensions("10 cm (12⁰)", fill = TRUE))

Publication place

Many versions of London:

x <- as.character(df.orig[, "publication_place"])
top_plot(x[grep("London", x)], ntop = 20)

In total r length(unique(x[grep("London", x)])) unique places with the string London - tidying up and synonyme lists !

Ambiguous authors

a <- which(sapply(split(df$author_birth, df$author_name), function (x) {length(unique(x))}) > 2)
dfa <- df[, c("author_name", "author_birth", "author_death")]
dfa <- filter(dfa, ! & (author_name %in% names(a)))
dfa <- dfa[!duplicated(dfa), ]
#dfa <- dfa[match(names(a), dfa$author_name),]
dfa <- arrange(dfa, author_birth)
# Order authors by birth year
#dfa$author_name <- factor(dfa$author_name, levels = dfa$author_name)
dfa$index <- sample(factor(1:nrow(dfa)))

p <- ggplot(dfa)
p <- p + geom_segment(aes(y = author_name, yend = author_name, x = author_birth, xend = author_death, color = index), size = 2) 
p <- p + theme(axis.text.y = element_text(size = 9))
p <- p + xlab("Author life span (year)") + ylab("")
p <- p + guides(color = FALSE)

Author gender

Enriching data by external information

as.matrix(get_gender(polish_author(sample(unique(df$author_name), 20))$names$first)$gender)



Who wrote history?

Who wrote history?

Top-10 authors (number of titles)

top_plot(df, "author.unique", 20)

Who wrote history?

Top-10 female authors (number of titles)

df2 <- df %>% filter(author_gender == "female")
top_plot(df2, "author.unique", 20)

Who wrote history?

Title count vs. paper consumption

df2 <- df %>%
    filter(! %>%
    group_by(author.unique) %>%
    summarize(paper = sum(paper, na.rm = TRUE),
          docs = n()) %>%

Document count vs. paper for top authors

ggplot(df2, aes(x = docs, y = paper)) + geom_text(aes(label = author.unique), size = 4)

Who wrote history?

Gender distribution for authors over time. Note that the name-gender mappings change over time. This has not been taken into account yet.

tab <- table(df$author_gender)
round(tab/sum(tab), 3)
dfd <- df %>% group_by(publication_decade) %>% summarize(n.male = sum(author_gender == "male", na.rm = T), n.female = sum(author_gender == "female", na.rm = T), = n()) %>% mutate(p.male = 100*n.male/, p.female = 100*n.female/ %>% filter( > 25) 
dfy <- df %>% group_by(publication_year) %>% summarize(n.male = sum(author_gender == "male", na.rm = T), n.female = sum(author_gender == "female", na.rm = T), = n()) %>% mutate(p.male = 100*n.male/, p.female = 100*n.female/ %>% filter( > 25) 
p <- regression_plot(p.female ~ publication_decade, dfd, main = "Female authors proportion")
p <- p + ylab("Female authors (%)")

Who wrote history?

Other questions to explore

df2 <- df %>% filter( == "London")
df2 <- df %>% filter(language == "French")
df2 <- df %>% filter(publication_year >= 1700 & publication_year < 1800)
top_plot(df2, "author.unique", 10)

2. Where was history published ?

Top-10 places (number of titles)

top_plot(df, "publication_place", 10)

Where was history published ?

df2 <- df %>% filter(publication_country %in% c("France", "Germany")) %>%
    group_by(publication_decade, publication_country) %>%
    summarize(paper = sum(paper, na.rm = TRUE), docs = n()) 
p <- ggplot(df2, aes(x = publication_decade, y = docs, color = publication_country)) +
     geom_point() + geom_smooth()

Where was history published ?

Title count vs. paper

df2 <- df %>%
    filter(! %>%
    group_by(publication_place) %>%
    summarize(paper = sum(paper, na.rm = TRUE),
          docs = n()) %>%
     aes(x = log10(1 + docs), y = log10(1 + paper))) +
     geom_text(aes(label = publication_place), size = 3) +
     scale_x_log10() + scale_y_log10() 

Where was history published ?

Scotland, Ireland, US comparison:

df2 <- df %>%
    filter(! %>%
    group_by(publication_country) %>%
    summarize(paper = sum(paper, na.rm = TRUE),
          docs = n()) %>%
    arrange(desc(docs)) %>%
    filter(publication_country %in% c("Scotland", "Ireland", "USA"))

p1 <- ggplot(df2, aes(x = publication_country, y = docs)) + geom_bar(stat = "identity") + ggtitle("Title count")
p2 <- ggplot(df2, aes(x = publication_country, y = paper)) + geom_bar(stat = "identity") + ggtitle("Paper consumption")
grid.arrange(p1, p2, nrow = 1)

Where was history published ?

#p1 <- ggplot(subset(melt(df2), variable == "paper"), aes(y = value, x = publication_country)) + geom_bar(stat = "identity") + ylab("Paper consumption")
#p2 <- ggplot(subset(melt(df2), variable == "docs"), aes(y = value, x = publication_country)) + geom_bar(stat = "identity") + ylab("Title count")
#grid.arrange(p1, p2, nrow = 1)

3. How does the history publishing change in the early modern period ?

What can we say about the nature of the documents? Pamphlets (<32 pages) vs. Books (>120 pages) ? Book size statistics and development over time

df$document.type <- rep(NA, nrow(df))
df$document.type[df$pagecount > 32] <- "book"
df$document.type[df$pagecount <= 32] <- "pamphlet"
df$document.type <- factor(df$document.type)

df2 <- df %>% group_by(publication_year, document.type) %>% summarize(paper = sum(paper, na.rm = TRUE), n = n()) 
p <- ggplot(df2, aes(x = publication_year, y = paper, group = document.type, color = document.type))
p <- p + geom_point()
p <- p + geom_smooth(method = "loess")
p <- p + ggtitle("Paper consumption per document document.type")
p <- p + xlab("Year")
p <- p + ylab("Paper consumption")

Nature of the documents


df2 <- df %>% group_by(publication_year) %>% summarize(paper = sum(paper, na.rm = TRUE), n = n()) 
p <- regression_plot(n ~ publication_year, df2) 
p <- p + ggtitle("Title count")
p <- p + xlab("Year")
p <- p + ylab("Documents (n)")
p1 <- p

p <- regression_plot(paper ~ publication_year, df2) 
p <- p + ggtitle("Paper consumption")
p <- p + xlab("Year")
p <- p + ylab("Paper consumption")
p2 <- p

grid.arrange(p1, p2, nrow = 1)

Nature of the documents

Estimated paper consumption by document size

df2 <- df %>% group_by(publication_year, gatherings) %>% summarize(paper = sum(paper, na.rm = TRUE), n = n()) 
df2 <- filter(df2, gatherings %in% names(which(table(df2$gatherings) >= 50)))
p <- ggplot(df2, aes(y = paper, x = publication_year, group = gatherings, color = gatherings))
p <- p + geom_point()
p <- p + geom_smooth(method = "loess", size = 1)
p <- p + ggtitle("Annual paper consumption by gatherings")
p <- p + xlab("Year")
p <- p + ylab("Paper consumption")

Nature of the documents

Document sizes over time

df2 <- df %>% group_by(publication_year) %>% summarize(paper = sum(paper, na.rm = TRUE), n = n()) 
p <- ggplot(df2, aes(x = publication_year, y = paper/n))
p <- p + geom_point()
p <- p + geom_line()
p <- p + ggtitle("Average paper consumption per document")
p <- p + xlab("Year")
p <- p + ylab("Average paper consumption per document")
p1 <- p

df2 <- filter(df, ! & (! | ! %>% group_by(gatherings.original, publication_decade) %>% 
  summarize(mean.height = mean(height.original, na.rm = T),
            mean.width = mean(width.original, na.rm = T), n = n())

p <- ggplot(df2, aes(x = publication_decade, y = mean.height, group = gatherings.original, color = gatherings.original))
p <- p + geom_point(aes(size = n))
p <- p + geom_line(method = "loess")
p <- p + ggtitle("Height")
p2 <- p

grid.arrange(p1, p2, nrow = 1)

Serious statistical analysis (also in the Humanities)

Open science in (digital?) humanities


These slides are automatically generated as well


Barriers to open science in the humanities


Thomason tracts 1640-1660

p <- ggplot(df, aes(x = publication_year)) 
p <- p + geom_histogram(binwidth = 5)
p <- p + ggtitle("Publication year")

Gatherings and page counts

dfs <- df[, c("width", "height", "gatherings", "area")] %>%
          filter(! & !
dfs <- dfs[, c("gatherings", "area")]
dfm <- melt(table(dfs))
names(dfm) <- c("gatherings", "area", "documents")
dfm$gatherings <- factor(dfm$gatherings, levels = levels(df$gatherings))
p <- ggplot(dfm, aes(x = gatherings, y = area)) 
p <- p + scale_y_continuous(trans = "log2")
p <- p + geom_point(aes(size = documents))
p <- p + scale_size(trans="log10")
p <- p + ggtitle("Document size distribution: gatherings vs. area")
p <- p + xlab("Size (gatherings)")
p <- p + ylab("Size (area)")
p <- p + coord_flip()

Page counts

Page count: distribution for documents with different sizes.


  # Single-volume docs
  dff <- filter(df, volcount == 1 &

  dff2 <- dff %>% group_by(gatherings, 
                     pagecount) %>%

  dff3 <- dff %>% group_by(gatherings) %>%
            summarize(mean = mean(pagecount, na.rm = T), 
                      median = median(pagecount, na.rm = T))

  p <- ggplot(dff2, aes(y = gatherings, x = pagecount)) 
  p <- p + geom_point(aes(size = n))
  p <- p + geom_point(data = dff3, aes(y = gatherings, x = mean), col = "red", size = 3)
  p <- p + geom_point(data = dff3, aes(y = gatherings, x = median), col = "blue", size = 3)
  p <- p + scale_x_log10(breaks = c(1, 10, 100, 1000))
  p <- p + xlab("Total page count (blue: median; red: mean)")
  p <- p + ylab("Document size")
  p <- p + ggtitle(paste("Pages: single-volume documents (n=", nrow(dff), ")", sep = ""))
  p1 <- p 

  # Multi-volume docs
  dff <- filter(df, 
       (volcount > 1 | 
       #(items == 1 & !
       #pagecount > 10
  dff2 <- dff %>% group_by(gatherings, 
                     pagecount) %>%

  dff3 <- dff %>% group_by(gatherings) %>%
            summarize(mean = mean(pagecount, na.rm = T), 
             median = median(pagecount, na.rm = T))

  p <- ggplot(dff2, aes(y = gatherings, x = pagecount)) 
  p <- p + geom_point(aes(size = n))
  p <- p + geom_point(data = dff3, aes(y = gatherings, x = mean), col = "red", size = 3)
  p <- p + geom_point(data = dff3, aes(y = gatherings, x = median), col = "blue", size = 3)
  p <- p + scale_x_log10(breaks = c(1, 10, 100, 1000))
  p <- p + xlab("Total page count (blue: median; red: mean)")
  p <- p + ylab("Document size")
  p <- p + ggtitle(paste("Pages: multi-volume documents (n=", nrow(dff), ")", sep = ""))
  p2 <- p 

grid.arrange(p1, p2, nrow = 1)

How does the history publishing change in the early modern period ?

df2 <- df %>% group_by(publication_year, document.type) %>% summarize(paper = sum(paper, na.rm = TRUE), n = n()) 
p <- ggplot(df2, aes(x = publication_year, y = n, group = document.type, color = document.type))
p <- p + geom_point()
p <- p + geom_smooth(method = "loess")
p <- p + ggtitle("Documents per document type")
p <- p + xlab("Year")
p <- p + ylab("Documents (n)")

Nature of the documents

Estimated title count by document size

df2 <- df %>% group_by(publication_year, gatherings) %>% summarize(paper = sum(paper, na.rm = TRUE), n = n()) 
df2 <- filter(df2, gatherings %in% names(which(table(df2$gatherings) >= 50)))
p <- ggplot(df2, aes(y = n, x = publication_year, group = gatherings, color = gatherings))
p <- p + geom_point()
p <- p + geom_smooth(method = "loess", size = 1)
p <- p + ggtitle("Annual document count by size")
p <- p + xlab("Year")
p <- p + ylab("Documents (n)")

Nature of the documents

Top authors

top.authors <- names(rev(sort(table(df$author.unique))))[1:10]
df2 <- df %>% filter(author.unique %in% top.authors) %>% group_by(publication_year, author.unique) %>% summarize(paper = sum(paper, na.rm = TRUE), n = n()) 
p <- ggplot(df2, aes(x = publication_year, y = paper, group = author.unique, color = author.unique))
p <- p + geom_point()
p <- p + geom_line()
#p <- p + geom_smooth(method = "loess", size = 1)
p <- p + ggtitle("Paper consumption per author")
p <- p + xlab("Year")
p <- p + ylab("Paper consumption")

Nature of the documents

Top authors title count

p <- ggplot(df2, aes(x = publication_year, y = n, group = author.unique, color = author.unique))
p <- p + geom_point()
p <- p + geom_line()
p <- p + ggtitle("Title count per author")
p <- p + xlab("Year")
p <- p + ylab("Documents (n)")


How does the history publishing change in the early modern period ?

How does the history publishing change in the early modern period ?

Top-4 places (title count), mean page count over time.

df2 <- df %>% group_by(publication_place) %>% tally() %>% arrange(desc(n))
top.places <- df2$publication_place[1:4]
df2 <- df %>% filter(publication_place %in% top.places) %>%
       group_by(publication_decade, publication_place) %>%
       summarize(paper = sum(paper, na.rm = TRUE), n = n(), mean.pagecount = mean(pagecount, na.rm = TRUE)) %>%
p <- ggplot(df2, aes(x = publication_decade, y = mean.pagecount, color = publication_place))
p <- p + geom_point() + geom_smooth()

