# Compare page count info between ESTC and ECCO df <- df.preprocessed #df.orig <- readRDS("data/unified/polished/df.raw.Rds") df.orig <- readRDS("df.raw.Rds") # Read ECCO dump if (!exists(ecco)) { ecco <- read_ecco(version = 2) } # Polish doc ID df$id <- df$system_control_number df$id <- gsub("\\(CU-RivES\\)", "", df$id) # Add ECCO page counts to our data df$pagecount.ecco <- ecco[match(df$id, as.character(ecco$id)), "totalPages"] df$ecco.id <- ecco[match(df$id, as.character(ecco$id)), "id"] # Add info on whether the page count was estimated or not df$pagecount.estimated <- !is.na(df$pagecount) & is.na(df$pagecount.orig) # Augmented pagecounts o <- subset(df, pagecount.estimated)$original_row comptab <- cbind( estc.id = as.character(df[match(o, df$original_row),"system_control_number"]), ecco.id = as.character(df[match(o, df$original_row), "ecco.id"]), gatherings = as.character(df[match(o, df$original_row),"gatherings"]), physical_extent = df.orig[match(o, df.orig$original_row),"physical_extent"], estc = df[match(o, df$original_row),"pagecount"], ecco = df[match(o, df$original_row),"pagecount.ecco"], estc.volcount = df[match(o, df$original_row),"volcount"], estc.volnumber = df[match(o, df$original_row),"volnumber"], ecco.hits = rep(NA, length(o)), year = df[match(o, df$original_row),"publication_year"], author = df[match(o, df$original_row),"author"], title = as.character(df[match(o, df$original_row),"title"]) ) # Exclude cases with no ECCO ID comptab[which(comptab[, "ecco.id"] == "NA"), "ecco.id"] <- NA comptab <- comptab[!is.na(comptab[, "ecco.id"]),] # Sum up ECCO hits for (i in 1:nrow(comptab)) { myid <- comptab[i,"ecco.id"] dfs <- subset(df, id == myid) ecco.set <- subset(ecco, id == myid) if (is.na(dfs$volcount) & !is.na(dfs$volnumber)) { # Pick the ecco pages just for this particular volume (volcount NA; volnumber given) # assuming that the ECCO hits are in same order as the real volumes # (this can be assumed to be a good approximation anyway) comptab[i, "ecco"] <- sum(ecco.set$totalPages[dfs$volnumber], na.rm = TRUE) } else { comptab[i, "ecco"] <- sum(ecco.set$totalPages, na.rm = TRUE) } comptab[i, "ecco.hits"] <- nrow(ecco.set) } write.table(comptab, file = "~/tmp/estc_ecco_augmented.csv", quote = F, row.names = F, sep = "\t") # ------------------------------------------------------------------- # The differing pagecounts in ESTC/ECCO df2 <- df df2 <- subset(df, !pagecount.estimated) # This takes long time to run, so only do once skip <- TRUE # TODO: should be streamlined, this whole Rmd doc if (!skip) { comptab <- cbind( estc.id = as.character(df2[,"system_control_number"]), ecco.id = as.character(df2[, "ecco.id"]), gatherings = df2[,"gatherings"], physical_extent = df.orig[match(df2$original_row, df.orig$original_row), "physical_extent"], estc = df2[,"pagecount"], ecco = df2[,"pagecount.ecco"], estc.volcount = df2[,"volcount"], estc.volnumber = df2[,"volnumber"], ecco.hits = rep(NA, nrow(df2)), year = df2[,"publication_year"], author = df2[,"author"], title = df2[,"title"] ) # Exclude cases with no ECCO ID comptab[which(comptab[, "ecco.id"] == "NA"), "ecco.id"] <- NA comptab <- comptab[!is.na(comptab[, "ecco.id"]),] # Sum up all ECCO hits for (i in 1:nrow(comptab)) { myid <- comptab[i,"ecco.id"] dfs <- subset(df2, id == myid) if (is.na(dfs$volcount) & !is.na(dfs$volnumber)) { # Pick the ecco pages just for this particular volume (volcount NA; volnumber given) # assuming that the ECCO hits are in same order as the real volumes # (this can be assumed to be a good approximation anyway) ecco.set <- subset(ecco, id == myid) # If ECCO lists multiple volumes then try to pick the indicated volume comptab[i, "ecco"] <- sum(ecco.set$totalPages[dfs$volnumber], na.rm = TRUE) comptab[i, "ecco.hits"] <- nrow(ecco.set) } else { comptab[i, "ecco"] <- sum(subset(ecco, id == myid)$totalPages, na.rm = TRUE) } } comptab <- comptab[!comptab[, "estc"] == comptab[, "ecco"],] o <- order(abs(comptab[, "estc"] - comptab[, "ecco"]), decreasing = TRUE) comptab <- comptab[o,] write.table(comptab, file = "~/tmp/estc_ecco_pagecount_mismatches.csv", quote = F, row.names = F, sep = "\t") saveRDS(comptab, file = "comptab.Rds") } else { comptab <- readRDS("comptab.Rds") } #------------------------------------------------------------------------ # Percentage of total ESTC pages vs. ECCO pages # across the shared docs coms <- intersect(ecco$id, df$id) total.match <- sum(subset(df, id %in% coms)$pagecount, na.rm = TRUE) / sum(subset(ecco, id %in% coms)$totalPages, na.rm = TRUE) # Prepare for the figure below dfs <- subset(df, !is.na(pagecount.ecco) & !is.na(pagecount)) cc <- cor(dfs$pagecount, dfs$pagecount.ecco, method = "spearman")
There are r nrow(ecco)
ECCO documents and r sum(!ecco$id == "", na.rm = TRUE)
have an ESTCID. Some of the ECCO entries refer to the same ESTCID; r sum(as.character(ecco$id) %in% df$id)
ECCO documents have a direct match in ESTC. The number of unique ECCO documents that have ESTC ID is r length(unique(as.character(ecco$id)))
and r round(100*mean(unique(as.character(ecco$id)) %in% df$id), 1)
% of these have a match in ESTC.
Total page count for the matched ESTC documents is r round(100 * total.match, 1)
% of the total page count over the same ECCO documents. This quantifies the correspondence between ESTC and ECCO.
The Spearman correlation between the ESTC and ECCO page counts is r round(cc, 2)
. This includes also the augmented page count information that is based on estimates.
Comparison between the page counts available in ESTC and ECCO helps to quantify the accuracy our automated page count cleaning and estimation procedure. Some ESTC page counts are missing in the original data and have been augmented based on predefined estimates for single volume, multi-volume and issues, calculated from those documents where original page count info is available. In the ESTC/ECCO comparison, where pagecounts are available for both data sets, r round(100 * mean(dfs$pagecount.estimated),2)
% of the pagecounts are based on estimates; pagecount for r sum(df$pagecount.from.ecco, na.rm = TRUE)
ESTC documents has been replaced or augmented from ECCO, improving the correlation.
library(ggplot2) theme_set(theme_bw(20)) p <- ggplot(dfs, aes(x = pagecount, y = pagecount.ecco)) + geom_point(aes(color = pagecount.estimated)) + scale_x_log10() + scale_y_log10() + scale_color_manual(values = c("darkblue", "darkred")) + xlab("Page count ESTC") + ylab("Page count ECCO") + ggtitle(paste("ECCO/ESTC page count comparison (n = ", nrow(dfs), ")", sep = "")) + geom_abline(intercept = 0, slope = 1) print(p)
Comparison without highlighting specific works. Showing the density of the points as well.
library(ggplot2) theme_set(theme_bw(25)) dfss <- dfs %>% filter(!pagecount.estimated) %>% mutate(diff = pagecount - pagecount.ecco) cc <- cor(dfss$pagecount, dfss$pagecount.ecco, method = "spearman") p <- ggplot(dfss, aes(x = pagecount, y = pagecount.ecco)) + geom_point(size = 1) + #geom_bin2d(bins = 50) + #geom_hex(bins = 70) + #geom_density_2d() + # stat_density_2d(aes(fill = ..level..), geom = "polygon", colour = "white") + #stat_density_2d(aes(fill = ..density..), geom = "raster", contour = FALSE) + # scale_x_continuous(expand = c(0, 0)) + # scale_y_continuous(expand = c(0, 0)) + #theme(legend.position='none') + scale_x_log10() + scale_y_log10() + xlab("Page count (ESTC)") + ylab("Page count (ECCO)") + #ggtitle(paste("ECCO/ESTC page count comparison (n = ", nrow(dfss), ")", sep = "")) + ggtitle(paste("n = ", nrow(dfss), sep = "")) + geom_abline(intercept = 0, slope = 1) print(p)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.