#' This script tracks downloads of CenSoc Datasets from Harvard Dataverse.
#'
#' All downloaders of CenSoc data from Harvard Dataverse must sign a "guestbook"
#' where name, email, institution, and position are collected.
#' The guestbook can only be downloaded by admins of the dataverse.
#' To acccess: log in to HDV, go to Berkeley CenSoc page, and select
#' 'Dataverse Guestbooks' from dropdown edit menu in upper right
#'
#' NOTE: From mid July to Mid October 2023, users were not required to provide information
#' when downloading the CenSoc-DMF and CenSoc-Numident. This was so that reviewers of our
#' article in Scientific Data (https://www.nature.com/articles/s41597-023-02713-y) could
#' remain anonymous while the paper was under review.
#' Downloads of the CenSoc-Numident and CenSoc-DMF during this time do appear to be tracked
#' in the guestbook, but no identifying information of the downloaders was recorded.
library(tidyverse)
library(data.table)
library(ggsci)
library(ggthemes)
# Read guestbook (note: read permissions limited)
gb <- fread("~/censoc/hdv_guestbooks/UC_Berkeley_CenSoc__GuestbookReponses_apr_29_2024.csv")
head(gb)
# Format date
gb[, c("mo", "day", "yr") := tstrsplit(Date, "/", fixed = TRUE)]
gb <- gb %>% mutate_at(c("mo", "day", "yr"), as.integer)
# Remove downloads prior to 2023 (earlier were internal tests)
gb <- gb %>% filter(yr >= 2023)
# Categorize downloads: first into groups....
table(gb$Dataset)
gb <- gb %>%
mutate(dataset_group = case_when(
str_detect(Dataset, pattern = "BUNMD") ~ "BUNMD",
str_detect(Dataset, pattern = "Numident") ~ "CenSoc-Numident",
str_detect(Dataset, pattern = "Demo") ~ "Demo File",
str_detect(Dataset, pattern = "DMF") ~ "CenSoc-DMF",
str_detect(Dataset, pattern = "Enlistment") ~ "Enlistment",
TRUE ~ NA_character_))
# ... Then scrape individual datasets for exact stats
table(gb$`File Name`)
gb <- gb %>%
mutate(dataset_downloaded = case_when(
str_detect(`File Name`, pattern = "bunmd") & !(str_detect(`File Name`, pattern = "supplement")) &
!(str_detect(`File Name`, pattern = "names")) ~ "BUNMD",
str_detect(`File Name`, pattern = "numident") & !(str_detect(`File Name`, pattern = "demo")) &
!(str_detect(`File Name`, pattern = "enlistment")) &
!(str_detect(`File Name`, pattern = "supplement")) ~ "CenSoc-Numident",
str_detect(`File Name`, pattern = "numident_demo") ~ "CenSoc-Numident Demo",
str_detect(`File Name`, pattern = "dmf") & !(str_detect(`File Name`, pattern = "demo")) &
!(str_detect(`File Name`, pattern = "enlistment")) ~ "CenSoc-DMF",
str_detect(`File Name`, pattern = "dmf_demo") ~ "CenSoc-DMF Demo",
str_detect(`File Name`, pattern = "censoc_wwii_army_enlistment") ~ "CenSoc WWII Enlistment",
str_detect(`File Name`, pattern = "censoc_enlistment_census_1940") ~ "CenSoc-Census WWII Enlistment",
str_detect(`File Name`, pattern = "enlistment_numident") ~ "CenSoc-Numident WWII Enlistment",
str_detect(`File Name`, pattern = "enlistment_dmf") ~ "CenSoc-DMF WWII Enlistment",
str_detect(`File Name`, pattern = "numident_enlistment") ~ "CenSoc-Numident WWII Enlistment",
str_detect(`File Name`, pattern = "dmf_enlistment") ~ "CenSoc-DMF WWII Enlistment",
str_detect(`File Name`, pattern = "supplement") ~ "Supplementary Files",
str_detect(`File Name`, pattern = "names") ~ "Supplementary Files",
TRUE ~ NA_character_))
gb %>% group_by(dataset_downloaded) %>% tally() %>% arrange(desc(n))
# make sure this looks okay
gb %>% group_by(Dataset, dataset_group, dataset_downloaded) %>% tally()
# Flag codebook vs data downloads in case we care about that
gb <- gb %>% mutate(data_or_codebook =
case_when(str_detect(`File Name`, pattern = "codebook") ~ "codebook",
TRUE ~ "dataset"))
# which datasets & codebooks are most downloaded?
gb %>% group_by(dataset_downloaded) %>% tally() %>% arrange(desc(n))
# Demo files, BUNMD, Enlistment files
# "Type" can be download, explore, read doc, or a few other options
table(gb$Type)
# For download metrics, we will only care about actual downloads of datasets themselves
gb_metrics <- gb %>% filter(Type == "Download" & data_or_codebook == "dataset")
# Let's look at monthly download numbers
# formatting these as Date objects will allow us sort them chronologically
gb_metrics <- gb_metrics %>% mutate(date_obj = paste(str_pad(day ,2, pad ="0", side ="left"),
str_pad(mo,2, pad ="0", side ="left"),
yr, sep = "-")) %>%
mutate(date_obj = format(as.Date(date_obj, "%d-%m-%Y"))) %>%
mutate(year_mo = format(as.Date(date_obj), "%Y-%m"))
# Bar plot of downloads by month since June 2023
gb_metrics %>%
filter(year_mo >= "2023-06") %>%
group_by(year_mo) %>%
tally() %>%
ggplot(aes(year_mo, n)) +
geom_col() +
theme_classic() +
labs(x = "Month",
y = "Downloads") +
scale_y_continuous(labels = scales::comma)
# monthly downloads categorizes by dataset group
gb_metrics %>% filter(year_mo >= "2023-06") %>%
ggplot(aes(fill=dataset_group, x=year_mo)) +
geom_bar(position="stack", stat="count") +
theme_classic() +
ggsci::scale_fill_locuszoom() +
ggtitle("Dataset Downloads June 1 2023 - April 29, 2024") # change end date if needed
# Note: spike in demo downloads in 10/2023 are likely from a teaching demonstration
# Note: No user info for Numident & DMF downloads from about July - October 2023
# Let's plot downloads by dataset
gb_metrics %>%
group_by(dataset_downloaded) %>%
tally() %>%
arrange(desc(n)) %>%
ungroup %>%
mutate(dataset_downloaded= reorder(dataset_downloaded, n)) %>%
ggplot(aes(dataset_downloaded, n)) +
geom_col() +
coord_flip() +
theme_classic()
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.