In NCEAS/arcticreport: Generate Reporting Metrics for the Arctic Data Center

knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)

#library(arcticreport)
devtools::load_all()
library(EML)
library(dataone)
library(readr)
library(purrr)
library(DT)
library(dplyr)
library(tidyr)
library(jsonlite)
library(rt)
library(lubridate)
library(stringr)

Sys.setenv("RT_BASE_URL"="https://support.nceas.ucsb.edu/rt/")
rt_login()

quarters_file <- "../inst/extdata/quarters.csv"
quarters <- read_csv(quarters_file, progress = FALSE)
quarters_period <- "y8q4" # CHANGE TO CURRENT QUARTER
quarter_start_date <- as.Date(quarters[quarters$period == quarters_period, ]$from)
quarter_end_date <- as.Date(quarters[quarters$period == quarters_period, ]$to)

Get a list of objects from the Metacat SOLR index to use for calculating metrics.

# make sure you set a token!
# Cache tolerance is maximum number of days tolerable for age of cached results. If the cached results are older than the cache tolerance, a new dataset will be cached and the old cache deleted. To force a cache refresh, set tolerance to zero.
objs <- query_objects(cache_tolerance = 1000000)

Next we get a list of the datasets that are stored only on the ADC production filesystem, and not yet in Metacat. The function query_filesys_objects() runs a system bash command, and assumes a particular directory structure is used for storing the data files. It also assumes files are on a cephfs filesystem on datateam, allowing for fast counting and size estimates for millions of files, but making the whole approach non-portable but convenient for now.

# Get list of filesystem data package sizes and counts
# Runs a system bash command, so assumes running on `datateam`
datasets_add <- query_filesys_objects()

# Adds rows of data to objs table
objs <- bind_rows(objs, datasets_add)

This updates the list of tickets, and individual files with annual ticket information (stored in inst/extdata/). Previous years ticket files are stored there, and the current year's needs to be updated every quarter.

update_ticket_list()
df <- update_annual_tix(2023) # CHANGE argument to year to update

Now, calculate all of the actual metrics!

quarters$new_datasets <- map2_chr(quarters$from, quarters$to, .f = count_new_datasets, objects = objs)
quarters$new_changed_datasets <- map2_chr(quarters$from, quarters$to, .f = count_new_and_changed_datasets, objects = objs)
quarters$new_objects <- map2_chr(quarters$from, quarters$to, .f = count_data_objects, objects = objs)
quarters$volume <- map2_chr(quarters$from, quarters$to, .f = count_volume, objects = objs)
quarters$unique_creators <- map2_chr(quarters$from, quarters$to, .f = count_creators, objects = objs)
quarters$downloads <- map2_chr(quarters$from, quarters$to, .f = count_downloads)
quarters$citations <- map2_chr(quarters$from, quarters$to, .f = count_citations)
quarters$support_interactions <- map2_chr(quarters$from, quarters$to, .f = count_support_interactions)

datatable(quarters)

Unique Accessors

A special metric, this one is obtained from the metacat postgres. For now, update the dates in the query below and sent it to someone with access to the production db to run. They will drop the file on the server for you to read from.

COPY (
    SELECT * FROM access_log WHERE 
    date_logged > '2023-08-01 00:00' AND 
    date_logged < '2023-10-31 23:59' AND 
    lower(event) = 'read' 
   ORDER BY date_logged ASC
) 
TO '/tmp/access_log.csv' WITH CSV HEADER;

count_unique_accessors("~/arcticreport/access_log.csv", quarter_start_date, quarter_end_date)

Total Website Visitors

Ask Robyn or the lead MetacatUI dev to give you access to Google Analytics if you don't already have it. Navigate to analytics.google.com. On the home page you will see a "Home" section that has total users for the last 7 days. Find the last 7 days drop down in the bottom left corner of the box with the timeseries figure on it, and select "custom" then enter the date range. The "users" number is the total website visitors.

Plots

plot_cumulative_metric(objs, type = "metadata", metric = "count")  +
    annotate("rect",
             xmin = quarter_start_date,
             xmax = quarter_end_date,
             ymin = 2500,
             ymax = 7200,
             fill = "gray",
             alpha = 0.4)+
    xlim(c(as.Date("2016-03-01"), quarter_end_date))

ggsave("~/datasets.png", height = 4, width = 5)

plot_cumulative_metric(objs, type = "data", metric = "count")  +
    annotate("rect",
             xmin = quarter_start_date,
             xmax = quarter_end_date,
             ymin = 450000,
             ymax = 50000000,
             fill = "gray",
             alpha = 0.4)+
    xlim(c(as.Date("2016-03-01"), quarter_end_date))

ggsave("~/objs.png", height = 4, width = 5)

plot_cumulative_metric(objs, type = "data", metric = "size") +
    annotate("rect",
             xmin = quarter_start_date,
             xmax = quarter_end_date,
             ymin = 0,
             ymax = 110,
             fill = "gray",
             alpha = 0.4)+
    xlim(c(as.Date("2016-03-01"), quarter_end_date)) 

ggsave("~/size.png", height = 4, width = 5)

plot_cumulative_volume(objs) +
    annotate("rect",
             xmin = quarter_start_date,
             xmax = quarter_end_date,
             ymin = 0,
             ymax = 150,
             fill = "gray",
             alpha = 0.4)

ggsave("~/size-model.png", height = 4, width = 5)

NSF Programs

Another special metric, this one takes a while to run.

# get the latest version (helps us more accurately read the NSF award numbers)
get_latest_version <- function(mn, pid){
    ids <- get_all_versions(mn, pid)
    return(ids[length(ids)])
}

mn <- getMNode(CNode("PROD"), "urn:node:ARCTIC")

# filter down the list of metadata docs during the time period
m_q <- objs %>% 
    filter(formatType == "METADATA") %>% 
    filter(!grepl("*.dataone.org/portals|*.dataone.org/collections", formatId)) %>%
    filter(is.na(obsoletes)) %>%
    filter(dateUploaded >= quarter_start_date & dateUploaded <= quarter_end_date)

# get the most recent version (early versions might not have a valid funding number)
for (i in 1:nrow(m_q)){
    m_q$latest[i] <- get_latest_version(mn, m_q$id[i])
}
#m_q$latest <- lapply(m_q$id, get_latest_version, mn = mn)
m_q$latest <- unlist(m_q$latest)
# extract award numbers
res <- c()
for (i in seq_along(m_q$latest)){

    doc <- read_eml(getObject(mn, m_q$latest[i]))
    if (!is.null(doc$dataset$project)){
        # TODO: only look for NSF awards first
        m_q$funding[i] <- paste(arcticdatautils::eml_get_simple(doc, "awardNumber"), collapse = ";")
    }
        else {
             m_q$funding[i] <- NA
        }

}
# clean up awards
funding <- m_q %>% 
    select(id, dateUploaded, funding) %>% 
    separate(funding, paste("funding", 1:5, sep="_"), sep=";", extra="drop") %>% 
    pivot_longer(cols = starts_with("funding"), names_to = "h", values_to = "funding") %>% 
    select(-h) %>% 
    filter(!is.na(funding) & funding != "") %>% 
    filter(nchar(funding) == 7)
# extract program names
for (i in 1:nrow(funding)){
    url <- paste0("https://api.nsf.gov/services/v1/awards.json?id=",funding$funding[i],"&printFields=fundProgramName")

    t <- fromJSON(url)
    if (!is.null(t$response$award$fundProgramName)){
        funding$programName[i] <- t$response$award$fundProgramName
    }
    else {funding$programName[i] <- "unknown"}
}

q <- funding %>% 
    group_by(programName) %>% 
    summarise(n = n())

DT::datatable(q, rownames = F)

Disciplines

Another special metric.

res <- list()
for (i in 1:nrow(m_q)){
    q <- dataone::query(mn, list(q = paste0('id:"', m_q$latest[i], '"'),
                                          fl = 'id,sem_annotation',
                                          sort = 'dateUploaded+desc',
                                          rows = 1000),
                                 as = "data.frame") 

    if (nrow(q) > 0){
        q <- q %>% 
            rename(latest = id)
    } else {
        q <- data.frame(id = m_q$id[i], sem_annotation = NA)
    }


    res[[i]] <- left_join(q, m_q[i, ])

}

res <- do.call(bind_rows, res) 

adc_disc <- read.csv("https://raw.githubusercontent.com/NCEAS/adc-disciplines/main/adc-disciplines.csv") %>% 
    mutate(an_uri = paste0("https://purl.dataone.org/odo/ADCAD_", stringr::str_pad(id, 5, "left", pad = "0")))

res$category <- map(res$sem_annotation, function(x){
    t <- grep("*ADCAD*", x, value = TRUE)
    cats <- c()
    for (i in 1:length(t)){
        z <- which(adc_disc$an_uri == t[i])
        cats[i] <- adc_disc$discipline[z]

    }
    return(cats)
})

res_summ <- res %>% 
    unnest_wider(category, names_sep = "") %>% 
    select(-sem_annotation) %>% 
    pivot_longer(cols = starts_with("category"), names_to = "cat", values_to = "disc") %>% 
    filter(!is.na(disc)) %>% 
    group_by(disc) %>% 
    summarise(n = n())


res1 <- res_summ %>% 
    arrange(disc)

RT Plot

This plot only goes on the one pager summary.

# generate plot
tickets_result <- rt_ticket_search("Queue='arcticdata'",
                         orderby = "+Created",
                         format = "l",
                         fields = "id,Created,Resolved,LastUpdated,Status")
tickets <- tickets_result # Copy so we don't have to re-run query when debugging
tickets$Status <- ordered(tickets$Status, c("rejected", "new", "open", "stalled", "resolved"))

# Make all datetime fields actual datetimes
parse_rt_datetime_pst <- function(x) {
  lubridate::parse_date_time(x, 
                  orders = c("a b d H:M:S Y", # RT default
                             "Y-m-d H:M:S"),       # My customized form
                  tz = "America/Los_Angeles")
}

tickets <- tickets %>% 
  mutate(Created = parse_rt_datetime_pst(Created),
         Resolved = parse_rt_datetime_pst(Resolved),
         LastUpdated = parse_rt_datetime_pst(LastUpdated)) %>% 
  mutate(id = str_replace(id, "ticket/", "")) %>% 
  mutate(DaysOpen = round(as.numeric(now() - Created, units = "days")),
         DaysSinceLastUpdated = round(as.numeric(now() - LastUpdated, units = "days")))

# Add in friendlier datetime fields mirroring the normal ones
nice_format <- "%Y/%m/%d %H:%M"

tickets <- tickets %>% 
  mutate(Created_nice = format(Created, nice_format),
         Resolved_nice = format(Resolved, nice_format),
         LastUpdated_nice = format(LastUpdated, nice_format))


tot <- tickets %>% 
  select(id, Created, Resolved) %>% 
  gather(status, datetime, -id, na.rm = TRUE)

names(tot) <- c("id", "status", "datetime")

tot <- tot %>%
  group_by(status) %>% 
  arrange(datetime) %>% 
  mutate(count = 1, ccount = cumsum(count)) %>% 
  mutate(date = date(datetime))


ggplot(tot, aes(datetime, ccount, color = status)) + 
  geom_step() +
  labs(title = "Cumulative Tickets Created & Resolved Over Time", x = "Date", y = "Number of Tickets") +
  annotate("rect",
           xmin = as.POSIXct(quarter_start_date),
           xmax = as.POSIXct(quarter_end_date),
           ymin = 0,
           ymax = max(tot$ccount),
           fill = "gray",
           alpha = 0.4) +
  xlim(c(ymd_hms("2016-03-01 00:00:00"), as.POSIXct(quarter_end_date))) +
  theme_bw() +
  theme(legend.position = "bottom")

ggsave("~/tix_q2.png", height = 4, width = 5)