knitr::opts_chunk$set( collapse = TRUE, comment = "#>" )
#library(arcticreport) devtools::load_all() library(EML) library(dataone) library(readr) library(purrr) library(DT) library(dplyr) library(tidyr) library(jsonlite) library(rt) library(lubridate) library(stringr)
Sys.setenv("RT_BASE_URL"="https://support.nceas.ucsb.edu/rt/") rt_login() quarters_file <- "../inst/extdata/quarters.csv" quarters <- read_csv(quarters_file, progress = FALSE) quarters_period <- "y8q4" # CHANGE TO CURRENT QUARTER quarter_start_date <- as.Date(quarters[quarters$period == quarters_period, ]$from) quarter_end_date <- as.Date(quarters[quarters$period == quarters_period, ]$to)
Get a list of objects from the Metacat SOLR index to use for calculating metrics.
# make sure you set a token! # Cache tolerance is maximum number of days tolerable for age of cached results. If the cached results are older than the cache tolerance, a new dataset will be cached and the old cache deleted. To force a cache refresh, set tolerance to zero. objs <- query_objects(cache_tolerance = 1000000)
Next we get a list of the datasets that are stored only on the ADC production filesystem, and not yet in Metacat. The function query_filesys_objects()
runs a system bash command, and assumes a particular directory structure is used for storing the data files. It also assumes files are on a cephfs
filesystem on datateam
, allowing for fast counting and size estimates for millions of files, but making the whole approach non-portable but convenient for now.
# Get list of filesystem data package sizes and counts # Runs a system bash command, so assumes running on `datateam` datasets_add <- query_filesys_objects() # Adds rows of data to objs table objs <- bind_rows(objs, datasets_add)
This updates the list of tickets, and individual files with annual ticket information (stored in inst/extdata/). Previous years ticket files are stored there, and the current year's needs to be updated every quarter.
update_ticket_list() df <- update_annual_tix(2023) # CHANGE argument to year to update
Now, calculate all of the actual metrics!
quarters$new_datasets <- map2_chr(quarters$from, quarters$to, .f = count_new_datasets, objects = objs) quarters$new_changed_datasets <- map2_chr(quarters$from, quarters$to, .f = count_new_and_changed_datasets, objects = objs) quarters$new_objects <- map2_chr(quarters$from, quarters$to, .f = count_data_objects, objects = objs) quarters$volume <- map2_chr(quarters$from, quarters$to, .f = count_volume, objects = objs) quarters$unique_creators <- map2_chr(quarters$from, quarters$to, .f = count_creators, objects = objs) quarters$downloads <- map2_chr(quarters$from, quarters$to, .f = count_downloads) quarters$citations <- map2_chr(quarters$from, quarters$to, .f = count_citations) quarters$support_interactions <- map2_chr(quarters$from, quarters$to, .f = count_support_interactions)
datatable(quarters)
A special metric, this one is obtained from the metacat postgres. For now, update the dates in the query below and sent it to someone with access to the production db to run. They will drop the file on the server for you to read from.
COPY ( SELECT * FROM access_log WHERE date_logged > '2023-08-01 00:00' AND date_logged < '2023-10-31 23:59' AND lower(event) = 'read' ORDER BY date_logged ASC ) TO '/tmp/access_log.csv' WITH CSV HEADER;
count_unique_accessors("~/arcticreport/access_log.csv", quarter_start_date, quarter_end_date)
Ask Robyn or the lead MetacatUI dev to give you access to Google Analytics if you don't already have it. Navigate to analytics.google.com. On the home page you will see a "Home" section that has total users for the last 7 days. Find the last 7 days drop down in the bottom left corner of the box with the timeseries figure on it, and select "custom" then enter the date range. The "users" number is the total website visitors.
plot_cumulative_metric(objs, type = "metadata", metric = "count") + annotate("rect", xmin = quarter_start_date, xmax = quarter_end_date, ymin = 2500, ymax = 7200, fill = "gray", alpha = 0.4)+ xlim(c(as.Date("2016-03-01"), quarter_end_date)) ggsave("~/datasets.png", height = 4, width = 5)
plot_cumulative_metric(objs, type = "data", metric = "count") + annotate("rect", xmin = quarter_start_date, xmax = quarter_end_date, ymin = 450000, ymax = 50000000, fill = "gray", alpha = 0.4)+ xlim(c(as.Date("2016-03-01"), quarter_end_date)) ggsave("~/objs.png", height = 4, width = 5)
plot_cumulative_metric(objs, type = "data", metric = "size") + annotate("rect", xmin = quarter_start_date, xmax = quarter_end_date, ymin = 0, ymax = 110, fill = "gray", alpha = 0.4)+ xlim(c(as.Date("2016-03-01"), quarter_end_date)) ggsave("~/size.png", height = 4, width = 5)
plot_cumulative_volume(objs) + annotate("rect", xmin = quarter_start_date, xmax = quarter_end_date, ymin = 0, ymax = 150, fill = "gray", alpha = 0.4) ggsave("~/size-model.png", height = 4, width = 5)
Another special metric, this one takes a while to run.
# get the latest version (helps us more accurately read the NSF award numbers) get_latest_version <- function(mn, pid){ ids <- get_all_versions(mn, pid) return(ids[length(ids)]) }
mn <- getMNode(CNode("PROD"), "urn:node:ARCTIC") # filter down the list of metadata docs during the time period m_q <- objs %>% filter(formatType == "METADATA") %>% filter(!grepl("*.dataone.org/portals|*.dataone.org/collections", formatId)) %>% filter(is.na(obsoletes)) %>% filter(dateUploaded >= quarter_start_date & dateUploaded <= quarter_end_date) # get the most recent version (early versions might not have a valid funding number) for (i in 1:nrow(m_q)){ m_q$latest[i] <- get_latest_version(mn, m_q$id[i]) } #m_q$latest <- lapply(m_q$id, get_latest_version, mn = mn) m_q$latest <- unlist(m_q$latest) # extract award numbers res <- c() for (i in seq_along(m_q$latest)){ doc <- read_eml(getObject(mn, m_q$latest[i])) if (!is.null(doc$dataset$project)){ # TODO: only look for NSF awards first m_q$funding[i] <- paste(arcticdatautils::eml_get_simple(doc, "awardNumber"), collapse = ";") } else { m_q$funding[i] <- NA } } # clean up awards funding <- m_q %>% select(id, dateUploaded, funding) %>% separate(funding, paste("funding", 1:5, sep="_"), sep=";", extra="drop") %>% pivot_longer(cols = starts_with("funding"), names_to = "h", values_to = "funding") %>% select(-h) %>% filter(!is.na(funding) & funding != "") %>% filter(nchar(funding) == 7) # extract program names for (i in 1:nrow(funding)){ url <- paste0("https://api.nsf.gov/services/v1/awards.json?id=",funding$funding[i],"&printFields=fundProgramName") t <- fromJSON(url) if (!is.null(t$response$award$fundProgramName)){ funding$programName[i] <- t$response$award$fundProgramName } else {funding$programName[i] <- "unknown"} }
q <- funding %>% group_by(programName) %>% summarise(n = n()) DT::datatable(q, rownames = F)
Another special metric.
res <- list() for (i in 1:nrow(m_q)){ q <- dataone::query(mn, list(q = paste0('id:"', m_q$latest[i], '"'), fl = 'id,sem_annotation', sort = 'dateUploaded+desc', rows = 1000), as = "data.frame") if (nrow(q) > 0){ q <- q %>% rename(latest = id) } else { q <- data.frame(id = m_q$id[i], sem_annotation = NA) } res[[i]] <- left_join(q, m_q[i, ]) } res <- do.call(bind_rows, res) adc_disc <- read.csv("https://raw.githubusercontent.com/NCEAS/adc-disciplines/main/adc-disciplines.csv") %>% mutate(an_uri = paste0("https://purl.dataone.org/odo/ADCAD_", stringr::str_pad(id, 5, "left", pad = "0"))) res$category <- map(res$sem_annotation, function(x){ t <- grep("*ADCAD*", x, value = TRUE) cats <- c() for (i in 1:length(t)){ z <- which(adc_disc$an_uri == t[i]) cats[i] <- adc_disc$discipline[z] } return(cats) }) res_summ <- res %>% unnest_wider(category, names_sep = "") %>% select(-sem_annotation) %>% pivot_longer(cols = starts_with("category"), names_to = "cat", values_to = "disc") %>% filter(!is.na(disc)) %>% group_by(disc) %>% summarise(n = n()) res1 <- res_summ %>% arrange(disc)
This plot only goes on the one pager summary.
# generate plot tickets_result <- rt_ticket_search("Queue='arcticdata'", orderby = "+Created", format = "l", fields = "id,Created,Resolved,LastUpdated,Status") tickets <- tickets_result # Copy so we don't have to re-run query when debugging tickets$Status <- ordered(tickets$Status, c("rejected", "new", "open", "stalled", "resolved")) # Make all datetime fields actual datetimes parse_rt_datetime_pst <- function(x) { lubridate::parse_date_time(x, orders = c("a b d H:M:S Y", # RT default "Y-m-d H:M:S"), # My customized form tz = "America/Los_Angeles") } tickets <- tickets %>% mutate(Created = parse_rt_datetime_pst(Created), Resolved = parse_rt_datetime_pst(Resolved), LastUpdated = parse_rt_datetime_pst(LastUpdated)) %>% mutate(id = str_replace(id, "ticket/", "")) %>% mutate(DaysOpen = round(as.numeric(now() - Created, units = "days")), DaysSinceLastUpdated = round(as.numeric(now() - LastUpdated, units = "days"))) # Add in friendlier datetime fields mirroring the normal ones nice_format <- "%Y/%m/%d %H:%M" tickets <- tickets %>% mutate(Created_nice = format(Created, nice_format), Resolved_nice = format(Resolved, nice_format), LastUpdated_nice = format(LastUpdated, nice_format)) tot <- tickets %>% select(id, Created, Resolved) %>% gather(status, datetime, -id, na.rm = TRUE) names(tot) <- c("id", "status", "datetime") tot <- tot %>% group_by(status) %>% arrange(datetime) %>% mutate(count = 1, ccount = cumsum(count)) %>% mutate(date = date(datetime)) ggplot(tot, aes(datetime, ccount, color = status)) + geom_step() + labs(title = "Cumulative Tickets Created & Resolved Over Time", x = "Date", y = "Number of Tickets") + annotate("rect", xmin = as.POSIXct(quarter_start_date), xmax = as.POSIXct(quarter_end_date), ymin = 0, ymax = max(tot$ccount), fill = "gray", alpha = 0.4) + xlim(c(ymd_hms("2016-03-01 00:00:00"), as.POSIXct(quarter_end_date))) + theme_bw() + theme(legend.position = "bottom") ggsave("~/tix_q2.png", height = 4, width = 5)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.