R/crawl_slug.R

Defines functions abm_unit_stats abm_division_stats abm_pubs_summary abm_unit_pubs abm_divisions abm_researchers db_upsert_table db_upload_crawl abm_graph_divisions kth_catalog_crawl kth_divisions_crawl abm_slugs_departments kthids_from_slug

Documented in abm_divisions abm_division_stats abm_graph_divisions abm_pubs_summary abm_researchers abm_slugs_departments abm_unit_pubs abm_unit_stats db_upload_crawl db_upsert_table kth_catalog_crawl kth_divisions_crawl kthids_from_slug

#' Given a organizational unit slug, such as "j/jj/jjn", retrieves 
#' associated KTH researcher ids
#' @param slug character slug for an organizational unit
#' @return tibble with kthids, usernames, titles and a timestamp
#' @details this fcn belongs in the kthapi package
#' @examples 
#' \dontrun{
#' if(interactive()){
#'  kthids_from_slug("j/jj/jjn")
#'  kthids_from_slug("j/jh/jhs")
#'  }
#' }
#' @seealso 
#'  \code{\link[kthapi]{kth_catalog}},\code{\link[kthapi]{kth_profile}}
#'  \code{\link[progress]{progress_bar}}
#'  \code{\link[purrr]{safely}},\code{\link[purrr]{map}}
#'  \code{\link[lubridate]{now}}
#' @rdname kthids_from_slug
#' @export 
#' @import dplyr
#' @importFrom kthapi kth_catalog kth_profile
#' @importFrom progress progress_bar
#' @importFrom purrr possibly map_chr
#' @importFrom lubridate now
#' 
kthids_from_slug <- function(slug) {
  
  users <- kthapi::kth_catalog(slug = slug)$users
  
  pb <- progress::progress_bar$new(
    format = "resolving kthids [:bar] :percent eta: :eta",
    total = length(users$username))
  
  kthid_from_username <- function(x) {
    pb$tick()
    kthapi::kth_profile(username = x)$content$kthId
  } 
  
  kfu <- purrr::possibly(kthid_from_username, otherwise = NA_character_)
  
  ids <- purrr::map_chr(users$username, kfu)
  
  users %>% 
    mutate(slug = slug, kthid = ids) %>%
    select(slug, kthid, username, firstName, lastName, title.en) %>%
    #select(slug, kthid, username, title.en) %>%
    mutate(crawl_ts = lubridate::now())
  
}

#' Slugs or identifiers for departments at KTH used in ABM
#' @return character vector of slugs
#' @examples 
#' \dontrun{
#' if(interactive()){
#'  slugs <- abm_slugs_departments()
#'  }
#' }
#' @rdname abm_slugs_departments
#' @export 
abm_slugs_departments <- function() {
  unit_info() %>% 
    filter(org_level == 2) %>% 
    pull(slug)
}

#' Crawl kthapi for divisions
#' 
#' This function crawls the KTH Directory API for divisions, by default those used in ABM
#' @param include character vector of slugs to include, Default: abm_slugs_institutions()
#' @param exclude character vector of slugs to exclude, Default: NULL
#' @param quiet logical to indicate logging, Default: FALSE
#' @return OUTPUT_DESCRIPTION
#' @details DETAILS
#' @examples 
#' \dontrun{
#' if(interactive()){
#'  kth_divisions_crawl()
#'  }
#' }
#' @seealso 
#'  \code{\link[progress]{progress_bar}}
#'  \code{\link[purrr]{safely}},\code{\link[purrr]{map}}
#' @rdname kth_divisions_crawl
#' @export 
#' @importFrom progress progress_bar
#' @importFrom purrr possibly map_df
kth_divisions_crawl <- function(include = abm_slugs_departments(), 
  exclude = NULL, quiet = FALSE) {
  
  if (!missing(exclude) && any(! exclude %in% abm_slugs_departments()))
    stop("Please exclude only valid slugs, use abm_slugs_departments().")
  
  if (!quiet)
    message("Please use this fcn sparingly and cache results, offloading the API.\n")
  
  slugs <- setdiff(include, exclude)

  pb <- progress::progress_bar$new(
    total = length(slugs),
    format = "  processing [:what] [:bar] :percent eta: :eta"
  )
  
  address_from_slug <- function(x) 
    x %>% purrr::map_chr(function(y) as.character(kth_catalog(slug = y)$info$location))
  
  crawl <- function(slug) {
    if (!quiet) pb$tick(tokens = list(what = sprintf("%10s", slug)))
    kcc <- kth_catalog_crawl(slug)
    kcc %>% mutate(address = address_from_slug(id))
  }
  
  crawly <- purrr::possibly(crawl, otherwise = NULL, quiet = FALSE)

  purrr::map_df(slugs, crawly)
  
}

#' Given an organizational "slug", a depth first traversal is made
#' enumerating organizational units (descendants)
#' @param slug a string with the slug, for example "j/jj"
#' @return a tibble encoding descendants as a hierarchy (using parent_id, child_id tuples)
#' @details this fcn should be moved to the kthapi package... please avoid using this fcn too often as it descends recursively and
#' generates several API calls; instead please cache the results and try to avoid hammering the API
#' @examples 
#' \dontrun{
#' if(interactive()){
#'  kth_catalog_crawl("j/jj")
#'  }
#' }
#' @seealso 
#'  \code{\link[kthapi]{kth_catalog}}
#'  \code{\link[purrr]{map}}
#'  \code{\link[dplyr]{bind}}
#' @rdname kth_catalog_crawl
#' @export 
#' @importFrom kthapi kth_catalog
#' @importFrom purrr map_df
#' @importFrom dplyr bind_rows
kth_catalog_crawl <- function(slug) {
  
  if (missing(slug) || length(slug) != 1 || nzchar(slug) < 1)
    stop("Please provide one single valid slug, for example 'j/jj'")

  # immediate children for a given slug
  slug_children <- function(y) {
    lookup <- kthapi::kth_catalog(slug = y)
    children <- lookup$catalogs
    res <- NULL
    if (nrow(children) != 0)
      res <- tibble(
        pid = y,
        id = children$slug, 
        desc_parent = lookup$info$`description.en`,
        location_parent = as.character(lookup$info$location),
        desc = children$`description.en`
        )
    res
  } 
  
  # depth first traversal from a given slug
  dfs <- function(x) {
    descendants <- slug_children(x)
    crawl <- purrr::map_df(descendants$id, dfs)
    dplyr::bind_rows(descendants, crawl)
  }
  
  dfs(slug)
}

#' ABM divisions as force directed network
#' 
#' After crawling organizational data, this is used for displaying an 
#' interactive graph with a force directed network of the units.
#' 
#' @param base_url pattern to prefigate links from nodes with in JS click action
#' @param use_size boolean to indicate whether to size bubbles by publication
#' volume, default is FALSE
#' @param link_encoder function to use for encoding outgoing link identifiers,
#' default is NULL but can be set to for example 
#' \code{function(x) URLencode(x, reserved = TRUE)} to transform outgoing links
#' @param links_excluded a set of node ids for which links will be excluded, default NULL
#' @param link_404 relative url to use for excluded links
#' @param prune_graph boolean to indicate if certain non-research nodes should
#' be removed, default: FALSE
#' @return a force directed network object from NetworkD3
#' @examples 
#' \dontrun{
#' if(interactive()){
#'  abm_graph_divisions()
#'  }
#' }
#' @seealso 
#'  \code{\link[networkD3]{forceNetwork}},\code{\link[networkD3]{JS}}
#' @rdname abm_graph_divisions
#' @export 
#' @importFrom stringr str_split_fixed str_replace str_count
#' @importFrom jsonlite toJSON
#' @importFrom networkD3 forceNetwork JS
abm_graph_divisions <- function(base_url = "dash/", 
  use_size = FALSE, link_encoder = NULL, links_excluded = NULL, link_404 = "",
  prune_graph = FALSE) {
  
  # assemble org tree only with divisions used in ABM
  
  root <- tibble(from = "KTH", to = NA_character_, name = "root")
  
  # consider only these schools
  schools <- 
    abm_slugs_departments() %>% 
    stringr::str_split_fixed(pattern = "/", n = 2) %>% 
    .[,1]
  
  schools_name <- 
    tibble(slug = schools) %>% left_join(unit_info(), by = c("slug")) %>%
    select(slug, unit_short) %>% pull(unit_short)
  
  departments_name <- 
    tibble(slug = abm_slugs_departments()) %>% left_join(unit_info(), by = c("slug")) %>%
    select(slug, unit_long_en) %>% pull(unit_long_en)
  
  # "backfill" the tree with larger org units before adding divisions/subdivisions
  l0 <- tibble(from = schools, to = "KTH", name = schools_name)
  l1 <- tibble(from = abm_slugs_departments(), to = schools, name = departments_name)
  d <- abm_divisions() %>% select(from = id, to = pid, name = desc)
  
  if (prune_graph)
    d <- d %>% filter(str_count(from, "/") <= 2)
  
  tree <- bind_rows(l0, l1, d)
  #eert <- tree %>% select(to, from, name)
  
  # make a graph colored by organizational unit level
  
  tree2 <- 
    tree %>% left_join(bind_rows(
      tibble(id = "KTH", name = "KTH"), 
      tree %>% select(id = from, name = name)), by = c(to = "id")) %>%
    rename(to_name = `name.y`, from_name = `name.x`)
  
  src <- paste(tree2$from, ": ", tree2$from_name)
  target <- paste(tree2$to, ": ", tree2$to_name)
  
  # data prepared for NetworkD3  
  networkData <- data.frame(src, target, stringsAsFactors = FALSE)
  
  nodes <- data.frame(name = unique(c(src, target)), stringsAsFactors = FALSE)
  nodes$id <- 0:(nrow(nodes) - 1)
  
  edges <- networkData %>%
    left_join(nodes, by = c("src" = "name")) %>%
    select(-src) %>%
    rename(source = id) %>%
    left_join(nodes, by = c("target" = "name")) %>%
    select(-target) %>%
    rename(target = id)
  
  edges$width <- 1
  
  # make a grouping variable that will match to colours
  
  nodes$groupid <- 
    stringr::str_replace(nodes$name, "(.*?)\\s+:\\s+(.*?)$", "\\1")
  
  nodes$group <- 
    nodes$groupid %>%
    stringr::str_count("/") + 1
  
  nodes$group[which(nodes$groupid == "KTH")] <- 0
  labels <- c("KTH", "School", "Department", "Division", "Research Group")
  if (prune_graph)
    labels <- labels[1:4]
  
  nodes$fgroup <- ordered(as.character(nodes$group), labels = labels)
  groups <- as.character(sort(unique(nodes$fgroup)))
  
  domain <- jsonlite::toJSON(groups)
  range <- jsonlite::toJSON(palette_kth(length(groups)))
  ColourScale <- sprintf("d3.scaleOrdinal().domain(%s).range(%s);", domain, range)
  
  nodes$size <- 1
  
  divlevel <- abm_divisions() %>% 
    mutate(level = stringr::str_count(id, "/")) %>% 
    mutate(level = max(level) - level) %>%
    select(groupid = id, level) 
  
  if (use_size == TRUE) 
    nodes$size <- (1 + (max(nodes$group) - nodes$group)) ^ 4
    #nodes %>% left_join(abm_division_stats(), by = c(groupid = "id")) %>% 
    #pull(n_pubs) %>% recode(.missing = NA_integer_)
  
  fn <- networkD3::forceNetwork(
    Links = edges, Nodes = nodes, 
    Source = "source",
    Target = "target",
    NodeID ="name",
    Group = "fgroup",
    Value = "width",
    Nodesize = "size",
    opacity = 0.9,
    charge = -30,
    zoom = TRUE, legend = TRUE,
    #  opacityNoHover = TRUE,
    colourScale = networkD3::JS(ColourScale)
  )
  
  # fn$x$nodes$hyperlink <- 
  #   sprintf('https://www.kth.se/directory/%s', nodes$groupid)
  # fn$x$options$clickAction = 'window.open(d.hyperlink)'

  links <- nodes$groupid

  if (!is.null(link_encoder) && is.function(link_encoder))
    links <- purrr::map_chr(nodes$groupid, link_encoder)

  if (!is.null(links_excluded)) {
    links[which(nodes$groupid %in% links_excluded)] <- link_404
  }

  fn$x$nodes$hyperlink <- sprintf('%s%s', base_url, links)

  fn$x$options$clickAction = 'window.open(d.hyperlink)'
  
    
  fn
  
}

#' Crawl and persist organizational data from KTH API 
#' 
#' This function can be used to trigger a crawl for organizational data provided
#' by the KTH APIs. Data will be persisted using the provided database connection
#' @param con connection to database, Default: con_bib()
#' @param crawl logical flag confirming intention to perform crawl, Default: FALSE
#' @return invisible TRUE on success
#' @details The parameter crawl must be set to TRUE to run this function
#' @examples 
#' \dontrun{
#' if(interactive()){
#'  db_upload_crawl()
#'  }
#' }
#' @rdname db_upload_crawl
#' @export 
#' @importFrom purrr walk2
db_upload_crawl <- function(con = con_bib(), crawl = FALSE) {
  
  if (!crawl) {
    message("Please call this function with crawl = TRUE to trigger a full crawl")
    return(invisible(FALSE))
  } else {
    message("Crawling, pls use this crawl sparingly, to avoid live API lookups...")
  }
  
  divisions <- 
    kth_divisions_crawl()
  
  researchers <- 
    divisions$id %>% map_df(kthids_from_slug)
  
  unit_stats <- 
    abm_unit_stats()
  
  on.exit(dbDisconnect(con))
  
  data <- list(
    researchers = researchers,
    divisions = divisions,
    unit_stats = unit_stats
  )
  
  purrr::walk2(names(data), data, function(x, y) db_upsert_table(x, y, con))
  return(invisible(TRUE))
}

#' Create or overwrite table at connection source
#' 
#' Utility that can be used to sync data against a connection.
#' 
#' @param tbl table name
#' @param df data frame or tibble with the data
#' @param con database connection, Default: con_bib()
#' @return invisibly TRUE on success
#' @details not all backend drivers support overwrite and append, therefore
#' param overwrite for dbWriteTable (see docs) is not used
#' @examples 
#' \dontrun{
#' if(interactive()){
#'  db_upsert_table("divisions", abm_divisions(), con = con_bib(type = "mssql"))
#'  }
#' }
#' @seealso 
#'  \code{\link[DBI]{dbExistsTable}},\code{\link[DBI]{dbRemoveTable}},\code{\link[DBI]{dbWriteTable}}
#' @rdname db_upsert_table
#' @export 
#' @importFrom DBI dbExistsTable dbRemoveTable dbWriteTable
db_upsert_table <- function(tbl, df, con = con_bib()) {
  if (DBI::dbExistsTable(con, tbl)) DBI::dbRemoveTable(con, tbl)
  con %>% DBI::dbWriteTable(tbl, df) #, append = TRUE)
}

#' Researchers used in ABM
#' @param unit_slug character identifier for a unit
#' @param con database connection, Default: con_bib()
#' @return tibble with researchers
#' @examples 
#' \dontrun{
#' if(interactive()){
#'  abm_researchers()
#'  }
#' }
#' @rdname abm_researchers
#' @import dplyr
#' @importFrom DBI dbDisconnect
#' @export 
abm_researchers <- function(unit_slug, con) {
  
  if (missing(con)) {
    con <- con_bib()
    on.exit(DBI::dbDisconnect(con))
  }
  
  if (missing(unit_slug))
    unit_slug <- "*"
  
  t1 <- "researchers"
  
  if (t1 %in% (con %>% dbListTables())) {
    con %>% tbl(t1) %>%
      # TODO: shall we allow this (for "upper" levels, "inner nodes"):
      collect %>% filter(stringr::str_starts(slug, unit_slug)) %>% pull(kthid)
      #filter(slug == unit_slug) %>% pull(kthid)
  } else {
    message("Please run db_sync() or use db_upload_crawl()")
  }
  
}

#' Divisions used in ABM
#' @param con database connection, Default: con_bib()
#' @return tibble with divisions
#' @examples 
#' \dontrun{
#' if(interactive()){
#'  abm_divisions()
#'  }
#' }
#' @rdname abm_divisions
#' @importFrom DBI dbListTables
#' @import dplyr
#' @export 
abm_divisions <- function(con =  con_bib()) {
  
  on.exit(dbDisconnect(con))
  t1 <- "divisions"
  
  if (t1 %in% (con %>% DBI::dbListTables())) {
    con %>% tbl(t1) %>% collect()
  } else {
    message("Please run db_sync() or use db_upload_crawl()")
  }
  
}

#' Publications for ABM researchers belonging to a unit
#' 
#' Publications corresponding to researchers associated with a unit slug are 
#' returned.
#' @param unit_slug identifier for organizational unit
#' @return tibble with publications
#' @examples 
#' \dontrun{
#' if(interactive()){
#'  abm_unit_pubs("j/jj/jjn")
#'  }
#' }
#' @rdname abm_unit_pubs
#' @export 
abm_unit_pubs <- function(unit_slug) {
  con <- con_bib()
  on.exit(DBI::dbDisconnect(con))

  ids <- abm_researchers(unit_slug)
  
  abm_staff_data(con = con, kthids = ids)
}

#' Summary for a list of publications
#' 
#' A list of publications belonging to for example an organizational unit
#' is summarized.
#' @param unit_slug identifier for the organizational unit
#' @return tibble with summary
#' @examples 
#' \dontrun{
#' if(interactive()){
#'  abm_pubs_summary("j/jj/jjn")
#'  }
#' }
#' @rdname abm_pubs_summary
#' @import dplyr
#' @export 
abm_pubs_summary <- function(unit_slug) {
  
  ids <- abm_researchers(unit_slug)
  n_staff <- length(ids)
  
  pubs <- abm_unit_pubs(unit_slug)
  
  nd_researchers <- 
    con_bib() %>% tbl("masterfile") %>% filter(Unit_code %in% ids) %>%
    select(Unit_code) %>% distinct(Unit_code) %>% 
    collect() %>% nrow()
  
  n_pubs <- nrow(pubs)
  
  n_pubs_wos <- pubs %>% filter(Unit_Fraction_adj > 1e-4) %>% nrow()
  
  tibble(slug = unit_slug, n_staff, nd_researchers, n_pubs, n_pubs_wos)
}

#' Publication summary stats for ABM divisions
#' @param slugs a vector of unit slug identifiers, Default: abm_divisions()$id
#' @return tibble with summary data
#' @examples 
#' \dontrun{
#' if(interactive()){
#'  stats <- abm_division_stats() 
#'  # upload results to database
#'  db_upsert_table("division_stats", stats)
#'  }
#' }
#' @rdname abm_division_stats
#' @importFrom purrr map_df
#' @import dplyr
#' @export 
abm_division_stats <- function(slugs = abm_divisions()$id) {
  stats <- purrr::map_df(slugs, abm_pubs_summary)
  abm_divisions() %>% left_join(
    stats %>% arrange(n_pubs, nd_researchers, desc(n_staff)) %>% rename(id = slug))
}

#' Publication summary stats for ABM divisions and institutions and schools
#' @return tibble with summary data
#' @examples 
#' \dontrun{
#' if(interactive()){
#'  stats <- abm_unit_stats() 
#'  # upload results to database
#'  db_upsert_table("unit_stats", stats)
#'  }
#' }
#' @rdname abm_unit_stats
#' @importFrom purrr map_df
#' @import dplyr
#' @export 
abm_unit_stats <- function() {
  
  inners <- 
    unit_info() %>% collect() %>% 
    filter(org_level < 3, org_level > 0) %>% 
    pull(slug)
  
  stats <- 
    purrr::map_df(inners, abm_pubs_summary) %>% 
    arrange(n_pubs, nd_researchers, desc(n_staff)) %>% 
    rename(id = slug)
  
  parents <- 
    unit_info() %>% inner_join(
    unit_info() %>% 
     select(pidx = parent_org_id, pid = slug, desc = description_en), 
      by = c("Diva_org_id" = "pidx")) %>% 
    select(pid = slug, id = pid, desc_parent = description_en, desc)
  
  stats_inners <- stats %>% inner_join(parents, by = "id")
  
  stats_inners %>% bind_rows(abm_division_stats()) %>%
    arrange(desc(n_pubs), desc(nd_researchers), desc(n_staff))
}

# statz <- abm_unit_stats()
# db_upsert_table("unit_stats", statz)

# db_upsert_table("division_stats", statz)
# pa_slugs <- c("j/jj/jjn", "j/jh/jhs")
# con_bib() %>% tbl("division_stats") %>%
#   mutate(cov = as.double(n_pubs_wos) / as.double(n_pubs)) %>%
#   mutate(ppr = as.double(n_pubs / as.double(nd_researchers))) %>%
#   mutate(pps = as.double(n_pubs / as.double(n_staff))) %>%
#   arrange(desc(pps), desc(ppr), cov, n_staff, n_pubs, nd_researchers) %>%
#   filter(cov > 0) %>%
#   collect()
# # filter(n_pubs == 0 | nd_researchers == 0) %>%
# #  pull(slug)
#   filter(id %in% pa_slugs)
KTH-Library/bibliomatrix documentation built on Feb. 29, 2024, 5:54 a.m.