Katie's notes

So far as I can tell, audit logs dont contain interesting information about content access. Presumably because audit logs are about server and user actions. Not in accessing content.

library(connectapi)
library(flexdashboard)
library(tidyverse)
library(lubridate)
library(ggplot2)
library(rscview)
library(pins)
library(sunburstR)
library(d3r)
check_envvars()
conn <- create_connection()


#get a table with content name and deployment info
# content <- get_content(conn, limit = Inf)
# content <- readr::read_rds(here::here("inst", "usage-report", "content_2021-06-14.rds"))
content_list <-pin_get("katie/content_list", board = "rsconnect")

content_info_tbl <- content_list %>% 
  dplyr::select(guid, name, title, owner_username, url, app_mode, content_category, access_type, py_version, r_version)

#Notes for future self - from `https://github.com/rstudio/connectapi/blob/002f0a6a0a6d3cf2eadc57f3daf173dd13ca65ff/R/ptype.R#L56`
#(1=shiny, 2=shiny Rmd, 3=source Rmd, 4=static, 5=api, 6=tensorflow, 7=python, 8=flask, 9=dash, 10=streamlit)
type_ref <- tibble(app_mode = 1:10, 
                   content_type = c("Shiny", 
                                   "Shiny RMD",
                                   "Source RMD", 
                                   "Static", 
                                   "API", 
                                   "Tensorflow", 
                                   "Python", 
                                   "Flask", 
                                   "Dash", 
                                   "Streamlit"))

content_info_tbl <- content_info_tbl %>% left_join(type_ref, by = "app_mode")

user_info_tbl <- get_users_tbl(conn)
# days_back <- 90 #days
months_back <- 3 #months
# ^^--- this needs to be added to a global variable. this goes into the ETL that pins the shiny / nonshiny usage pins. 
## TODO create a test to see how long each API calls and advise if a pin should be used or if running the call live is okay.

#I think we dont need this --v
# default_content_title <- "Unknown (Deleted Content?)"

report_from <- lubridate::today() - lubridate::dmonths(months_back)

# shiny_raw <- get_usage_shiny(
#   conn,
#   from = report_from,
#   limit = Inf
# ) %>%
#   mutate(
#     started = lubridate::ymd_hms(started),
#     ended = lubridate::ymd_hms(ended),
#     session_duration = ended - started
#     ) %>%
#   filter(session_duration > lubridate::dseconds(5))

# shiny_raw <- read_rds(here::here("inst", "usage-report", "shiny_2021-06-14.rds"))
shiny_raw <- pin_get("katie/shiny_raw_usage", board = "rsconnect")
shiny <- shiny_raw %>% select(-ended)

# start_time <- Sys.time()
# other_raw <- get_usage_static(
#   conn,
#   from = report_from,
#   limit = Inf
# )
# end_time <- Sys.time()
# duration_get_other <- end_time - start_time

other_raw <- pin_get("katie/nonshiny_raw_usage", board = "rsconnect")
# other_raw <- read_rds(here::here("inst", "usage-report", "static_2021-06-14.rds"))

other <- other_raw %>% select(-variant_key, -rendering_id, -bundle_id) %>% 
  rename("started" = time)

## TO DO - ask - does flask and other interactive content in the "other" category no session duration?

# all_users <- get_users(conn, page_size = 500)


access_data <- bind_rows(shiny, other) %>% 
  mutate(session_duration_mins = as.numeric(session_duration)/60) %>% 
  select(-session_duration)
# readr::write_rds(data, file = here::here("inst","usage-report", "data.rds"))


access_details <- access_data %>% left_join(content_info_tbl, by = c(content_guid = "guid")) %>% 
  left_join(select(user_info_tbl, username, guid), by = c(user_guid = "guid"))

# Note - curious thing to look into - what is the content that doesnt seem to exist with long durations? was this since deleted and thats why it has no info?  content guid example a3a16278-fca0-4592-a547-e7087395571f

# WHAT IS NA for acl? <- it's the same as content_type == NA.  but what is it?  
# how is NA username accessing content that is acl, logged_in?
authenticated_access <- access_details %>% filter(access_type != "all") 
## ^ check this.

This content summary may contain privileged information. The report is generated using the RStudio Connect Server API and the source code is available online if you'd like to customize your analysis. Data is limited to the last r days_back days.

The report uses the environment variables CONNECT_SERVER and CONNECT_API_KEY to collect the data. To limit the results to a single publisher, use a publisher API key.

Row

What content is most popular?

get_top_n_visited <- function(access_details, monthsback = months_back, n = Inf){
  access_details %>%
  filter(started > today() - dmonths(months_back)) %>% 
  mutate(month = round_date(started, "month")) %>% 
  group_by(month, content_guid, user_guid, name, title, owner_username, access_type, content_type, username) %>% 
  summarise(visits = n(), .groups = "drop") %>% 
  arrange(desc(visits)) %>% 
    head(n)
}

R vs Python content

#the vision: sunburst plot of R vs Python and version of each used. <- wonder if we should color special the reticulated items :)

content_tbl_lang <- content_info_tbl %>% 
  select(guid, name, title, r_version, py_version, content_category) %>% 
  # select(-guid, -content_category, -name) %>% 
  filter(!(is.na(r_version) & is.na(py_version))) %>% 
  rename("R" = r_version, "Python" = py_version) %>% 
  pivot_longer(cols = c(R, Python), names_to = "language", values_to = "version") %>% 
  filter(!is.na(version)) %>% 
  group_by(language, version) %>% summarise(count = n()) %>% 
  mutate(path = paste(language, version, sep = "-")) %>% ungroup() %>% 
  select(path, count)

my_colors <- list(range = c(calls_account_color$acct_color, opps_color$opp_color),
                  domain = c(calls_account_color$account_name_days, opps_color$opp_name_days))


sund2b(content_tbl_lang, 
       valueField = "count",
       # colors = my_colors, 
       rootLabel = "All Content", 
       width = "100%",
       # breadcrumbs = sund2bBreadcrumb(enabled = FALSE),
       showLabels = TRUE
       )

sunburst(content_tbl_lang)
summary_access <- get_top_n_visited(access_details, months_back)

## NEED HELP with this plot -- cant get it in pareto sort and want to filter results

visits_by_month_tbl <-
  summary_access %>% 
  dplyr::select(month, visits, content_type) %>%
  dplyr::group_by(month, content_type) %>%
  dplyr::summarize(n = n(), .groups = "keep") %>%
  dplyr::arrange(desc(month), desc(n)) %>%
  dplyr::


ggplot(
  data = visits_by_month_tbl,
  aes(x = month, y = visits)
  ) + 
  geom_bar(stat = "identity") +  
     labs(
       x = NULL,
       y = "Number of Visits"
     ) + 
  facet_wrap(~ content_type, scales = "free") +
  scale_x_datetime(date_labels = "%m %Y")


get_top_n_visitors <- function(access_details, monthsback = months_back, n = Inf){
  access_details %>%
    filter(!is.na(user_guid)) %>% 
  filter(started > today() - dmonths(months_back)) %>% 
  mutate(month = round_date(started, "month")) %>% 
  group_by(month, user_guid, name, title, owner_username, access_type, content_type, username) %>% 
  summarise(visits = n()) %>% arrange(desc(visits)) %>% head(n)
}




# Create top N chart of most frequently visited pieces of content. 
# Also create filter by publisher
# and summary of number of things published by usernae
access_details %>% 
  filter(username == "katie") %>% group_by(content_guid, title, name, owner_username, content_type) %>% 
  summarise(visits = n()) %>% arrange(desc(visits)) 
#what content has a user visited?
get_user_access_activity <- function(access_details, user_guid) {
  access_details %>% filter(user_guid == !!user_guid)
}

get_user_access_activity(access_details, my_guid) %>% 
  get_top_n_visited()

#what users visit a piece of content most?
get_content_access_activity <- function(access_details, content_guid){
  access_details %>% filter(content_guid == !!content_guid)
}


get_content_access_activity(authenticated_access, (get_top_n_visited(authenticated_access)$content_guid)[3]) %>% 
  get_top_n_visitors()

#what apps have the largest number of users
authenticated_access %>% get_top_n_visitors()


kmasiello/rscview documentation built on Jan. 3, 2023, 2:58 p.m.