Pull in IP addresses, filter to only keep IP addresses over 11 views (I think any 11 or below is a bot, since they are so common)

eyeIntegration.stats is generated as follows:

  1. Select all Daily_eyeinegration_frequent_IP_access_list emails in Outlook
  2. Drag all emails to empty folder
  3. Run cat *eml | grep "^[[:digit:]]" > eyeIntegration.stats
library(tidyverse)
library(rworldmap)
library(ipstack)
library(lubridate)
library(patchwork)

eml_files <- list.files('~/data/eyeIntegration_eml_emails/x/', full.names = TRUE, pattern = '*eml', recursive = TRUE)
eml_reader <- function(eml){
  raw <- read_lines(eml)

  tsv <- read_delim(eml, col_names = c('ip','Page Views'), delim = ' ')
  tsv <- tsv %>% filter(grepl('^\\d', ip)) %>% mutate(`Page Views` = trimws(`Page Views`) %>% as.integer())
  tsv$`Page Views` <- gsub('<br>', '', tsv$`Page Views`)

  chunk <- grep('Date', raw,value = TRUE) %>% substr(., 12,22) %>% str_split(., ' ', simplify = TRUE)
  date <- lubridate::mdy(paste(chunk[2], chunk[1], chunk[3]))

  tsv$date <- date

  tsv 

}

eml_list <- list()
for (i in eml_files){
  eml_list[[i]] <- eml_reader(i)
}

ipaddresses <- eml_list %>% bind_rows() %>% 
  unique() %>% 
  mutate(`Page Views` = case_when(!is.na(`Page Views`) ~ `Page Views`,
                                  !is.na(X3) ~ X3,
                                  !is.na(X4) ~ X4,
                                  !is.na(X5) ~ X5,
                                  !is.na(X6) ~ X6),
         `Page Views` = as.numeric(`Page Views`)) %>% 
  filter(`Page Views` > 11)

# find missing dates
date_span <- Sys.Date() - (ipaddresses$date %>% sort() %>% head(1) )

date_tib <- ((ipaddresses$date %>% sort() %>% head(1)) + days(0:date_span)) %>%  tibble::enframe() %>% transmute(date = value)

# if loading previous work
#load('visitor_stats_2023_04.Rdata')

#save(ipaddresses, eml_list, file = 'visitor_stats_2023_04.Rdata')

Total page view from May 2017 to Present

sum(ipaddresses$`Page Views`)
#1718117

Chart of page views increasing over time

a <- ipaddresses %>% 
  filter(`Page Views` > 11) %>% 
  group_by(date) %>% 
  summarise(`Visitor Count` = length(unique(ip))) %>% 
  mutate(`Visitor Count` = cumsum(`Visitor Count`)) %>% 
  ggplot(aes(x=date,y=`Visitor Count`)) + 
  geom_line() +
  cowplot::theme_cowplot() +
  xlab('Date') + ylab('eyeIntegration\nCumulative\nVisits') +
  expand_limits(x = as.Date(c("2017-05-01", "2023-05-01")))

a

Get latitude and longitude for all unique IPs (takes a few minutes to query)

unique_IP <- ipaddresses$ip %>% unique()
ip_info <- list()

# ipstack 
for (i in unique_IP){
  if (!(i %in% names(ip_info))){
    ip_info[[i]] <- ip_lookup(i, ipstack_api_key = '7d06f26c4f0f61bd9edd20df0219f874', hostname = TRUE)
  }
}

# ip_info_ipapi <- list()
# for (i in unique_IP){
#   if (!(i %in% names(ip_info_ipapi))){
#     print(i)
#     ip_info_ipapi[[i]] <- ipapi::geolocate(i, .progress = FALSE)
#     Sys.sleep(2)
#   }
# }

processed_ip <- list()
for (i in names(ip_info)){
  ip <- ip_info[[i]]$ip
  isp <- ip_info[[i]]$connection$isp %>% as.character()
  lat <- ip_info[[i]]$latitude %>% as.character()
  lon <- ip_info[[i]]$longitude %>% as.character()
  city <- ip_info[[i]]$city %>% as.character()
  country <- ip_info[[i]]$country_name
  processed_ip[[i]] <- cbind(ip, isp, lat, lon, country, city) %>% as_tibble()
}

ip_tibble <- processed_ip %>% bind_rows()

ip_tibble %>% 
  filter(grepl('Insti|Univ|Hosp|Med|Scie|Tech|Coll|Health|Verein|State|Labora|State|Riken|Biome|Agenc|China Educa|Merck', isp))
usage_by_academia_pharma <- ipaddresses %>% left_join(ip_tibble) %>% 
  group_by(isp) %>% 
  summarise(Usage = sum(`Page Views`), Visits = n()) %>% 
  arrange(-Usage) %>% 
  filter(grepl('Insti|Univ|Hosp|Med|Scie|Tech|Coll|Health|Verein|State|Labora|State|Riken|Biome|Agenc|China Educa|Merck', isp),
         isp != 'Nus Information Technology',
         isp != 'Virgin Media Limited') %>% 
  mutate(isp = case_when(grepl("Verein", isp)  ~ 'German National\nResearch and Education Network',
                         TRUE ~ isp)) %>% data.frame() 

top15 <- usage_by_academia_pharma %>% head(15) %>% pull(isp)

top25 <- ipaddresses %>% left_join(ip_tibble) %>% 
  group_by(isp) %>% 
  summarise(Usage = sum(`Page Views`)) %>% 
  arrange(-Usage) %>% 
  filter(grepl('Insti|Univ|Hosp|Med|Scie|Tech|Coll|Health|Verein|State|Labora|State|Riken|Biome|Agenc|China Educa|Merck', isp),
         isp != 'Nus Information Technology',
         isp != 'Virgin Media Limited') %>% 
  head(25) %>% pull(isp)
ipaddresses %>% left_join(ip_tibble %>% group_by(ip, isp) %>% summarise(Count = n())) %>% 
    filter(`Page Views` > 11) %>%  filter(isp %in% top25) %>% 
    group_by(isp, date) %>% 
    summarise(`Usage` = sum(`Page Views`)) %>% 
    mutate(`Usage` = cumsum(`Usage`)) %>%   mutate(isp = case_when(grepl("Verein", isp)  ~ 'German National\nResearch and Education Network',
                                                                   TRUE ~ isp)) %>% 
    ggplot(aes(x=date,y=Usage)) + 
    geom_line() +
    cowplot::theme_cowplot() +
    xlab('Date') + ylab('eyeIntegration\nCumulative\nVisits') +
    expand_limits(x = as.Date(c("2017-05-01", "2023-05-01"))) + facet_wrap(~isp, scales = 'free', labeller = label_wrap_gen(20))

Save

save(ip_tibble, ipaddresses, ip_info, file = '~/git/eyeIntegration_app/visitor_stats_2023_04.Rdata')

Interactive Map

# library(ggiraph)
# world <- map_data("world")
# world <- world[world$region != "Antarctica",]

#world <- c(geom_polygon(aes(long,lat,group=group), size = 0.1, colour= "#090D2A", fill="white", alpha=0.8, data=worldmap))
# code <- ggplot() + geom_map(data=world, map=world,
#                     aes(x=long, y=lat, map_id=region),
#                     color="#090D2A", fill="white", size=0.05, alpha=1/4) + 
#   geom_point_interactive(aes(x = geo_locate$longitudes, y = geo_locate$latitudes, tooltip=geo_locate$cities)) + theme_void() 
# ggiraph(code = print(code) )

Regular Map

top15_coords <- ip_tibble %>% filter(isp %in% top15) %>% 
  group_by(isp) %>% 
  summarise(lon = mean(as.numeric(lon)),
            lat = mean(as.numeric(lat)))
world <- map_data("world")
world <- world[world$region != "Antarctica",]

ggplot(data = ip_tibble, 
       aes(x = as.numeric(lon), y = as.numeric(lat))) + 
  geom_point(color = 'black', size = 0.4) + 
  geom_point(data = ip_tibble %>% filter(grepl('Insti|Univ|Hosp|Med|Scie|Tech|Coll|Health|Verein|State|Labora|State|Riken|Biome|Agenc|China Educa|Merck', isp)), color = 'Red', size = 0.5) +
  theme_void() +
  ggsci::scale_color_aaas() +
  geom_map(data=world %>% mutate(Institution = 'placeholder'), map=world,
           aes(x=long, y=lat, map_id=region),
           color="#090D2A", fill="lightgray", size=0.05, alpha=1/4) +
  ggtitle('eyeIntegration has been used across 84 countries,\n250 research institutions, and 1465 cities')
ggplot(data = ip_tibble, 
       aes(x = as.numeric(lon), y = as.numeric(lat))) + 
  geom_point(color = 'black', size = 0.4) + 
  theme_void() +
  scale_color_viridis_d() +
  geom_map(data=world, map=world,
           aes(x=long, y=lat, map_id=region),
           color="#090D2A", fill="lightgray", size=0.05, alpha=1/4) +

  ggrepel::geom_label_repel(data = top15_coords, aes(x=lon,y=lat,label = isp), 
                            max.overlaps = Inf, force = 15, force_pull = 0.2) +
  ggtitle("Top 15 Research Institution Users of eyeIntegration")

Figure

isp_location <- ipaddresses %>% left_join(ip_tibble, by = c('ip' = 'query')) %>% group_by(isp) %>% summarise(Usage = sum(`Page Views`), lat = mean(lat), lon = mean(lon))

isp_location_academic <- isp_location %>%
  filter(grepl('Agency|Acad|Scie|Centr|Center|Schoo|Univ|Hosp|Stat|Colle|Resea|Publ|Nation|Instit|Medici', isp), 
         !grepl('BT Public', isp)) %>% 
  arrange(-Usage)

c <- isp_location_academic %>% 
  head(25) %>%
  mutate(isp = factor(isp, levels = rev(isp_location_academic$isp))) %>%  
  ggplot(aes(x=isp, y = Usage)) + 
  geom_bar(stat = 'identity', fill = 'darkblue') +  
  ggbreak::scale_y_break(c(35000, 240000)) +
  scale_y_continuous(breaks = c(5000, 15000, 25000, 240000)) +
  coord_flip() +
  cowplot::theme_cowplot() +
  xlab('Top 25\nResearch Users') +
  ylab('Page Views') +
  theme(text = element_text(size = 20)) 



b <- ggplot(data = isp_location, aes(x = lon, y = lat) ) + 
  geom_map(data=world, map=world,
           aes(x=long, y=lat, map_id=region),
           color="#090D2A", fill="lightgray", size=0.05, alpha=1/4) + 
  geom_point(color = 'black', size = 4, alpha = 0.5) + 
  geom_point(data = isp_location_academic, color = 'darkblue', size = 5, alpha = 0.7) +
  theme_void() +
  scale_color_viridis_c()

first <- (a + c)  + 
  plot_layout(widths = c(1, 2)) +
  plot_annotation(theme = theme(text = element_text(size = 25)))

first / b +
  plot_layout(heights = c(1, 1.4))  +
  plot_annotation(
    title = 'eyeIntegration Worldwide Usage',
    subtitle = glue::glue('Users from {ip_tibble$country %>% unique() %>% length()} countries, {ip_tibble$city %>% unique() %>% length()} cities, and {isp_location_academic %>% nrow} research institutions.'),
    caption = 'Blue points are research instituations',
    theme = theme(text = element_text(size = 25))
  )


davemcg/eyeIntegration_app documentation built on May 18, 2024, 1:37 p.m.