Assumptions

Processing steps

knitr::opts_chunk$set(echo = F)

library(tidyverse)
library(here)
library(data.table)

dat <- fread(params$input, encoding = 'UTF-8') %>% as_tibble() %>%
  set_names(make.names(colnames(.))) %>%
  select(author = Author.name, contains("affiliation.")) %>%
  mutate_all(~ str_squish(.x)) %>%
  mutate(author = str_replace_all(author, "\\b([[:alpha:]])\\b(?!\\.)", "\\1\\.")) %>%
  mutate(author_rank = 1:nrow(.)) %>%
  pivot_longer(cols = contains("affiliation."), 
               names_to = "affil_rank_author",
               values_to = "affil") %>%
  mutate(affil_rank_author = as.integer(str_extract(affil_rank_author, "\\d+"))) %>%
  filter(affil != "") %>%
  filter(affil != "C") %>% 
  mutate(affil = str_replace(affil, "\\bUSA?$", "USA"),
         affil = str_replace(affil, "\\bGB$", "UK")) %>% 
  mutate(affil_rank = as.numeric(factor(affil, levels = unique(affil))))

List of similar affiliations (to look for errors)

# unique affiliations in data
uaffil <- unique(dat$affil)

# fuzzy matching to other listed affiliations (only return imperfect matches)
affilmatch <- lapply(seq_along(uaffil), function(i) {
  agrep(uaffil[i], uaffil[-(1:i)], ignore.case = T, value = T)
}) %>% 
  set_names(uaffil) %>% 
  .[sapply(., function(x) length(x) > 0)]

r sapply(seq_along(affilmatch), function(i) { paste0("- ", names(affilmatch)[i], "\n", paste0(" - ", affilmatch[[i]], "\n", collapse = "")) }) %>% paste0(collapse = "")

Authors

# compose formatted string of authors
authors <- dat %>%
  group_by(author_rank, author) %>%
  arrange(affil_rank) %>% 
  summarize(aff_ranks = paste0(affil_rank, collapse = ","),
            string = paste0(unique(author), "^", aff_ranks, "^")) %>% 
  ungroup() %>%
  summarize(out = paste0(string, collapse = ", ")) %>% .$out

r authors

Affiliations

# compose formatted string of affiliations
affiliations <- dat %>% 
  select(affil, affil_rank) %>% 
  filter(!duplicated(affil)) %>% 
  summarize(out = paste0("^", affil_rank, "^", affil, ".", 
                         collapse = " ")) %>% 
  .$out

r affiliations



friedav/fdhelpers documentation built on July 7, 2020, 1:16 a.m.