knitr::opts_chunk$set(collapse = FALSE)
library(tidyverse);library(magrittr)

pmid <- readRDS("~/starsurg_impact/data/impact_pub.rds")$pmid

data <- impactr::extract_pmid(pmid = pmid) %>%
    dplyr::mutate(journal_edit = ifelse(journal_full=="The Lancet", "Lancet", journal_full),
                  journal_edit = ifelse(journal_edit=="British Medical Journal", "BMJ-British Medical Journal", journal_edit))

Impact from Traditional Metrics

Citations are a common metric for determining the impact of research on the wider academic field. Both extract_pmid() and extract_doi() will provide data on citations (based on PubMed/Crossref repositories) and journal impact factor (based on Scimago Scientific Journal Rankings).

impact_cite()

The function impact_cite() can also produce several metrics that can be used for the purposes of assessment of the more traditional research impact of publications. This includes:

data_cite <- impactr::impact_cite(data = data, var_id = "pmid",
                                  crossref=TRUE, dimentions=TRUE, scopus=FALSE, oc = TRUE,
                                  gscholar = "Ol5uNSwAAAAJ&hl", metric=TRUE)
data_cite$df %>%
  dplyr::select(doi, title, journal_edit, journal_if,
                cite_dim,   cite_gs) %>%
  magrittr::set_colnames(gsub("_", " ", names(.))) %>%
  dplyr::mutate(`journal edit` = ifelse(`journal edit`=="The British journal of surgery", "BJS", `journal edit`),
                `journal edit` = ifelse(`journal edit`=="Colorectal disease : the official journal of the Association of Coloproctology of Great Britain and Ireland", "Colorectal Disease", `journal edit`)) %>%
  knitr::kable(format="html") %>%
  kableExtra::column_spec(which(colnames(.) %in% c("title")), width_min="7in") %>%
  kableExtra::kable_styling(bootstrap_options = "striped", full_width = F) %>%
  kableExtra::scroll_box(height = "400px",width = "1000px")

 

Impact Metrics

Citations

Citations (e.g. cite_max) can be used to either provide summary statistics or visualisations.

Note: Google Scholar is typically being more sensitive but less specific in estimating citation count than Crossref / PubMed repositories.

cite_max <- data_cite$df %$% max(cite_gs, na.rm=TRUE)

plot_cite_difference <- data_cite$df %>%
  dplyr::select(cite_gs, cite_cr) %>%
  ggplot() +
  geom_point(aes(x = cite_cr, y = cite_gs)) + 
  scale_x_continuous(breaks=c(seq(0, cite_max, by=5)),
                     limits = c(0, max(data_cite$df$cite_cr, na.rm=TRUE)), expand = c(0, 0.5)) +
  scale_y_continuous(breaks=c(seq(0, cite_max, by=5)),
                     limits = c(0, max(data_cite$df$cite_gs, na.rm=TRUE)), expand = c(0, 0.5)) + 
  labs(x = "Citations per publication on CrossRef", y = "Citations per publication on Google Scholar") +
  theme_bw()

ggsave(plot_cite_difference, filename = "~/impactr/vignettes/plot/plot_cite_difference.png",
       height = 5.76*0.75, width = 9.60*0.75)

Only Google Scholar and Open Citations allow exporation of the citations over time. This will be included in the output of impact_cite() if either of those sources are selected.

This can be used to plot the longitudinal impact of the paper through citations over time.

cite_max <- data_cite$df %$% max(cite_gs, na.rm=TRUE)

plot_cite_time <- data_cite$time %>%
  dplyr::filter(source == "gs") %>% 
  dplyr::filter(is.na(cite_cumsum)==F) %>%
  dplyr::mutate(cite_year = forcats::fct_drop(cite_year)) %>%
  # dplyr::group_by(source) %>% 
  ggplot() +
  aes(x = cite_year, y = cite_cumsum, group = doi, fill = doi) + 
  # geom_line() + 
  geom_area(position = 'stack', colour = "black") + 
  labs(x = "Year of Citation", y = "Cumulative total of citations per publication") +
  guides(fill=FALSE) +
  theme_bw(base_size = 13)

ggsave(plot_cite_time, filename = "~/impactr/vignettes/plot/plot_cite_time.png",
       height = 5.76*0.75, width = 9.60*0.75)

 

Journals

As journal-level information is also extracted using impact_cite() (e.g. impact factor / Eigenfactor) this can also allow some assessment of impact of the articles in relation to the journal benchmark.

For example, the plot below demonstrates the ratio of Paper Citations : Journal Impact Factor - any point above the horizontal line (ratio of 1:1) indicates the paper has gathered more citations than typical for that journal.

cite_if_max <- data_cite$df %$% max(cite_max/journal_if, na.rm=TRUE)

plot_cite_cif <- data_cite$df %>%
  dplyr::filter(is.na(journal_if)==F) %>%
  dplyr::mutate(cite_year = cite_max/(lubridate::year(Sys.Date())-year)) %>%
  dplyr::mutate(cite_if = cite_max / journal_if,
                cite_if_year = cite_year / journal_if) %>%
  dplyr::filter(grepl("Paper",type)) %>%
  dplyr::select("Journal" = journal_full, journal_if,
                cite_max, cite_year, cite_if, cite_if_year, year, altmetric) %>%
    dplyr::mutate(Journal = ifelse(Journal=="Colorectal disease : the official journal of the Association of Coloproctology of Great Britain and Ireland", "Colorectal disease", Journal)) %>%
dplyr::group_by(Journal, journal_if, cite_max,
                  cite_year, cite_if, cite_if_year, year, altmetric) %>%
  dplyr::summarise(n = n()) %>%
  ggplot() + 
  aes(x = year, y = cite_if, group = Journal, colour = Journal, fill = Journal) +
  geom_point() +
  scale_y_continuous(breaks=c(0, 1, seq(5, cite_if_max, by=5)),minor_breaks=NULL) +
  geom_hline(aes(yintercept=1), colour = "dark grey", linetype = "dashed") +
  labs(x = "Year of Publication", y = "Ratio of Paper Citations : Journal Impact Factor") +
  theme_bw(base_size = 13)

ggsave(plot_cite_cif, filename = "~/impactr/vignettes/plot/plot_cite_cif.png",
       height = 5.76*0.75, width = 9.60*0.75)

 

Metrics

Citation metrics are author-level measures that attempts to assess both the productivity and citation impact of the publications. These include:

impact_cite() will produce common citation metrics automatically based on the papers included in the dataframe supplied. Alternatively this can be calculated directly using cite_metric().

data_cite$metric; impactr::cite_metric(data_cite$df$cite_max, data_cite$df$year)

 

Troubleshooting ($validation)

The Google Scholar ID is used to derive publications, and so all publications by an author (or authorship group) must be uploaded under one Google Scholar account.

As google scholar does not contain DOI or PMID, papers must be matched by title, and can only be matched if there is a google scholar record for each paper in the supplied dataframe. impact_cite() provides several features to proactively identify issues via $validation output.

The outcome will record either matched, or the following:



kamclean/impactr documentation built on Jan. 11, 2023, 2:51 p.m.