  collapse = TRUE,
  comment = "#>"

Import Endnote XML As Data Frame


endnote_xml <- kwb.endnote::default_xml()

references_df <- kwb.endnote::create_df_from_endnote_xml(endnote_xml) %>%  
                 dplyr::filter(.data$ref_type_name != "Generic")

n_publications <- length(unique(references_df$record_id))

Analysis of Publications

The imported Endnote XML library r basename(endnote_xml) contains r n_publications which are analysed in detail in the following.

By Type

refs_by_type <- kwb.endnote::get_reference_type_names(endnote_xml) %>%
  dplyr::count(.data$ref_type_name) %>%


By Year

endnote_list <- kwb.endnote::create_endnote_list(endnote_xml)
refs_df <- kwb.endnote::create_references_df(endnote_list)

p1 <- kwb.endnote::plot_pubs_by_year(refs_df) 

By Language

refs_by_language <- refs_df %>%
dplyr::count(language) %>%  
dplyr::mutate(percent = round(100*n/nrow(refs_df), 1))


Has an Abstract?

refs_with_abstract <- references_df %>% 
  dplyr::filter(.data$key1 == "abstract") %>%
  dplyr::count(.data$record_id, .data$rec_number, .data$ref_type_name)

n_pubs_with_abstract <- nrow(refs_with_abstract) 
### Percent of Publications with Abstracts
percent_pubs_with_abstracts <- round(100*nrow(refs_with_abstract)/n_publications,1) 

r percent_pubs_with_abstracts percent (i.e. r sprintf("%d / %d", n_pubs_with_abstract, n_publications) publications) have an abstract.

refs_with_abstract_and_linebreaks <- refs_with_abstract %>%  
  dplyr::filter(.data$n > 1) %>% 

n_pubs_with_abstract_and_linebreaks <- nrow(refs_with_abstract_and_linebreaks)


However there are some line breaks for these abstracts for in total r n_pubs_with_abstract_and_linebreaks publications as shown in the table above.

These need to be corrected manually!

By Accessibility Level

Is the accessibility level for the referenc defined in the fiel"caption" (confidential or not?)

refs_with_accessability_level <- refs_df %>% 
  dplyr::count(.data$caption) %>%  
  dplyr::mutate(percent = round(100*n/nrow(refs_df), 1))

n_pubs_with_accessability_level <- sum(refs_with_accessability_level$n[1:2]) 

percent_with_accessibility_level <- round(100*n_pubs_with_accessability_level/n_publications, 1)

is_confidential <- refs_with_accessability_level$caption=="confidential" & !$caption)


Only for r n_pubs_with_accessability_level (i.e. r percent_with_accessibility_level percent) meta information on the accessibility level is explicitly defined, out of which r refs_with_accessability_level$n[is_confidential] are defined as confidential (i.e. r refs_with_accessability_level$percent[is_confidential] of the publications with metadata on the accessibility level).


Fill the field caption for all publications with either:

By Project

Group references by "label" (i.e. "project names")

refs_with_project <- refs_df[!$label),]

n_pubs_with_project_meta <- nrow(refs_with_project)

### in percent
percent_pubs_with_project_meta <- round(100*n_pubs_with_project_meta/n_publications,1)

unique_project_names <- refs_with_project  %>% 


In total r percent_pubs_with_project_meta percent (i.e. r sprintf("%d / %s", n_pubs_with_project_meta, n_publications) publications) contain meta-information on the project. However, the spelling of project names does not follow a controlled vocabulary yet as can be seen in the table above.

Thus it is recommended to establish a controlled vocabulary for project identifiers according to the best-practices defined in the FAKIN project (see here).

By Author


refs_by_author_lastfirst <- references_df %>%
  dplyr::filter(.data$key2 == "authors") %>%
  dplyr::count(.data$value)  %>%

p2 <- kwb.endnote::plot_pubs_by_author(refs_by_author_lastfirst[1:50, ])

Quite messy the figure above. Thus names should be entered in Endnote in the following format:

Lastname, Firstname

Also note the , also is an author! This is due to entering a wrong semicolon in the Endnote database and needs to be fixed here for at least the following entries:

pubs_with_semicolon_as_author <- references_df %>%
  dplyr::filter(.data$key2 == "authors",
                .data$value == ",") %>% 
  dplyr::count(.data$rec_number) %>% 

n_pubs_with_semicolon_as_author <- nrow(pubs_with_semicolon_as_author)


In total r n_pubs_with_semicolon_as_author publications contain one or more semicolon authors. This needs to be fixed manually in the Endnote DB.


For now data-munging is needed to just use the authors` lastname for aggregating the data

refs_by_author_last <- references_df %>%
  dplyr::filter(.data$key2 == "authors") %>%
  dplyr::mutate(value = stringr::str_remove_all(.data$value, ",.*")) %>%
  dplyr::mutate(value = stringr::str_remove_all(.data$value, "^\\w+\\.?\\s+")) %>%
  dplyr::count(.data$value)  %>%

p3 <- kwb.endnote::plot_pubs_by_author(refs_by_author_last[1:50, ])

Journal Articles

By Journal

Journal names need to be harmonised. Also needs to be checked what the differences between full-title and secondary-title of journal are.

Journal full-title

Defined in: $record$periodical$full-title$style[[1]]

pubs_in_journals <- references_df %>%  
  dplyr::filter(.data$ref_type_name == "Journal Article")

n_pubs_in_journals <- length(unique(pubs_in_journals$record_id))

journal_title_full <- pubs_in_journals %>%  
  dplyr::filter(.data$key2 %in% c("full-title")) %>% 
  dplyr::mutate("source_field" = sprintf("%s_%s",.data$key1, 
                                           .data$key2))  %>% 
  dplyr::count(.data$value, .data$source_field) 


Journal secondary-title

Defined in: $record$titles$secondary-title$style[[1]]

journal_title_secondary <- pubs_in_journals %>%  
  dplyr::filter(.data$key2 %in% c("secondary-title")) %>% 
  dplyr::mutate("source_field" = sprintf("%s_%s",.data$key1, 
                                           .data$key2))  %>% 
  dplyr::count(.data$value, .data$source_field)


Journal DOI

All r n_pubs_in_journals journal articles should have an DOI. Let`s check for how many we have this metadata:

has_doi <- pubs_in_journals %>%  
           dplyr::filter(.data$key1 == "electronic-resource-num", 
                         .data$value != "") 

n_pubs_in_journals_with_doi <- nrow(has_doi)


Only r n_pubs_in_journals_with_doi (r round(100*n_pubs_in_journals_with_doi/n_pubs_in_journals,1) %) journal publications have a DOI which is encoded in the field electronic-resource-num.

All DOIs should be entered in the field electronic-resource-num in the following format, i.e.:


Export References To XLSX

Write references dataframe to xlsx with one sheet for each publication type to references.xlsx


And also writting a cleaned version after automatically running kwb.endnote::clean_references_df() to references_cleaned.xlsx.


Check Endnote References

problematic_entries <- kwb.endnote::check_problematic_entries(endnote_list)

KWB-R/kwb.endnote documentation built on July 8, 2021, 4:18 p.m.