knitr::opts_chunk$set( collapse = TRUE, comment = "#>" )
library(kwb.endnote) endnote_xml <- kwb.endnote::default_xml() references_df <- kwb.endnote::create_df_from_endnote_xml(endnote_xml) %>% dplyr::filter(.data$ref_type_name != "Generic") n_publications <- length(unique(references_df$record_id))
The imported Endnote XML library r basename(endnote_xml)
contains
r n_publications
which are analysed in detail in the following.
refs_by_type <- kwb.endnote::get_reference_type_names(endnote_xml) %>% dplyr::count(.data$ref_type_name) %>% dplyr::arrange(dplyr::desc(.data$n)) knitr::kable(refs_by_type)
endnote_list <- kwb.endnote::create_endnote_list(endnote_xml) refs_df <- kwb.endnote::create_references_df(endnote_list) p1 <- kwb.endnote::plot_pubs_by_year(refs_df) plotly::ggplotly(p1)
refs_by_language <- refs_df %>% dplyr::count(language) %>% dplyr::mutate(percent = round(100*n/nrow(refs_df), 1)) knitr::kable(refs_by_language)
refs_with_abstract <- references_df %>% dplyr::filter(.data$key1 == "abstract") %>% dplyr::count(.data$record_id, .data$rec_number, .data$ref_type_name) n_pubs_with_abstract <- nrow(refs_with_abstract) ### Percent of Publications with Abstracts percent_pubs_with_abstracts <- round(100*nrow(refs_with_abstract)/n_publications,1)
r percent_pubs_with_abstracts
percent (i.e.
r sprintf("%d / %d", n_pubs_with_abstract, n_publications)
publications) have an abstract.
refs_with_abstract_and_linebreaks <- refs_with_abstract %>% dplyr::filter(.data$n > 1) %>% dplyr::arrange(dplyr::desc(.data$n)) n_pubs_with_abstract_and_linebreaks <- nrow(refs_with_abstract_and_linebreaks) knitr::kable(refs_with_abstract_and_linebreaks)
However there are some line breaks for these abstracts for in total
r n_pubs_with_abstract_and_linebreaks
publications as shown in the table above.
These need to be corrected manually!
Is the accessibility level for the referenc defined in the fiel"caption" (confidential or not?)
refs_with_accessability_level <- refs_df %>% dplyr::count(.data$caption) %>% dplyr::mutate(percent = round(100*n/nrow(refs_df), 1)) n_pubs_with_accessability_level <- sum(refs_with_accessability_level$n[1:2]) percent_with_accessibility_level <- round(100*n_pubs_with_accessability_level/n_publications, 1) is_confidential <- refs_with_accessability_level$caption=="confidential" & !is.na(refs_with_accessability_level$caption) knitr::kable(refs_with_accessability_level)
Only for r n_pubs_with_accessability_level
(i.e. r percent_with_accessibility_level
percent)
meta information on the accessibility level is explicitly defined, out of which
r refs_with_accessability_level$n[is_confidential]
are defined as confidential (i.e.
r refs_with_accessability_level$percent[is_confidential]
of the publications with metadata on the
accessibility level).
Recommendation:
Fill the field caption for all publications with either:
public or
confidential
Group references by "label" (i.e. "project names")
refs_with_project <- refs_df[!is.na(refs_df$label),] n_pubs_with_project_meta <- nrow(refs_with_project) ### in percent percent_pubs_with_project_meta <- round(100*n_pubs_with_project_meta/n_publications,1) unique_project_names <- refs_with_project %>% dplyr::count(.data$label) knitr::kable(unique_project_names)
In total r percent_pubs_with_project_meta
percent (i.e.
r sprintf("%d / %s", n_pubs_with_project_meta, n_publications)
publications)
contain meta-information on the project. However, the spelling of project names
does not follow a controlled vocabulary yet as can be seen in the table above.
Thus it is recommended to establish a controlled vocabulary for project identifiers according to the best-practices defined in the FAKIN project (see here).
refs_by_author_lastfirst <- references_df %>% dplyr::filter(.data$key2 == "authors") %>% dplyr::count(.data$value) %>% dplyr::arrange(dplyr::desc(.data$n)) p2 <- kwb.endnote::plot_pubs_by_author(refs_by_author_lastfirst[1:50, ]) plotly::ggplotly(p2)
Quite messy the figure above. Thus names should be entered in Endnote in the following format:
Lastname, Firstname
Also note the , also is an author! This is due to entering a wrong semicolon in the Endnote database and needs to be fixed here for at least the following entries:
pubs_with_semicolon_as_author <- references_df %>% dplyr::filter(.data$key2 == "authors", .data$value == ",") %>% dplyr::count(.data$rec_number) %>% dplyr::arrange(dplyr::desc(.data$n)) n_pubs_with_semicolon_as_author <- nrow(pubs_with_semicolon_as_author) knitr::kable(pubs_with_semicolon_as_author)
In total r n_pubs_with_semicolon_as_author
publications contain one or more
semicolon authors. This needs to be fixed manually in the Endnote DB.
For now data-munging is needed to just use the authors` lastname for aggregating the data
refs_by_author_last <- references_df %>% dplyr::filter(.data$key2 == "authors") %>% dplyr::mutate(value = stringr::str_remove_all(.data$value, ",.*")) %>% dplyr::mutate(value = stringr::str_remove_all(.data$value, "^\\w+\\.?\\s+")) %>% dplyr::count(.data$value) %>% dplyr::arrange(dplyr::desc(.data$n)) p3 <- kwb.endnote::plot_pubs_by_author(refs_by_author_last[1:50, ]) plotly::ggplotly(p3)
Journal names need to be harmonised. Also needs to be checked what the differences between full-title and secondary-title of journal are.
Defined in: $record
$periodical$full-title
$style
[[1]]
pubs_in_journals <- references_df %>% dplyr::filter(.data$ref_type_name == "Journal Article") n_pubs_in_journals <- length(unique(pubs_in_journals$record_id)) journal_title_full <- pubs_in_journals %>% dplyr::filter(.data$key2 %in% c("full-title")) %>% dplyr::mutate("source_field" = sprintf("%s_%s",.data$key1, .data$key2)) %>% dplyr::count(.data$value, .data$source_field) knitr::kable(journal_title_full)
Defined in: $record
$titles$secondary-title
$style
[[1]]
journal_title_secondary <- pubs_in_journals %>% dplyr::filter(.data$key2 %in% c("secondary-title")) %>% dplyr::mutate("source_field" = sprintf("%s_%s",.data$key1, .data$key2)) %>% dplyr::count(.data$value, .data$source_field) knitr::kable(journal_title_secondary)
All r n_pubs_in_journals
journal articles should have an DOI. Let`s check
for how many we have this metadata:
has_doi <- pubs_in_journals %>% dplyr::filter(.data$key1 == "electronic-resource-num", .data$value != "") n_pubs_in_journals_with_doi <- nrow(has_doi) knitr::kable(has_doi)
Only r n_pubs_in_journals_with_doi
(r round(100*n_pubs_in_journals_with_doi/n_pubs_in_journals,1)
%) journal publications have a DOI which is encoded in the field electronic-resource-num.
All DOIs should be entered in the field electronic-resource-num in the following format, i.e.:
10.6084/m9.figshare.828487
Write references dataframe to xlsx with one sheet for each publication type
to references.xlsx
kwb.endnote::write_references_df_to_xlsx(endnote_list)
And also writting a cleaned version after automatically running
kwb.endnote::clean_references_df()
to references_cleaned.xlsx
.
kwb.endnote::write_clean_references_df_to_xlsx(endnote_list)
problematic_entries <- kwb.endnote::check_problematic_entries(endnote_list) knitr::kable(problematic_entries[1:100,])
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.