Nothing
#' @title Wrangle Cristin metadata into Zotero-type format
#' @description The little engine that could, at least try, to wrangle metadata
#' from Cristin into a Zotero-type matrix. References can be augmented, or
#' replaced with metadata gathered through ISBN or DOI identifiers, and
#' through Crossref queries if all other fails. Or, hope the best and place
#' your misguided faith in the ability of researchers to correctly register
#' their own publications.
#' @param data Tibble containing metadata from Cristin
#' @param use.identifiers Use if ISBN/DOI identifiers if enabled, Default: TRUE
#' @param crossref.search Query Crossref database based on title, authors, and
#' date if enabled, Default: FALSE
#' @param autosearch Results could be automatically evaluated (based on some
#' logic) or you could inspect them manually if set to FALSE, Default: TRUE
#' @param override Put your faith in the algorithms and the identifiers (i.e.,
#' DOI/ISBN) and override what is reported in Cristin, Default: FALSE
#' @param silent Running silent, running deep, Default: FALSE
#' @param polite Please store you email in `.Renviron` to query Crossref,
#' Default: TRUE
#' @param log A list for storing log elements, Default: list()
#' @return A Zotero-type matrix (tibble)
#' @details Please see
#' \href{https://oeysan.github.io/c2z/}{https://oeysan.github.io/c2z/}
#' @examples
#' \donttest{
#' # Simple `Cristin` search by id with import set as FALSE
#' example <- Cristin(id = "840998", zotero.import = FALSE)
#'
#' if (any(nrow(example$results))) {
#' # Check if results i supported using `CristinSupported`
#' example.supported <- CristinSupported(example$results)
#'
#' # Use `ZoteroIndex` to print `CristinWrangler`
#' ZoteroIndex(CristinWrangler(example.supported$data)$results) |>
#' dplyr::select(name) |>
#' print(width = 80)
#' }
#' }
#' @seealso
#' \code{\link[tidyr]{unnest}}, {\link[tidyr]{nest}}
#' \code{\link[dplyr]{filter}}, \code{\link[dplyr]{across}},
#' \code{\link[dplyr]{reexports}}, \code{\link[dplyr]{bind}},
#' \code{\link[dplyr]{mutate}}, \code{\link[dplyr]{distinct}}
#' \code{\link[httr]{RETRY}}
#' \code{\link[tibble]{tibble}}
#' @rdname CristinWrangler
#' @export
CristinWrangler <- \(data,
use.identifiers = TRUE,
crossref.search = FALSE,
autosearch = TRUE,
override = FALSE,
silent = FALSE,
polite = TRUE,
log = list()) {
# Visible bindings
title <- date_published <- year_published <- firstName <- creatorType <-
lastName <- filter.name <- external.data <- results <- id <- name <-
key <- . <- NULL
# Converting to Zotero-type format
log <- LogCat(
sprintf(
"Converting %s from Cristin to Zotero format" ,
Pluralis(nrow(data), "reference")
),
silent = silent,
log = log
)
# List of initial states
initial.state <- list(
external.data = external.data,
crossref.search = crossref.search
)
# Start time for query
query.start <- Sys.time()
# Cycle through references
## Should perhaps vectorize, but for loop seems more informative tbh
for (i in seq_len(nrow(data))) {
# Define data
x <- data[i,]
meta <- list()
external.data <- initial.state$external.data
crossref.search <- initial.state$crossref.search
# Set item type
meta$itemType <- x$category
# Fetch title
meta$title <- GoFish(x$title[x$original_language])
if (is.na(meta$title)) meta$title <- GoFish(x$title[!is.na(x$title)])
meta$title <- CleanText(meta$title)
# Get contributors from contributors url
## Cristin has for some reason placed them in a different table
httr.get <- Online(
httr::RETRY(
"GET",
sprintf(
"https://api.cristin.no/v2/results/%s/contributors",
x$cristin_result_id),
quiet = TRUE
),
silent = TRUE,
message = "Cristin contributors"
)
# Add to log
log <- append(log, httr.get$log)
# Create zotero type creator matrix
if (!httr.get$error) {
# Set creators matrix
get.creators <- httr.get$data |>
JsonToTibble()
# Loop through rows and set creators
if (any(nrow(get.creators))) {
meta$creators <- ZoteroCreator(
lapply(seq_len(nrow(get.creators)), \(i) {
list(type = GoFish(
get.creators[i, ]$affiliations[[1]]$role_code
),
name = GoFish(c(
get.creators[i,]$surname,
get.creators[i,]$first_name
)))
})
)
}
}
# Fetch abstract
meta$abstractNote <- ToString(
GoFish(x$summary[, x$original_language],""),
"\n"
)
# Fetch type of thesis
meta$thesisType <- GoFish(x$type)
# Fetch institution
meta$institution <- GoFish(x$publisher$name)
# Fetch series title
meta$seriesTitle <- GoFish(x$series$name)
# Fetch report number
meta$reportNumber <- GoFish(x$issue)
# Fetch presentation type
meta$presentationType <- GoFish(x$type)
# Fetch presentation place
if (is.null(meta$place)) meta$place <- GoFish(x$event$location)
# Fetch presentation type/name/event
meta$meetingName <- GoFish(x$event$name)
# Fetch university
meta$university <- GoFish(x$publisher$name)
# Fetch data
meta$date <- GoFish(x$date_published)
if(is.na(meta$date)) meta$date <- GoFish(x$year_published)
# Fetch number of pages
meta$numPages <- as.character(GoFish(x$number_of_pages))
# Fetch language
meta$language <- x$original_language
# Fetch ISBN
meta$ISBN <- GoFish(
ToString(x$international_standard_numbers[[1]]$value)
)
# Fetch publisher
meta$publisher <- GoFish(x$publisher$name)
# Fetch publication place
if (is.null(meta$place)) meta$place <- GoFish(x$publisher$place)
# Fetch series name
meta$series <- GoFish(x$series$name)
# Fetch series number
meta$seriesNumber <- GoFish(x$issue)
# Fetch title of journal
meta$publicationTitle <- GoFish(x$journal$name)
# Fetch pages
meta$pages <- GoFish(
ToString(x$pages[1:2][!is.na(x$pages[1:2])], "-")
)
# Fetch volume of journal
meta$volume <- GoFish(x$volume)
# Fetch issue of journal
meta$issue <- GoFish(x$issue)
# Check NVI
nvi <- GoFish(x$journal$nvi_level)
if (is.na(nvi)) nvi <- GoFish(x$journal$publisher$nvi_level)
if (is.na(nvi)) nvi <- GoFish(x$publisher$nvi_level)
if (is.na(nvi)) nvi <- GoFish(x$series$nvi_level)
# Extract DOI
meta$DOI <- GoFish(
strsplit(
x$links[[1]][x$links[[1]]$url_type=="DOI",]$url[[1]], ",")[[1]][1]
)
# Extract URLS (Fulltext preferred )
urls <- c("FULLTEKST","SAMMENDRAG","DATA","PROSJEKT","OMTALE", "ARKIV")
# Fetch url
meta$url <- GoFish(
x$links[[1]]$url[x$links[[1]]$url_type %in% urls][[1]]
)
# Take any url if empty
if (is.na(meta$url)) meta$url <- GoFish(x$links[[1]]$url[[1]])
# Try to scrape Cristin App if DOI is empty
if (is.na(meta$DOI) & use.identifiers) {
# Define result url
cristin.url <- paste0(
"https://app.cristin.no/results/show.jsf?id=", x$cristin_result_id
)
# Find results links
httr.get <- Online(
httr::RETRY(
"GET",
sprintf(
httr::RETRY("GET", cristin.url),
x$cristin_result_id),
quiet = TRUE
),
silent = TRUE,
message = "Cristin external URLs"
)
# Add to log
log <- append(log, httr.get$log)
if (!httr.get$error) {
cristin.urls <- httr.get$data |>
rvest::read_html() |>
rvest::html_nodes(".fact-box-section .img-with-text-textbox a") |>
rvest::html_text()
# Filter out URLs
if (length(cristin.urls)) {
# Set DOI if exist
if (any(grepl("^.*(10\\..*)", cristin.urls))) {
meta$DOI <- cristin.urls[grepl("^.*(10\\..*)", cristin.urls)][[1]]
}
}
}
}
# Check if url is in fact a DOI and set as DOI if DOI is empty
if (is.na(meta$DOI)) {
if (grepl("^.*(10\\..*)", meta$url)) meta$DOI <- meta$url
}
# Remove URL from DOI
if (!is.na(meta$DOI)) {
meta$DOI <- Trim(gsub(
"^.*(10\\..*)",
"\\1",
meta$DOI
))
}
# Find tags
## What do you mean it's convoluted?
tags <- GoFish(
c(as.character(
unlist(x$classification$scientific_disciplines[[1]]$name)
),
as.character(
unlist(x$classification$keywords[[1]]$name)
)
)
)
meta$tags <- if (any(!is.na(tags))) tibble::tibble(tag = tags) else NA
# Check if items is part of a whole
if (!is.na(GoFish(x$part_of$url))) {
# Fetch reference book from Cristin
external.data <- Cristin(id = basename(x$part_of$url),
silent = TRUE,
force.type = "book",
remove.na = FALSE,
zotero.check = FALSE,
use.identifiers = use.identifiers,
crossref.search = crossref.search,
autosearch = autosearch,
override = override)$results
# Change itemType if book metadata is not empty
if (any(nrow(external.data))) {
# Change creator-type of external.data
if (any(!is.na(external.data$creators[[1]]))) {
external.data$creators[[1]] <- external.data$creators[[1]] |>
dplyr::mutate(
creatorType = dplyr::case_when(
creatorType == "author" ~ "editor",
TRUE ~ creatorType
)
)
}
# Set metadata as bookChapter
meta$itemType <- "bookSection"
# Append creators of book to zotero type creator matrix
meta$creators <- AddAppend(
meta$creators,
external.data$creators[[1]]
) |>
dplyr::distinct()
# Set book title
meta$bookTitle <- GoFish(external.data$title)
# Fetch ISBN
meta$ISBN <- GoFish(external.data$ISBN)
# Fetch edition
meta$edition <- GoFish(EditionFix(external.data$edition))
# Set NVI if NVI is NA
if (is.na(nvi) & "extra" %in% names(external.data)) {
nvi <- ZoteroId("NVI", external.data$extra)
}
# Fetch publisher
meta$publisher <- GoFish(external.data$publisher)
# Fetch publication place
meta$place <- GoFish(external.data$place)
# Fetch url
if (is.na(meta$url)) meta$url <- GoFish(external.data$url)
# Set any DOI as extra
if (!is.na(meta$DOI)) meta$extra <- sprintf(
"DOI: %s", meta$DOI
)
}
}
# If NVI found add to extra
if (!is.na(nvi)) meta$extra <- AddAppend(
sprintf("NVI: %s", nvi), meta$extra, "\n"
)
# Set Cristin reference in extra
meta$extra <- AddAppend(
sprintf("Cristin: %s", x$cristin_result_id), meta$extra, "\n"
)
# Fix instances where creatorType does not match itemType
if ("editor" %in% GoFish(meta$creators$creatorType)) {
editor.items <- c("book", "bookSection", "conferencePaper",
"dictionaryEntry", "document", "encyclopediaArticle",
"journalArticle","preprint")
if (meta$itemType == "report" & is.na(GoFish(meta$bookTitle))) {
meta$creators$creatorType[
meta$creators$creatorType == "editor"] <- "serieseditor"
} else if (!meta$itemType %in% editor.items &
!is.na(GoFish(meta$bookTitle))) {
meta$itemType <- "bookSection"
} else if (!meta$itemType %in% editor.items) {
meta$itemType <- "book"
}
}
# Check ISBN/DOI if use.identifiers is set to TRUE
## Omit if itemtype not supported (yet)
not.supported <- c("thesis", "newspaperArticle", "presentation")
if (use.identifiers &
!meta$itemType %in% not.supported &
is.null(external.data)) {
# Use CrossRef if DOI is defined
if (!is.na(meta$DOI)) {
doi <- ZoteroDoi(meta$DOI)
external.data <- doi$data
log <- append(log, doi$log)
# Stop check function for running search again
if (!is.null(external.data)) crossref.search <- FALSE
} # End DOI
# Use MARC 21 if ISBN is defined and external data is NULL
## Only use if part_of is NA
if (!is.na(meta$ISBN) & is.null(external.data)) {
isbn <- ZoteroIsbn(GoFish(strsplit(meta$ISBN, ",")[[1]][[1]]))
external.data <- isbn$data
log <- append(log, isbn$log)
} # End ISBN
# Search Crossref if enabled and external data is NULL
# Items to search for
meta.search <- c("book", "bookSection", "journalArticle", "report")
if (crossref.search &
is.null(external.data) &
meta$itemType %in% meta.search &
any(!is.na(meta$creators))) {
zotero.match <- ZoteroMatch(
title = meta$title,
authors = meta$creators,
date = meta$date,
crossref.search = TRUE,
autosearch = autosearch,
cristin.data = meta,
external.data = external.data,
polite = polite,
silent = TRUE,
log = log
)
# Set external data
external.data <- zotero.match$data
# Add to log
log <- append(log, zotero.match$log)
# Stop check function for running search again
crossref.search <- FALSE
} # End search data
# Test whether external data matches metadata from Cristin
if (!is.null(external.data) &
any(!is.na(meta$creators)) &
!override) {
# Set external.data as creator if nrow >= Cristin data
## Usually happens if Cristin metadata exclude editors
if (nrow(external.data$creators[[1]]) >= nrow(meta$creators)) {
creators <- external.data$creators[[1]]
} else {
creators <- meta$creators
}
zotero.match <- ZoteroMatch(
title = meta$title,
authors = creators,
date = meta$date,
haystack = list(
title = list(GoFish(external.data$title)),
authors = GoFish(external.data$creators),
date = list(GoFish(external.data$date))
),
crossref.search = crossref.search,
autosearch = autosearch,
cristin.data = meta,
external.data = external.data,
polite = polite,
silent = TRUE,
log = log
)
# Set external data
external.data <- zotero.match$data
# Add to log
log <- append(log, zotero.match$log)
} # End check data
# Augment external data with Cristin data
if (!is.null(external.data)) {
# Fix any stray tags
if ("tags" %in% names(external.data)) {
if (is.character(external.data$tags)) {
external.data$tags <- list(
tibble::tibble(tag = external.data$tags)
)
}
} else {
external.data$tags <- NA
}
# Append if any tags
if (is.data.frame(external.data$tags)) {
meta$tags <- GoFish(AddAppend(
external.data$tags, meta$tags
))
} else if (is.data.frame(meta$tags)) {
meta$tags <- GoFish(AddAppend(
meta$tags, external.data$tags
))
}
# Create Zotero-type matrix from Cristin data
meta <- GoFish(ZoteroFormat(meta), NULL)
# Fill in any NA in external.data with cristin.data
external.data <- dplyr::bind_cols(
lapply(names(external.data), \(x) {
if (is.na(external.data[,x])) {
external.data[,x] <- GoFish(meta[,x])
} else {
external.data[,x]
}
return (external.data[,x])
})
)
# Find unique creators
external.data$creators[[1]] <- AddMissing( # Add names if missing
external.data$creators[[1]],
c("name",
"firstName",
"lastName"),
na.type = NA_character_,
location = NULL) |>
# Coalesce firstName and name as filter.name
dplyr::mutate(id = dplyr::row_number(),
filter.name = gsub(
" .*$",
"",
dplyr::coalesce(firstName, name)
)) |>
# Find unique rows based on type, lastName and filter.name
dplyr::distinct(
creatorType,
lastName,
filter.name,
.keep_all = TRUE,
name
) |>
dplyr::select(-c(id, filter.name))
# Add or append extra data
external.data$extra <- AddAppend(
meta$extra, external.data$extra, "\n"
)
# Set external data as metadata
meta <- external.data
} # End combine external and Cristin data
} # End use.identifiers
# Set meta as Cristin if not defined by external data
if (!is.data.frame(meta)) meta <- GoFish(ZoteroFormat(meta), NULL)
# Replace any keys with existing
if (!any(is.na(GoFish(x$key)))) {
meta$key <- x$key
meta$version <- GoFish(x$version, 0)
meta$collections <- GoFish(x$collections, NA)
}
# Set accessDate
meta$accessDate <- format(Sys.time(), format = "%Y-%m-%dT%H:%M:%S%z")
# Remove any missing creators
if (any(!is.na(GoFish(meta$creators[[1]])))) {
meta$creators[[1]] <- meta$creators[[1]] |>
dplyr::filter_all(dplyr::any_vars(!is.na(.)))
}
# Sett as NULL if no creators
if (is.null(GoFish(meta$creators[[1]], NULL))) meta$creators <- NA
# Remove hyphens from ISBN
if (!is.na(GoFish(meta$ISBN))) meta$ISBN <- Trim(
gsub('[^[:alnum:] ]', "", meta$ISBN)
)
# Add or append results
results <- AddAppend(meta, results)
# Add ID to ETA message
eta.message <- paste(
"Next ID:", data[i+1,]$cristin_result_id
)
# Estimate time of arrival
log.eta <-
LogCat(
Eta(query.start, i, nrow(data), eta.message),
silent = silent,
flush = TRUE,
log = log,
append.log = FALSE
)
}
# Add to log
log <- append(log,log.eta)
return (list(results = results, log = log))
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.