### ctrdata package
### utility functions
#### variable definitions ####
# prototype return structure
emptyReturn <- list(n = 0L, success = NULL, failed = NULL)
# EUCTR definitions
countriesEUCTR <- c(
"AT", "BE", "BG", "HR", "CY", "CZ", "DK", "EE", "FI", "FR",
"DE", "GR", "HU", "IE", "IT", "LV", "LT", "LU", "MT", "NL",
"PL", "PT", "RO", "SK", "SE", "SI", "ES", "GB", "IS", "LI",
"NO", "3RD")
# regexpr
# - queryterm and urls
regQueryterm <- "[^-.a-zA-Z0-9=?+&#%_:\"/, {}]"
# - EudraCT e.g. 2010-022945-52
regEuctr <- "[0-9]{4}-[0-9]{6}-[0-9]{2}"
regCtgov <- "NCT[0-9]{8}"
# - CTGOV2
regCtgov2 <- regCtgov
# - regIsrctn
regIsrctn <- "[0-9][0-9]{7}"
# - CTIS e.g. 2022-501549-57-00
regCtis <- "[0-9]{4}-[0-9]{6}-[0-9]{2}-[0-9]{2}"
# register list
registerList <- c("EUCTR", "CTGOV", "ISRCTN", "CTIS", "CTGOV2")
#### functions ####
#' ctgovVersion
#' Checks for mismatch between label CTGOV and CTGOV2
#' and tries to guess the correct label
#' @param url url or data frame with query term
#' @param register any of the register names
#' @keywords internal
#' @noRd
#' @returns string
#' @examples
#' ctgovVersion("", "")
#' ctgovVersion("", "")
#' ctgovVersion("", "")
#' ctgovVersion("")
#' ctgovVersion("", "")
#' ctgovVersion(",%20NCT04368728", "")
#' ctgovVersion("term=NCT04412252,%20NCT04368728", "CTGOV2")
#' ctgovVersion("", "")
ctgovVersion <- function(url, register) {
# in case the input is from dbQueryHistory
if (!is.atomic(url)) try({url <- url[["query-term"]]}, silent = TRUE)
if (inherits(url, "try-error") || is.null(url)) return(register)
# logic 1
if (grepl(paste0(
# these are classic-specific
"[?&]age=|[?&]cntry=|[?&][a-z]+_[a-z]+="), url)) { # e.g. strd_s
message("* Appears specific for CTGOV Classic website")
# logic 2
if (grepl(paste0(
# clear identifiers of CTGOV2
"[?&]country=|[:][^/]|%3[aA]"), url)) {
message("* Appears specific for CTGOV REST API 2.0")
# default return
message("Not overruling register label ", register)
#' ctgovClassicToCurrent
#' Fully translates a user's search query URL from the classic website
#' into a query for the current website, with all search parameters.
#' added to accomodate classic website retirement as of 2024-06-25.
#' Note this function only handles search queries, but not display
#' URLs such as
#' The function is to be called by ctrGetQueryUrl(), which turns
#' search and display URLs into queries. See also
#' ./inst/tinytest/more_test_ctrdata_param_checks.R
#' @param url url intended for a search in the classic CTGOV website
#' @keywords internal
#' @noRd
#' @importFrom countrycode countrycode
#' @importFrom utils URLdecode
#' @returns string url suitable for a search current CTGOV website
#' @examples
#' ctgovClassicToCurrent(",%20NCT04368728")
#' ctgovClassicToCurrent("")
#' ctgovClassicToCurrent("")
#' ctgovClassicToCurrent("")
#' ctgovClassicToCurrent("[MaximumAge]+RANGE[0+days,+28+days]")
ctgovClassicToCurrent <- function(url, verbose = TRUE) {
# apiParams is a kind of dictionary for
# mapping classic to current params
# - not matched:
# CTGOV2 studyComp
# CTGOV dist
# CTGOV rsub
apiParams <- list(
# start aggFilters
"ages:" = list(
"extract" = c(
"replace" = c(
"collapse" = " ",
"out" = character()
"phase:" = list(
"extract" = c(
"replace" = c(
"collapse" = " ",
"out" = character()
"docs:" = list(
"extract" = c(
"replace" = c(
"collapse" = " ",
"out" = character()
"results:" = list(
"extract" = c(
"replace" = c(
"collapse" = " ",
"out" = character()
"funderType:" = list(
"extract" = c(
"replace" = c(
"industry", # 2
"nih", # 0
"fed", # 1
"other"), # 3
"collapse" = " ",
"out" = character()
"studyType:" = list(
"extract" = c(
"replace" = c(
"int", # Interventional
"obs", # Observational
"obs_patreg", # Patient registries
"exp", # Expanded access
"exp_treat", # Treatment IND/Protocol
"exp_indiv", # Individual patients
"exp_inter" # Intermediate-size population
"collapse" = " ",
"out" = character()
"sex:" = list(
"extract" = c(
"replace" = c(
"collapse" = " ",
"out" = character()
"healthy:" = list(
"extract" = "hlth=Y",
"replace" = "y",
"collapse" = " ",
"out" = character()
"violation:" = list(
"extract" = "f801=Yes",
"replace" = "y",
"collapse" = " ",
"out" = character()
"status:" = list(
"extract" = c(
"replace" = c(
"rec", # Recruiting
"act", # Active, not recruiting
"not", # Not yet recruiting
"com", # Completed
"ter", # Terminated
"enr", # Enrolling by invitation
"sus", # Suspended
"wit", # Withdrawn
"unk", # Unknown
"ava", # Available
"nla", # No longer available
"tna", # Temporarily not available
"afm"), # Approved for marketing
"collapse" = " ",
"out" = character()
# end aggFilters
# dates
"dates" = list(
"extract" = list(
"replace" = list(
"collapse" = "@",
"out" = list()
# translate simple terms
"extract" = c(
"replace" = c(
"collapse" = "",
"out" = character()
) # apiParams
## now operate on the input
# mangle input
queryterm <- utils::URLdecode(url)
queryterm <- gsub("[+]", " ", queryterm)
# some specifics found by chance
queryterm <- sub("[?&]recr=Open", "&recrs=b&recrs=a&recrs=c", queryterm)
queryterm <- sub("[?&]recr=Closed", "&recrs=f&recrs=d&recrs=g&recrs=h&recrs=e&recrs=i&recrs=m&recrs=j&recrs=k&recrs=l", queryterm)
# split and focus on parameters
queryterm <- strsplit(queryterm, split = "[&?]")[[1]]
queryterm <- queryterm[!grepl("^https://", queryterm)]
queryterm <- queryterm[queryterm != ""]
# iterate over API terms
for (t in seq_along(queryterm)) {
for (a in seq_along(apiParams)) {
for (i in seq_along(apiParams[[a]][["extract"]])) {
if (grepl(apiParams[[a]][["extract"]][[i]], queryterm[t])) {
item <-
apiParams[[a]][["out"]] <-
c(apiParams[[a]][["out"]], item),
collapse = apiParams[[a]][["collapse"]]
} # if extract
} # extract
} # apiParams
} # queryterm
# merge
apiParams <- sapply(apiParams, "[[", "out")
apiParams <- apiParams[lapply(apiParams, length) > 0L]
# handle two dates parameters into one
if (length(apiParams[["dates"]])) {
tmpSplit <- strsplit(apiParams[["dates"]], "@", fixed = TRUE)[[1]]
apiParams[["dates"]] <- ""
for (t in unique(sub("(.+)=.+", "\\1", tmpSplit))) {
apiParams[["dates"]] <- paste0(c(
apiParams[["dates"]], paste0(
t, "=", sub(
"_+", "_",
sub(".+=(.+)", "\\1", tmpSplit[grepl(t, tmpSplit)]),
collapse = "_")),
collapse = "")),
collapse = "&")
# handle parts within aggFilter
for (t in seq_along(apiParams)) {
if (grepl(":", names(apiParams[t]))) apiParams[t] <- paste0(
names(apiParams[t]), paste0(
unique(strsplit(apiParams[[t]], " ")[[1]]), collapse = " ")
# merge other and aggFilter parts
apiParams <- paste0(
unique(apiParams[!grepl(":", names(apiParams))]),
collapse = ""),
unique(apiParams[grepl(":", names(apiParams))]),
collapse = ",")
# handle country
if (grepl("[?&]country=[^$&]", apiParams)) {
countryCode <- sub(".+([?&]country=)([A-Z]+)([$&]).*", "\\2", apiParams)
if (countryCode != apiParams) apiParams <-
paste0("\\1", countrycode::countrycode(
countryCode, "iso2c", ""), "\\3"),
# prettify
apiParams <- gsub("&&", "&", apiParams)
apiParams <- gsub("&aggFilters=$", "", apiParams)
apiParams <- gsub("search[?]&", "search?", apiParams)
## inform user
# inform user
if (verbose) message(
"Since 2024-06-25, the classic CTGOV servers are no longer available. ",
"Package ctrdata has translated the classic CTGOV query URL from this ",
"call of function ctrLoadQueryIntoDb(queryterm = ...) into a query URL ",
"that works with the current CTGOV2. This is printed below and is also ",
"part of the return value of this function, ctrLoadQueryIntoDb(...)$url. ",
"This URL can be used with ctrdata functions. Note that the fields and ",
"data schema of trials differ between CTGOV and CTGOV2. "
# inform user
"\nReplace this URL:\n\n", url,
"\n\nwith this URL:\n\n", apiParams, "\n")
# return
} # end ctgovClassicToCurrent
#' Check, write, read cache object for ctrdata
#' @param xname name of variable to read or write
#' @param xvalue value of variable to write
#' @param verbose set to `TRUE` to print debug info
#' @keywords internal
#' @noRd
#' @return value of variable or `NULL` if variable does not exist
ctrCache <- function(xname, xvalue = NULL, verbose = FALSE) {
# hidden environment .ctrdataenv created in zzz.R
# write or overwrite and exit early
if (!is.null(xvalue)) {
assign(x = xname, value = xvalue, envir = .ctrdataenv)
if (verbose) message("- Wrote ", xname, " to cache ")
# check and read any value for xname variable
if (verbose) message("- Checking cache...")
if (exists(x = xname, envir = .ctrdataenv)) {
tmp <- try(get(x = xname, envir = .ctrdataenv), silent = TRUE)
if (inherits(tmp, "try-error")) return(NULL)
if (verbose) message("- Returning ", xname, " ")
# default
#' Check and prepare nodbi connection object for ctrdata
#' @param con A connection object, see section
#' `Databases` in \link{ctrdata}.
#' @keywords internal
#' @importFrom nodbi src_sqlite src_duckdb docdb_list
#' @importFrom utils capture.output
#' @return Connection object as list, with collection
#' element under root
ctrDb <- function(con) {
## postgres
if (inherits(con, "src_postgres")) {
if (is.null(con$collection)) {
stop("Specify attribute 'collection' with a table name, using ",
"<nodbi src_postgres object>[[\"collection\"]] <- \"test\"), ",
"for package ctrdata to work.",
call. = FALSE)
# add database as element under root
con <- c(con,
"db" = con$dbname,
"ctrDb" = TRUE)
## return
class = c("src_postgres", "docdb_src")))
## sqlite
if (inherits(con, "src_sqlite")) {
if (is.null(con$collection)) {
stop("Specify parameter 'collection' with a table name, ",
"such as nodbi::src_sqlite(collection = 'test'), ",
"for package ctrdata to work.",
call. = FALSE)
# check
if (inherits(try(nodbi::docdb_list(con), silent = TRUE), "try-error")) {
con <- nodbi::src_sqlite(dbname = con$dbname,
collection = con$collection)
# add database as element under root
con <- c(con,
"db" = con$dbname,
"ctrDb" = TRUE)
# print warning
if (grepl(":memory:", con$dbname)) {
warning("Database not persisting",
call. = FALSE, noBreaks. = FALSE)
## return
class = c("src_sqlite", "docdb_src")))
## mongo
if (inherits(con, "src_mongo")) {
# rights may be insufficient to call info(),
# hence this workaround that should always
# work and be stable to retrieve name of
# collection in the mongo connection
# suppress... for reconnect info from mongolite
coll <- suppressMessages(utils::capture.output(con$con)[1])
coll <- sub("^.*'(.*)'.*$", "\\1", coll)
# add collection as element under root
con <- c(con,
"collection" = coll,
"ctrDb" = TRUE)
## return
class = c("src_mongo", "docdb_src")))
## duckdb
if (inherits(con, "src_duckdb")) {
if (is.null(con$collection)) {
stop("Specify parameter 'collection' with a table name, ",
"such as nodbi::src_duckdb(collection = 'test'), ",
"for package ctrdata to work.",
call. = FALSE)
# check
if (inherits(try(nodbi::docdb_list(con), silent = TRUE), "try-error")) {
con <- nodbi::src_duckdb(
dbdir = attr(attr(con$con, "driver"), "dbdir"),
collection = con$collection)
# add database as element under root
con <- c(con,
"db" = attr(attr(con$con, "driver"), "dbdir"),
"ctrDb" = TRUE)
# print warning about nodbi::src_duckdb()
if (grepl(":memory:", attr(attr(con$con, "driver"), "dbdir"))) {
warning("Database not persisting\n",
call. = FALSE, noBreaks. = FALSE)
## return
class = c("src_duckdb", "docdb_src")))
## unprepared for other nodbi adapters so far
stop("Please specify in parameter 'con' a database connection. ",
"crdata supports src_mongo(), src_sqlite(), src_postgres() and src_duckdb().",
call. = FALSE)
} # end ctrDb
#' Change type of field based on name of field
#' @param dv a vector of character strings
#' @param fn a field name
#' @return a typed vector, same length as dv
#' @importFrom xml2 xml_text read_html
#' @importFrom lubridate duration ymd_hms dyears dmonths ddays
#' @keywords internal
#' @noRd
typeField <- function(dv, fn) {
# get function name
ft <- typeVars[[fn]]
# expand to function
if (!is.null(ft)) ft <- switch(
"ctrInt" = "as.integer(x = x)",
"ctrIntList" = 'sapply(x, function(i) {i[i == "NA"] <- NA; as.integer(i)}, USE.NAMES = FALSE)',
"ctrYesNo" = 'sapply(x, function(i) if ( NA else
switch(i, "Yes" = TRUE, "No" = FALSE, NA), simplify = TRUE, USE.NAMES = FALSE)',
"ctrFalseTrue" = 'if (is.numeric(x)) as.logical(x) else
sapply(x, function(i) switch(i, "true" = TRUE, "false" = FALSE, NA), USE.NAMES = FALSE)',
"ctrDate" = 'as.Date(x, tryFormats =
c("%Y-%m-%d", "%Y-%m", "%Y-%m-%d %H:%M:%S", "%Y-%m-%dT%H:%M:%S",
"%d/%m/%Y", "%Y-%m-%dT%H:%M:%S%z"))',
"ctrDateUs" = 'as.Date(x, tryFormats = c("%b %e, %Y", "%Y-%m-%d", "%Y-%m"))',
"ctrDateTime" = "lubridate::ymd_hms(x)",
"ctrDifftime" = 'as.difftime(as.numeric(lubridate::duration(
tolower(x)), units = "days"), units = "days")',
"ctrDifftimeDays" = "lubridate::ddays(x = as.numeric(x))",
"ctrDifftimeMonths" = "lubridate::dmonths(x = as.numeric(x))",
"ctrDifftimeYears" = "lubridate::dyears(x = as.numeric(x))",
# clean up text
if (is.null(ft)) {
# - if NA as string, change to NA
dv[grepl("^N/?A$|^ND$", dv)] <- NA
# - check if any html entities
htmlEnt <- grepl("&[#a-zA-Z]+;", dv)
# - convert html entities to text and symbols
if (any(htmlEnt) && all(sapply(dv, typeof) == "character")) {
dv[htmlEnt] <-
lapply(dv[htmlEnt], function(i) {
sapply(i, function(ii) {
# - check if possible and convert to numeric
if (all(is.numeric(dv) | dv <- as.numeric(dv)
# - collapse unless list structure is heterogenous
rowN1 <- sapply(dv, function(i) is.null(names(i)))
rowN2 <- sapply(names(rowN1), function(i) is.null(i))
rowType <- sapply(dv, function(i) typeof(unlist(i, recursive = FALSE)))
if (all(rowN1) &&
all(rowN2) &&
length(unique(rowN1)) <= 1L &&
any(rowType == "character")) {
dv <- sapply(dv, function(i) {
i <- gsub("\r", "\n", i)
i <- sub("^Information not present in EudraCT", "", i)
if (length(i) > 1L) {
rowI <- paste0(i[!], collapse = " / ")
if (nchar(rowI)) rowI else NA
} else {
if (length(i) && ! i else NA
# early return
# early exit if already date or logical
if (all(sapply(dv, class) %in%
c("logical", "Date", "POSIXct", "POSIXt"))) return(dv)
# record length of input dv for NULL handling
lenDv <- length(dv)
# apply typing function, returning
# if possible a vector over list
expr = {
dv <- lapply(dv, function(x) {
# - text mangling
x <- ifelse(grepl("Information not present in EudraCT", x), NA, x)
# - give Month Year a Day to allow conversion
if (grepl("date", fn, = TRUE)) {
x <- sub("^ processed this data on ", "", x)
x <- sub("^([a-zA-Z]+) ([0-9]{4})$", "\\1 15, \\2", x)
x <- sub("^([0-9]{4}-[0-9]{2})$", "\\1-15", x)
# - apply function to x
eval(parse(text = ft))
error = function(e) {
message(fn, ": returning untyped values, as ",
ft, " raised an error when applied to ",
paste0(unlist(dv), collapse = " / "))
warning = function(w) {
message(fn, ": returning untyped values, as ",
ft, " raised a warning when applied to ",
paste0(unlist(dv), collapse = " / "))
# exceptional case inform user
if (is.null(dv)) {
fn, " could not be typed, please report here: ",
dv <- rep_len(NA, lenDv)
# make original classes (e.g., Date) reappear
if (!is.list(dv)) dv <- as.list(dv)
if (all(sapply(dv, length) <= 1L)) {
return("c", dv))
# return
} # end typeField
#' Annotate ctrdata function return values
#' @param x object to be annotated
#' @inheritParams ctrDb
#' @keywords internal
#' @noRd
addMetaData <- function(x, con) {
# add metadata
attr(x, "ctrdata-dbname") <- con$db
attr(x, "ctrdata-table") <- con$collection
attr(x, "ctrdata-table-note") <- "^^^ attr ctrdata-table will be removed by end 2024"
attr(x, "ctrdata-collection") <- con$collection
attr(x, "ctrdata-dbqueryhistory") <- dbQueryHistory(
con = con,
verbose = FALSE)
# return annotated object
} # end addMetaData
#' ctrMultiDownload
#' @param urls Vector of urls to be downloaded
#' @param destfiles Vector of local file names into which to download
#' @param progress Set to \code{FALSE} to not print progress bar
#' @param resume Logical for dispatching to curl
#' @param multipley Logical for using http/2
#' @param verbose For debug message printing
#' @keywords internal
#' @noRd
#' @return Data frame with columns such as status_code etc
#' @importFrom curl multi_download
#' @importFrom utils URLencode
#' @importFrom jsonlite fromJSON
ctrMultiDownload <- function(
progress = TRUE,
resume = FALSE,
multiplex = TRUE,
verbose = TRUE) {
stopifnot(length(urls) == length(destfiles))
if (!length(urls)) return(data.frame())
# starting values
numI <- 1L
canR <- resume
# do not again download files that already exist
# or that do not have an (arbitrary) minimal size.
# nchar("Request failed.") is 15L
toDo <-, times = length(urls))
toDo[file.exists(destfiles) &
( |
file.size(destfiles) > 20L)] <- FALSE
downloadValue <- data.frame(
"success" = !toDo,
"status_code" =, length(toDo)),
"resumefrom" = double(length(toDo)),
"url" = urls,
"destfile" = destfiles,
"error" = character(length(toDo)),
"type" = character(length(toDo)),
"modified" = double(length(toDo)),
"time" = double(length(toDo)),
"headers" = character(length(toDo))
# remove any duplicates
downloadValue <- unique(downloadValue)
# does not error in case any of the individual requests fail.
# inspect the return value to find out which were successful
# make no more than 3 attempts to complete downloading
while (any(toDo) && numI < 3L) {
args <- c(
urls = list(utils::URLencode(downloadValue$url[toDo])),
destfiles = list(downloadValue$destfile[toDo]),
resume = canR,
progress = progress,
timeout = Inf,
multiplex = multiplex,
accept_encoding = "gzip,deflate,zstd,br")
# do download
res <-, args)
# check if download successful and CDN is likely to be used
cdnCheck <- (res$status_code %in% c(200L, 206L, 416L)) &
!grepl("[.]json$", res$destfile) &
sapply(res$headers, function(x)
if (length(x) >= 1)
any(grepl("application/json", x))
else FALSE)
# replace url with CDN url
if (any(cdnCheck)) {
message("Redirecting to CDN...")
# get CDN url
res$url[cdnCheck] <- sapply(
function(x) jsonlite::fromJSON(x)$url,
simplify = TRUE)
# remove files containing CDN url
# reset status
res$status_code[cdnCheck] <- NA
# update input
downloadValue[toDo, ] <- res
if (any(grepl(
"annot resume", downloadValue[toDo, "error", drop = TRUE]))) canR <- FALSE
if (inherits(downloadValue, "try-error")) {
stop("Download failed; last error: ", class(downloadValue), call. = FALSE)
toDoThis <-$success) |
!downloadValue$success |
!(downloadValue$status_code %in% c(200L, 206L, 416L))
# only count towards repeat attempts if
# the set of repeated urls is unchanged
if (identical(toDo, toDoThis)) numI <- numI + 1L
toDo <- toDoThis
if (any(toDo)) {
# remove any files from failed downloads
unlink(downloadValue[toDo, c("destfile"), drop = TRUE])
if (verbose) {
"Download failed for: status code / url(s):"
downloadValue[toDo, c("status_code", "url"), drop = FALSE],
1, function(r) message(r[1], " / ", r[2], "\n", appendLF = FALSE)
return(downloadValue[!toDo, , drop = FALSE])
} # end ctrMultiDownload
#' ctrTempDir
#' create empty temporary directory on localhost for
#' downloading from register into temporary directory
#' @return path to existing directory
#' @keywords internal
#' @noRd
ctrTempDir <- function(verbose = FALSE) {
# get temporary space
tempDir <- getOption(
default = tempfile(pattern = "ctrDATA"))
# create and normalise for OS
dir.create(tempDir, showWarnings = FALSE, recursive = TRUE)
tempDir <- normalizePath(tempDir, mustWork = TRUE)
# retain tempdir for session to accelerate,
# but only if session is user-interactive.
# from ctrdata onwards, all
# intermediate files are deleted before
# finalising a ctrLoadQueryIntoDb() call
# (that is, only downloaded files are kept).
if (interactive()) options(ctrdata.tempdir = tempDir)
# register deleting tempDir when exiting session
assign("keeptempdir", verbose, envir = .ctrdataenv)
delCtrdataTempDir <- function(x) {
if (length(.ctrdataenv$keeptempdir) &&
!is.null(.ctrdataenv$keeptempdir)) {
if (.ctrdataenv$keeptempdir) {
'ctrdata: "verbose = TRUE", not deleting temporary directory ', tempDir, "\r")
} else {
unlink(tempDir, recursive = TRUE)
message("ctrdata: deleted temporary directory\r")
assign("keeptempdir", NULL, envir = .ctrdataenv)
e = .ctrdataenv,
f = delCtrdataTempDir,
onexit = TRUE
# inform user
if (verbose) message(
"\nDEBUG: ", tempDir,
"\nUsing any previously downloaded files of the ",
length(dir(path = tempDir)),
" files existing in this folder.\n")
# return
#' ctrDocsDownload
#' download documents
#' @param dlFiles data frame with columns _id, filename, url
#' @param documents.path parameter from parent call
#' @param documents.regexp parameter from parent call
#' @param multiplex use http/2 or not
#' @param verbose parameter from parent call
#' @return number of documents
#' @keywords internal
#' @noRd
ctrDocsDownload <- function(
multiplex = TRUE,
verbose) {
# check and create directory
createdDir <- try(
dir.create(documents.path, recursive = TRUE, showWarnings = FALSE),
silent = TRUE)
# early return
if (inherits(createdDir, "try-errror")) {
warning("Directory could not be created for 'documents.path' ",
documents.path, ", cannot download files", call. = FALSE)
# continue after if
message("- Downloading documents into 'documents.path' = ", documents.path)
# canonical directory path
documents.path <- normalizePath(documents.path, mustWork = TRUE)
if (createdDir) message("- Created directory ", documents.path)
# documents download
message("- Creating subfolder for each trial")
# add destination file directory path
dlFiles$filepath <- file.path(documents.path, dlFiles$`_id`)
# create subdirectories by trial
unique(dlFiles$filepath), function(i) if (!dir.exists(i))
dir.create(i, showWarnings = FALSE, recursive = TRUE)
# check if destination document exists
dlFiles$filepathname <- file.path(dlFiles$filepath, dlFiles$filename)
dlFiles$fileexists <- file.exists(dlFiles$filepathname) &
file.size(dlFiles$filepathname) > 20L
# placeholder or files
if (is.null(documents.regexp)) {
message("- Creating empty document placeholders (max. ", nrow(dlFiles), ")")
# create empty files
tmp <-
function(i) if (!file.exists(i))
file.create(i, showWarnings = TRUE),
tmp <- sum(unlist(tmp), na.rm = TRUE)
} else {
# inform
message("- Applying 'documents.regexp' to ",
nrow(dlFiles), " missing documents")
# apply regexp
dlFiles <- dlFiles[
grepl(documents.regexp, dlFiles$filename, = TRUE), ,
drop = FALSE]
# inform
message("- Downloading ",
nrow(dlFiles[!dlFiles$fileexists, , drop = FALSE]),
" missing documents")
# do download
tmp <- ctrMultiDownload(
urls = dlFiles$url[!dlFiles$fileexists],
destfiles = dlFiles$filepathname[!dlFiles$fileexists],
multiplex = multiplex,
verbose = verbose)
# check results
if (!nrow(tmp)) tmp <- 0L else {
# handle failures despite success is true
tmp[tmp$status_code != 200L, "destfile", drop = TRUE],
# delete but only micro files, possible remnants
function(f) if (file.size(f) < 20L) unlink(f)
tmp <- nrow(tmp[tmp$status_code == 200L, , drop = FALSE])
} # is.null(documents.regexp)
# inform user
"= Newly saved %i ",
ifelse(is.null(documents.regexp), "placeholder ", ""),
"document(s) for %i trial(s); ",
"%i of such document(s) for %i trial(s) already existed in %s"),
# return
} # end ctrDocsDownload
#' initTranformers
#' @importFrom V8 v8 JS
#' @importFrom readr read_file
#' @keywords internal
#' @noRd
initTranformers <- function() {
# prepare V8, see ./inst/js/
ct <- V8::v8()
# get javascript for xml to ndjson
ct$source(system.file("js/bundle.js", package = "ctrdata"))
# function for xml to ndjson conversion
V8::JS("function(xml, opts) {injs.parseString(xml, opts, function (err, result)
{ out = result; }); return JSON.stringify(out); }"))
# native javascript function for euctr txt to ndjson conversion
ct$eval(readr::read_file(system.file("js/euctr2ndjson.js", package = "ctrdata")))
# assign into package private environment, see zzz.R
assign("ct", ct, envir = .ctrdataenv)
#' dbCTRLoadJSONFiles
#' @param dir Path to local directory with JSON files
#' from downloading and converting
#' @importFrom jsonlite validate
#' @importFrom nodbi docdb_create
#' @importFrom stats na.omit
#' @importFrom jqr jq
#' @inheritParams ctrDb
#' @inheritParams ctrLoadQueryIntoDb
#' @return List with elements n (number of imported trials),
#' _id's of successfully imported trials and
#' _id's of trials that failed to import
#' @keywords internal
#' @noRd
dbCTRLoadJSONFiles <- function(dir, con, verbose) {
# find files
tempFiles <- dir(path = dir,
pattern = "^.+_trials_.*.ndjson$",
full.names = TRUE)
# check
if (!length(tempFiles)) stop("no .+_trials_.*.ndjson files found in ", dir)
# initialise counters
fc <- length(tempFiles)
## iterate ndjson files -----------------------------------------------------------------
retimp <- lapply(
X = seq_along(tempFiles),
function(tempFile) {
## initialise output
idSuccess <- NULL
idFailed <- NULL
idAnnotation <- NULL
nImported <- 0
ids <- NULL
## get _id's
# main function for fast reading,
# switching off warning about final EOL missing
fd <- file(description = tempFiles[tempFile],
open = "rt", blocking = TRUE)
on.exit(try(close(fd), silent = TRUE), add = TRUE)
# inform user
"JSON file #: ", tempFile, " / ", fc,
" \r",
appendLF = FALSE)
# get all ids using jq, safet than regex
ids <- gsub("\"", "", as.vector(jqr::jq(file(tempFiles[tempFile]), " ._id ")))
## existing annotations -------------------------------------------------
# get annotations
annoDf <- try({
src = con,
key = con$collection,
query = paste0(
'{"_id": {"$in": [',
paste0('"', ids, '"', collapse = ","), "]}}"),
fields = '{"_id": 1, "annotation": 1}')
}, silent = TRUE)
if (!inherits(annoDf, "try-error") && length(annoDf[["_id"]])) {
annoDf <- merge(
data.frame("_id" = ids, check.names = FALSE, stringsAsFactors = FALSE),
annoDf, all.x = TRUE) # only need input ids, do not need all.y
} else {
annoDf <-
data.frame("_id" = ids, check.names = FALSE, stringsAsFactors = FALSE)
if (is.null(annoDf[["annotation"]]))
annoDf[["annotation"]] <- rep(NA, length(ids))
## delete and import ----------------------------------------------------
# delete any existing records
src = con,
key = con$collection,
query = paste0(
'{"_id": {"$in": [',
paste0('"', ids, '"', collapse = ","), ']}}'))
}, silent = TRUE)
## import
tmp <- try({
src = con,
key = con$collection,
value = tempFiles[tempFile]
)))}, silent = TRUE)
## return values for lapply
if (inherits(tmp, "try-error") || tmp == 0L || tmp != nrow(annoDf)) {
# step into line by line mode
fdLines <- file(tempFiles[tempFile], open = "rt", blocking = TRUE)
fLineOut <- tempfile(pattern = "tmpOneLine", tmpdir = dir, fileext = ".ndjson")
on.exit(unlink(fLineOut), add = TRUE)
while (TRUE) {
tmpOneLine <- readLines(con = fdLines, n = 1L, warn = FALSE)
if (length(tmpOneLine) == 0L || !nchar(tmpOneLine)) break
id <- sub(".*\"_id\":[ ]*\"(.*?)\".*", "\\1", tmpOneLine)
cat(tmpOneLine, file = fLineOut)
tmp <- suppressWarnings(suppressMessages(nodbi::docdb_create(
src = con, key = con$collection, value = fLineOut)))
nImported <- nImported + tmp
if (tmp) idSuccess <- c(idSuccess, id)
if (!tmp) idFailed <- c(idFailed, id)
if (!tmp) warning("Failed to load: ", id, call. = FALSE)
if (tmp) idAnnotation <- c(idAnnotation, annoDf[
annoDf[["_id"]] == id, "annotation", drop = TRUE][1])
} else {
nImported <- nImported + tmp
idSuccess <- c(idSuccess, annoDf[, "_id", drop = TRUE])
idAnnotation <- c(idAnnotation, annoDf[, "annotation", drop = TRUE])
# close this file
# return values
list(success = idSuccess,
failed = idFailed,
n = nImported,
annotations = idAnnotation)
}) # sapply tempFiles
# prepare return values, n is successful only
n <- sum(sapply(retimp, "[[", "n"), na.rm = TRUE)
success <- as.vector(unlist(sapply(retimp, "[[", "success")))
failed <- as.vector(unlist(sapply(retimp, "[[", "failed")))
annotations <- as.vector(unlist(sapply(retimp, "[[", "annotations")))
# return
return(list(n = n,
success = success,
failed = failed,
annotations = annotations))
} # end dbCTRLoadJSONFiles
#' dbQueryAnnotateRecords
#' @inheritParams ctrLoadQueryIntoDb
#' @keywords internal
#' @noRd
#' @importFrom jsonlite toJSON
#' @importFrom nodbi docdb_update
dbCTRAnnotateQueryRecords <- function(
verbose) {
# debug
if (verbose) message("Annotating records...")
if (verbose) message(recordnumbers)
if (verbose) message(annotation.mode)
# df from existing annotations
if (is.null(recordannotations)) recordannotations <- ""
annotations <- data.frame(
"_id" = recordnumbers,
"annotation" = recordannotations,
stringsAsFactors = FALSE,
check.names = FALSE)
# check if dataframe is as expected: columns _id and annotation
# dataframe could be empty if _ids not yet imported
if (nrow(annotations) == 0) {
annotations <- data.frame("_id" = recordnumbers,
"annotation" = "",
stringsAsFactors = FALSE,
check.names = FALSE)
# modify the annotations
annotations[["annotation"]] <- trimws(
"replace" = paste0(annotation.text),
"prepend" = paste0(annotation.text, " ", ifelse([["annotation"]]), "", annotations[["annotation"]])),
paste0(ifelse([["annotation"]]), "", annotations[["annotation"]]),
" ", annotation.text)
# ensure columns including order
annotations <- annotations[, c("_id", "annotation"), drop = FALSE]
# debug
if (verbose) message(annotations)
# update the database
result <- nodbi::docdb_update(
src = con,
key = con$collection,
value = annotations,
query = "{}")
# inform user
message("= Annotated retrieved records (", result, " records)")
} # end dbCTRAnnotateQueryRecords
#' dbCTRUpdateQueryHistory
#' @inheritParams ctrLoadQueryIntoDb
#' @keywords internal
#' @noRd
#' @importFrom jsonlite toJSON
#' @importFrom nodbi docdb_delete docdb_create docdb_update
dbCTRUpdateQueryHistory <- function(
verbose) {
## check database connection
con <- ctrDb(con)
# debug
if (verbose) message("Running dbCTRUpdateQueryHistory...")
# compose history entry from current search
# default for format methods is "%Y-%m-%d %H:%M:%S"
newHist <- data.frame(
"query-timestamp" = format(Sys.time(), "%Y-%m-%d %H:%M:%S"),
"query-register" = register,
"query-records" = recordnumber,
"query-term" = queryterm,
check.names = FALSE,
stringsAsFactors = FALSE)
# retrieve existing history data
hist <- dbQueryHistory(con, verbose)
# append current search
# default for format methods is "%Y-%m-%d %H:%M:%S"
if (!is.null(hist) &&
nrow(hist)) {
newHist <- rbind(hist, newHist)
newHist <- list("queries" = newHist)
tmp <- suppressMessages(
src = con,
key = con$collection,
value = newHist,
query = '{"_id": "meta-info"}'
} else {
# to list
newHist <- list(list(
"_id" = "meta-info",
"queries" = newHist))
# write new document
tmp <- suppressMessages(
src = con,
key = con$collection,
value = newHist
# inform user
if (tmp == 1L) {
message('Updated history ("meta-info" in "', con$collection, '")')
} else {
warning('Could not update history ("meta-info" in "', con$collection,
'")', call. = FALSE, immediate. = FALSE)
} # end dbCTRUpdateQueryHistory
