### ctrdata package
#' ctrLoadQueryIntoDbEuctr
#'
#' @inheritParams ctrLoadQueryIntoDb
#'
#' @keywords internal
#' @noRd
#'
#' @importFrom httr content GET status_code config
#' @importFrom curl new_handle handle_data handle_setopt parse_headers
#' @importFrom curl multi_run curl_fetch_memory multi_add new_pool
#' @importFrom nodbi docdb_query docdb_update
#' @importFrom zip unzip
#' @importFrom stringi stri_replace_all_fixed stri_detect_fixed
#' @importFrom readr write_file read_file
#' @importFrom digest digest
#'
ctrLoadQueryIntoDbEuctr <- function(
queryterm = queryterm,
register,
euctrresults,
euctrresultshistory,
ctgov2history,
documents.path,
documents.regexp,
annotation.text,
annotation.mode,
only.count,
con, verbose,
queryupdateterm) {
## sanity correction for naked terms
# otherwise all trials would be retrieved
# see also ctrGetQueryUrl
queryterm <- sub(
"(^|&|[&]?\\w+=\\w+&)([ a-zA-Z0-9+-]+)($|&\\w+=\\w+)",
"\\1query=\\2\\3",
queryterm)
# inform user
message("* Checking trials in EUCTR...")
## euctr api ----------------------------------------------------------------
queryEuRoot <- "https://www.clinicaltrialsregister.eu/"
queryEuType1 <- "ctr-search/search?"
queryEuType3 <- "ctr-search/rest/download/full?"
queryEuType4 <- "ctr-search/rest/download/result/zip/xml/"
queryEuPost <- paste0(
"&mode=current_page&format=text&dContent=full",
"&number=current_page&submit-download=Download")
# get first result page
q <- utils::URLencode(paste0(queryEuRoot, queryEuType1, queryterm))
if (verbose) message("DEBUG: queryterm is ", q)
#
resultsEuPages <- try(
httr::GET(url = q),
silent = TRUE)
#
if (inherits(resultsEuPages, "try-error") ||
httr::status_code(resultsEuPages) != 200L) {
if (grepl("SSL certificate.*local issuer certificate", resultsEuPages)) {
stop("Host ", queryEuRoot, " cannot be queried as expected, error:\n",
trimws(resultsEuPages), "\nFor a potential workaround, check ",
"https://github.com/rfhb/ctrdata/issues/19#issuecomment-820127139",
call. = FALSE)
} else {
stop("Host ", queryEuRoot, " not working as expected, ",
"cannot continue: ", resultsEuPages[[1]], call. = FALSE)
}
}
# - get content of response
resultsEuPages <- httr::content(resultsEuPages, as = "text")
# get number of trials identified by query
resultsEuNumTrials <- sub(
".*Trials with a EudraCT protocol \\(([0-9,.]*)\\).*",
"\\1",
resultsEuPages)
#
resultsEuNumTrials <- suppressWarnings(
as.integer(gsub("[,.]", "", resultsEuNumTrials)))
#
# no trials found even though host may have been online
if (!is.integer(resultsEuNumTrials)) {
message("ctrLoadQueryIntoDb(): register does not deliver ",
"search results as expected, check if working with ",
"'browseURL(\"", q, "\")'")
return(invisible(emptyReturn))
}
# calculate number of results pages
resultsEuNumPages <- ceiling(resultsEuNumTrials / 20)
# check for plausibility and stop function without erro
if (is.na(resultsEuNumPages) ||
is.na(resultsEuNumTrials) ||
(resultsEuNumTrials == 0)) {
message("First result page empty - no (new) trials found?")
return(invisible(emptyReturn))
}
# inform user
message("Retrieved overview, multiple records of ",
resultsEuNumTrials, " trial(s) from ",
resultsEuNumPages, " page(s) to be downloaded ",
"(estimate: ", signif(resultsEuNumTrials * 0.13, 1L), " MB)")
# only count?
if (only.count) {
# return
return(list(n = resultsEuNumTrials,
success = NULL,
failed = NULL))
}
# suggest chunking if large number of trials found
if (as.integer(resultsEuNumTrials) > 10000L) {
stop("These are ", resultsEuNumTrials, " (more than 10,000) trials; ",
"consider correcting or splitting into separate queries")
}
## protocol-related information ---------------------------------------------
## create empty temporary directory
tempDir <- ctrTempDir(verbose)
# check results parameters
if (is.null(documents.path)) {
documents.path <- tempDir
}
if (euctrresults &&
(is.na(file.info(documents.path)[["isdir"]]) ||
!file.info(documents.path)[["isdir"]])) {
createdDir <- try(
dir.create(documents.path, recursive = TRUE, showWarnings = FALSE),
silent = TRUE)
if (!inherits(createdDir, "try-errror") && createdDir) {
message("Created directory ", documents.path)
} else {
warning("Directory could not be created for 'documents.path' ",
documents.path, ", ignored", call. = FALSE)
documents.path <- tempDir
}
# canonical directory path
documents.path <- normalizePath(documents.path, mustWork = TRUE)
}
## download all text files from pages
# inform user
message("(1/3) Downloading trials...")
# new handle
h <- do.call(
curl::new_handle,
c(accept_encoding = "gzip,deflate,zstd,br",
getOption("httr_config")[["options"]])
)
# test fetch
tmp <- curl::curl_fetch_memory(
url = paste0(
queryEuRoot, queryEuType3,
"query=2008-003606-33", "&page=1", queryEuPost),
handle = h
)
# inform user about capabilities
stime <- curl::handle_data(h)[["times"]][["total"]]
sgzip <- curl::parse_headers(tmp$headers)
sgzip <- sgzip[grepl("Transfer-Encoding", sgzip)]
sgzip <- grepl("gzip|deflate", sgzip)
if (length(sgzip) && !sgzip) {
message("Note: register server cannot compress data, ",
"transfer takes longer (estimate: ",
signif(resultsEuNumTrials * 1.2, 1L), " s)")
}
# generate vector with URLs of all pages
urls <- vapply(
paste0(queryEuRoot, queryEuType3,
queryterm, "&page=", 1:resultsEuNumPages, queryEuPost),
utils::URLencode, character(1L))
# generate vector with file names for saving pages
fp <- file.path(
tempDir, paste0(
"euctr_trials_",
# appending hash of query for re-download
sapply(urls, digest::digest, algo = "crc32"),
".txt"
))
# do download and saving
tmp <- ctrMultiDownload(urls, fp, verbose = verbose)
if (nrow(tmp) != resultsEuNumPages) {
message("Download from EUCTR failed; incorrect number of records")
return(invisible(emptyReturn))
}
## convert euctr to ndjson -----------------------------------------------
if (length(.ctrdataenv$ct) == 0L) initTranformers()
# run conversion (~12s for 563 trials)
message("(2/3) Converting to NDJSON (estimate: ",
signif(resultsEuNumTrials * 0.02, 1L), " s)...")
tmp$ndjsonfile <- sub("[.]txt$", ".ndjson", tmp$destfile)
sapply(seq_len(nrow(tmp)), function(r) {
readr::write_file(
.ctrdataenv$ct$call(
"euctr2ndjson", readr::read_file(tmp$destfile[r]),
format(Sys.time(), "%Y-%m-%d %H:%M:%S")),
file = tmp$ndjsonfile[r])
})
## import into database -----------------------------------------------
# run import into database from json files
message("(3/3) Importing records into database...")
if (verbose) message("DEBUG: ", tempDir)
imported <- dbCTRLoadJSONFiles(dir = tempDir,
con = con,
verbose = verbose)
# inform user on final import outcome
message("= Imported or updated ",
imported$n, " records on ",
resultsEuNumTrials, " trial(s)")
# read in the eudract numbers of the
# trials just retrieved and imported
eudractnumbersimported <- imported$success
# remove calculated ndjson files in case of re-download import
# because dbCTRLoadJSONFiles() imports all ndjson in the folder
try(unlink(tmp$ndjsonfile), silent = TRUE)
## result-related information -----------------------------------------------
if (euctrresults) {
# results are available only one-by-one for
# each trial as just retrieved and imported
# transform eudract numbers
# with country info ("2010-024264-18-3RD")
# into eudract numbers ("2010-024264-18")
eudractnumbersimported <- unique(
substring(text = eudractnumbersimported,
first = 1,
last = 14))
# inform user
message("* Checking results if available from EUCTR for ",
length(eudractnumbersimported), " trials: ")
## parallel download and unzipping into temporary directory
# "https://www.clinicaltrialsregister.eu/ctr-search/rest/
# download/result/zip/xml/..."
# first version: "2007-000371-42/1"
# second version: "2007-000371-42/2"
# latest version: "2007-000371-42"
# inform user
message("(1/4) Downloading results...")
# prepare download and save
# urls
urls <- vapply(
paste0(queryEuRoot, queryEuType4,
eudractnumbersimported),
utils::URLencode, character(1L), USE.NAMES = FALSE)
# destfiles
fp <- file.path(
tempDir, paste0(
"euctr_results_",
eudractnumbersimported,
".zip"
))
# do download and save
tmp <- ctrMultiDownload(urls, fp, verbose = verbose)
# work only on successful downloads
tmp <- tmp[tmp[["status_code"]] == 200L, , drop = FALSE]
# inform user
message(
"- extracting results (. = data, F = file[s] and data, x = none):")
# unzip downloaded files and move non-XML extracted files
tmp <- lapply(
tmp[["destfile"]], function(f) {
if (file.exists(f) &&
file.size(f) != 0L) {
# this unzip does not handle special
# characters under windows, thus only
# obtain file names and extract with
# extra package
tmp <- utils::unzip(
zipfile = f,
list = TRUE)$Name
if (is.null(tmp)) return(NULL)
try(zip::unzip(
zipfile = f,
exdir = tempDir),
silent = TRUE)
# results in files such as
# EU-CTR 2008-003606-33 v1 - Results.xml
nonXmlFiles <- tmp[!grepl("Results[.]xml$", tmp)]
euctrnr <- gsub(paste0(".*(", regEuctr, ").*"),
"\\1", tmp[grepl("Results[.]xml$", tmp)])[1]
# any non-XML file
if (length(nonXmlFiles)) {
message("F ", appendLF = FALSE)
if (documents.path != tempDir) {
# move results file(s) to user specified directory
saved <- try(suppressWarnings(
file.rename(
from = file.path(tempDir, nonXmlFiles),
to = file.path(documents.path,
paste0(euctrnr, "--", basename(nonXmlFiles))
))), silent = TRUE)
# inform user
if (inherits(saved, "try-error")) {
warning("Could not save ", nonXmlFiles, "; ", trimws(saved),
call. = FALSE, immediate. = TRUE)
if (grepl("expanded 'from' name too long", saved)) {
message("Set options(ctrdata.tempdir = <dir>) to a ",
"short absolute path name for a directory.")
}
}
if (!inherits(saved, "try-error") && any(!saved)) {
warning("Could not save ", nonXmlFiles[!saved],
call. = FALSE, immediate. = TRUE)
}
} # if paths
} else {
# only XML data file
if (any(grepl("Results[.]xml$", tmp)))
message(". ", appendLF = FALSE)
}
} else {
# unsuccessful
message("x ", appendLF = FALSE)
}
}) # lapply fp
# line break
message("", appendLF = TRUE)
## convert xml to ndjson -----------------------------------------------
if (length(.ctrdataenv$ct) == 0L) initTranformers()
# for each file of an imported trial create new ndjson file
xmlFileList <- dir(path = tempDir, pattern = "EU-CTR.+Results.xml", full.names = TRUE)
xmlFileList <- xmlFileList[vapply(xmlFileList, function(i) any(
stringi::stri_detect_fixed(i, eudractnumbersimported)), logical(1L))]
on.exit(try(unlink(xmlFileList), silent = TRUE), add = TRUE)
jsonFileList <- file.path(tempDir, paste0(
"EU_Results_", sub(".+ ([0-9]{4}-[0-9]{6}-[0-9]{2}) .+", "\\1.ndjson", xmlFileList)))
on.exit(try(unlink(jsonFileList), silent = TRUE), add = TRUE)
# run conversion (~2 s for 19 records)
message("(2/4) Converting to NDJSON (estimate: ",
signif(length(xmlFileList) * 2 / 19, 1L), " s)...")
sapply(seq_along(xmlFileList), function(f) {
cat(stringi::stri_replace_all_fixed(
.ctrdataenv$ct$call(
"parsexml",
# read source xml file
paste0(readLines(xmlFileList[f], warn = FALSE), collapse = ""),
# important parameters
V8::JS('{trim: true, ignoreAttrs: false, mergeAttrs: true,
explicitRoot: false, explicitArray: false, xmlns: false}')),
# remove conversion remnants and conformity breaking characters
c('"xmlns:ns0":"http://eudract.ema.europa.eu/schema/clinical_trial_result",',
'"xmlns:xsi":"http://www.w3.org/2001/XMLSchema-instance","xsi:nil":"true"',
"&", "'", "\n", "\r", "\t"),
c("", "", "&", "'", " ", " ", " "),
vectorize_all = FALSE),
file = jsonFileList[f]
)
}) # sapply xmlFileList
## delete for any re-downloads
try(unlink(xmlFileList), silent = TRUE)
# iterate over results files
message("(3/4) Importing results into database (may take some time)...")
# initiate counter
importedresults <- 0L
# import results data from ndjson files
for (f in jsonFileList) {
if (!file.exists(f) || file.size(f) == 0L) next
eudractNumber <- gsub(
"\"", "", as.character(jqr::jq(file(f), " .eudractNumber ")))
# update database with results
tmp <- try({
tmpnodbi <-
nodbi::docdb_update(
src = con,
key = con$collection,
value = f,
query = paste0('{"a2_eudract_number": "', eudractNumber, '"}')
)
max(tmpnodbi, na.rm = TRUE)
},
silent = TRUE)
# inform user on failed trial
if (inherits(tmp, "try-error")) {
warning(paste0(
"Import of results failed for trial ", eudractNumber), immediate. = TRUE)
tmp <- 0L
}
# however output is number of trials updated
importedresults <- importedresults + 1L
# inform user on records
message(
importedresults,
" trials' records updated with results\r",
appendLF = FALSE)
} # for f
## delete for any re-downloads
try(unlink(jsonFileList), silent = TRUE)
## result history information ---------------------------------------------
# get result history from result webpage, section Results information
importedresultshistory <- NULL
if (euctrresultshistory) {
# for date time conversion
lct <- Sys.getlocale("LC_TIME")
Sys.setlocale("LC_TIME", "C")
# helper function
extractResultsInformation <- function(t) {
# get content of partial webpage
wpText <- rawToChar(t[["content"]])
eudractNumber <- t[["url"]]
eudractNumber <- sub(
paste0(".*(", regEuctr, ").*"),
"\\1", eudractNumber)
# extract information about results
tmpFirstDate <- as.Date(trimws(
sub(".+First version publication date</div>.*?<div>(.+?)</div>.*",
"\\1", ifelse(grepl("First version publication date", wpText),
wpText, ""))),
format = "%d %b %Y")
tmpThisDate <- as.Date(trimws(
sub(".+This version publication date</div>.*?<div>(.+?)</div>.*",
"\\1", ifelse(grepl("This version publication date", wpText),
wpText, ""))),
format = "%d %b %Y")
tmpChanges <- trimws(
gsub("[ ]+", " ", gsub("[\n\r]", "", gsub("<[a-z/]+>", "", sub(
".+Version creation reas.*?<td class=\"valueColumn\">(.+?)</td>.+",
"\\1", ifelse(grepl("Version creation reas", wpText), wpText, ""))
))))
# return
return(list("eudractNumber" = eudractNumber,
"tmpFirstDate" = tmpFirstDate,
"tmpThisDate" = tmpThisDate,
"tmpChanges" = tmpChanges))
}
# this does not include the retrieval of information
# about amendment to the study, as presented at the bottom
# of the webpage for the respective trial results
message("(4/4) Retrieving results history:")
# prepare download and save
pool <- curl::new_pool(
multiplex = TRUE)
#
pc <- 0L
curlSuccess <- function(res) {
pc <<- pc + 1L
# incomplete data is 206L but some results pages are complete
if (any(res$status_code == c(200L, 206L))) {
retdat <<- c(retdat, list(extractResultsInformation(res)))
message("\r", pc, " downloaded", appendLF = FALSE)
}}
# compose urls to access results page
urls <- vapply(paste0(
"https://www.clinicaltrialsregister.eu/ctr-search/trial/",
eudractnumbersimported, "/results"),
utils::URLencode, character(1L))
# add urls to pool
tmp <- lapply(
seq_along(urls),
function(i) {
h <- curl::new_handle(
url = urls[i],
range = "0-30000", # only top of page needed
accept_encoding = "identity")
curl::handle_setopt(h, .list = getOption("httr_config")[["options"]])
curl::multi_add(
handle = h,
done = curlSuccess,
pool = pool)
})
# do download and save into batchresults
retdat <- list()
tmp <- try(curl::multi_run(
pool = pool), silent = TRUE)
# check plausibility
if (inherits(tmp, "try-error") || tmp[["error"]] || !length(retdat)) {
stop("Download from EUCTR failed; last error with one or more of:\n",
paste0(urls, collapse = "\n"), call. = FALSE
)
}
# combine results
resultHistory <- do.call(
rbind,
c(lapply(retdat, as.data.frame),
stringsAsFactors = FALSE,
make.row.names = FALSE))
# apply to store in database
message(", updating records ", appendLF = FALSE)
importedresultshistory <- apply(
resultHistory, 1,
function(r) {
r <- as.list(r)
# check, add default, inform user
if (is.na(r$tmpFirstDate) &
is.na(r$tmpThisDate) &
r$tmpChanges == "") {
message("x ", appendLF = FALSE)
} else {
# update record
message(". ", appendLF = FALSE)
tmp <- nodbi::docdb_update(
src = con,
key = con$collection,
value = list(
"firstreceived_results_date" = as.character(r[["tmpFirstDate"]]),
"this_results_date" = as.character(r[["tmpThisDate"]]),
"version_results_history" = r[["tmpChanges"]]),
query = paste0('{"a2_eudract_number":"', r[["eudractNumber"]], '"}')
)
# return if successful
ifelse(inherits(tmp, "try-error"), 0L, 1L)
}
}) # apply resultsHistory
# reset date time
Sys.setlocale("LC_TIME", lct)
# sum up successful downloads
importedresultshistory <- sum(unlist(
importedresultshistory, use.names = FALSE), na.rm = TRUE)
} else {
message("(4/4) Results history: not retrieved ",
"(euctrresultshistory = FALSE)",
appendLF = FALSE)
} # if euctrresultshistory
## inform user on final import outcomes
message("\n= Imported or updated results for ",
importedresults, " trials")
if (!is.null(importedresultshistory) &&
importedresultshistory > 0L) {
message("= Imported or updated results history for ",
importedresultshistory, " trials")
}
if (documents.path != tempDir) {
message("= documents saved in '", documents.path, "'")
}
} # if euctrresults
# return
return(imported)
}
# end ctrLoadQueryIntoDbEuctr
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.