R/ctrRerunQuery.R

Defines functions ctrRerunQuery

### ctrdata package

#' ctrRerunQuery
#'
#' @inheritParams ctrLoadQueryIntoDb
#'
#' @keywords internal
#' @noRd
#'
#' @importFrom stringi stri_extract_all_regex
#' @importFrom jqr jq
#' @importFrom jsonlite toJSON
#' @importFrom nodbi docdb_query docdb_update
#' @importFrom httr2 req_perform req_body_json request req_user_agent
#'
ctrRerunQuery <- function(
    querytoupdate = querytoupdate,
    forcetoupdate = forcetoupdate,
    ctishistory = ctishistory,
    only.count = only.count,
    con = con,
    verbose = verbose,
    queryupdateterm = queryupdateterm) {

  ## check database connection
  con <- ctrDb(con)

  ## prepare
  failed <- NULL

  ## handle query history -----------------------------------------------------
  rerunquery <- dbQueryHistory(con = con, verbose = verbose)

  # check parameters
  if (is.null(rerunquery) || !nrow(rerunquery))
    stop("'querytoupdate': no previous queries found in collection, ",
         "aborting query update", call. = FALSE)

  # select last query if specified
  if (querytoupdate == "last")
    querytoupdate <- nrow(rerunquery)

  # check parameters
  if (!(as.integer(querytoupdate) == querytoupdate))
    stop("'querytoupdate' needs to be an integer number", call. = FALSE)
  querytoupdate <- as.integer(querytoupdate)

  # try to select the query to be updated
  if (querytoupdate > nrow(rerunquery) ||
      querytoupdate < 1L) {
    stop("'querytoupdate': specified query number ", querytoupdate,
         " not found, check 'dbQueryHistory()'", call. = FALSE)
  }

  # set query values as retrieved
  queryterm  <- rerunquery[querytoupdate, "query-term", drop = TRUE]
  register   <- rerunquery[querytoupdate, "query-register", drop = TRUE]

  # secondary check parameters in case history queries need
  # to be translated or otherwise manipulated as for new queries
  query <- ctrGetQueryUrl(url = queryterm, register = register)
  queryterm <- query$`query-term`
  register <- query$`query-register`

  # when was this query last run?
  #
  # - dates of all the same queries
  initialday <- rerunquery[["query-timestamp"]][
    rerunquery[querytoupdate, "query-term", drop = TRUE] ==
      rerunquery[["query-term"]]]
  #
  # - remove time, keep date
  initialday <- substr(initialday, start = 1L, stop = 10L)
  #
  # - change to Date class and get
  #   index of latest (max) date,
  initialdayindex <- try(which.max(as.Date(initialday, format = "%Y-%m-%d")))
  if (!inherits(initialdayindex, "try-error")) {
    # - keep initial (reference) date of this query
    initialday <- initialday[initialdayindex]
  } else {
    # - fallback to number (querytoupdate) as specified by user
    initialday <- rerunquery[querytoupdate, "query-timestamp", drop = TRUE]
  }
  message("* Query last run: ", initialday)

  ## adapt updating procedure to respective register
  querytermoriginal <- queryterm

  # mangle parameter only if not forcetoupdate,
  # which just returns parameters of original query
  if (!forcetoupdate) {

    # inform user
    message("* Checking for new or updated trials...")

    # ctgov --------------------------------------------------------------------

    # no more needed because previous CTGOV query is
    # translated to CTGOV2 by ctrGetQueryUrl above

    # ctgov2 -------------------------------------------------------------------
    if (register == "CTGOV2") {

      # ctgov2: specify last update start / end:
      # https://www.clinicaltrials.gov/search?cond=Cancer&lastUpdPost=2022-01-01_2023-12-31

      # if "lastUpdPost" is already in query term, just re-run full query
      # to avoid multiple queries in history that differ only in timestamp:
      if (grepl("&lastUpdPost=[0-9]{2}", queryterm)) {

        # remove queryupdateterm, thus running full again
        queryupdateterm <- ""
        warning("Query has date(s) for start or end of last update ",
                "('&lastUpdPost'); running again with these limits",
                call. = FALSE, immediate. = TRUE)

      } else {

        queryupdateterm <- strftime(strptime(
          initialday, format = "%Y-%m-%d"), format = "%Y-%m-%d")

        queryupdateterm <- paste0("&lastUpdPost=", queryupdateterm, "_")

        if (verbose) message("DEBUG: additional term: ", queryupdateterm)

      }

      message("- Rerunning query: ", queryterm,
              "\n- Last run: ", initialday)
    } # end ctgov2

    # euctr -------------------------------------------------------------------
    if (register == "EUCTR") {

      # euctr: studies added or updated in the last 7 days:
      # "https://www.clinicaltrialsregister.eu/ctr-search/rest/feed/
      # bydates?query=cancer&age=children"

      # check if update request is in time window of the register (7 days)
      if (difftime(Sys.Date(), initialday, units = "days") > 7L) {

        warning("'querytoupdate=", querytoupdate, "' not possible because ",
                "it was last run more than 7 days ago and the register ",
                "provides information on changes only for the last 7 days. ",
                "Reverting to normal download. ",
                call. = FALSE, immediate. = TRUE)

        message("- Rerunning query: ", queryterm,
                "\n- Last run: ", initialday)

      } else {

        # obtain rss feed with list of recently updated trials
        rssquery <- utils::URLencode(
          paste0("https://www.clinicaltrialsregister.eu/ctr-search/",
                 "rest/feed/bydates?", queryterm))

        if (verbose) message("DEBUG (rss url): ", rssquery)

        resultsRss <- try(rawToChar(
          httr2::req_perform(
            httr2::req_user_agent(
              httr2::request(
                rssquery),
              ctrdataUseragent
            ))$body), silent = TRUE)

        # check plausibility
        if (inherits(resultsRss, "try-error")) {
          stop("Download from EUCTR failed; last error: ",
               class(resultsRss), call. = FALSE)
        }

        # inform user
        if (verbose) message("DEBUG (rss content): ", resultsRss)

        # attempt to extract euctr number(s)
        resultsRssTrials <- gregexpr(
          "eudract_number:[0-9]{4}-[0-9]{6}-[0-9]{2}</link>",
          resultsRss)[[1]]

        if (length(resultsRssTrials) == 1L &&
            resultsRssTrials == -1L) {

          # inform user
          message("First result page empty - no (new) trials found?")

          # only for EUCTR, update history here because
          # for EUCTR the query to be used with function
          # ctrLoadQueryIntoDb cannot be specified to only
          # query for updated trials
          dbCTRUpdateQueryHistory(
            register = register,
            queryterm = queryterm,
            recordnumber = 0L,
            con = con,
            verbose = verbose)

          # set indicator
          failed <- emptyReturn

        } else {

          # new trials found, construct
          # differential query to run
          resultsRssTrials <- vapply(
            resultsRssTrials, FUN = function(x) {
              substr(resultsRss, x + 15, x + 28)
            }, character(1L))

          resultsRssTrials <- paste0(
            "query=",
            paste(
              resultsRssTrials,
              collapse = "+OR+"))

          if (verbose) message("DEBUG (rss trials): ", resultsRssTrials)

          # run query for extracted euctr number(s)
          # store original query in update term
          queryupdateterm <- queryterm
          queryterm <- resultsRssTrials

          if (verbose) {
            message("DEBUG: additional term: ",
                    queryupdateterm)
          }

          message("- Rerunning query: ", queryupdateterm,
                  "\n- Last run: ", initialday)
        }
        #
      }
    } # register euctr

    # isrctn ------------------------------------------------------------------
    if (register == "ISRCTN") {

      # isrctn last edited:
      # "&filters=condition:Cancer,
      #  GT+lastEdited:2021-04-01T00:00:00.000Z,
      #  LE+lastEdited:2021-04-25T00:00:00.000Z&"

      # if already in query term, just re-run full query to avoid
      # multiple queries in history that only differ in timestamp:
      if (grepl("lastEdited:", queryterm)) {

        # remove queryupdateterm, thus running full again
        queryupdateterm <- ""
        warning("Query has date(s) for start or end of last update ",
                "('lastEdited'); running again with these limits",
                immediate. = TRUE)

      } else {

        queryupdateterm <- strftime(
          strptime(initialday,
                   format = "%Y-%m-%d"),
          format = "%Y-%m-%d")

        queryupdateterm <- paste0(" AND lastEdited GE ",
                                  queryupdateterm,
                                  "T00:00:00.000Z")

        if (verbose) {
          message("DEBUG: additional term: ",
                  queryupdateterm)
        }

      }

      message("- Rerunning query: ", queryterm,
              "\n- Last run: ", initialday)
    } # end isrctn

    # ctis ------------------------------------------------------------------
    if (register == "CTIS") {

      # principles:
      # - historic ctis versions are only created in ctrRerunQuery
      #   because the user needs to "trigger" creating a version,
      #   since ctis does not on its own offer an API for retrieving
      #   versions of a record
      # - in ctrRerunQuery, updating trials identified in the last
      #   seven days is straightforward and can readily include
      #   creating historic versions
      # - for rerunning older queries,
      #   we need to get the concerned ids and this is only possible
      #   with a full ctrLoadQueryIntoDb but into a separate database
      #   so that we do not overwrite existing records;
      #   over the resulting $success trial identifiers,
      #   then iterate

      # helper function
      getIdsFromRss <- function(queryterm) {

        # ctis: studies added or updated in the last 7 days:
        # "https://euclinicaltrials.eu/ctis-public-api/rss/updates.rss?
        # search_criteria={"ageGroupCode":[2],"therapeuticAreaCode":[4]}"

        # obtain rss feed with list of recently updated trials
        rssquery <- paste0(
          "https://euclinicaltrials.eu/ctis-public-api/rss/updates.rss?search_criteria=",
          utils::URLencode(sub("searchCriteria=", "", queryterm)))

        if (verbose) message("DEBUG (rss url): ", utils::URLdecode(rssquery))

        resultsRss <- try(rawToChar(
          httr2::req_perform(
            httr2::req_user_agent(
              httr2::request(
                rssquery),
              ctrdataUseragent
            ))$body), silent = TRUE)

        idsUpdatedTrials <- stringi::stri_extract_all_regex(
          # <link>https://euclinicaltrials.eu/search-for-clinical-trials/?lang=en&amp;EUCT=2024-516838-35-00</link>
          resultsRss, "EUCT=[-0-9]+</link>")[[1]]
        idsUpdatedTrials <- na.omit(stringi::stri_replace_all_regex(
          idsUpdatedTrials, "EUCT=([-0-9]+)</link>", "$1"))

        return(idsUpdatedTrials)

      } # end getIdsFromRss

      # helper function
      updateOrLoadTrial <- function(trialId, con, ctishistory) {

        message(". ", appendLF = FALSE)

        # check if exists
        recExists <- nodbi::docdb_query(
          src = con,
          key = con$collection,
          query = paste0('{"_id":"', trialId, '"}'),
          fields = '{"_id":1}')

        # get existing data in collection
        if (ctishistory) {
          exstJson <- nodbi::docdb_query(
            src = con,
            key = con$collection,
            query = paste0('{"_id":"', trialId, '"}'))
        }

        # get new data
        result <- suppressWarnings(
          suppressMessages(
            ctrLoadQueryIntoDbCtis(
              queryterm = paste0('searchCriteria={"number":"', trialId, '"}'),
              con = con,
              documents.path = NULL,
              only.count = FALSE,
              verbose = FALSE
            )))
        result$updated <- nrow(recExists)

        # if historical version is to be created
        if (ctishistory && nrow(exstJson)) {

          # move existing data into historical version
          # - lastUpdated is part of ctisEndpoints[1],
          #   the /search overview obtained from CTIS;
          #   there is no reference information about
          #   the meaning of this field, however
          # - versionNumber incremented every time
          exstJson <- jsonlite::toJSON(exstJson)
          exstJson <- jqr::jq(
            exstJson, paste0(
              '( .[] | ( if has("history") then
                  ( [ .history[] | .history_version | .version_number ] | max + 1 )
                  else 1 end )
                ) as $n |
                { _id: .[] | ._id,
                 history: .[] | [
                    ( del(.history) | .history_version = {
                         version_date: .lastUpdated,
                         version_number: $n }),
                    if has("history") then .history[] else empty end
                  ]
                 }'
            ))

          # temporary file and cleanup
          tfname <- tempfile()
          on.exit(try(unlink(tfname), silent = TRUE), add = TRUE)
          writeLines(exstJson, con = file(tfname))

          # update record, adding historical versions
          # avoid SQL issues by using file-based json
          result$updated <- nodbi::docdb_update(
            src = con,
            key = con$collection,
            value = tfname,
            query = "{}"
          )

        } # if historical version

        # default return
        return(result)

      } # end updateOrLoadTrial

      # helper function
      histCreateRet <- function(res) {

        # querytermoriginal, queryupdateterm, queryterm, register

        # construct return object
        ret <- NULL
        ret$n <- sum(sapply(res, "[[", "n"))
        ret$success <- unlist(sapply(res, "[[", "success"), use.names = FALSE)
        ret$failed <- NULL
        ret$queryterm <- querytermoriginal
        ret$updated <- sum(sapply(res, "[[", "updated"))

        # annotate
        if (ret$n > 0L) {
          dbCTRUpdateQueryHistory(
            register = register,
            queryterm = querytermoriginal,
            recordnumber = ret$n,
            con = con,
            verbose = verbose
          )
        }

        # add meta-data
        ret <- addMetaData(x = ret, con = con)

        # failed is indicator to not run main function
        return(list(
          "querytermoriginal" = querytermoriginal,
          "queryupdateterm"   = queryupdateterm,
          "queryterm"         = queryterm,
          "register"          = register,
          "failed"            = ret))

      }

      #### . dispatch ####

      # separate out decision alternatives for best overview

      useRss <- difftime(Sys.Date(), initialday, units = "days") <= 7L

      # - alternative 1
      if (!ctishistory & !useRss) {

        # rerunning original query
        warning(
          "'querytoupdate=", querytoupdate, "' not possible because no ",
          "effcient way was found so far to query CTIS for data only from ",
          "recently changed trials (last checked 2025-08-21). Reverting to ",
          "normal download. ", call. = FALSE, immediate. = TRUE)

        # standard case in main function ctrLoadQueryIntoDb
        message("- Rerunning query: ", queryterm,
                "\n- Last run: ", initialday)

        # go to default return at bottom

      } # not ctishistory and not useRss

      # - alternative 2
      if (ctishistory | useRss) {

        # - get idsUpdatedTrials for processing
        if (useRss) {

          idsUpdatedTrials <- getIdsFromRss(queryterm)

          # prepare for early exit if only.count
          if (only.count) message(
            "= Not done (only.count = TRUE): Imported ",
            length(idsUpdatedTrials), " trial(s)")

        } else {

          # rerunning original query
          warning(
            "'querytoupdate=", querytoupdate, "' not possible because no ",
            "effcient way was found so far to query CTIS for data only from ",
            "recently changed trials (last checked 2025-08-21). Need to ",
            "download iteratively. ", call. = FALSE, immediate. = TRUE)

          res <- ctisApi1(
            queryterm, only.count,
            "https://euclinicaltrials.eu/ctis-public-api/search",
            ctrTempDir(verbose), verbose)

          # prepare for early exit if only.count
          if (only.count) {
            idsUpdatedTrials <- rep.int(1L, res$n)
          } else {
            idsUpdatedTrials <- res$idsTrials
          }

        }

        # early exit if only.count
        if (only.count || !length(idsUpdatedTrials)) {
          res <- NULL
          res$n <- length(idsUpdatedTrials)
          res$queryterm <- querytermoriginal
          # failed is indicator to not run main function
          return(list(failed = res))
        }

        # user interim info
        message(
          ifelse(
            ctishistory,
            "- Creating historic versions when updating trials\n", ""
          ),
          "- Loading and updating ", length(idsUpdatedTrials),
          " trials one-by-one (estimate: ",
          ifelse(
            ctishistory,
            signif(length(idsUpdatedTrials) * 25 / 40, 2L),
            signif(length(idsUpdatedTrials) * 20 / 40, 2L)
          ), " s)...")

        # iterate
        res <- list()
        # print(system.time(
          for (trialId in idsUpdatedTrials) res <- c(
            res, list(updateOrLoadTrial(trialId, con, ctishistory)))
        # ))

        # info
        message(
          "\r= ",
          sum(sapply(res, "[[", "updated")), " updated, ",
          sum(sapply(res, "[[", "n")) - sum(sapply(res, "[[", "updated")),
          " new records")

        # return and signal to ctrLoadQueryIntoDb to exit early
        return(histCreateRet(res))

      } # ctishistory or useRss

    } # end ctis

  } # forcetoupdate

  #### return ####

  ## return main parameters needed
  return(list(
    "querytermoriginal" = querytermoriginal,
    "queryupdateterm"   = queryupdateterm,
    "queryterm"         = queryterm,
    "register"          = register,
    "failed"            = failed))

} # end ctrRerunQuery

Try the ctrdata package in your browser

Any scripts or data that you put into this service are public.

ctrdata documentation built on Jan. 18, 2026, 9:07 a.m.