R/ctrGetQueryUrl.R

Defines functions ctrGetQueryUrl

Documented in ctrGetQueryUrl

### ctrdata package

#' Get query details
#'
#' Extracts query parameters and register name from parameter `url` or
#' from the clipboard, into which the URL of a register search was copied.
#'
#' @param url URL such as from the browser address bar.
#' If not specified, clipboard contents will be checked for
#' a suitable URL.
#' For automatically copying the user's query of a register
#' in a web browser to the clipboard, see
#' \ifelse{latex}{\out{\href{https://github.com/rfhb/ctrdata\#3-script-to-automatically-copy-users-query-from-web-browser}{here}}}{\href{https://github.com/rfhb/ctrdata#3-script-to-automatically-copy-users-query-from-web-browser}{here}}.
#' Can also contain a query term such as from
#' \link{dbQueryHistory}()["query-term"].
#'
#' @param register Optional name of register (one of "EUCTR", "CTGOV",
#' "ISRCTN" or "CTIS") in case `url` is a query term
#'
#' @export
#'
#' @return A data frame (or tibble, if \code{tibble} is loaded)
#' with column names `query-term` and `query-register`.
#' The data frame (or tibble) can be passed as such as parameter
#' `query-term` to \link{ctrLoadQueryIntoDb} and as parameter
#' `url` to \link{ctrOpenSearchPagesInBrowser}.
#'
#' @importFrom clipr read_clip
#' @importFrom tibble as_tibble
#'
#' @examples
#'
#' # user copied into the clipboard the URL from
#' # the address bar of the browser that shows results
#' # from a query in one of the trial registers
#' try(ctrGetQueryUrl(), silent = TRUE)
#'
#' # extract query parameters from search result URL
#' # (URL was cut for the purpose of formatting only)
#' ctrGetQueryUrl(
#'     url = paste0(
#'         "https://classic.clinicaltrials.gov/ct2/results?",
#'         "cond=&term=AREA%5BMaximumAge%5D+RANGE%5B0+days%2C+28+days%5D",
#'         "&type=Intr&rslt=&age_v=&gndr=&intr=Drugs%2C+Investigational",
#'         "&titles=&outc=&spons=&lead=&id=&cntry=&state=&city=&dist=",
#'         "&locn=&phase=2&rsub=&strd_s=01%2F01%2F2015&strd_e=01%2F01%2F2016",
#'         "&prcd_s=&prcd_e=&sfpd_s=&sfpd_e=&rfpd_s=&rfpd_e=&lupd_s=&lupd_e=&sort="
#'     )
#' )
#'
#' ctrGetQueryUrl("https://www.clinicaltrialsregister.eu/ctr-search/trial/2007-000371-42/results")
#' ctrGetQueryUrl("https://euclinicaltrials.eu/app/#/view/2022-500041-24-00")
#' ctrGetQueryUrl("https://euclinicaltrials.eu/app/#/search?sponsorTypeCode=1")
#' ctrGetQueryUrl("https://classic.clinicaltrials.gov/ct2/show/NCT01492673?cond=neuroblastoma")
#' ctrGetQueryUrl("https://clinicaltrials.gov/ct2/show/NCT01492673?cond=neuroblastoma")
#' ctrGetQueryUrl("https://www.clinicaltrials.gov/study/NCT01467986?aggFilters=ages:child")
#' ctrGetQueryUrl("https://www.isrctn.com/ISRCTN70039829")
#'
ctrGetQueryUrl <- function(
    url = "",
    register = "") {
  # check parameters expectations
  if (!is.atomic(url) || !is.atomic(register) ||
      is.null(url) || is.null(register) ||
      !inherits(url, "character") || !inherits(register, "character") ||
      length(url) != 1L || length(register) != 1L ||
      is.na(url) || is.na(register)) {
    stop("ctrGetQueryUrl(): 'url' and / or 'register' ",
         "is not a single character string, url: '",
         url, "', register: '", register, "'",
         call. = FALSE
    )
  }

  # if no parameter specified,
  # check clipboard contents
  if (nchar(url) == 0L && register != "CTIS") {
    url <- try(
      suppressWarnings(
        clipr::read_clip(
          allow_non_interactive = TRUE
        )
      ),
      silent = TRUE
    )
    if (inherits(url, "try-error")) url <- ""
    if (is.null(url) || (length(url) != 1L) || (nchar(url) == 0L) ||
        grepl(regQueryterm, url) ||
        !startsWith(url, "https://")) {
      stop("ctrGetQueryUrl(): no clinical trial register ",
           "search URL found in parameter 'url' or in clipboard.",
           call. = FALSE
      )
    }
    message("* Using clipboard content as register query URL: ", url)
  }

  # check parameter combination
  if (register != "" && startsWith(url, "http")) {
    warning("Full URL but also 'register' specified; ",
            "continuing with register = ''",
            immediate. = TRUE
    )
    register <- ""
  }

  # identify domain and register short name
  registerFromUrl <- switch(sub("^https://([a-zA-Z.]+?)/.*", "\\1", url),
                            "classic.clinicaltrials.gov" = "CTGOV",
                            "www.clinicaltrials.gov" = "CTGOV2",
                            "clinicaltrials.gov" = "CTGOV2",
                            "euclinicaltrials.eu" = "CTIS",
                            "www.clinicaltrialsregister.eu" = "EUCTR",
                            "www.isrctn.com" = "ISRCTN",
                            "isrctn.com" = "ISRCTN",
                            "NONE"
  )

  # check parameters expectations
  if (register != "" && registerFromUrl != "NONE" && register != registerFromUrl) {
    stop("ctrGetQueryUrl(): 'url' and / or 'register' mismatch, url: '",
         deparse(url), "', register: '", deparse(register), "'",
         call. = FALSE
    )
  } else {
    if (registerFromUrl != "NONE") register <- registerFromUrl
  }

  # handle any mismatch of ctgov label with expected parameters
  if (grepl("^CTGOV[2]?$", register)) register <- ctgovVersion(url, register)

  # output value for return
  outdf <- function(qt, reg) {
    qt <- utils::URLdecode(qt)
    message("* Found search query from ", reg, ": ", qt)
    out <- data.frame(
      `query-term` = qt,
      `query-register` = reg,
      check.names = FALSE,
      stringsAsFactors = FALSE
    )
    if (any("tibble" == .packages())) out <- tibble::as_tibble(out)
    return(out)
  }

  ## identify query term per register

  # euctr
  if (register == "EUCTR") {
    # search result page
    queryterm <- sub(".*/ctr-search/search[?](.*)", "\\1", url)
    # single trial page
    queryterm <- sub(
      paste0(".*/ctr-search/trial/(", regEuctr, ")/.*"),
      "\\1", queryterm
    )
    # remove any intrapage anchor, e.g. #tableTop
    queryterm <- sub("#.+$", "", queryterm)
    # sanity correction for naked terms
    queryterm <- sub(
      "(^|&|[&]?\\w+=\\w+&)([ a-zA-Z0-9+-]+)($|&\\w+=\\w+)",
      "\\1query=\\2\\3", queryterm
    )

    # check if url was for results of single trial
    if (endsWith(url, ".*/results")) {
      queryterm <- paste0(queryterm)
    }

    # return
    return(outdf(queryterm, register))
  }

  # ctgov classic
  if (register == "CTGOV") {
    # mangle query term
    queryterm <- sub(
      paste0(".*/ct2/show/[recodsult/]*(", regCtgov, ")([?][a-z]+.*|$)"),
      "\\1", url
    )
    # single trial page
    if (grepl("[?][a-z]+=\\w+", url, perl = TRUE) &&
        grepl(paste0("^", regCtgov, "$"), queryterm)) {
      message(
        "* Note: 'url' shows a single trial (and is returned by the ",
        "function) but also had search parameters: If interested in ",
        "search results, click 'Return to List' in browser and use ",
        "this as 'url'."
      )
    }

    # expert search page
    queryterm <- sub(".*/ct2/results/refine[?](.*)", "\\1", queryterm)
    # search results page
    queryterm <- sub(".*/ct2/results[?](.*)", "\\1", queryterm)
    # other results page
    queryterm <- sub(
      "(.*)&Search[a-zA-Z]*=(Search|Find)[a-zA-Z+]*",
      "\\1", queryterm
    )

    # remove empty parameters
    queryterm <- gsub("[a-z_0-9]+=&", "", queryterm)
    queryterm <- sub("&[a-z_0-9]+=$", "", queryterm)

    # correct naked terms
    queryterm <- sub(
      "(^|&|[&]?\\w+=\\w+&)(\\w+|[a-zA-z0-9+-.:]+)($|&\\w+=\\w+)",
      "\\1term=\\2\\3", queryterm
    )

    # return
    return(outdf(queryterm, register))
  }

  # iscrtn
  if (register == "ISRCTN") {
    # single trial page
    queryterm <- sub(
      paste0("^.*/ISRCTN(", regIsrctn, ")([&?].+|$)"),
      "ISRCTN\\1", url
    )
    # search results page
    queryterm <- sub(".*/search[?](.*)", "\\1", queryterm)
    # remove unnecessary parameter
    queryterm <- sub("&searchType=[a-z]+-search", "", queryterm)
    # correct naked terms
    queryterm <- sub(
      "(^|&|[&]?\\w+=\\w+&)(\\w+|[ a-zA-Z0-9+-]+)($|&\\w+=\\w+)",
      "\\1q=\\2\\3", queryterm
    )

    # single trial
    if (nchar(queryterm) && grepl(regIsrctn, queryterm) &&
        grepl("[?&].+=[^&]+", url)) {
      message(
        "* Note: 'url' shows a single trial (and is returned by the ",
        "function) but also had search parameters: If interested in ",
        "search results, click 'Back to results' in browser and use ",
        "this as 'url'."
      )
    }

    # return
    return(outdf(queryterm, register))
  }

  # ctgov new 2023
  if (register == "CTGOV2") {
    # extract search query
    queryterm <- sub(
      paste0("(.*/study/", regCtgov, "/?[?]|.*/search/?[?][&]?)([a-z]+.*$)"),
      "\\2", url
    )

    # single trial page
    if (nchar(queryterm) && queryterm != url && grepl(regCtgov2, url)) {
      message(
        "* Note: 'url' shows a single trial (and is returned by the ",
        "function) but also had search parameters: If interested in ",
        "search results, click on 'Search Results' in browser and use ",
        "this as 'url'."
      )
    }
    if (grepl(paste0("study/", regCtgov2), url)) {
      queryterm <-
        paste0(sub(paste0(".*study/(", regCtgov2, ").*"), "id=\\1", url))
    }

    # remove empty parameters, rank, sort
    queryterm <- gsub("[a-z_0-9]+=&", "", queryterm)
    queryterm <- sub("&[a-z_0-9]+=$", "", queryterm)
    queryterm <- sub("&rank=[0-9]+", "", queryterm)
    queryterm <- sub("&sort=[a-zA-Z%23,:]+(&|$)", "\\1", queryterm)
    queryterm <- sub("&viewType=[a-zA-Z]+(&|$)", "\\1", queryterm)

    # correct naked terms
    queryterm <- sub(
      "(^|&|[&]?\\w+=\\w+&)(\\w+|[a-zA-z0-9+-.:]+)($|&\\w+=\\w+)",
      "\\1term=\\2\\3", queryterm
    )

    # return
    return(outdf(queryterm, register))
  }

  # ctis
  if (register == "CTIS") {
    # some seem to use this
    queryterm <- sub(
      "https://euclinicaltrials.eu/ct-public-api-services/services/ct/publiclookup[?]", "", url
    )
    # or https://euclinicaltrials.eu/app/#/search?status=Ended
    queryterm <- sub(
      "https://euclinicaltrials.eu/app/#/search[?]?", "", queryterm
    )
    queryterm <- sub(
      "https://euclinicaltrials.eu/app/#/view/", "", queryterm
    )

    # remove unnecessary components
    queryterm <- sub("&?paging=[-,0-9]+", "", queryterm)
    queryterm <- sub("&?sorting=[-a-zA-Z]+", "", queryterm)
    queryterm <- sub("&?isEeaOnly=false", "", queryterm)
    queryterm <- sub("&?isNonEeaOnly=false", "", queryterm)
    queryterm <- sub("&?isBothEeaNonEea=false", "", queryterm)

    # url lists single trial
    if (grepl(paste0("^", regCtis, "$"), queryterm)) {
      queryterm <- paste0(
        "number=", queryterm
      )
    }

    # return
    return(outdf(queryterm, register))
  }

  # default / NONE
  warning("ctrGetQueryUrl(): no clinical trial register ",
          "search URL found in parameter 'url' or in clipboard.",
          call. = FALSE, immediate. = TRUE
  )
  return(invisible(NULL))
}
# end ctrGetQueryUrl

Try the ctrdata package in your browser

Any scripts or data that you put into this service are public.

ctrdata documentation built on Nov. 24, 2023, 5:11 p.m.