R/utils.R

Defines functions checkBinary installFindBinary installCygwinWindowsTest installCygwinWindowsDoInstall setProxy addMetaData typeField dfFindUniqueEuctrRecord dfMergeTwoVariablesRelevel dfListExtractKey dfTrials2Long dfName2Value dbGetFieldsIntoDf dbFindIdsUniqueTrials dbFindFields dbQueryHistory ctrFindActiveSubstanceSynonyms ctrGetQueryUrlFromBrowser ctrGetQueryUrl ctrOpenSearchPagesInBrowser ctrDb

Documented in ctrDb ctrFindActiveSubstanceSynonyms ctrGetQueryUrl ctrGetQueryUrlFromBrowser ctrOpenSearchPagesInBrowser dbFindFields dbFindIdsUniqueTrials dbGetFieldsIntoDf dbQueryHistory dfFindUniqueEuctrRecord dfListExtractKey dfMergeTwoVariablesRelevel dfName2Value dfTrials2Long installCygwinWindowsDoInstall

### ctrdata package
### utility functions

## variable definitions
#
# EUCTR definitions
countriesEUCTR <- c(
  "AT", "BE", "BG", "HR", "CY", "CZ", "DK", "EE", "FI", "FR",
  "DE", "GR", "HU", "IE", "IT", "LV", "LT", "LU", "MT", "NL",
  "PL", "PT", "RO", "SK", "SE", "SI", "ES", "GB", "IS", "LI",
  "NO", "3RD")
#
# regexpr
# - EudraCT e.g. 2010-022945-52
regEuctr <- "[0-9]{4}-[0-9]{6}-[0-9]{2}"
# - CTGOV
regCtgov <- "NCT[0-9]{8}"
# - regIsrctn
# FIXME check if first digit is always non zero
regIsrctn <- "[1-9][0-9]{7}"
#
# register list
registerList <- c("EUCTR", "CTGOV", "ISRCTN")


#' Check and prepare nodbi connection object for ctrdata
#'
#' @param con A \link[nodbi]{src} connection object, as obtained with
#'  nodbi::\link[nodbi]{src_mongo}() or nodbi::\link[nodbi]{src_sqlite}()
#'
#' @keywords internal
#'
#' @importFrom nodbi src_sqlite
#' @importFrom utils capture.output
#'
#' @return Connection object as list, with collection
#'  element under root
#'
ctrDb <- function(
  con = nodbi::src_sqlite(
    collection = "ctrdata_auto_generated")) {

  ## sqlite
  if (inherits(con, "src_sqlite")) {

    if (is.null(con$collection)) {
      stop("In src_sqlite(), a parameter 'collection' needs to specify ",
           "the name of a table, such as src_sqlite(collection = 'test'), ",
           "for package ctrdata to work with other nosql databases.",
           call. = FALSE)
    }

    # check
    if (!RSQLite::dbIsValid(con$con)) {
      con <- nodbi::src_sqlite(dbname = con$dbname,
                               collection = con$collection)
    }

    # add database as element under root
    con <- c(con,
             "db" = con$dbname,
             "ctrDb" = TRUE)

    # print warning from nodbi::src_sqlite()
    if (grepl(":memory:", con$dbname)) {
      warning("Database not persisting,\ncopy to persistant database like ",
              "this:\n\nRSQLite::sqliteCopyDatabase(",
              "\n  from = <your in-memory-database-object>$con,",
              "\n  to = RSQLite::dbConnect(RSQLite::SQLite(),",
              "\n                          dbname = 'local_file.db'))\n",
              call. = FALSE,
              noBreaks. = FALSE,
              immediate. = TRUE)
    }

    ## return
    return(structure(con,
                     class = c("src_sqlite", "docdb_src")))
  }

  ## mongo
  if (inherits(con, "src_mongo")) {

    # rights may be insufficient to call info(),
    # hence this workaround that should always
    # work and be stable to retrieve name of
    # collection in the mongo connection
    # suppress... for reconnect info from mongolite
    coll <- suppressMessages(utils::capture.output(con$con)[1])
    coll <- sub("^.*'(.*)'.*$", "\\1", coll)

    # add collection as element under root
    con <- c(con,
             "collection" = coll,
             "ctrDb" = TRUE)

    ## return
    return(structure(con,
                     class = c("src_mongo", "docdb_src")))
  }

  ## unprepared for other nodbi adapters so far
  stop("Please specify in parameter 'con' a database connection. ",
       "crdata supports so far only src_mongo() and src_sqlite().",
       call. = FALSE)

} # end ctrDb


#' Open advanced search pages of register(s) or execute search in browser
#'
#' @param url of search results page to show in the browser.
#'   May be the output of \link{ctrGetQueryUrl} or from \link{dbQueryHistory}.
#'
#' @param register Register(s) to open. Either "EUCTR" or "CTGOV" or a vector of
#'   both. Default is to open both registers' advanced search pages. To open the
#'   browser with a previous search, the output of \link{ctrGetQueryUrl}
#'   or one row from \link{dbQueryHistory} can be used.
#'
#' @param copyright (Optional) If set to \code{TRUE}, opens copyright pages of
#'   register(s).
#'
#' @param ... May include the deprecated \code{input} parameter.
#'
#' @export
#'
#' @return Is always true, invisibly.
#'
#' @examples
#' \dontrun{
#' ctrOpenSearchPagesInBrowser(
#'  "https://www.clinicaltrialsregister.eu/ctr-search/search?query=cancer")
#'
#' # for this example, the clipboard has to
#' # contain the URL from a search in a register
#' ctrOpenSearchPagesInBrowser(
#'  ctrGetQueryUrl())
#'
#' # open the last query that was
#' # loaded into the database
#' db <- nodbi::src_sqlite(
#'   collection = "previously_created"
#' )
#' ctrOpenSearchPagesInBrowser(
#'   dbQueryHistory(con = db))
#' }
#'
ctrOpenSearchPagesInBrowser <- function(
  url = "",
  register = "",
  copyright = FALSE,
  ...) {

  ## FIXME migrate from previously used parameter "input"
  tmp <- list(...)
  tmp <- tmp[["input"]]
  if (length(tmp)) {
    url <- tmp
    warning("Parameter 'input' is deprecated, use 'url' instead.",
            call. = FALSE)
  }

  ## check combination of arguments to select action

  # - open all registers if no parameter is specified
  if (all(register == "") && all(url == "")) {
    sapply(
      c("https://www.clinicaltrialsregister.eu/ctr-search/search",
        "https://clinicaltrials.gov/ct2/search/advanced",
        "https://www.isrctn.com/editAdvancedSearch"),
      function(u) utils::browseURL(u))
  }

  # - open copyright or similar pages
  if (copyright) {
    sapply(
      c("https://www.clinicaltrialsregister.eu/disclaimer.html",
        "https://clinicaltrials.gov/ct2/about-site/terms-conditions#Use",
        "https://www.isrctn.com/page/faqs#usingISRCTN"),
      function(u) utils::browseURL(u))
  }

  # - open from url, or query and register
  if (is.atomic(url) && url != "") {
    url <- ctrGetQueryUrl(url = url, register = register)
  }

  # - get from a data frame, such as from
  #   ctrQueryHistoryInDb() or ctrGetQueryUrl()
  if (is.data.frame(url) &&
      all(substr(names(url), 1, 6) == "query-")) {
    nr <- nrow(url)
    if (nr > 1L) warning("Using last query",
                         call. = FALSE, immediate. = TRUE)
    register  <- url[nr, "query-register"]
    url <- url[nr, "query-term"]
  }

  # - open from url and register
  if (is.atomic(url) && url != "" && register != "") {
    url <- switch(
      register,
      "EUCTR" = paste0("https://www.clinicaltrialsregister.eu/ctr-search/search?", url),
      "CTGOV" = paste0("https://clinicaltrials.gov/ct2/results?", url),
      "ISRCTN" = paste0("https://www.isrctn.com/search?", url))
    utils::browseURL(url = url)
    return(url)
  }

  # return
  invisible(NULL)
}
# end ctrOpenSearchPagesInBrowser


#' Extract query parameters and register name from input or from
#' clipboard into which the URL of a register search was copied
#'
#' @param url URL such as from the browser address bar.
#' If not specified, clipboard contents will be checked for
#' a suitable URL. Can also contain a query term such as from
#' \link{dbQueryHistory}()["query-term"]
#'
#' @param register Optional name of register (i.e., "EUCTR" or
#' "CTGOV") in case url is a query term
#'
#' @return A string of query parameters that can be used to retrieve data
#' from the register.
#'
#' @export
#'
#' @return A data frame with column names query term and register name
#' that can directly be used in \link{ctrLoadQueryIntoDb} and in
#' \link{ctrOpenSearchPagesInBrowser}
#'
#' @examples
#'
#' \dontrun{
#' db <- nodbi::src_sqlite(
#'   collection = "my_collection"
#' )
#'
#' # user now copies into the clipboard the URL from
#' # the address bar of the browser that shows results
#' # from a query in one of the trial registers
#' ctrLoadQueryIntoDb(
#'   ctrGetQueryUrl(),
#'   con = db
#' )
#' }
#'
#' @importFrom clipr read_clip
#'
ctrGetQueryUrl <- function(
  url = "",
  register = "") {
  #
  # check parameters expectations
  if (!is.atomic(url) || !is.atomic(register) ||
      is.null(url) || is.null(register) ||
      !inherits(url, "character") || !inherits(register, "character") ||
      length(url) != 1L || length(register) != 1L ||
      is.na(url) || is.na(register)) {
    stop("ctrGetQueryUrl(): 'url' and / or 'register' ",
         "is not a single character string, url: '",
         deparse(url), "', register: '", deparse(register), "'",
         call. = FALSE)
  }
  #
  # if no parameter specified,
  # check clipboard contents
  if (nchar(url) == 0L) {
    url <- suppressWarnings(
      clipr::read_clip(
        allow_non_interactive = TRUE)
    )
    if (is.null(url) || (length(url) != 1L) || (nchar(url) == 0L)) {
      stop("ctrGetQueryUrl(): no clinical trial register ",
           "search URL found in parameter 'url' or in clipboard.",
           call. = FALSE)
    }
    message("* Using clipboard content as register query URL: ", url)
  }
  #
  #
  if (register != "" && grepl("^http", url)) {
    warning("Full URL but also 'register' specified; ",
            "continuing with register = ''", immediate. = TRUE)
    register <- ""
  }
  #
  # identify domain and register short name
  if (register == "") {
    register <- switch(
      sub("^https://[w]{0,3}[.]?([a-zA-Z.]+)/.*", "\\1", url),
      "clinicaltrialsregister.eu" = "EUCTR",
      "clinicaltrials.gov" = "CTGOV",
      "isrctn.com" = "ISRCTN",
      "NONE")
  }
  #
  outdf <- function(qt, reg) {
    qt <- utils::URLdecode(qt)
    message("* Found search query from ", reg, ": ", qt)
    data.frame(
      `query-term` = qt,
      `query-register` = reg,
      check.names = FALSE,
      stringsAsFactors = FALSE)
  }
  # identify query term per register
  #
  if (register == "EUCTR") {
    # search result page
    queryterm <- sub(".*/ctr-search/search[?](.*)", "\\1", url)
    # single trial page
    queryterm <- sub(paste0(".*/ctr-search/trial/(", regEuctr, ")/.*"),
                     "\\1", queryterm)
    # remove any intrapage anchor, e.g. #tableTop
    queryterm <- sub("#.+$", "", queryterm)
    # sanity correction for naked terms
    queryterm <- sub(
      "(^|&|[&]?\\w+=\\w+&)([ a-zA-Z0-9+-]+)($|&\\w+=\\w+)",
      "\\1query=\\2\\3", queryterm)
    # check if url was for results of single trial
    if (grepl(".*/results$", url)) {
      queryterm <- paste0(queryterm, "&resultsstatus=trials-with-results")
    }
    #
    return(outdf(queryterm, register))
  }
  #
  if (register == "CTGOV") {
    # single trial page
    queryterm <- sub(paste0(".*/ct2/show/(", regCtgov, ")([?][a-z]+.*|$)"),
                     "\\1", url)
    # inform user
    if (grepl("[?][a-z]+=\\w+", url, perl = TRUE) &
        grepl(paste0("^", regCtgov, "$"), queryterm)) {
      message("* Note: 'url' shows a single trial (and is returned by the ",
              "function) but also had search parameters: If interested in ",
              "search results, click 'Return to List' in browser and use ",
              "this as 'url'.")
    }
    # search results page
    queryterm <- sub(".*/ct2/results[?](.*)", "\\1", queryterm)
    # other results page
    queryterm <- sub("(.*)&Search[a-zA-Z]*=(Search|Find)[a-zA-Z+]*",
                     "\\1", queryterm)
    # remove empty parameters
    queryterm <- gsub("[a-z_0-9]+=&", "", queryterm)
    queryterm <- sub("&[a-z_0-9]+=$", "", queryterm)
    # correct naked terms
    queryterm <- sub(
      # "(^|&|[&]?\\w+=\\w+&)(\\w+|[NCT0-9-]+)($|&\\w+=\\w+)",
      "(^|&|[&]?\\w+=\\w+&)(\\w+|[a-zA-z0-9+-.:]+)($|&\\w+=\\w+)",
      "\\1term=\\2\\3", queryterm)
    #
    return(outdf(queryterm, register))
  }
  #
  if (register == "ISRCTN") {
    # single trial page
    queryterm <- sub(paste0("^.*/ISRCTN(", regIsrctn, ")$"),
                     "ISRCTN\\1", url)
    # search results page
    queryterm <- sub(".*/search[?](.*)", "\\1", queryterm)
    # remove unnecessary parameter
    queryterm <- sub("&searchType=[a-z]+-search", "", queryterm)
    # correct naked terms
    queryterm <- sub(
      "(^|&|[&]?\\w+=\\w+&)(\\w+|[ a-zA-Z0-9+-]+)($|&\\w+=\\w+)",
      "\\1q=\\2\\3", queryterm)
    #
    return(outdf(queryterm, register))
  }
  #
  # default / NONE
  warning("ctrGetQueryUrl(): no clinical trial register ",
          "search URL found in parameter 'url' or in clipboard.",
          call. = FALSE, immediate. = TRUE)
  #
  return(invisible(NULL))
}
# end ctrGetQueryUrl


#' Import from clipboard the URL of a search in one of the registers
#'
#' @inheritParams ctrGetQueryUrl
#'
#' @return A string of query parameters that can be used to retrieve data
#' from the register.
#'
#' @export
#'
#' @return A data frame with column names query term and register name
#' that can directly be used in \link{ctrLoadQueryIntoDb} and in
#' \link{ctrOpenSearchPagesInBrowser}
#'
#' @examples
#'
#' \dontrun{
#' db <- nodbi::src_sqlite(
#'   collection = "my_collection"
#' )
#'
#' # user now copies into the clipboard the URL from
#' # the address bar of the browser that shows results
#' # from a query in one of the trial registers
#' ctrLoadQueryIntoDb(
#'   ctrGetQueryUrlFromBrowser(),
#'   con = db
#' )
#' }
#'
#' @importFrom clipr read_clip
#'
ctrGetQueryUrlFromBrowser <- function(
  url = "",
  register = "") {

  # deprecate
  .Deprecated(new = "ctrGetQueryUrl")

  # defer call
  ctrGetQueryUrl(url = url, register = register)

}
# end ctrGetQueryUrlFromBrowser


#' Find synonyms of an active substance
#'
#' An active substance can be identified by a recommended international
#' nonproprietary name, a trade or product name, or a company code(s).
#'
#' At this time, this function uses the register ClinicalTrials.Gov to
#' detect which substances were also searched for.
#'
#' @param activesubstance An active substance, in an atomic character vector
#'
#' @return A character vector of the active substance (input parameter) and
#'  synonyms, if any were found
#'
#' @importFrom xml2 read_html
#' @importFrom rvest html_node html_table
#'
#' @export
#'
#' @examples
#'
#' \dontrun{
#' ctrFindActiveSubstanceSynonyms(
#'   activesubstance = "imatinib"
#' )
#' }
#'
ctrFindActiveSubstanceSynonyms <- function(activesubstance = "") {

  # check parameters
  if ((length(activesubstance) != 1) ||
      !is.character(activesubstance) ||
      (nchar(activesubstance) == 0)) {
    stop("ctrFindActiveSubstanceSynonyms(): ",
         "activesubstance should be a single string.",
         call. = FALSE)
  }

  # initialise output variable
  as <- activesubstance

  # check and set proxy if needed to access internet
  setProxy()

  # getting synonyms
  ctgovfirstpageurl <-
    paste0("https://clinicaltrials.gov/ct2/results/details?term=",
           activesubstance)
  tmp <- xml2::read_html(x = utils::URLencode(ctgovfirstpageurl))

  # extract from table "Terms and Synonyms Searched:"
  tmp <- rvest::html_node(
    tmp, xpath =
      '//*[@id="searchdetail"]//table[1]')
  tmp <- rvest::html_table(tmp, fill = TRUE)
  asx <- tmp[[1]]
  asx <- asx[!grepl(
    paste0("(more|synonyms|terms|", as, "|",
           paste0(unlist(strsplit(as, " "), use.names = FALSE),
                  collapse = "|"), ")"), asx,
    ignore.case = TRUE)]

  # prepare and return output
  as <- c(as, asx)
  as <- unique(as)
  return(as)
}
# end ctrFindActiveSubstanceSynonyms


#' Show the history of queries that were loaded into a database
#'
#' @inheritParams ctrDb
#'
#' @return A data frame with columns: query-timestamp, query-register,
#'  query-records (note: this is the number of records loaded when last
#'  executing \link{ctrLoadQueryIntoDb}, not the total record number) and
#'  query-term, and with one row for each \link{ctrLoadQueryIntoDb}
#'  loading trial records in this collection.
#'
#' @param verbose If \code{TRUE}, prints additional information
#' (default \code{FALSE}).
#'
#' @importFrom nodbi docdb_query
#'
#' @export
#'
#' @examples
#'
#' \dontrun{
#' db <- nodbi::src_sqlite(
#'   collection = "my_collection"
#' )
#' dbQueryHistory(
#'   con = db
#' )
#' }
#'
dbQueryHistory <- function(con, verbose = FALSE) {

  ## check database connection
  if (is.null(con$ctrDb)) con <- ctrDb(con = con)

  # debug
  if (verbose) message("Running dbQueryHistory ...")

  hist <- nodbi::docdb_query(
    src = con,
    key = con$collection,
    query = '{"_id": {"$eq": "meta-info"}}',
    fields = '{"queries": 1}')

  # check if meeting expectations
  if (is.null(hist) ||
      nrow(hist) == 0L) {
    #
    message("No history found in expected format.")
    #
    # return (class data.frame is expected)
    return(invisible(data.frame(NULL)))
    #
  }

  # access data frame of queries
  hist <- hist[["queries"]][[1]]

  # inform user
  if (verbose) {

    message("Number of queries in history of \"",
            con$collection, "\": ", nrow(hist))

    # total number of records in collection to inform user
    countall <- length(nodbi::docdb_query(
      src = con,
      key = con$collection,
      query =  '{"_id": {"$ne": "meta-info"}}',
      fields = '{"_id": 1}')[["_id"]])

    message("Number of records in collection \"",
            con$collection, "\": ", countall)

  }

  # return
  return(hist)

}
# end ctrQueryHistoryInDb


#' Find names of fields in the database collection
#'
#' Given part of the name of a field of interest to the user, this
#' function returns the full field names as found in the database.
#'
#' For fields in EUCTR (protocol- and results-related information),
#' see also the register's documentation at
#' \url{https://eudract.ema.europa.eu/result.html}.
#'
#' For fields in CTGOV (protocol-related information), see also
#' the register's definitions at
#' \url{https://prsinfo.clinicaltrials.gov/definitions.html}.
#'
#' Note: Generating a list of fields with this function may take
#' some time, and may involve running a mapreduce function if using
#' a MongoDB server. If the user is not not authorized to run
#' such a function, random documents are sampled to generate a
#' list of fields.
#'
#' @param namepart A plain string (can include a regular expression,
#' including Perl-style) to be searched for among all field names
#' (keys) in the database.
#'
#' @param verbose If \code{TRUE}, prints additional information
#' (default \code{FALSE}).
#'
#' @importFrom nodbi docdb_query
#'
#' @inheritParams ctrDb
#'
#' @return Vector of names of found field(s) in alphabetical
#' order (that is, not by register or field frequency)
#'
#' @export
#'
#' @examples
#'
#' \dontrun{
#' db <- nodbi::src_sqlite(
#'   collection = "my_collection"
#' )
#' dbFindFields(
#'   nampepart = "date",
#'   con = db
#' )
#' }
#'
dbFindFields <- function(namepart = "",
                         con,
                         verbose = FALSE) {

  ## sanity checks
  if (!is.atomic(namepart)) stop("'namepart' should be atomic.", call. = FALSE)
  if (length(namepart) > 1) stop("'namepart' should have one element.", call. = FALSE)
  if (namepart == "")       stop("Empty 'namepart' parameter.", call. = FALSE)

  ## check database connection
  if (is.null(con$ctrDb)) con <- ctrDb(con = con)

  ## check if cache for list of keys in collection exists,
  # otherwise create new environment as session cache
  if (!exists(".dbffenv")) {
    .dbffenv <- new.env(parent = emptyenv())
  }

  ## check if cache environment has entry for the database
  if (exists(x = paste0(con$db, "/", con$collection),
             envir = .dbffenv)) {

    # if true, get keys list from cache
    keyslist <- get(x = paste0(con$db, "/", con$collection),
                    envir = .dbffenv)

    # informing user
    message("Using cache of fields.")

  } else {

    # inform user
    message("Finding fields in database (may take some time)")

    ## using storage backend- specific methods, since
    ## no canonical way was found yet to retrieve
    ## field / key names

    ## - method for mongodb
    if (inherits(con, "src_mongo")) {

      # try mapreduce to get all keys
      keyslist <- try({
        con$con$mapreduce(
          map = "function() {
      obj = this;
      return searchInObj(obj, '');
      function searchInObj(obj, pth) {
         for(var key in obj) {
            if(typeof obj[key] == 'object' && obj[key] !== null) {
               if(pth != '') {pth = pth + '.'}
                  searchInObj(obj[key], pth + key);
            }else{
               key = pth + '.' + key;
               key = key.replace(/[.][0-9]+[.]/g, '.');
               key = key.replace(/[.][0-9]+$/, '');
               key = key.replace(/[.][.]+/g, '.');
               key = key.replace(/[.]$/g, '');
               key = key.replace(/^[.]/, '');
               emit(key, 1);
      }}}}",
      reduce = "function(id, counts) {return Array.sum(counts)}"
      # extract and keep only "_id" = first column, with keys
        )[["_id"]]},
      silent = TRUE)

      # if mapreduce does not work or is not permitted, revert to guessing
      if (inherits(keyslist, "try-error")) {

        warning("Mongo server returned: ", as.character(keyslist),
                "Using alternative method (extracting keys from ",
                "sample documents, may be incomplete).",
                call. = FALSE, immediate. = TRUE)

        # get 2 random documents, one for each register EUCTR and CTGOV,
        # if in collection, and retrieve keys from documents
        keyslist <- c(
          "", # avoid empty vector
          names(con$con$find(
            query = '{"_id": { "$regex": "^NCT[0-9]{8}", "$options": ""} }',
            limit = 1L)),
          names(con$con$find(
            query = '{"_id": { "$regex": "^[0-9]{4}-[0-9]{6}", "$options": ""} }',
            limit = 1L)))

      } # end if error with mapreduce
    } # end if src_mongo

    ## - method for sqlite
    if (inherits(con, "src_sqlite")) {

      # uses special function parameter for
      # src_sqlite query method: listfields
      keyslist <- c("", # avoid empty vector
                    nodbi::docdb_query(
                      src = con,
                      key = con$collection,
                      query = "",
                      listfields = TRUE))

    }

    ## store keyslist to environment (cache)
    if (length(keyslist) > 1) {
      assign(x = paste0(con$db, "/", con$collection),
             value = keyslist,
             envir = .dbffenv)
      message("Field names cached for this session.")
    }

  } # end get cached list or generate new list

  ## inform user of unexpected situation
  if ((length(keyslist) == 0) || all(keyslist == "")) {
    warning("No keys could be extracted, please check database ",
            "and contents: ", con$db, "/", con$collection, call. = FALSE)
  }

  ## now do the actual search and find for key name parts
  fields <- keyslist[grepl(pattern = namepart, x = keyslist,
                           ignore.case = TRUE, perl = TRUE)]

  # clean empty entries and exclude _id for consistency
  # since different approaches above return _id or not
  fields <- fields[fields != "_id" & fields != ""]
  if (!length(fields)) fields <- ""

  # return the match(es)
  return(sort(fields))

} # end dbFindFields


#' Deduplicate records to provide unique clinical trial identifiers
#'
#' If records for a clinical trial are found from more than one register, the
#' record from EUCTR is returned. The function currently relies on CTGOV
#' recording other identifiers such as the EudraCT number in the field "Other
#' IDs".
#'
#' @param preferregister A vector of the sequence of preference for registers
#' from which to generate unique _id's, default
#' \code{c("EUCTR", "CTGOV", "ISRCTN")}
#'
#' @inheritParams dfFindUniqueEuctrRecord
#'
#' @param verbose If set to \code{TRUE}, prints out information about numbers
#' of records found at subsequent steps when searching for duplicates
#'
#' @importFrom nodbi docdb_query
#'
#' @inheritParams ctrDb
#'
#' @return A vector with strings of keys ("_id" in the database) that
#'   represent non-duplicate trials.
#'
#' @export
#'
#' @examples
#'
#' \dontrun{
#' db <- nodbi::src_sqlite(
#'   collection = "my_collection"
#' )
#' dbFindIdsUniqueTrials(
#'   con = db
#' )
#' }
#'
dbFindIdsUniqueTrials <- function(
  preferregister = c("EUCTR", "CTGOV", "ISRCTN"),
  prefermemberstate = "GB",
  include3rdcountrytrials = TRUE,
  con,
  verbose = TRUE) {

  # parameter checks
  if (!all(preferregister %in% registerList)) {
    stop("'preferregister' not known: ", preferregister, call. = FALSE)
  }
  if (length(prefermemberstate) != 1L |
      !any(prefermemberstate == countriesEUCTR)) {
    stop("'prefermemberstate' not known: ", prefermemberstate, call. = FALSE)
  }
  # complete if preferregister does not have all
  preferregister <- unique(preferregister)
  preferregister <- union(preferregister, registerList)

  # objective: create a vector of database record identifiers (_id)
  # that represent unique records of clinical trials, based on user's
  # preferences for selecting the preferred from any multiple records

  ## check database connection
  if (is.null(con$ctrDb)) con <- ctrDb(con = con)

  # inform user
  message("Searching for duplicate trials... ")
  message(" - Getting trial ids...", appendLF = FALSE)

  # fields for database query
  fields <- c(
    "ctrname",
    # euctr
    "a2_eudract_number",
    "a52_us_nct_clinicaltrialsgov_registry_number",
    "trialInformation.usctnIdentifier",
    "a51_isrctn_international_standard_randomised_controlled_trial_number",
    "trialInformation.isrctnIdentifier",
    "a41_sponsors_protocol_code_number",
    # ctgov
    "id_info",
    # isrctn
    "externalRefs",
    "isrctn"
  )

  # get identifiers
  listofIds <- try(suppressMessages(suppressWarnings(
    dbGetFieldsIntoDf(
      fields = fields,
      con = con,
      verbose = FALSE,
      stopifnodata = FALSE)
  )),
  silent = TRUE
  )

  # error check
  if (inherits(listofIds, "try-error") ||
      !length(listofIds) || !nrow(listofIds)) {
    stop("No records found, check collection '", con$collection, "'",
         call. = FALSE)
  }

  # inform user
  message("\b\b\b, ", nrow(listofIds), " found in collection")

  # copy attributes
  attribsids <- attributes(listofIds)

  # target fields for further steps in this function
  fields <- c(
    "_id",
    "ctrname",
    # euctr
    "a2_eudract_number",
    "a52_us_nct_clinicaltrialsgov_registry_number",
    "trialInformation.usctnIdentifier",
    "a51_isrctn_international_standard_randomised_controlled_trial_number",
    "trialInformation.isrctnIdentifier",
    "a41_sponsors_protocol_code_number",
    # ctgov
    "id_info.secondary_id",
    "id_info.org_study_id",
    "id_info.nct_id",
    "id_info.nct_alias",
    "id_info.secondary_id",
    "id_info.secondary_id",
    "id_info.org_study_id",
    # isrctn
    "externalRefs.eudraCTNumber",
    "externalRefs.clinicalTrialsGovNumber",
    "isrctn",
    "externalRefs.protocolSerialNumber"
  )

  # add any missing columns
  missFields <- setdiff(fields, names(listofIds))
  if (length(missFields)) {
    missCols <- matrix(nrow = nrow(listofIds), ncol = length(missFields))
    missCols <- data.frame(missCols)
    names(missCols) <- missFields
    listofIds <- cbind(listofIds, missCols)
  }

  # replicate columns to make data frame fit subsequent steps
  listofIds <- listofIds[, fields, drop = FALSE]

  # rename columns for content mangling, needs to
  # correspond to columns and sequence in fields
  # for mapping identifiers across registers
  names(listofIds) <- c(
    "_id", "ctrname",
    # euctr
    "euctr.1", "ctgov.1a", "ctgov.1b", "isrctn.1a", "isrctn.1b", "sponsor.1",
    # ctgov
    "euctr.2a", "euctr.2b", "ctgov.2a", "ctgov.2b", "isrctn.2",
    "sponsor.2a", "sponsor.2b",
    # isrctn
    "euctr.3", "ctgov.3", "isrctn.3", "sponsor.3"
  )

  # keep only relevant content
  # - in certain raw value columns
  colsToMangle <- list(
    c("ctgov.1a", regCtgov),
    c("ctgov.1b", regCtgov),
    c("ctgov.2a", regCtgov),
    c("ctgov.2b", regCtgov),
    c("isrctn.1a", regIsrctn),
    c("isrctn.1b", regIsrctn),
    c("isrctn.2", regIsrctn),
    c("isrctn.3", regIsrctn),
    c("euctr.1", regEuctr),
    c("euctr.2a", regEuctr),
    c("euctr.2b", regEuctr),
    c("euctr.3", regEuctr)
  )
  # - do mangling; prerequisite is
  #   that each of the columns holds
  #   a single character vector,
  #   possibly collapsed with " / "
  invisible(sapply(
    colsToMangle,
    function(ctm) {
      colMangled <- regmatches(
        listofIds[[ ctm[[1]] ]],
        regexec(ctm[[2]], listofIds[[ ctm[[1]] ]]))
      colMangled[!lengths(colMangled)] <- ""
      listofIds[[ ctm[[1]] ]] <<- unlist(colMangled)
    }))
  # - merge columns for register ids and sponsor ids
  for (reg in c(registerList, "SPONSOR")) {
    listofIds[[reg]] <- apply(listofIds[
      , grepl(paste0("^", reg, "[.][0-9]"), names(listofIds),
              ignore.case = TRUE), drop = FALSE], MARGIN = 1,
      function(r) gsub("^ ?/ | / ?$", "",
                       paste0(na.omit(unique(r)), collapse = " / ")))
  }
  # - delete raw columns
  listofIds <- listofIds[
    , c("_id", "ctrname", registerList, "SPONSOR"), drop = FALSE]

  # inform user
  message(" - Finding duplicates among registers' and sponsor ids...")

  # find duplicates
  colsToCheck <- match(c(preferregister, "SPONSOR"), names(listofIds))
  outSet <- NULL
  for (i in seq_along(preferregister)) {

    # to be added
    tmp <- listofIds[
      listofIds[["ctrname"]] == preferregister[i], , drop = FALSE]
    row.names(tmp) <- NULL

    # check if second etc. set has identifiers
    # in the previously rbind'ed sets
    if (i > 1L && nrow(tmp)) {

      # check for duplicates
      dupes <- mapply(
        function(c1, c2) {
          tmpIs <- intersect(
            unlist(strsplit(c1, " / ")),
            unlist(strsplit(c2, " / ")))
          if (length(tmpIs)) {
            # map found intersecting names back
            # to the rows of the input data frame
            grepl(paste0(tmpIs, collapse = "|"), c1)
          } else {
            rep(FALSE, times = length(c1))
          }},
        tmp[, colsToCheck, drop = FALSE],
        outSet[, colsToCheck, drop = FALSE]
      )
      # keep uniques
      tmp <- tmp[rowSums(dupes) == 0L, , drop = FALSE]
      rm(dupes)
    }

    # add to output set
    outSet <- rbind(outSet, tmp,
                    make.row.names = FALSE,
                    stringsAsFactors = FALSE)
  }
  rm(tmp)

  # keep necessary columns
  listofIds <- outSet[, c("_id", "EUCTR", "ctrname")]
  names(listofIds)[2] <- "a2_eudract_number"
  rm(outSet)

  # find unique, preferred country version of euctr
  listofIds <- dfFindUniqueEuctrRecord(
    df = listofIds,
    prefermemberstate = prefermemberstate,
    include3rdcountrytrials = include3rdcountrytrials)

  # count
  countIds <- table(listofIds[["ctrname"]])

  # prepare output
  listofIds <- listofIds[["_id"]]

  # copy attributes
  attributes(listofIds) <- attribsids[grepl("^ctrdata-", names(attribsids))]

  # avoid returning list() if none found
  if (length(listofIds) == 0) {
    listofIds <- character()
  }

  # inform user
  message(" - Keeping ", paste0(countIds, collapse = " / "), " records",
          " from ", paste0(names(countIds), collapse = " / "))
  message(
    "= Returning keys (_id) of ", length(listofIds),
    " records in collection \"", con$collection, "\"")

  # return
  return(listofIds)

}
# end dbFindIdsUniqueTrials


#' Create data frame by extracting specified fields from database collection
#'
#' With this convenience function, fields in the database are retrieved
#' into an R data frame. Note that fields within the record of a trial
#' can be hierarchical and structured, that is, nested.
#'
#' With both src_sqlite and src_mongo, the function returns a list of data
#' for a field that includes nested content; use function
#' \link{dfTrials2Long} followed by \link{dfName2Value} to
#' extract desired nested variables.
#'
#' For more sophisticated data retrieval from the database, see vignette
#' examples and other packages to query mongodb such as mongolite.
#'
#' @param fields Vector of one or more strings, with names of the sought fields.
#' See function \link{dbFindFields} for how to find names of fields.
#' Regular expressions are possible. "item.subitem" notation is supported.
#'
#' @param stopifnodata Stops with an error (\code{TRUE}, default) or with
#' a warning (\code{FALSE}) if the sought field is empty in all,
#' or not available in any of the records in the database collection.
#'
#' @param verbose Printing additional information if set to \code{TRUE};
#' default is \code{FALSE}.
#'
#' @inheritParams ctrDb
#'
#' @return A data frame with columns corresponding to the sought fields.
#' Note: a column for the record _id will always be included.
#' Each column can be either a simple data type (numeric, character, date)
#' or a list (see example below): For complicated lists, use function
#' \link{dfTrials2Long} followed by function \link{dfName2Value} to
#' extract values for nested variables.
#' The maximum number of rows of the returned data frame is equal to,
#' or less than the number of records of trials in the database.
#'
#' @importFrom nodbi docdb_query
#' @importFrom stats na.omit
#'
#' @export
#'
#' @examples
#'
#' \dontrun{
#' db <- nodbi::src_sqlite(
#'   collection = "my_collection"
#' )
#'
#' # access fields that are nested within another field
#' # and can have multiple values with the other field
#' dbGetFieldsIntoDf(
#'   "b1_sponsor.b31_and_b32_status_of_the_sponsor",
#'   con = db
#' )[1,]
#' #                 _id b1_sponsor.b31_and_b32_status_of_the_sponsor
#' # 1 2004-000015-25-GB                  Non-commercial / Commercial
#'
#' # access fields that include a list of values
#' # which are printed as comma separated values
#' dbGetFieldsIntoDf(
#'   "keyword",
#'   con = db
#' )[1,]
#'
#' #           _id                                 keyword
#' # 1 NCT00129259 T1D, type 1 diabetes, juvenile diabetes
#'
#' str(.Last.value)
#' # 'data.frame':	1 obs. of  2 variables:
#' # $ _id    : chr "NCT00129259"
#' # $ keyword:List of 1
#' # ..$ : chr  "T1D" "type 1 diabetes" "juvenile diabetes"
#'
#' }
#'
dbGetFieldsIntoDf <- function(fields = "",
                              con, verbose = FALSE,
                              stopifnodata = TRUE) {

  # check parameters
  if (!is.vector(fields) |
      class(fields) != "character") {
    stop("Input should be a vector of strings of field names.", call. = FALSE)
  }

  # remove NA, NULL if included in fields
  fields <- fields[!is.null(fields) & !is.na(fields)]

  # remove _id if included in fields
  fields <- fields["_id" != fields]

  # check if valid fields
  if (any(fields == "") | (length(fields) == 0)) {
    stop("'fields' contains empty elements; ",
         "please provide a vector of strings of field names. ",
         "Function dbFindFields() can be used to find field names. ",
         call. = FALSE)
  }

  ## check database connection
  if (is.null(con$ctrDb)) con <- ctrDb(con = con)

  # get all ids to enable Reduce which would fail
  # due to holes from NULLs from the merge step
  dft <- nodbi::docdb_query(
    src = con,
    key = con$collection,
    query = '{}',
    fields = paste0('{"_id": 1}'))
  dft <- dft[dft[["_id"]] != "meta-info", "_id", drop = FALSE]

  # initialise output
  nFields <- length(fields)

  # iterate over fields so that we can
  # use a custom function to merge results
  result <- lapply(
    seq_len(nFields),
    function(i) {
      #
      item <- fields[i]
      message("     \r", i, appendLF = FALSE)
      #
      query <- '{"_id": {"$ne": "meta-info"}}'
      if (verbose) message("DEBUG: field: ", item)
      #
      tmpItem <- try({

        # execute query
        dfi <- nodbi::docdb_query(
          src = con,
          key = con$collection,
          query = query,
          fields = paste0('{"_id": 1, "', item, '": 1}'))

        # leave try() early if no results
        if (!nrow(dfi) || ncol(dfi) == 1L) simpleError(message = "")

        # remove any rows without index variable
        dfi <- dfi[!is.na(dfi[["_id"]]), , drop = FALSE]

        # simplify by extracting recursively any requested subitem
        itemSegments <- strsplit(item, "[.]")[[1]]
        itemSegments <- setdiff(itemSegments, names(dfi))
        for (iS in itemSegments) {
          if ((length(names(dfi[[2]])) == 1L) && (iS == names(dfi[[2]]))) {
            dfi[[2]] <- dfi[[2]][[iS]]
          } else {
            tn <- sapply(dfi[[2]], names)
            if (length(unique(tn)) == 1L && (iS == tn[1]))
              dfi[[2]] <- sapply(dfi[[2]], "[[", 1)
          }
        }

        # simplify by expanding a resulting data frame
        if (length(unique(names(dfi[[2]]))) > 1L) {
          item <- paste0(item, ".", names(dfi[[2]]))
          dfi <- cbind("_id" = dfi[["_id"]], as.data.frame(dfi[[2]]))
          emptyCols <- sapply(dfi, function(c) all(is.na(c)))
          emptyCols <- seq_along(emptyCols)[emptyCols]
          if (length(emptyCols)) dfi <- dfi[, -emptyCols, drop = FALSE]
          if (length(emptyCols)) item <- item[-(emptyCols - 1L)]
        }

        # name result set
        names(dfi) <- c("_id", item)

        # create NA output from template
        dfo <- dft

        # simplify by processing columns
        for (c in seq_len(ncol(dfi))[-1]) {

          # special case: column is one-column data frame
          if (is.data.frame(dfi[[c]]) && (ncol(dfi[[c]]) == 1L) &&
              (nrow(dfi[[c]]) == nrow(dfi))) dfi[[c]] <-
              dfi[[c]][, 1, drop = TRUE]

          # simplify at row level, replaces NULL with NA
          if (!is.data.frame(dfi[[c]]) &&
              !any(sapply(dfi[[c]], class) == "data.frame")) {
            dfi[[c]] <- sapply(dfi[[c]], function(i) {
              l <- length(i)
              if (l == 0L) i <- NA
              if (l == 1L) i <- i[1]
              if (l >= 2L) {
                if (all(sapply(i, is.character))) {
                  i } else {i <- list(i) }}
              i}, USE.NAMES = FALSE, simplify = TRUE)}

          # simplify vectors in cells by collapsing
          # (compatibility with previous version)
          if (all(sapply(dfi[[c]], function(r) is.na(r)[1] | is.character(r))) &&
              any(sapply(dfi[[c]], function(r) length(r) > 1L))) {
            dfi[[c]] <- sapply(dfi[[c]], function(i) paste0(i, collapse = " / "))
          }

          # type results
          if (typeof(dfi[[c]]) == "character") dfi[[c]] <-
              typeField(dfi[, c(1, c), drop = FALSE])[, 2, drop = TRUE]

          # add a column into copy of NA template
          dfo[[c]] <- switch(
            class(dfi[[c]]),
            "Date" = as.Date(NA),
            "numeric" = as.numeric(NA),
            "character" = as.character(NA),
            "data.frame" = NA,
            "integer" = as.integer(NA),
            "list" = NA,
            "logical" = as.logical(NA),
            NA
          )

        } # for process

        # add NA where dfi has no data to avoid NULL when merge'ing
        names(dfo) <- names(dfi)
        dfi <- rbind(dfo[!(dfo[["_id"]] %in% dfi[["_id"]]), , drop = FALSE], dfi)

      },
      silent = TRUE) # tmpItem try

      # inform user
      if (inherits(tmpItem, "try-error") ||
          !nrow(dfi) || (ncol(dfi) == 1L)) {

        # try-error occurred or no data retrieved
        if (stopifnodata) {
          stop("No data could be extracted for '", item,
               "'. \nUse dbGetFieldsIntoDf(stopifnodata = ",
               "FALSE) to ignore this. ", call. = FALSE)
        } else {
          message("* No data: '", item, "'")
          # create empty data set
          dfi <- cbind(dft, NA)
          names(dfi) <- c("_id", fields[i])
        } # stopifnodata
      } # if

      # add to result
      dfi

    }) # end lapply
  message("")

  # bring result lists into data frame, by record _id
  result <- Reduce(function(...) merge(..., all = TRUE, by = "_id"), result)

  # prune rows without _id
  result <- result[!is.na(result[["_id"]]), , drop = FALSE]

  # remove rows with only NAs; try because
  # is.na may fail for complex cells
  onlyNas <- try({apply(result[, -1, drop = FALSE], 1,
                  function(r) all(is.na(r)))}, silent = TRUE)
  if (!inherits(onlyNas, "try-error")) {
    result <- result[!onlyNas, , drop = FALSE]
  } else {
    message("Could not remove rows with only NAs")
  }

  # inform user
  if (is.null(result) || !nrow(result)) {
    warning("No records with values for any specified field. ",
            call. = FALSE)
    return(NULL)
  }

  # sort, add meta data and return
  return(
    addMetaData(
      result[order(result[["_id"]]), , drop = FALSE],
      con = con))
}
# end dbGetFieldsIntoDf


#' Extract information of interest (e.g., endpoint)
#' from long data frame of protocol- or result-related
#' trial information as returned by \link{dfTrials2Long}
#'
#' @param df A data frame with four columns (_id,
#'  identifier, name, value) as returned by
#'  \link{dfTrials2Long}
#'
#' @param valuename A character string for the name of the variable
#'  from which to extract information for the variable of interest
#'
#' @param wherename A character string to identify the variable
#'  of interest
#'
#' @param wherevalue A character string with the value of interest
#'  for the variable of interest
#'
#' @return A data frame with columns _id, identifier,
#'  name, value that only includes the values of interest,
#'  where value are strings unless all value elements
#'  are numbers.
#'
#' @export
#' @examples
#' \dontrun{
#' db <- nodbi::src_sqlite(
#'   collection = "my_collection"
#' )
#' df <- ctrdata::dbGetFieldsIntoDf(
#' fields = c(
#'   # ctgov - typical results fields
#'   "clinical_results.baseline.analyzed_list.analyzed.count_list.count",
#'   "clinical_results.baseline.group_list.group",
#'   "clinical_results.baseline.analyzed_list.analyzed.units",
#'   "clinical_results.outcome_list.outcome",
#'   "study_design_info.allocation",
#'   # euctr - typical results fields
#'   "trialInformation.fullTitle",
#'   "subjectDisposition.recruitmentDetails",
#'   "baselineCharacteristics.baselineReportingGroups.baselineReportingGroup",
#'   "endPoints.endPoint",
#'   "trialChanges.hasGlobalInterruptions",
#'   "subjectAnalysisSets",
#'   "adverseEvents.seriousAdverseEvents.seriousAdverseEvent"
#'   ), con = dbc
#' )
#' # convert to long
#' reslong <- ctrdata::dfTrials2Long(
#'   df = df
#' )
#' # get values for endpoint of interest, duration of response
#' ctrdata::dfValue2Name(
#'   df = df,
#'   valuename = paste0(
#'     "endPoints.endPoint.*armReportingGroup.tendencyValues.tendencyValue.value|",
#'     "clinical_results.*category.measurement_list.measurement.value|",
#'     "clinical_results.*outcome.measure.units|endPoints.endPoint.unit"
#'   ),
#'   wherename = "clinical_results.*outcome.measure.title|endPoints.endPoint.title",
#'   wherevalue = "duration of response"
#' )
#' }
#'
dfName2Value <- function(df, valuename = "",
                         wherename = "", wherevalue = "") {

  # check parameters
  if (valuename == "") {
    stop("'valuename' must be specified.",
         call. = FALSE)
  }
  if (!identical(names(df),
                 c("_id", "identifier", "name", "value"))) {
    stop("'df' does not seem to come from dfTrials2Long()",
         call. = FALSE)
  }

  # indices of valuename
  indexVnames <- which(grepl(valuename, df[["name"]],
                             perl = TRUE, ignore.case = TRUE))
  if (!length(indexVnames)) stop("No rows found for 'valuename' = ", valuename)

  # if no where... are specified, just
  # return rows where name corresponds
  # to valuename
  if (wherename == "" & wherevalue == "") {

    # get relevant rows
    out <- df[indexVnames, , drop = FALSE]

  } else {# if where... are specified, continue

    # get where... indices per trial
    indexRows <- which(
      grepl(wherename, df[["name"]], perl = TRUE, ignore.case = TRUE) &
        grepl(wherevalue, df[["value"]], perl = TRUE, ignore.case = TRUE))
    if (!length(indexRows)) stop("No rows found for 'wherename' and 'wherevalue'")

    # get trial ids and identifiers for where...
    indexCases <- df[indexRows, c("_id", "identifier"), drop = FALSE]

    # get output iterate over trials
    out <- apply(
      indexCases, 1,
      function(i) {
        ids <- Reduce(
          intersect, list(
            # trial id
            which(grepl(i[["_id"]], df[["_id"]], fixed = TRUE)),
            # identifier to match starting from left and
            # do not match e.g. 22 for identifier 2
            which(grepl(paste0("^", i[["identifier"]], "([.]|$)"),
                        df[["identifier"]])),
            # indices of sought valuename
            indexVnames
          ))
        # return value
        if (length(ids)) df[ids, ]
      }
    )

    # bind into data frame
    out <- do.call(
      rbind,
      c(out, stringsAsFactors = FALSE, make.row.names = FALSE))

  } # if where...

  # value column is character
  # try to convert it to numeric
  tmp <- suppressWarnings(
    as.numeric(out[["value"]])
  )
  # use if converted ok
  if (all(is.na(tmp) == is.na(out[["value"]]))) {
    out["value"] <- tmp
  }
  # remove any duplicates such as
  # from duplicate where... criteria
  out <- unique(out)

  # return
  return(out)

} # end dfName2Value


#' Extract trial information into long format
#'
#' The function works with procotol- and results-
#' related information. It converts lists and other
#' values into individual rows of a long data frame.
#' From the resulting data frame, values of interest
#' can then be selected (e.g. select an outcome
#' and its analysis by the identifier of the measure
#' which has "Hazard Ratio" in its name, see
#' \link{dfName2Value}).
#'
#' @param df Data frame with columns including
#'  the trial identifier (\code{_id}) and
#'  one or more variables as obtained from
#'  \link{dbGetFieldsIntoDf}
#'
#' @return A data frame with the four columns:
#'  _id, identifier, name, value
#'
#' @importFrom stringi stri_extract_all_charclass
#' @importFrom stringi stri_extract_first
#' @importFrom stringi stri_replace_first
#'
#' @export
#'
#' @examples
#' \dontrun{
#' db <- nodbi::src_sqlite(
#'   collection = "my_collection"
#' )
#' df <- dbGetFieldsIntoDf(
#'   fields = c(
#'     "clinical_results.outcome_list.outcome"),
#'   con = db
#' )
#' dfTrials2Long(
#'   df = df
#' )
#' }
dfTrials2Long <- function(df) {

  # check parameters
  if (!any("_id" == names(df)) ||
      ncol(df) == 1L) stop(
        "Missing _id column or other variables in 'df'",
        call. = FALSE
      )
  if (any(c("identifier", "name", "value") %in% names(df))) stop(
    "Unexpected columns; 'df' should not come from dfTrials2Long",
    call. = FALSE
  )

  # helper function
  flattenDf <- function(x) {
    while (any(vapply(x, is.list, logical(1L)))) {
      x <- lapply(x, function(x) if (is.list(x)) x else list(x))
      x <- unlist(x, recursive = FALSE, use.names = TRUE)
    }
    x
  }

  # to add a first row in the next step,
  # columns that are not compatible with
  # adding a row are converted to character
  conv <- sapply(df, class) == "Date"
  conv <- seq_len(ncol(df))[conv]
  for (c in conv) df[, c] <- as.character(df[, c, drop = TRUE])

  # add a first row to df to hold item name
  # which otherwise is not available in apply
  df <- rbind(
    names(df),
    df)

  # iterative unnesting, by column
  out <- lapply(
    df[, -match("_id", names(df)), drop = FALSE],
    function(cc) {
      message(". ", appendLF = FALSE)
      # get item name as added in first row
      tn <- cc[[1]]
      # and by element in column
      lapply(cc[-1], function(c) {
        x <- unlist(flattenDf(c))
        if (!is.null(names(x))) tn <- names(x)
        if (is.null(x)) x <- NA
        data.frame(
          "name" = tn,
          "value" = x,
          check.names = FALSE,
          stringsAsFactors = FALSE,
          row.names = NULL)
      })})

  # add _id to list elements and
  # simplify into data frames
  tmpNames <- df[-1, "_id", drop = TRUE]
  out <- lapply(
    out, function(e) {
      message(". ", appendLF = FALSE)
      names(e) <- tmpNames
      # duplicate e to force generating
      # names in the later rbind step
      do.call(rbind, c(e, e, stringsAsFactors = FALSE))
    })

  # combine lists into data frame
  out <- do.call(rbind, c(out, stringsAsFactors = FALSE))
  message(". ", appendLF = FALSE)

  # remove rows where value is NA
  out <- out[!is.na(out[["value"]]), , drop = FALSE]

  # process row.names such as "clinical_results.NCT00082758.73"
  # to to obtain "clinical_results" as part of variable name
  names <- stringi::stri_replace_first(
    str = row.names(out), replacement = "",
    regex = c(paste0(".(", regCtgov, "|", regEuctr, "-[3A-Z]+)[.0-9]*")))

  # generate new data frame with target columns and order
  out <- data.frame(
    # process row.names to obtain trial id
    "_id" = stringi::stri_extract_first(
      str = row.names(out),
      regex = c(paste0(regCtgov, "|", regEuctr, "-[3A-Z]+"))),
    "identifier" = NA,
    "name" = out[["name"]],
    "value" = out[["value"]],
    check.names = FALSE,
    row.names = NULL,
    stringsAsFactors = FALSE)
  message(". ", appendLF = FALSE)

  # generate variable names from processed row names
  # and name unless the same is as already in name
  out[["name"]] <- ifelse(
    out[["name"]] == names,
    out[["name"]],
    paste0(names, ".0.", out[["name"]]))

  # name can have from 0 to about 6 number groups, get all
  # and concatenate to oid-like string such as "1.2.2.1.4"
  out[["identifier"]] <- vapply(
    stringi::stri_extract_all_regex(out[["name"]], "[0-9]+([.]|$)"),
    function(i) paste0(gsub("[.]", "", i), collapse = "."), character(1L))
  out[["identifier"]] [out[["identifier"]] == "NA"] <- "0"
  message(". ", appendLF = FALSE)

  # remove numbers from variable name
  out[["name"]] <- gsub(
    "[0-9]+([.])|[0-9]+$|[.]?@attributes", "\\1", out[["name"]], perl = TRUE)

  # remove any double separators
  out[["name"]] <- gsub("[.]+", ".", out[["name"]], perl = TRUE)

  # remove double rows from duplicating e above
  out <- unique(out)

  # inform
  message("\nTotal ", nrow(out), " rows, ",
          length(unique(out[["name"]])),
          " unique names of variables")

  # output
  return(out)

} # end dfTrials2Long


#' Extract named element(s) from list(s) into long-format
#' data frame
#'
#' The function uses a name (key) to extract an element
#' from a list in a data.frame such as obtained with
#' \link{dbGetFieldsIntoDf}. This helps to simplify
#' working with nested lists and with complex structures.
#'
#' @param df A data frame
#' @param list.key A list of pairs of list names and
#'  key names, where the list name corresponds to the
#'  name of a column in \code{df} that holds a list and
#'  the name of the key identifies the element to be
#'  extracted. See example.
#'
#' @return A data frame in long format with columns
#'  name (identifying the full path in the data frame,
#'  "<list>.<key>"), _id (of the trial record), value
#'  (of name per _id), item (number of value of name
#'  per _id).
#'
#' @export
#'
#' @examples
#'
#' \dontrun{
#' db <- nodbi::src_sqlite(
#'   collection = "my_collection"
#' )
#' df <- dbGetFieldsIntoDf(
#'   fields = c(
#'     "endPoints.endPoint",
#'     "subjectDisposition.postAssignmentPeriods"),
#'   con = db
#' )
#' dfListExtractKey(
#'   df = df,
#'   list.key = list(
#'       c("endPoints.endPoint",
#'         "^title"),
#'       c("subjectDisposition.postAssignmentPeriods",
#'         "arms.arm.type.value")
#' ))
#' }
#'
dfListExtractKey <- function(
  df,
  list.key =
    list(c("endPoints.endPoint", "^title")
    )) {

  # deprecate
  .Deprecated(new = "dfName2Value")

  # check
  if (!any("_id" == names(df))) {
    stop("Data frame 'df' lacks '_id' column.",
         call. = FALSE)
  }

  # helper function to extract from
  # a named vector elements by name
  extractKey <- function(flattenedList, key) {

    # find element by key
    selected <- grepl(key,
                      names(flattenedList),
                      ignore.case = TRUE)


    # extract value for key
    extracted <- flattenedList[selected]

    # if key is not found, return a value
    # e.g. missing value (NA) or empty string ("")
    # please change as wanted for later processing
    if (length(extracted) == 0) extracted <- NA

    # return
    return(extracted)
  }

  # dots needs to be defined because passing
  # it in .Internal(mapply()) is not enough
  out <- lapply(
    list.key,
    function(k)
      lapply(df[[k[1]]],
             # k[1] = "endPoints.endPoint" identifies
             # the column in data frame with the list
             function(l) extractKey(
               unlist(l, recursive = TRUE, use.names = TRUE), k[2])
             # k[2] = "^title" identifies the key in the sublist
      ))

  out <- sapply(seq_along(list.key), function(li) {

    tmp <- out[[li]]

    tmp <- sapply(

      seq_along(tmp),
      function(ii) {

        data.frame(
          name = gsub("[-0-9]*$", "", # trailing number
                      gsub("[^a-zA-Z0-9_.-]", "",
                           paste0(list.key[[li]], collapse = "."))),
          "_id" = df[["_id"]][[ii]],
          value = tmp[[ii]],
          item = seq_along(tmp[[ii]]),
          row.names = NULL,
          stringsAsFactors = FALSE,
          check.names = FALSE)
      }, simplify = FALSE)

    do.call(rbind, tmp)

  }, simplify = FALSE)

  # return
  do.call(rbind, c(out, stringsAsFactors = FALSE, make.row.names = FALSE))

} # end dfListExtractKey


#' Merge two variables into one, optionally map values to new levels
#'
#' @param df A \link{data.frame} in which there are two variables (columns)
#' to be merged into one.
#' @param colnames A vector of length two with names of the two columns
#' that hold the variables to be merged. See \link{colnames} for how to
#' obtain the names of columns of a data frame.
#' @param levelslist A list with one slice each for a new value to be
#' used for a vector of old values (optional).
#' @param ... for deprecated \code{varnames} parameter (will be removed)
#'
#' @return A vector of strings
#'
#' @export
#'
#' @examples
#'
#' \dontrun{
#' statusvalues <- list(
#'   "ongoing" = c("Recruiting", "Active", "Ongoing",
#'                 "Active, not recruiting", "Enrolling by invitation"),
#'   "completed" = c("Completed", "Prematurely Ended", "Terminated"),
#'   "other" = c("Withdrawn", "Suspended",
#'               "No longer available", "Not yet recruiting"))
#'
#' dfMergeTwoVariablesRelevel(
#'   df = result,
#'   colnames = c("Recruitment", "x5_trial_status"),
#'   levelslist = statusvalues)
#' }
#'
dfMergeTwoVariablesRelevel <- function(
  df = NULL,
  colnames = "",
  levelslist = NULL,
  ...) {

  # check parameters

  # FIXME migrate from previously
  # used parameter "varnames"
  tmp <- match.call()
  tmp <- tmp["varnames"]
  tmp <- as.list(tmp)[[1]]
  if (length(tmp) == 3 && colnames == "") {
    colnames <- unlist(as.list(tmp[-1], use.names = FALSE))
    warning("Parameter varnames is deprecated, use colnames instead.",
            call. = FALSE)
  }

  # other checks
  if (class(df) != "data.frame") {
    stop("Need a data frame as input.", call. = FALSE)
  }
  if (length(colnames) != 2) {
    stop("Please provide exactly two column names.", call. = FALSE)
  }

  # find variables in data frame and merge
  tmp <- match(colnames, names(df))
  df <- df[, tmp, drop = FALSE]

  # bind as ...
  if (class(df[, 1]) == class(df[, 2]) &&
      class(df[, 1]) != "character") {
    # check
    if (nrow(na.omit(df[!vapply(df[, 1, drop = TRUE], is.null, logical(1L)) &
                        !vapply(df[, 2, drop = TRUE], is.null, logical(1L)), ,
                        drop = FALSE]))) {
      warning("Some rows had values for both columns, used first",
              noBreaks. = TRUE, immediate. = TRUE)
    }
    # values, with first having
    # priority over the second
    tmp <- ifelse(is.na(tt <- df[, 1]), df[, 2], df[, 1])
  } else {
    # check
    if (nrow(df[(!is.na(df[, 1]) & df[, 1] != "") &
                (!is.na(df[, 2]) & df[, 2] != ""), , drop = FALSE])) {
      warning("Some rows had values for both columns, concatenated",
              noBreaks. = TRUE, immediate. = TRUE)
    }
    # strings, concatenated
    tmp <- paste0(
      ifelse(is.na(tt <- as.character(df[, 1])), "", tt),
      ifelse(is.na(tt <- as.character(df[, 2])), "", tt))
  }

  # type where possible
  if (class(df[, 1]) == class(df[, 2]) &&
      class(df[, 1]) != "character") {
    mode(tmp) <- mode(df[, 1])
    class(tmp) <- class(df[, 1])
  }

  # relevel if specified
  if (!is.null(levelslist)) {

    # check
    if (class(levelslist) != "list") {
      stop("Need list for parameter 'levelslist'.", call. = FALSE)
    }

    # helper function to collapse factor levels into the first
    refactor <- function(x, collapselevels, levelgroupname) {
      levels(x) [match(collapselevels, levels(x))] <- levelgroupname
      return(x)
    }

    # convert result to factor as this is needed for helper function
    tmp <- as.factor(tmp)

    # apply helperfunction to elements of the list
    for (i in seq_len(length(levelslist))) {
      tmp <- refactor(tmp, unlist(levelslist[i], use.names = FALSE),
                      attr(levelslist[i], "names"))
    }

    # convert factor back into string vector
    tmp <- as.character(tmp)

  }

  # check and inform user
  if (length(tt <- unique(tmp)) > 3L) {
    message("Unique values returned (first three): ",
            paste(tt[1L:3L], collapse = ", "))
  } else {
    message("Unique values returned: ",
            paste(tt, collapse = ", "))
  }

  # return
  return(tmp)
}
# end dfMergeTwoVariablesRelevel


#' Select single trial record from records of different EU Member States
#'
#' The EUCTR provides one record per trial per EU Member State in which the
#' trial is conducted. For all trials conducted in more than one Member State,
#' this function returns only one record per trial.
#'
#' Note: To deduplicate trials from different registers (EUCTR and CTGOV),
#' please first use function \code{\link{dbFindIdsUniqueTrials}}.
#'
#' @param df A data frame created from the database that includes the columns
#'   "_id" and "a2_eudract_number", for example created with function
#'   dbGetFieldsIntoDf(c("_id", "a2_eudract_number")).
#' @param prefermemberstate Code of single EU Member State for which records
#' should returned. If not available, a record for GB or lacking this, any
#' other record for the trial will be returned. For a list of codes of EU
#'   Member States, please see vector \code{countriesEUCTR}. Alternatively,
#'   "3RD" will lead to return a Third Country record of a trial, if available.
#' @param include3rdcountrytrials A logical value if trials should be retained
#'   that are conducted exclusively in third countries, that is, outside
#'   the European Union.
#'
#' @return A data frame as subset of \code{df} corresponding to the sought
#'   records.
#'
#' @keywords internal
#
dfFindUniqueEuctrRecord <- function(
  df = NULL,
  prefermemberstate = "GB",
  include3rdcountrytrials = TRUE) {

  # check parameters
  if (class(df) != "data.frame") {
    stop("Parameter df is not a data frame.", call. = FALSE)
  }
  #
  if (is.null(df[["_id"]]) ||
      is.null(df["a2_eudract_number"])) {
    stop('Data frame does not include "_id"',
         ' and "a2_eudract_number" columns.',
         call. = FALSE)
  }
  #
  if (nrow(df) == 0) {
    stop("Data frame does not contain records (0 rows).",
         call. = FALSE)
  }
  #
  if (!(prefermemberstate %in% countriesEUCTR)) {
    stop("Value specified for prefermemberstate does not match",
         " one of the recognised codes: ",
         paste(sort(countriesEUCTR), collapse = ", "),
         call. = FALSE)
  }

  # notify it mismatching parameters
  if (prefermemberstate == "3RD" & !include3rdcountrytrials) {
    warning("Preferred EUCTR version set to 3RD country trials, but ",
            "'include3rdcountrytrials' was FALSE, setting it to TRUE.",
            call. = FALSE,
            noBreaks. = FALSE,
            immediate. = FALSE)
    include3rdcountrytrials <- TRUE
  }

  # count total
  totalEuctr <- unique(df[["a2_eudract_number"]])
  totalEuctr <- na.omit(totalEuctr[totalEuctr != ""])
  totalEuctr <- length(totalEuctr)

  # as a first step, handle 3rd country trials e.g. 2010-022945-52-3RD
  # if retained, these trials would count as record for a trial
  if (!include3rdcountrytrials) {
    df <- df[!grepl("-3RD", df[["_id"]]), ]
  }

  # count number of records by eudract number
  tbl <- table(df[["_id"]], df[["a2_eudract_number"]])
  tbl <- as.matrix(tbl)
  # nms has names of all records
  nms <- dimnames(tbl)[[1]]

  # nrs has eudract numbers for which is there more than 1 record
  nrs <- colSums(tbl)
  nrs <- nrs[nrs > 1]
  nrs <- names(nrs)

  # nst is a list of nrs trials of a logical vector along nms
  # that indicates if the indexed record belongs to the trial
  nms2 <- substr(nms, 1, 14)
  nst <- lapply(nrs, function(x) nms2 %in% x)

  # helper function to find the Member State version
  removeMSversions <- function(indexofrecords) {
    # given a vector of records (nnnn-nnnnnnn-nn-MS) of a single trial, this
    # returns all those _ids of records that do not correspond to the preferred
    # Member State record, based on the user's choices and defaults.
    # Function uses prefermemberstate, nms from the caller environment
    recordnames <- nms[indexofrecords]
    #
    # fnd should be only a single string, may need to be checked
    if (sum(fnd <- grepl(prefermemberstate, recordnames)) != 0) {
      result <- recordnames[!fnd]
      return(result)
    }
    #
    if (sum(fnd <- grepl("GB", recordnames)) != 0) {
      result <- recordnames[!fnd]
      return(result)
    }
    #
    # default is to list all but first record
    # the listed records are the duplicates
    # 3RD country trials would be listed first
    # hence selected, which is not desirable
    # unless chosen as prefermemberstate
    return(rev(sort(recordnames))[-1])
  }

  # finds per trial the desired record;
  # uses prefermemberstate and nms
  result <- lapply(nst,
                   function(x) removeMSversions(x))
  result <- unlist(result, use.names = FALSE)

  # eleminate the unwanted EUCTR records
  df <- df[!(df[["_id"]] %in% result), ]

  # also eliminate the meta-info record
  df <- df[!(df[["_id"]] == "meta-info"), ]

  # inform user about changes to data frame
  if (length(nms) > (tmp <- length(result))) {
    message(
      " - ", tmp,
      " EUCTR _id were not preferred EU Member State record for ",
      totalEuctr, " trials")
  }

  # return
  return(df)

}
# end dfFindUniqueEuctrRecord


#' Change type of field based on name of field
#'
#' @param dfi a data frame of columns _id, fieldname
#'
#' @keywords internal
#' @noRd
#'
typeField <- function(dfi) {

  # check
  if (ncol(dfi) != 2L) {
    stop("Expect data frame with two columns, _id and a field.",
         call. = FALSE)
  }

  # clean up input
  # - if NA as string, change to NA
  dfi[grepl("^N/?A$|^ND$", dfi[, 2]), 2] <- NA
  # - give Month Year also a Day to work with as.Date
  dfi[, 2] <- sub("^([a-zA-Z]+) ([0-9]{4})$", "\\1 15, \\2", dfi[, 2])
  # - convert html entities
  if (any(grepl("&[a-z]+;", dfi[, 2]))) dfi[, 2] <- sapply(
    dfi[, 2], function(i) xml2::xml_text(xml2::read_html(charToRaw(i))))

  # for date time conversion
  lct <- Sys.getlocale("LC_TIME")
  Sys.setlocale("LC_TIME", "C")
  on.exit(Sys.setlocale("LC_TIME", lct))

  # main typing functions
  ctrDate      <- function() as.Date(dfi[, 2], format = "%Y-%m-%d")
  ctrDateUs    <- function() as.Date(dfi[, 2], format = "%b %e, %Y")
  ctrDateCtr   <- function() as.Date(dfi[, 2], format = "%Y-%m-%d %H:%M:%S")
  ctrDateTime  <- function() as.Date(dfi[, 2], format = "%Y-%m-%dT%H:%M:%S")
  ctrYesNo     <- function() vapply(dfi[, 2], FUN = function(x) switch(x, "Yes" = TRUE, "No" = FALSE, NA), logical(1L))
  ctrFalseTrue <- function() vapply(dfi[, 2], FUN = function(x) switch(x, "true" = TRUE, "false" = FALSE, NA), logical(1L))
  ctrInt       <- function() vapply(dfi[, 2], FUN = function(x) as.integer(x = x), integer(1L))

  # selective typing
  tmp <- try({
    switch(
      EXPR = names(dfi)[2],
      #
      #
      # dates
      #
      # - intern
      "record_last_import" = ctrDateCtr(),
      # - EUCTR
      "n_date_of_ethics_committee_opinion"     = ctrDate(),
      "n_date_of_competent_authority_decision" = ctrDate(),
      "p_date_of_the_global_end_of_the_trial"  = ctrDate(),
      "x6_date_on_which_this_record_was_first_entered_in_the_eudract_database" = ctrDate(),
      "firstreceived_results_date"             = ctrDate(),
      "trialInformation.primaryCompletionDate" = ctrDate(),
      "trialInformation.globalEndOfTrialDate"  = ctrDateTime(),
      "trialInformation.recruitmentStartDate"  = ctrDateTime(),
      # - CTGOV
      "start_date"              = ctrDateUs(),
      "primary_completion_date" = ctrDateUs(),
      "completion_date"         = ctrDateUs(),
      "study_first_posted"      = ctrDateUs(),
      "results_first_posted"    = ctrDateUs(),
      "last_update_posted"      = ctrDateUs(),
      # - ISRCTN
      "participants.recruitmentStart" = ctrDateTime(),
      "participants.recruitmentEnd"   = ctrDateTime(),
      "trialDesign.overallStartDate"  = ctrDateTime(),
      "trialDesign.overallEndDate"    = ctrDateTime(),
      #
      # factors
      #
      # - EUCTR Yes / No / Information not present in EudraCT
      "d21_imp_to_be_used_in_the_trial_has_a_marketing_authorisation" = ctrYesNo(),
      "e13_condition_being_studied_is_a_rare_disease" = ctrYesNo(),
      #
      "e61_diagnosis"         = ctrYesNo(),
      "e62_prophylaxis"       = ctrYesNo(),
      "e63_therapy"           = ctrYesNo(),
      "e64_safety"            = ctrYesNo(),
      "e65_efficacy"          = ctrYesNo(),
      "e66_pharmacokinetic"   = ctrYesNo(),
      "e67_pharmacodynamic"   = ctrYesNo(),
      "e68_bioequivalence"    = ctrYesNo(),
      "e69_dose_response"     = ctrYesNo(),
      "e610_pharmacogenetic"  = ctrYesNo(),
      "e611_pharmacogenomic"  = ctrYesNo(),
      "e612_pharmacoeconomic" = ctrYesNo(),
      "e613_others"           = ctrYesNo(),
      #
      "e71_human_pharmacology_phase_i"         = ctrYesNo(),
      "e711_first_administration_to_humans"    = ctrYesNo(),
      "e712_bioequivalence_study"              = ctrYesNo(),
      "e713_other"                             = ctrYesNo(),
      "e72_therapeutic_exploratory_phase_ii"   = ctrYesNo(),
      "e73_therapeutic_confirmatory_phase_iii" = ctrYesNo(),
      "e74_therapeutic_use_phase_iv"           = ctrYesNo(),
      #
      "e81_controlled"      = ctrYesNo(),
      "e811_randomised"     = ctrYesNo(),
      "e812_open"           = ctrYesNo(),
      "e813_single_blind"   = ctrYesNo(),
      "e814_double_blind"   = ctrYesNo(),
      "e815_parallel_group" = ctrYesNo(),
      "e816_cross_over"     = ctrYesNo(),
      "e817_other"          = ctrYesNo(),
      #
      "e83_the_trial_involves_single_site_in_the_member_state_concerned"    = ctrYesNo(),
      "e84_the_trial_involves_multiple_sites_in_the_member_state_concerned" = ctrYesNo(),
      "e85_the_trial_involves_multiple_member_states"                       = ctrYesNo(),
      "e861_trial_being_conducted_both_within_and_outside_the_eea"          = ctrYesNo(),
      "e862_trial_being_conducted_completely_outside_of_the_eea"            = ctrYesNo(),
      "e87_trial_has_a_data_monitoring_committee"                           = ctrYesNo(),
      #
      "f11_trial_has_subjects_under_18"            = ctrYesNo(),
      "f111_in_utero"                              = ctrYesNo(),
      "f112_preterm_newborn_infants_up_to_gestational_age__37_weeks" = ctrYesNo(),
      "f113_newborns_027_days"                     = ctrYesNo(),
      "f114_infants_and_toddlers_28_days23_months" = ctrYesNo(),
      "f115_children_211years"                     = ctrYesNo(),
      "f116_adolescents_1217_years"                = ctrYesNo(),
      "f12_adults_1864_years"                      = ctrYesNo(),
      "f13_elderly_65_years"                       = ctrYesNo(),
      "f21_female"                                 = ctrYesNo(),
      "f22_male"                                   = ctrYesNo(),
      "f31_healthy_volunteers"                     = ctrYesNo(),
      "f32_patients"                               = ctrYesNo(),
      "f33_specific_vulnerable_populations"        = ctrYesNo(),
      "f331_women_of_childbearing_potential_not_using_contraception_" = ctrYesNo(),
      "f332_women_of_childbearing_potential_using_contraception"      = ctrYesNo(),
      "f333_pregnant_women"      = ctrYesNo(),
      "f334_nursing_women"       = ctrYesNo(),
      "f335_emergency_situation" = ctrYesNo(),
      "f336_subjects_incapable_of_giving_consent_personally" = ctrYesNo(),
      #
      # - CTGOV
      "has_expanded_access"            = ctrYesNo(),
      "oversight_info.has_dmc"         = ctrYesNo(),
      "eligibility.healthy_volunteers" = ctrYesNo(),
      #
      # - ISRCTN
      "trialDescription.acknowledgment" = ctrFalseTrue(),
      "results.biomedRelated"           = ctrFalseTrue(),
      #
      # numbers
      #
      # - EUCTR
      "e824_number_of_treatment_arms_in_the_trial" = ctrInt(),
      "e891_in_the_member_state_concerned_years"   = ctrInt(),
      "e891_in_the_member_state_concerned_months"  = ctrInt(),
      "e891_in_the_member_state_concerned_days"    = ctrInt(),
      "e892_in_all_countries_concerned_by_the_trial_years"  = ctrInt(),
      "e892_in_all_countries_concerned_by_the_trial_months" = ctrInt(),
      "e892_in_all_countries_concerned_by_the_trial_days"   = ctrInt(),
      "e841_number_of_sites_anticipated_in_member_state_concerned" = ctrInt(),
      "f11_number_of_subjects_for_this_age_range"   = ctrInt(),
      "f1111_number_of_subjects_for_this_age_range" = ctrInt(),
      "f1121_number_of_subjects_for_this_age_range" = ctrInt(),
      "f1131_number_of_subjects_for_this_age_range" = ctrInt(),
      "f1141_number_of_subjects_for_this_age_range" = ctrInt(),
      "f1151_number_of_subjects_for_this_age_range" = ctrInt(),
      "f1161_number_of_subjects_for_this_age_range" = ctrInt(),
      "f121_number_of_subjects_for_this_age_range"  = ctrInt(),
      "f131_number_of_subjects_for_this_age_range"  = ctrInt(),
      "f41_in_the_member_state"          = ctrInt(),
      "f421_in_the_eea"                  = ctrInt(),
      "f422_in_the_whole_clinical_trial" = ctrInt(),
      #
      # - CTGOV
      "number_of_arms" = ctrInt(),
      "enrollment"     = ctrInt(),
      #
      # - ISRCTN
      "participants.targetEnrolment"      = ctrInt(),
      "participants.totalTarget"          = ctrInt(),
      "participants.totalFinalEnrolment"  = ctrInt(),
      "externalRefs.protocolSerialNumber" = ctrInt(),
      #
      # TODO results-related variables
      "trialInformation.analysisForPrimaryCompletion" = ctrFalseTrue()
      #
    )
  },
  silent = TRUE)

  # prepare output
  if (!inherits(tmp, "try-error") &&
      !is.null(unlist(tmp, use.names = FALSE))) {

    # need to construct new data frame, because
    # replacing columns with posixct does not work
    dfn <- names(dfi)
    dfi <- data.frame(dfi[["_id"]],
                      tmp,
                      stringsAsFactors = FALSE)
    names(dfi) <- dfn

  }

  # return
  return(dfi)

} # end typeField


#' Annotate ctrdata function return values
#'
#' @param x object to be annotated
#'
#' @inheritParams ctrDb
#'
#' @keywords internal
#' @noRd
#'
addMetaData <- function(x, con) {

  # add metadata
  attr(x, "ctrdata-dbname")         <- con$db
  attr(x, "ctrdata-table")          <- con$collection
  attr(x, "ctrdata-dbqueryhistory") <- dbQueryHistory(
    con = con,
    verbose = FALSE)

  # return annotated object
  return(x)

} # end addMetaData


#' Function to set proxy
#'
#' @importFrom curl ie_proxy_info
#'
#' @keywords internal
#' @noRd
#'
setProxy <- function() {

  # only act if environment
  # variable is not already set
  if (Sys.getenv("https_proxy") == "") {

    # works under windows only
    p <- curl::ie_proxy_info()$Proxy

    if (!is.null(p)) {

      # used by httr and curl
      Sys.setenv(https_proxy = p)

    }
  }
} # end setproxy


#' Convenience function to install a minimal cygwin environment under MS
#' Windows, including perl, sed and php
#'
#' Alternatively and in case of difficulties, download and run the cygwin
#' setup yourself as follows: \code{cygwinsetup.exe --no-admin --quiet-mode
#' --verbose --upgrade-also --root c:/cygwin --site
#' http://www.mirrorservice.org/sites/sourceware.org/pub/cygwin/ --packages
#' perl,php-jsonc,php-simplexml}
#'
#' @export
#'
#' @param force Set to \code{TRUE} to force updating and overwriting an existing
#'   installation in \code{c:\\cygwin}
#' @param proxy Specify any proxy to be used for downloading via http, e.g.
#'   "host_or_ip:port". \code{installCygwinWindowsDoInstall} may detect and use
#'   the proxy configuration used in MS Windows to use an automatic proxy
#'   configuration script. Authenticated proxies are not supported at this time.
#'
installCygwinWindowsDoInstall <- function(
  force = FALSE,
  proxy = "") {

  # checks
  if (.Platform$OS.type != "windows") {
    stop(
      "This function is only for MS Windows operating systems.",
      call. = FALSE)
  }
  #
  if (!force & dir.exists("c:\\cygwin")) {
    message("cygwin is already installed in c:\\cygwin. ",
            "To re-install, use force = TRUE.")
    # exit function after testing
    return(installCygwinWindowsTest(verbose = TRUE))
  }

  # define installation command
  installcmd <- paste0(
    "--no-admin --quiet-mode --upgrade-also --no-shortcuts --prune-install ",
    "--root c:/cygwin ",
    "--site http://www.mirrorservice.org/sites/sourceware.org/pub/cygwin/ ",
    "--packages perl,php-simplexml,php-json")

  # create R session temporary directory
  tmpfile <- paste0(tempdir(), "/cygwin_inst")
  dir.create(tmpfile)
  dstfile <- paste0(tmpfile, "/cygwinsetup.exe")

  # generate download url
  tmpurl <- ifelse(
    grepl("x64", utils::win.version()),
    "setup-x86_64.exe",
    "setup-x86.exe")
  tmpurl <- paste0("https://cygwin.org/", tmpurl)

  # inform user
  message("Attempting cygwin download using ",
          tmpurl, " ...")

  # check and set proxy if needed to access internet
  setProxy()

  # download.file uses the proxy configured in the system
  tmpdl <- try({
    utils::download.file(
      url = tmpurl,
      destfile = dstfile,
      quiet = FALSE,
      mode = "wb")
  }, silent = TRUE)

  # check
  if (!file.exists(dstfile) ||
      file.size(dstfile) < (5 * 10 ^ 5) ||
      (inherits(tmpdl, "try-error"))) {
    stop("Failed, please download manually and install with:\n",
         tmpurl, " ", installcmd,
         call. = FALSE)
  }

  # proxy handling
  if (proxy != "") {
    # manual setting overriding
    proxy <- paste0(" --proxy ", proxy)
    message("Setting cygwin proxy install argument to: ",
            proxy, ", based on provided parameter.")
  } else {
    # detect proxy
    proxy <- curl::ie_proxy_info()$Proxy
    if (!is.null(proxy)) {
      message("Setting cygwin proxy install argument to: ",
              proxy, ", based on system settings.")
      proxy <- paste0(" --proxy ", proxy)
    }
  }

  # execute cygwin setup command
  system(paste0(dstfile, " ", installcmd,
                " --local-package-dir ", tmpfile, " ", proxy))

  # return cygwin installation test
  return(installCygwinWindowsTest(verbose = TRUE))

}
# end installCygwinWindowsDoInstall


#' Convenience function to test for working cygwin installation
#'
#' @param verbose If \code{TRUE}, prints confirmatory
#'  message (default \code{FALSE})
#'
#' @return Information if cygwin can be used, \code{TRUE}
#'  or \code{FALSE}, or NULL if not under MS Windows
#'
#' @keywords internal
#' @noRd
#
installCygwinWindowsTest <- function(verbose = FALSE) {
  #
  if (.Platform$OS.type != "windows") {
    message("Function installCygwinWindowsTest() is ",
            "only for MS Windows operating systems.")
    return(invisible(NULL))
  }
  #
  tmpcygwin <- try({
    suppressWarnings(
      system(
        paste0("cmd.exe /c ",
               rev(Sys.glob("c:\\cygw*\\bin\\bash.exe"))[1],
               " --version"),
        intern = TRUE,
        ignore.stderr = TRUE
      ))},
    silent = TRUE)
  #
  if (!inherits(tmpcygwin, "try-error") &
      (length(tmpcygwin) > 5L)) {
    if (verbose) message("cygwin seems to work correctly")
    return(invisible(TRUE))
  } else {
    message(
      "cygwin is not available, ctrLoadQueryIntoDb() will not work.\n",
      "Consider calling ctrdata::installCygwinWindowsDoInstall()")
    return(invisible(FALSE))
  }
}
# end installCygwinWindowsTest


#' Check availability of binaries installed locally
#'
#' @param commandtest Command to be used for testing
#' the availability of the binary, e.g. "php -v".
#' Note internal quotes need to be escaped, e.g.
#' \code{installFindBinary('php -r
#' \"simplexml_load_string(\'\');\"')}.
#' See R/onload.R for tested binaries.
#'
#' @param verbose Set to \code{TRUE} to see printed
#' return value of \code{commandtest}
#'
#' @return A logical if executing commandtest
#' returned an error or not
#'
#' @keywords internal
#' @noRd
#
installFindBinary <- function(commandtest = NULL, verbose = FALSE) {
  #
  if (is.null(commandtest)) {
    stop("Empty argument: commandtest",
         call. = FALSE)
  }
  #
  if (.Platform$OS.type == "windows") {
    commandtest <-
      paste0(rev(Sys.glob("c:\\cygw*\\bin\\bash.exe"))[1],
             " --login -c ",
             shQuote(commandtest))
  }
  #
  if (verbose) print(commandtest)
  #
  commandresult <- try(
    suppressWarnings(
      system(commandtest,
             intern = TRUE,
             ignore.stderr =
               ifelse(.Platform$OS.type == "windows",
                      FALSE, TRUE))),
    silent = TRUE
  )
  #
  commandreturn <- ifelse(
    inherits(commandresult, "try-error") ||
      grepl("error|not found", tolower(paste(commandresult, collapse = " "))) ||
      (!is.null(attr(commandresult, "status")) &&
         (attr(commandresult, "status") != 0)),
    FALSE, TRUE)
  #
  if (!commandreturn) {
    # warning(commandtest, " not found.",
    #         call. = FALSE,
    #         immediate. = FALSE)
  } else {
    if (interactive()) {
      message(". ", appendLF = FALSE)
    }
  }
  #
  if (verbose) {
    print(commandresult)
  }
  #
  return(commandreturn)
  #
}
# end installFindBinary


#' checkBinary
#'
#' @param b Vector of pre-defined binaries to be tested
#'
#' @keywords internal
#' @noRd
#'
#' @return Logical, \code{TRUE} if all binaries ok
#'
checkBinary <- function(b = NULL) {

  # check actions and user infos
  actionsInfos <- list(
    "notworking" = c("nonexistingbinarytested",
                     "nonexistingbinarytested not found"),
    "php" = c("php --version",
              "php not found, ctrLoadQueryIntoDb() will not work "),
    "phpxml" = c("php -r 'simplexml_load_string(\"\");'",
                 "php xml not found, ctrLoadQueryIntoDb() will not work "),
    "phpjson" = c("php -r 'json_encode(\"<foo>\");'",
                  "php json not found, ctrLoadQueryIntoDb() will not work "),
    "sed" = c("echo x | sed s/x/y/",
              "sed not found, ctrLoadQueryIntoDb() will not work "),
    "perl" = c("perl -V:osname",
               "perl not found, ctrLoadQueryIntoDb() will not work ")
  )

  # check private environment and create if not found
  if (!exists(x = ".dbffenv", mode = "environment")) {
    .dbffenv <- new.env(parent = emptyenv())
  }

  # if input empty, just check all except test
  if (is.null(b)) b <- names(actionsInfos)[-1]

  # do check
  out <- sapply(X = b, function(bi) {

    # check input
    actionsInfo <- actionsInfos[[bi]]
    if (is.null(actionsInfo)) stop("Unknown binary to check: ", bi, call. = FALSE)

    # previously checked and successful?
    checked <- exists(x = paste0("bin_check_", bi), envir = .dbffenv)
    if (checked) checked <- get(x = paste0("bin_check_", bi), envir = .dbffenv)
    if (checked) return(TRUE)

    # continue to check binary
    ok <- installFindBinary(commandtest = actionsInfo[1])
    if (!ok) message("\n", actionsInfo[2], appendLF = FALSE)

    # store check to private environment
    assign(x = paste0("bin_check_", bi), value = ok, envir = .dbffenv)

    # return
    ok

  })

  # inform user
  if (!all(out)) message(
    "\nTo install command line binaries needed for the function ",
    "ctrLoadQueryIntoDb() of package ctrdata, see recommendations at ",
    "https://github.com/rfhb/ctrdata#",
    "2-command-line-tools-perl-sed-cat-and-php-52-or-higher",
    "\nAfter installation, detach and load package ctrdata again, ",
    "or restart the R session.\n")

  # return single value since
  # all tests need to be ok
  invisible(all(out))

}

Try the ctrdata package in your browser

Any scripts or data that you put into this service are public.

ctrdata documentation built on Nov. 22, 2021, 1:06 a.m.