R/dbFindIdsUniqueTrials.R

Defines functions dfFindUniqueEuctrRecord dbFindIdsUniqueTrials

Documented in dbFindIdsUniqueTrials

### ctrdata package

#' Get identifiers of deduplicated trial records
#'
#' Records for a clinical trial can be loaded from more than one
#' register into a collection. This function returns deduplicated
#' identifiers for all trials in the collection, respecting the
#' register(s) preferred by the user. All registers are recording
#' identifiers also from other registers, which are used by this
#' function to provide a vector of identifiers of deduplicated trials.
#'
#' Note that the content of records may differ between registers
#' (and, for "EUCTR", between records for different Member States).
#' Such differences are not considered by this function.
#'
#' @param preferregister A vector of the order of preference for
#' registers from which to generate unique _id's, default
#' \code{c("EUCTR", "CTGOV", "CTGOV2", "ISRCTN", "CTIS")}
#'
#' @param prefermemberstate Code of single EU Member State for which records
#' should returned. If not available, a record for DE or lacking this, any
#' random Member State's record for the trial will be returned.
#' For a list of codes of EU  Member States, please see vector
#' \code{countriesEUCTR}. Specifying "3RD" will return the Third Country
#' record of trials, where available.
#'
#' @param include3rdcountrytrials A logical value if trials should be retained
#' that are conducted exclusively in third countries, that is, outside
#' the European Union. Ignored if \code{prefermemberstate} is set to "3RD".
#'
#' @param verbose If \code{TRUE}, prints out the fields of registers used to
#' find corresponding trial records
#'
#' @importFrom nodbi docdb_query
#' @importFrom stats setNames
#'
#' @inheritParams ctrDb
#'
#' @return A named vector with strings of keys (field "_id") of
#' records in the collection that represent unique trials, where
#' names correspond to the register of the record.
#'
#' @export
#'
#' @examples
#'
#' dbc <- nodbi::src_sqlite(
#'     dbname = system.file("extdata", "demo.sqlite", package = "ctrdata"),
#'     collection = "my_trials"
#' )
#'
#' dbFindIdsUniqueTrials(con = dbc)
#'
dbFindIdsUniqueTrials <- function(
    preferregister = c("EUCTR", "CTGOV", "CTGOV2", "ISRCTN", "CTIS"),
    prefermemberstate = "DE",
    include3rdcountrytrials = TRUE,
    con,
    verbose = FALSE) {
  # parameter checks
  if (!all(preferregister %in% registerList)) {
    stop("'preferregister' unknown: ", preferregister, call. = FALSE)
  }
  if (length(prefermemberstate) != 1L ||
      !any(prefermemberstate == countriesEUCTR)) {
    stop("'prefermemberstate' unknown: ", prefermemberstate, call. = FALSE)
  }
  # complete if preferregister does not have all
  preferregister <- unique(preferregister)
  preferregister <- union(preferregister, registerList)

  # objective: create a vector of database record identifiers (_id)
  # that represent unique records of clinical trials, based on user's
  # preferences for selecting the preferred from any multiple records

  ## check database connection
  if (is.null(con$ctrDb)) con <- ctrDb(con = con)

  # inform user
  message("Searching for duplicate trials... ")

  # fields for database query
  fields <- c(
    "ctrname",
    # euctr
    "a2_eudract_number",
    "a52_us_nct_clinicaltrialsgov_registry_number",
    "trialInformation.usctnIdentifier",
    "a51_isrctn_international_standard_randomised_controlled_trial_number",
    "trialInformation.isrctnIdentifier",
    "a41_sponsors_protocol_code_number",
    # ctgov
    "id_info",
    # isrctn
    "externalRefs",
    "isrctn",
    # ctis
    "ctNumber",
    "eudraCtInfo.eudraCtCode",
    "authorizedPartI.trialDetails.clinicalTrialIdentifiers.secondaryIdentifyingNumbers.nctNumber.number",
    # ctgov2
    "protocolSection.identificationModule.nctId",
    "protocolSection.identificationModule.secondaryIdInfos.id",
    "protocolSection.identificationModule.nctIdAliases",
    "protocolSection.identificationModule.orgStudyIdInfo.id"
  )

  # check if cache environment has entry for the database
  listofIds <- ctrCache(
    xname = paste0("listofids_", con$db, "/", con$collection),
    verbose = FALSE
  )

  # get cache reference value
  cacheRef <- as.character(rev(unlist(try(nodbi::docdb_query(
    src = con, key = con$collection, query = '{"_id": "meta-info"}',
    fields = '{"queries.query-timestamp": 1}'
  ), silent = TRUE)))[1])

  # cache validity
  cacheOutdated <- is.null(listofIds) || (cacheRef != ctrCache(
    xname = paste0("listofids_", con$db, "/", con$collection, "_timestamp"),
    verbose = FALSE
  ))

  # inform user
  message("- Getting all trial identifiers...", appendLF = FALSE)

  # cache outdated
  if (cacheOutdated) {
    # inform user
    message("\b\b\b (may take some time)...", appendLF = FALSE)

    # get identifiers
    listofIds <- try(
      suppressMessages(suppressWarnings(
        dbGetFieldsIntoDf(
          fields = fields,
          con = con,
          verbose = FALSE
        )
      )),
      silent = TRUE
    )

    # error check
    if (inherits(listofIds, "try-error") ||
        !length(listofIds) || !nrow(listofIds)) {
      stop("No records found, check collection '", con$collection, "'",
           call. = FALSE
      )
    }

    # write cache entries
    ctrCache(
      xname = paste0("listofids_", con$db, "/", con$collection),
      xvalue = listofIds, verbose = FALSE
    )
    ctrCache(
      xname = paste0("listofids_", con$db, "/", con$collection, "_timestamp"),
      xvalue = cacheRef, verbose = FALSE
    )
  } # if outdated

  # inform user
  message("\b\b\b, ", nrow(listofIds), " found in collection")

  # copy attributes
  attribsids <- attributes(listofIds)

  # target fields for adding cols for mangling below
  fields <- c(
    "_id",
    "ctrname",
    # euctr
    "a2_eudract_number",
    "a52_us_nct_clinicaltrialsgov_registry_number",
    "trialInformation.usctnIdentifier",
    "a52_us_nct_clinicaltrialsgov_registry_number",
    "trialInformation.usctnIdentifier",
    "a51_isrctn_international_standard_randomised_controlled_trial_number",
    "trialInformation.isrctnIdentifier",
    "a41_sponsors_protocol_code_number",
    # ctgov
    "id_info.secondary_id",
    "id_info.org_study_id",
    "id_info.nct_id",
    "id_info.nct_id",
    "id_info.nct_alias",
    "id_info.secondary_id",
    "id_info.secondary_id",
    "id_info.org_study_id",
    # isrctn
    "externalRefs.eudraCTNumber",
    "externalRefs.clinicalTrialsGovNumber",
    "externalRefs.clinicalTrialsGovNumber",
    "isrctn",
    "externalRefs.protocolSerialNumber",
    # ctis
    "ctNumber",
    "eudraCtInfo.eudraCtCode",
    "authorizedPartI.trialDetails.clinicalTrialIdentifiers.secondaryIdentifyingNumbers.nctNumber.number",
    "authorizedPartI.trialDetails.clinicalTrialIdentifiers.secondaryIdentifyingNumbers.nctNumber.number",
    # ctgov2
    "protocolSection.identificationModule.nctId",
    "protocolSection.identificationModule.nctId",
    "protocolSection.identificationModule.secondaryIdInfos.id",
    "protocolSection.identificationModule.secondaryIdInfos.id",
    "protocolSection.identificationModule.nctIdAliases",
    "protocolSection.identificationModule.orgStudyIdInfo.id"
  )
  if (verbose) {
    message(
      "\nFields used for finding corresponding register records of trials: ",
      "\n\n", paste0(fields, collapse = ", "), "\n"
    )
  }

  # add any missing columns
  missFields <- setdiff(fields, names(listofIds))
  if (length(missFields)) {
    missCols <- matrix(nrow = nrow(listofIds), ncol = length(missFields))
    missCols <- data.frame(missCols)
    names(missCols) <- missFields
    listofIds <- cbind(listofIds, missCols)
  }

  # replicate columns to make data frame fit subsequent steps
  listofIds <- listofIds[, fields, drop = FALSE]

  # rename columns for content mangling, needs to
  # correspond to columns and sequence in "fields"
  # for mapping identifiers across registers
  names(listofIds) <- c(
    "_id", "ctrname",
    # euctr
    "euctr.1", "ctgov.1a", "ctgov.1b", "ctgov2.1a", "ctgov2.1b", "isrctn.1a", "isrctn.1b", "sponsor.1",
    # ctgov
    "euctr.2a", "euctr.2b", "ctgov.2a", "ctgov2.2", "ctgov.2b", "isrctn.2",
    "sponsor.2a", "sponsor.2b",
    # isrctn
    "euctr.3", "ctgov.3", "ctgov2.3", "isrctn.3", "sponsor.3",
    # ctis
    "ctis.1", "euctr.4", "ctgov.4", "ctgov2.4",
    # ctgov2
    "ctgov2.5", "ctgov.5a", "euctr.5", "sponsor.4a", "ctgov.5b", "sponsor.4b"
  )

  # keep only relevant content
  # - in certain raw value columns
  colsToMangle <- list(
    c("ctgov.1a", regCtgov),
    c("ctgov.1b", regCtgov),
    c("ctgov.2a", regCtgov),
    c("ctgov.2b", regCtgov),
    c("ctgov.3", regCtgov),
    c("ctgov.4", regCtgov),
    c("ctgov.5a", regCtgov),
    c("ctgov.5b", regCtgov),
    c("ctgov2.1", regCtgov2),
    c("ctgov2.2", regCtgov2),
    c("ctgov2.3", regCtgov2),
    c("ctgov2.4", regCtgov2),
    c("ctgov2.5", regCtgov2),
    c("isrctn.1a", regIsrctn),
    c("isrctn.1b", regIsrctn),
    c("isrctn.2", regIsrctn),
    c("isrctn.3", regIsrctn),
    c("euctr.1", regEuctr),
    c("euctr.2a", regEuctr),
    c("euctr.2b", regEuctr),
    c("euctr.3", regEuctr),
    c("euctr.4", regEuctr),
    c("euctr.5", regEuctr),
    c("ctis.1", regCtis)
  )

  # - inconsistency:
  #   isrctn.3 = 12345678, but isrctn.1a
  #   and isrctn.1b have ISRCTN12345678
  listofIds["isrctn.1a"] <- sub("^ISRCTN", "", listofIds[["isrctn.1a"]])
  listofIds["isrctn.1b"] <- sub("^ISRCTN", "", listofIds[["isrctn.1b"]])

  # - do mangling; prerequisite is
  #   that each of the columns holds
  #   a single character vector,
  #   possibly collapsed with " / "
  invisible(sapply(
    colsToMangle,
    function(ctm) {
      colMangled <- regmatches(
        listofIds[[ctm[[1]]]],
        regexec(ctm[[2]], listofIds[[ctm[[1]]]])
      )
      colMangled[!lengths(colMangled)] <- ""
      listofIds[[ctm[[1]]]] <<- unlist(colMangled)
    }
  ))
  # - merge columns for register ids and sponsor ids
  for (reg in c(registerList, "SPONSOR")) {
    listofIds[[reg]] <- apply(
      listofIds[
        , grepl(paste0("^", reg, "[.][0-9]"), names(listofIds),
                ignore.case = TRUE
        ),
        drop = FALSE
      ],
      MARGIN = 1,
      function(r) {
        gsub(
          "^ ?/ | / ?$", "",
          paste0(na.omit(unique(r)), collapse = " / ")
        )
      }
    )
  }
  # - delete raw columns
  listofIds <- listofIds[
    , c("_id", "ctrname", registerList, "SPONSOR"),
    drop = FALSE
  ]

  # inform user
  message("- Finding duplicates among registers' and sponsor ids...")

  # find duplicates
  colsToCheck <- match(c(preferregister, "SPONSOR"), names(listofIds))
  outSet <- NULL
  for (i in seq_along(preferregister)) {
    # to be added
    tmp <- listofIds[
      listofIds[["ctrname"]] == preferregister[i], ,
      drop = FALSE
    ]
    row.names(tmp) <- NULL

    # check if second etc. set has identifiers
    # in the previously rbind'ed sets
    if (i > 1L && nrow(tmp)) {
      # check for duplicates
      dupes <- mapply(
        function(c1, c2) {
          tmpIs <- intersect(
            unlist(strsplit(c1, " / ")),
            unlist(strsplit(c2, " / "))
          )
          if (length(tmpIs)) {
            # map found intersecting names back
            # to the rows of the input data frame
            grepl(paste0(tmpIs, collapse = "|"), c1)
          } else {
            rep(FALSE, times = length(c1))
          }
        },
        tmp[, colsToCheck, drop = FALSE],
        outSet[, colsToCheck, drop = FALSE],
        SIMPLIFY = FALSE
      )

      # mangle dupes for marginal cases, e.g. one record
      dupes <- do.call(cbind, dupes)
      dupes <- as.data.frame(dupes)

      # keep uniques
      tmp <- tmp[rowSums(dupes) == 0L, , drop = FALSE]
      rm(dupes)
    }

    # add to output set
    outSet <- rbind(outSet, tmp,
                    make.row.names = FALSE,
                    stringsAsFactors = FALSE
    )
  }
  rm(tmp)

  # keep necessary columns
  listofIds <- outSet[, c("_id", "EUCTR", "ctrname")]
  names(listofIds)[2] <- "a2_eudract_number"
  rm(outSet)

  # find unique, preferred country version of euctr
  listofIds <- dfFindUniqueEuctrRecord(
    df = listofIds,
    prefermemberstate = prefermemberstate,
    include3rdcountrytrials = include3rdcountrytrials
  )

  # prepare output
  listofIds <- setNames(
    object = listofIds[["_id"]],
    nm = listofIds[["ctrname"]]
  )

  listofIds <- sort(listofIds)

  # count
  countIds <- table(names(listofIds))

  # sort by user's input
  countIds <- countIds[preferregister]
  countIds[is.na(countIds)] <- 0L
  countIds <- setNames(countIds, preferregister)

  # append attributes
  attributes(listofIds) <- c(
    attributes(listofIds),
    attribsids[startsWith(names(attribsids), "ctrdata-")]
  )

  # avoid returning list() if none found
  if (length(listofIds) == 0) listofIds <- character()

  # inform user
  message(
    "- Keeping ", paste0(countIds, collapse = " / "), " records",
    " from ", paste0(names(countIds), collapse = " / ")
  )
  message(
    "= Returning keys (_id) of ", length(listofIds),
    " records in collection \"", con$collection, "\""
  )

  # return
  return(listofIds)
}
# end dbFindIdsUniqueTrials



#' Select a single trial record from records of different EU Member States
#'
#' The EUCTR provides one record per trial per EU Member State in which the
#' trial is conducted. For all trials conducted in more than one Member State,
#' this function returns only one record per trial.
#'
#' Note: To deduplicate trials from different registers,
#' please first use function \link{dbFindIdsUniqueTrials}.
#'
#' @param df A data frame created from the database collection that includes
#'   the columns "_id" and "a2_eudract_number", for example created with
#'   function dbGetFieldsIntoDf(c("_id", "a2_eudract_number")).
#'
#' @inheritParams dfFindIdsUniqueTrials
#'
#' @return A data frame as subset of \code{df} corresponding to the sought
#'   records.
#'
#' @keywords internal
#' @noRd
#
dfFindUniqueEuctrRecord <- function(
    df = NULL,
    prefermemberstate = "DE",
    include3rdcountrytrials = TRUE) {
  # check parameters
  if (!any(class(df) %in% "data.frame")) {
    stop("Parameter df is not a data frame.", call. = FALSE)
  }
  #
  if (is.null(df[["_id"]]) ||
      is.null(df[["a2_eudract_number"]])) {
    stop('Data frame does not include "_id"',
         ' and "a2_eudract_number" columns.',
         call. = FALSE
    )
  }
  #
  if (nrow(df) == 0) {
    stop("Data frame does not contain records (0 rows).",
         call. = FALSE
    )
  }
  #
  if (!(prefermemberstate %in% countriesEUCTR)) {
    stop("Value specified for prefermemberstate does not match",
         " one of the recognised codes: ",
         paste(sort(countriesEUCTR), collapse = ", "),
         call. = FALSE
    )
  }

  # notify it mismatching parameters
  if (prefermemberstate == "3RD" && !include3rdcountrytrials) {
    warning("Preferred EUCTR version set to 3RD country trials, but ",
            "'include3rdcountrytrials' was FALSE, setting it to TRUE.",
            call. = FALSE,
            noBreaks. = FALSE,
            immediate. = FALSE
    )
    include3rdcountrytrials <- TRUE
  }

  # count total
  totalEuctr <- unique(df[["a2_eudract_number"]])
  totalEuctr <- na.omit(totalEuctr[totalEuctr != ""])
  totalEuctr <- length(totalEuctr)

  # as a first step, handle 3rd country trials e.g. 2010-022945-52-3RD
  # if retained, these trials would count as record for a trial
  if (!include3rdcountrytrials) {
    df <- df[!grepl("-3RD", df[["_id"]]), , drop = FALSE]
  }

  # count number of records by eudract number
  tbl <- table(df[["_id"]], df[["a2_eudract_number"]])
  tbl <- as.matrix(tbl)
  # nms has names of all records
  nms <- dimnames(tbl)[[1]]

  # nrs has eudract numbers for which is there more than 1 record
  nrs <- colSums(tbl)
  nrs <- nrs[nrs > 1]
  nrs <- names(nrs)

  # nst is a list of nrs trials of a logical vector along nms
  # that indicates if the indexed record belongs to the trial
  nms2 <- substr(nms, 1, 14)
  nst <- lapply(nrs, function(x) nms2 %in% x)

  # helper function to find the Member State version
  removeMSversions <- function(indexofrecords) {
    # given a vector of records (nnnn-nnnnnnn-nn-MS) of a single trial, this
    # returns all those _ids of records that do not correspond to the preferred
    # Member State record, based on the user's choices and defaults.
    # Function uses prefermemberstate, nms from the caller environment
    recordnames <- nms[indexofrecords]
    #
    # fnd should be only a single string, may need to be checked
    if (sum(fnd <- grepl(prefermemberstate, recordnames)) != 0) {
      result <- recordnames[!fnd]
      return(result)
    }
    #
    if (sum(fnd <- grepl("DE", recordnames)) != 0) {
      result <- recordnames[!fnd]
      return(result)
    }
    #
    # default is to list all but first record
    # the listed records are the duplicates
    # 3RD country trials would be listed first
    # hence selected, which is not desirable
    # unless chosen as prefermemberstate
    return(rev(sort(recordnames))[-1])
  }

  # finds per trial the desired record;
  # uses prefermemberstate and nms
  result <- lapply(
    nst,
    function(x) removeMSversions(x)
  )
  result <- unlist(result, use.names = FALSE)

  # eleminate the unwanted EUCTR records
  df <- df[!(df[["_id"]] %in% result), , drop = FALSE]

  # also eliminate the meta-info record
  df <- df[!(df[["_id"]] == "meta-info"), , drop = FALSE]

  # inform user about changes to data frame
  if (length(nms) > (tmp <- length(result))) {
    message(
      "- ", tmp,
      " EUCTR _id were not preferred EU Member State record for ",
      totalEuctr, " trials"
    )
  }

  # return
  return(df)
}
# end dfFindUniqueEuctrRecord

Try the ctrdata package in your browser

Any scripts or data that you put into this service are public.

ctrdata documentation built on Sept. 30, 2024, 9:39 a.m.