#### history ####
# 2025-01-27 first partly working version
# 2025-02-08, 2025-05-31 improved
#' Calculate if study is likely a platform trial or not
#'
#' Trial concept calculated: platform trial, research platform.
#' As operational definition, at least one of these criteria is true:
#' a. trial has "platform", "basket", "umbrella", "multi.?arm", "multi.?stage"
#' or "master protocol" in its title or description (for ISRCTN, this is the
#' only criterion; some trials in EUCTR lack data in English),
#' b. trial has more than 2 active arms with different investigational medicines,
#' after excluding comparator, auxiliary and placebo medicines (calculated with
#' \link{f.numTestArmsSubstances}; not used for ISRCTN because it cannot be
#' calculated precisely),
#' c. trial has more than 2 periods, after excluding safety run-in, screening,
#' enrolling, extension and follow-up periods (for CTGOV and CTGOV2, this
#' criterion requires results-related data).
#'
#' For EUCTR, requires that results have been included in the collection, using
#' `ctrLoadQueryIntoDb(queryterm = ..., euctrresults = TRUE, con = ...)`.
#' Requires packages dplyr and stringdist to be installed; stringdist is used
#' for evaluating terms in brackets in the trial title, where trials may be
#' related if the term similarity is 0.77 or higher.
#'
#' Publication references considered:
#' \href{https://web.archive.org/web/20230314024441/https://eu-pearl.eu/wp-content/uploads/2020/06/EU-PEARL_D2.1_Report-on-Terminology-and-Scenarios-for-Platform-Trials-and-Masterprotocols.pdf}{EU-PEARL WP2 2020}
#' and Williams RJ et al. 2022, \doi{10.1136/bmj-2021-067745}
#'
#' @param df data frame such as from \link{dbGetFieldsIntoDf}. If `NULL`,
#' prints fields needed in `df` for calculating this trial concept, which can
#' be used with \link{dbGetFieldsIntoDf}.
#'
#' @return data frame with columns `_id` and `.likelyPlatformTrial`, a logical,
#' and two complementary columns, each with lists of identifiers:
#' `.likelyRelatedTrials` (based on other identifiers provided in the trial
#' record, including `associatedClinicalTrials` from CTIS; listing identifiers
#' whether or not the trial with the other identifier is in the database
#' collection) and `.maybeRelatedTrials` (based on similar short terms in the
#' first set of brackets or before a colon in the trial title; only listing
#' identifiers from the database collection).
#'
#' @export
#'
#' @importFrom dplyr mutate case_when pull left_join coalesce `%>%`
#' @importFrom stringi stri_count_fixed stri_detect_regex stri_split_fixed
#' @importFrom stringdist stringsimmatrix
#' @importFrom tidyr pivot_longer unnest
#' @importFrom rlang .data
#'
#' @examples
#' # fields needed
#' f.likelyPlatformTrial()
#'
#' # apply trial concept when creating data frame
#' dbc <- nodbi::src_sqlite(
#' dbname = system.file("extdata", "demo.sqlite", package = "ctrdata"),
#' collection = "my_trials", flags = RSQLite::SQLITE_RO)
#' trialsDf <- dbGetFieldsIntoDf(
#' calculate = "f.likelyPlatformTrial",
#' con = dbc)
#' trialsDf
#'
f.likelyPlatformTrial <- function(df = NULL) {
# check generic, do not edit
stopifnot(is.data.frame(df) || is.null(df))
#### fields ####
fldsHere <- list(
"euctr" = c(
# "a3_full_title_of_the_trial",
# "trialInformation.fullTitle",
"subjectDisposition.postAssignmentPeriods.postAssignmentPeriod.title"
),
"ctgov" = c(
# "official_title",
# "detailed_description.textblock",
"clinical_results.participant_flow.period_list.period.title"
),
"ctgov2" = c(
# "protocolSection.identificationModule.officialTitle",
# "protocolSection.descriptionModule.detailedDescription",
"resultsSection.participantFlowModule.periods.title"
),
"isrctn" = c(
# "trialDescription.title",
# "trialDescription.scientificTitle",
"interventions.intervention.interventionType"
),
"ctis" = c(
# CTIS1
# "applications.fullTitle",
# "authorizedPartI.trialDetails.clinicalTrialIdentifiers.fullTitle",
# "authorizedPartI.trialDetails.clinicalTrialIdentifiers.publicTitle",
"authorizedPartI.trialDetails.protocolInformation.studyDesign.periodDetails.title",
# "authorizedPartI.trialDetails.protocolInformation.studyDesign.periodDetails.armDetails.title",
# "authorizedPartI.trialDetails.associatedClinicalTrials.parentClinicalTrialId",
"authorizedPartI.trialDetails.associatedClinicalTrials.ctNumber",
# "authorizedPartsII.mscInfo.clinicalTrialId",
#
# CTIS2
# "shortTitle" is mostly an uninformative study code
# "authorizedApplication.authorizedPartI.trialDetails.clinicalTrialIdentifiers.fullTitle",
# "authorizedApplication.authorizedPartI.trialDetails.clinicalTrialIdentifiers.publicTitle",
"authorizedApplication.authorizedPartI.trialDetails.protocolInformation.studyDesign.periodDetails.title",
# "authorizedApplication.authorizedPartI.trialDetails.associatedClinicalTrials.parentClinicalTrialId",
"authorizedApplication.authorizedPartI.trialDetails.associatedClinicalTrials.ctNumber"
# "authorizedApplication.authorizedPartsII.mscInfo.clinicalTrialId"
))
# merge with fields needed for nested function
flst1 <- suppressMessages(f.numTestArmsSubstances())
flst2 <- suppressMessages(f.trialTitle())
keys <- unique(c(names(flst1), names(flst2)))
fldsAdded <- setNames(mapply(c, flst1[keys], flst2[keys]), keys)
#
fldsNeeded <- sapply(names(fldsHere), function(i) na.omit(c(
fldsHere[[i]], fldsAdded[[i]])), simplify = FALSE)
fldsNeeded <- c("ctrname", fldsNeeded)
#### describe ####
if (is.null(df)) {
# generic, do not edit
return(fldsNeeded)
} # end describe
#### calculate ####
# check generic, do not edit
fctChkFlds(names(df), fldsNeeded)
# helper definitions
indexThreshold <- 0.77
platformThreshold <- 0.3
minNumArmsDefPlatform <- 2L
minNumPeriodsDefPlatform <- 2L
periodExclPlatform <- "safe|enrol|screen|follow|extens"
titleDefPlatform <- paste0(
"basket|platform|umbrella|multi.?arm|multi.?stage|",
"master protocol|sub.?protocol|sub.?study"
)
# helper function
`%>%` <- dplyr::`%>%`
# apply nested function which provides values for each register
# therefore the following code needs to check against register
if (!any(names(df) == ".numTestArmsSubstances")) df <- dplyr::left_join(
df, f.numTestArmsSubstances(df = df), by = "_id")
# apply nested function which provides values for each register
# therefore the following code needs to check against register
if (!any(names(df) == ".trialTitle")) df <- dplyr::left_join(
df, f.trialTitle(df = df), by = "_id")
# remove columns needed exclusively for nested functions
df <- df[, -match(
setdiff(unlist(fldsAdded), unlist(fldsHere)),
names(df)), drop = FALSE]
# helper
rowColsList <- function(...) {
apply(..., 1, function(i) {
i <- na.omit(unlist(i))
if (!length(i)) return(NA_character_)
i <- stringi::stri_split_regex(
i, "(, | / )",
omit_empty = TRUE,
simplify = TRUE)
i <- sort(unique(na.omit(i)))
i <- i[i != ""]
return(i)
}, simplify = FALSE)
}
# get and mangle mapping table, add ids associated trials
df2 <- .dbMapIdsTrials(con = parent.frame()$con)
df2 <- dplyr::left_join(df2, df[, c("_id",
"authorizedPartI.trialDetails.associatedClinicalTrials.ctNumber",
"authorizedApplication.authorizedPartI.trialDetails.associatedClinicalTrials.ctNumber"
)], by = "_id")
# collapse into single column that is a list
df2$.likelyRelatedTrials <- rowColsList(
dplyr::select(df2, c(
registerList,
"authorizedPartI.trialDetails.associatedClinicalTrials.ctNumber",
"authorizedApplication.authorizedPartI.trialDetails.associatedClinicalTrials.ctNumber"
)))
# expand rows for each list item
df2 <- tidyr::unnest(
df2, cols = .data$.likelyRelatedTrials)
# mangle, revert to one list in one row per trial
df2 <- df2 %>%
dplyr::select(c("_id", "ctrname", ".likelyRelatedTrials")) %>%
tidyr::pivot_longer(cols = !"_id") %>%
# since value has no country suffix as needed for _id
dplyr::filter(.data$name != "EUCTR") %>%
dplyr::filter(.data$name != "ctrname") %>%
dplyr::filter(.data$value != "") %>%
dplyr::filter(.data$value != .data$`_id`) %>%
dplyr::select(!"name") %>%
unique() %>%
dplyr::rename(".likelyRelatedTrials" = .data$value) %>%
dplyr::summarise(
.likelyRelatedTrials = list(.data$.likelyRelatedTrials),
.by = .data$`_id`)
# merge column into df
tnc <- nrow(df)
df <- dplyr::left_join(
df, df2,
by = "_id")
# safety check
stopifnot(tnc == nrow(df))
# simplify and replace NULL with NA
df[[".likelyRelatedTrials"]][
!sapply(df[[".likelyRelatedTrials"]], length)] <- NA
# helper function, column vector
indexSimilarX <- function(x) {
# calculate similarities
t <- stringdist::stringsimmatrix(
x, x, method = c(
# help("stringdist-metrics")
"osa", "lv", "dl", "hamming", "lcs", "qgram",
"cosine", "jaccard", "jw", "soundex")[9])
# blank diagonal
diag(t) <- NA_integer_
# find indices above threshold similarity
apply(t, 1, function(r) {
r <- seq_along(r)[r >= indexThreshold]
r <- r[!is.na(r)]
if (!length(r)) NA_integer_ else r
}, simplify = FALSE)
} # indexSimilarX
# analyse trial title,
# number of arms and substances
df %>% dplyr::mutate(
#
# normalise to work with regex
.trialTitle = gsub("\n|\r| +", " ", .data$.trialTitle),
#
# is title relevant
analysis_titleRelevant = stringi::stri_detect_regex(
.data$.trialTitle, titleDefPlatform, case_insensitive = TRUE),
#
#
# TODO An ... (AG10) in ... Phase 3 ATTRibute-CM Trial (AG10-301)
# TODO Master:A ... Disease MACARONI-23 ISA:A Phase ...
#
# get from title bracketed references, e.g. as per Williams RJ
# Refining ..., TransPORTEC platform trials – p53abn - RED (RAINBO-p53abnRED) Randomized
# ... canceR patients with ... disease (UMBRELLA)
titleBracketedRefs = stringi::stri_replace_first_regex(
.data$.trialTitle, "^.+[(](.*?)[)].+$", "$1"
),
titleBracketedRefs = if_else(
.data$titleBracketedRefs == .data$.trialTitle |
nchar(.data$titleBracketedRefs) < 5L |
# TODO add more abbreviations that are not
# short names of (platform) clinical trials
grepl("NSCLC|mCRPC|SCCHN",
.data$titleBracketedRefs, ignore.case = TRUE),
NA_character_, gsub("[^a-zA-Z0-9]", "", .data$titleBracketedRefs)
),
#
# get delimited reference such as:
# RAINBO: Refining ... features, TransPORTEC platform trials (MMRd-GREEN)
# CARE1: FIRST LINE ... IN PATIENTS
# RECLAIM: an Adaptive Platform Trial ... Post-Acute ... (PASC)
# COSENSE-1: A ... precision medicine platform ...
# NeoART – A phase Ib/II platform ... adenocarcinoma
titleColonedRefs = stringi::stri_replace_first_regex(
.data$.trialTitle, "^(.+?)(: | - ).+$", "$1"
),
titleColonedRefs = if_else(
.data$titleColonedRefs == .data$.trialTitle |
nchar(.data$titleColonedRefs) < 5L |
nchar(.data$titleColonedRefs) > 35L,
NA_character_, gsub("[^a-zA-Z0-9]", "", .data$titleColonedRefs)
),
#
# select one of the terms found
titleRefs = dplyr::if_else(
!is.na(.data$titleBracketedRefs),
.data$titleBracketedRefs, .data$titleColonedRefs
),
#
# identify rows of possibly related trials
# 'contained in' e.g. RAINBO-p53abnRED, RAINBO
# is realised by using Jaro Winkler distance
.maybeRelatedTrials = indexSimilarX(.data$titleRefs),
#
# turn row indices to ids of possibly related trials
.maybeRelatedTrials = mapply(
function(a, b, c) if (all(is.na(a)))
NA_character_ else sort(unique(na.omit(c(.data$`_id`[a], b, c)))),
a = .data$.maybeRelatedTrials,
b = .data$.likelyRelatedTrials,
c = .data$`_id`,
USE.NAMES = FALSE,
SIMPLIFY = FALSE
),
#
#
# is number of test arms and substances relevant
analysis_numArmsSubstancesRelevant =
.data$.numTestArmsSubstances > minNumArmsDefPlatform
#
) -> df
#### . EUCTR ####
df %>% dplyr::mutate(
#
helper_periodTitle = stringi::stri_split_fixed(
.data$subjectDisposition.postAssignmentPeriods.postAssignmentPeriod.title,
" / "),
#
analysis_numberTestPeriods = sapply(
.data$helper_periodTitle,
function(i) {
i <- unique(tolower(na.omit(i)))
i <- i[!grepl(periodExclPlatform, i)]
i <- length(i)
if (i) i else NA_integer_
}),
#
.analysis_numberTestPeriods = dplyr::case_when(
ctrname == "EUCTR" ~ .data$analysis_numberTestPeriods,
.default = .data$analysis_numberTestPeriods)
#
) -> df
#### . CTGOV ####
df %>% dplyr::mutate(
#
helper_periodTitle = stringi::stri_split_fixed(
.data$clinical_results.participant_flow.period_list.period.title,
" / "),
#
analysis_numberTestPeriods = sapply(
.data$helper_periodTitle,
function(i) {
i <- unique(tolower(na.omit(i)))
i <- i[!grepl(periodExclPlatform, i)]
i <- length(i)
if (i) i else NA_integer_
}),
#
.analysis_numberTestPeriods = dplyr::case_when(
ctrname == "CTGOV" ~ .data$analysis_numberTestPeriods,
.default = .data$analysis_numberTestPeriods)
#
) -> df
#### . CTGOV2 ####
df %>% dplyr::mutate(
#
helper_periodTitle = stringi::stri_split_fixed(
.data$resultsSection.participantFlowModule.periods.title,
" / "),
#
analysis_numberTestPeriods = sapply(
.data$helper_periodTitle,
function(i) {
i <- unique(tolower(na.omit(i)))
i <- i[!grepl(periodExclPlatform, i)]
i <- length(i)
if (i) i else NA_integer_
}),
#
.analysis_numberTestPeriods = dplyr::case_when(
ctrname == "CTGOV2" ~ .data$analysis_numberTestPeriods,
.default = .data$analysis_numberTestPeriods)
#
) -> df
#### . ISRCTN ####
df %>% dplyr::mutate(
#
analysis_isDrugTrial =
stringi::stri_detect_regex(
.data$interventions.intervention.interventionType,
"drug|biological|vaccine",
case_insensitive = TRUE),
#
analysis_titleRelevant = dplyr::case_when(
ctrname == "ISRCTN" ~ .data$analysis_isDrugTrial &
.data$analysis_titleRelevant,
.default = .data$analysis_titleRelevant)
#
) -> df
#### . CTIS ####
df %>%
dplyr::mutate(
#
periodTitle = dplyr::coalesce(as.character(
.data$authorizedPartI.trialDetails.protocolInformation.studyDesign.periodDetails.title),
.data$authorizedApplication.authorizedPartI.trialDetails.protocolInformation.studyDesign.periodDetails.title
),
#
helper_periodTitle = stringi::stri_split_fixed(
.data$periodTitle, " / "),
#
analysis_numberTestPeriods = sapply(
.data$helper_periodTitle,
function(i) {
i <- unique(tolower(na.omit(i)))
i <- i[!grepl(periodExclPlatform, i)]
i <- length(i)
if (i) i else 1L
}),
#
.analysis_numberTestPeriods = dplyr::case_when(
ctrname == "CTIS" ~ .data$analysis_numberTestPeriods,
.default = .data$analysis_numberTestPeriods)
#
) -> df
# summarise
df$analysis_numPeriodsRelevant <-
df$.analysis_numberTestPeriods > minNumPeriodsDefPlatform
#
df$.likelyPlatformTrial <- rowSums(
df[, c(
"analysis_titleRelevant",
"analysis_numArmsSubstancesRelevant",
"analysis_numPeriodsRelevant"
), drop = FALSE], na.rm = TRUE
)
#
df$.likelyPlatformTrial <- (
df$.likelyPlatformTrial / 3L) >= platformThreshold
# keep only outcome columns
df <- df[, c(
"_id",
".likelyPlatformTrial",
".likelyRelatedTrials",
".maybeRelatedTrials"
), drop = FALSE]
#### checks ####
stopifnot(inherits(df[[".likelyPlatformTrial"]], "logical"))
stopifnot(inherits(df[[".likelyRelatedTrials"]], "list"))
stopifnot(inherits(df[[".maybeRelatedTrials"]], "list"))
stopifnot(ncol(df) == 4L)
# return
return(df)
} # end f.likelyPlatformTrial
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.