R/rt_register_pmc.R

Defines functions .which_registered_5 .which_registered_4 .which_registered_3 .which_registered_2 .which_registered_1 .which_prospero_1 .which_ct_3 .which_ct_2 .which_ct_1

Documented in .which_ct_1 .which_ct_2 .which_ct_3 .which_prospero_1 .which_registered_1 .which_registered_2 .which_registered_3 .which_registered_4 .which_registered_5

#' Identify mentions of registration on ClinicalTrials.gov
#'
#' Extract the index of mentions such as: "The study is registered at
#'     www.clinicaltrials.gov (NCT01624883)."
#'
#' @param article A string or a list of strings.
#' @return Index of element with phrase of interest
.which_ct_1 <- function(article) {

  # Just using the NCT was too sensitive
  # e.g. picked up references to protocols, mentions of trials underway, etc.
  grep("\\b(|pre|pre-)regist.{0,20}NCT[0-9]{8}", article, perl = T)

}


#' Identify mentions of registration on ClinicalTrials.gov
#'
#' Extract the index of mentions such as: "The study (EudraCT 2011‐001925‐26;
#'     ClinicalTrial.gov NCT01489592) was approved by the Ethics Committee of
#'     Rennes University Hospital."
#'
#' @param article A string or a list of strings.
#' @param dict A list of regular expressions for each concept.
#' @return Index of element with phrase of interest
.which_ct_2 <- function(article, dict) {

  the_study <- "(This|The|It|There) ([a-zA-Z0-9]+\\s){0,2}(randomi(z|s)ed|study|trial|was a|is a|were)"
  ct_nct <- "([Cc]linical[Tt]rial.* NCT[0-9]{8}|ISRCTN[0-9]{8}|ChiCTR-IOR-[0-9]{5})"

  c(the_study, ct_nct) %>%
    paste(collapse = dict$txt) %>%
    grep(article, perl = T)

}


#' Identify mentions of registration on ClinicalTrials.gov
#'
#' Extract the index of mentions such as: "Registered on ClinicalTrials.gov (NCT12345678)."
#'
#' @param article A string or a list of strings.
#' @param dict A list of regular expressions for each concept.
#' @return Index of element with phrase of interest
.which_ct_3 <- function(article, dict) {

  ct_nct <- c(
    "(^|\\.)[a-zA-Z0-9\\s,]+[Cc]linical[Tt]rials",
    "NCT[0-9]{8}"
  )

  ct_nct %>%
    paste(collapse = dict$txt) %>%
    grep(article, perl = T)
}


#' Identify mentions of registration on PROSPERO
#'
#' Extract the index of mentions such as: "We registered the protocol for this
#'     meta-analysis with the PROSPERO database (www.crd.york.ac.uk/prosper
#'     o)—registration no. CRD42014015595."
#'
#' @param article A string or a list of strings.
#' @return Index of element with phrase of interest
.which_prospero_1 <- function(article) {

  # Just using the NCT was too sensitive
  # e.g. picked up references to protocols, mentions of trials underway, etc.
  grep("\\bP(?i)ROSPERO\\b(?-i).*CRD\\s*[0-9]{5}", article, perl = T)

}


#' Identify generic mentions of registration
#'
#' Extract the index of mentions such as: "This study was approved by the local
#'     Scientific and Ethics Committees of IRCCS \"Saverio de Bellis\",
#'     Castellana Grotte (Ba), Italy, and it was part of a registered research
#'     on https://www.clinicaltrials.gov, reg. number: NCT01244945."
#'
#' @param article A string or a list of strings.
#' @param dict A list of regular expressions for each concept.
#' @return Index of element with phrase of interest
.which_registered_1 <- function(article, dict) {

  words <- c("this", "research_lower_strict", "and", "registered_registration")
  # Too generic without research & registered, or if including registration

  this_research <-
    dict %>%
    magrittr::extract(words[c(1, 2)]) %>%
    lapply(.bound) %>%
    lapply(.encase) %>%
    paste(collapse = .max_words(" ", 5, space_first = F))

  this_research_registered <-
    dict %>%
    magrittr::extract(words[c(1, 2, 4)]) %>%
    lapply(.bound) %>%
    lapply(.encase) %>%
    paste(collapse = .max_words(" ", 5, space_first = F))

  and_registered <-
    dict %>%
    magrittr::extract(words[c(3:4)]) %>%
    lapply(.bound) %>%
    lapply(.encase) %>%
    paste(collapse = .max_words(" ", 5, space_first = F))

  research_and_registered <-
    dict %>%
    magrittr::extract(words[2]) %>%
    lapply(.bound) %>%
    lapply(.encase) %>%
    paste(and_registered, sep = dict$txt)

  this_research_and_registered <-
    this_research %>%
    paste(and_registered, sep = dict$txt)

  # c(this_research_registered, research_and_registered) %>%
  #   lapply(.encase) %>%
  #   .encase() %>%
  #   paste("([A-Z]{2}\\s*[0-9]{2}|[0-9]{5})", sep = dict$txt) %>%
  #   grep(article, perl = T)

  c(this_research_registered, this_research_and_registered) %>%
    lapply(.encase) %>%
    .encase() %>%
    paste("([A-Z]{2}\\s*[0-9]{2}|[0-9]{5})", sep = dict$txt) %>%
    grep(article, perl = T)

}


#' Identify generic mentions of registration
#'
#' Extract the index of mentions such as: " EPGP is registered with
#'     clinicaltrials.gov (NCT00552045)."
#'
#' @param article A string or a list of strings.
#' @param dict A list of regular expressions for each concept.
#' @return Index of element with phrase of interest
.which_registered_2 <- function(article, dict) {

  c("(^|\\.\\s*)(Ethical|Approval|(|The )[A-Z][A-Z]+)",
    "(registered|registration)",
    "([Tt]rial|[Ss]tudy)"
  ) %>%
    paste(collapse = dict$txt) %>%
    paste("([A-Z]{2}\\s*[0-9]{2}|[0-9]{5})", sep = dict$txt) %>%
    grep(article, perl = T)

}


#' Identify generic mentions of registration
#'
#' Extract the index of mentions such as: "registered with Clinical Trials
#'     (ChiCTR-IOR-14005438)"
#'
#' @param article A string or a list of strings.
#' @param dict A list of regular expressions for each concept.
#' @return Index of element with phrase of interest
.which_registered_3 <- function(article, dict) {

  c("([Tt]rial|[Pp]rotocol) (registered|registration) (with\\b|under\\b|on\\b|at\\b|as\\b)") %>%
    paste("([A-Z]{2}\\s*[0-9]{2}|[0-9]{5})", sep = dict$txt) %>%
    grep(article, perl = T)

}


#' Identify generic mentions of registration
#'
#' Extract the index of mentions such as: "This registered study on
#'     www.clinicaltrials.gov (NCT01375270) was approved by the Ethics
#'     Committee of the Capital Region of Denmark (H-3-2010-127), and all
#'     subjects provided informed written consent to participate."
#'
#' @param article A string or a list of strings.
#' @param dict A list of regular expressions for each concept.
#' @return Index of element with phrase of interest
.which_registered_4 <- function(article, dict) {

  words <- c("this", "registered", "research_lower_strict")
  # Too generic without research & registered, or if including registration

  dict$this <- append(dict$this, "\\b[Ww]e")

  dict %>%
    magrittr::extract(words) %>%
    lapply(.bound) %>%
    lapply(.encase) %>%
    paste(collapse = .max_words(" ", 5, space_first = F)) %>%
    paste("([A-Z]{2}\\s*[0-9]{2}|[0-9]{5})", sep = dict$txt) %>%
    grep(article, perl = T)

}


#' Identify generic mentions of registration
#'
#' Extract the index of mentions such as: "The Régression de l'Albuminurie dans
#'     la Néphropathie Drépanocytaire (RAND) study design was approved by the
#'     local ethics committee (Ref: DGRI CCTIRS MG/CP09.503, 9 July 2009) and
#'     registered at ClinicalTrials.gov (NCT01195818)."
#'
#' @param article A string or a list of strings.
#' @param dict A list of regular expressions for each concept.
#' @return Index of element with phrase of interest
.which_registered_5 <- function(article, dict) {

  words <- c("this", "research_strict", "and", "registered_registration")
  SOME <- "\\([a-zA-Z]+\\)"
  # Too generic without research & registered, or if including registration

  SOME_research <-
    dict %>%
    magrittr::extract(words[2]) %>%
    lapply(.bound) %>%
    lapply(.encase) %>%
    paste(SOME, .)

  the_SOME_research <-
    dict %>%
    magrittr::extract(words[1]) %>%
    lapply(.bound) %>%
    lapply(.encase) %>%
    paste(SOME_research, sep = dict$txt)

  the_SOME_research_registered <-
    dict %>%
    magrittr::extract(words[4]) %>%
    lapply(.bound) %>%
    lapply(.encase) %>%
    paste(the_SOME_research, ., sep = .max_words(" ", 5, space_first = F))

  and_registered <-
    dict %>%
    magrittr::extract(words[c(3:4)]) %>%
    lapply(.bound) %>%
    lapply(.encase) %>%
    paste(collapse = .max_words(" ", 5, space_first = F))

  the_SOME_research_and_registered <-
    the_SOME_research %>%
    paste(and_registered, sep = dict$txt)


  c(the_SOME_research_registered, the_SOME_research_and_registered) %>%
    lapply(.encase) %>%
    .encase() %>%
    paste("([A-Z]{2}\\s*[0-9]{2}|[0-9]{5})", sep = dict$txt) %>%
    grep(article, perl = T)

}


#' Identify mentions of lack of registration
#'
#' Extract the index of mentions such as: "This trial and its protocol were not
#'     registered on a publicly accessible registry."
#'
#' @param article A string or a list of strings.
#' @param dict A list of regular expressions for each concept.
#' @return Index of element with phrase of interest
.which_not_registered_1 <- function(article, dict) {

  words <- c("this", "research_lower_strict")

  dict %>%
    magrittr::extract(words) %>%
    lapply(.bound, location = "both") %>%
    lapply(.encase) %>%
    paste(collapse = .max_words(" ", n_max = 4, space_first = F)) %>%
    paste(" not", " registered", sep = .max_words("", n_max = 6)) %>%
    grep(article, perl = T)
}


#' Identify generic mentions of registration
#'
#' Extract the index of mentions such as: "Trial registration number is
#'     TCTR20151021001."
#'
#' @param article A string or a list of strings.
#' @param dict A list of regular expressions for each concept.
#' @return Index of element with phrase of interest
.which_registration_1 <- function(article, dict) {

  words <- c("research_strict", "registration")  # Too generic without these

  research_registration <-
    dict %>%
    magrittr::extract(words) %>%
    lapply(.bound) %>%
    lapply(.encase) %>%
    paste(collapse = " ") %>%
    paste0("(:|-+)")

  c(research_registration, "([A-Z]{2}\\s*[0-9]{2}|[0-9]{5})") %>%
    paste(collapse = dict$txt) %>%
    grep(article, perl = T)

}


#' Identify generic mentions of registration
#'
#' Extract the index of mentions such as: "Trial registration number is
#'     TCTR20151021001."
#'
#' @param article A string or a list of strings.
#' @param dict A list of regular expressions for each concept.
#' @return Index of element with phrase of interest
.which_registration_2 <- function(article, dict) {

  words <- c("Research_strict", "registration")

  research_registration <-
    dict %>%
    magrittr::extract(words) %>%
    lapply(.bound) %>%
    lapply(.encase) %>%
    paste(collapse = " ")

  c(research_registration, "([A-Z]{2}\\s*[0-9]{2}|[0-9]{5})") %>%
    paste(collapse = dict$txt) %>%
    grep(article, perl = T)

}


#' Identify generic mentions of registration
#'
#' Extract the index of mentions such as: "Public clinical trial registration
#'     (www.clinicaltrials.gov) ID# NCT02720653"
#'
#' @param article A string or a list of strings.
#' @return Index of element with phrase of interest
.which_registration_3 <- function(article) {

  grep("(^|\\.).{0,35}\\b(|pre|pre-)registration.{0,35}NCT[0-9]{8}",
       article, perl = T)

}


#' Identify generic mentions of registration
#'
#' Extract the index of mentions such as: "The RAPiD trial's International
#'     Standard Randomised Controlled Trial Number (ISRCTN) registration is
#'     ISRCTN 49204710."
#'
#' @param article A string or a list of strings.
#' @param dict A list of regular expressions for each concept.
#' @return Index of element with phrase of interest
.which_registration_4 <- function(article, dict) {

  words <- c("This", "research_lower_strict", "registration")
  # Too generic without research & registered, or if including registration

  this_research <-
    dict %>%
    magrittr::extract(words[c(1, 2)]) %>%
    lapply(.bound) %>%
    lapply(.encase) %>%
    paste(collapse = .max_words(" ", 5, space_first = F))

  c(this_research) %>%
    lapply(.encase) %>%
    .encase() %>%
    paste("registration", "([A-Z]{2}\\s*[0-9]{2}|[0-9]{5})", sep = dict$txt) %>%
    grep(article, perl = T)

}


#' Identify mentions of registry
#'
#' Extract the index of mentions such as: "Here, we describe a collaboration
#'     between an international group of patient organisations advocating for
#'     patients with atypical haemolytic uraemic syndrome (aHUS), the aHUS
#'     Alliance, and an international aHUS patient registry (ClinicalTrials.gov
#'     NCT01522183)."
#'
#' @param article A string or a list of strings.
#' @param dict A list of regular expressions for each concept.
#' @return Index of element with phrase of interest
.which_registry_1 <- function(article, dict) {

  words <- c("research_lower_strict", "registry")  # Too generic without these

  research_registry <-
    dict %>%
    magrittr::extract(words) %>%
    lapply(.bound) %>%
    lapply(.encase) %>%
    paste(collapse = dict$txt)

  a <-
    c(research_registry, "[\\s,;:]+([A-Z]{2,10}\\s*[0-9]{2}|[0-9]{5})") %>%
    paste(collapse = dict$txt) %>%
    grep(article, perl = T)

  if (!!length(a)) {

    is_false <- .negate_registry_1(article[a], dict)
    a <- a[!is_false]

  }

  return(a)

}




#' Identify registration titles - sensitive with negation
#'
#' Extract the index of mentions such as: "Study registration: ..."
#'
#' @param article A string or a list of strings.
#' @param dict A list of regular expressions for each concept.
#' @return Index of element with phrase of interest
.which_reg_title_1 <- function(article, dict) {

  b <- integer()
  d <- integer()

  words <- c("registration_title")

  a <-
    dict %>%
    magrittr::extract(words) %>%
    lapply(.bound) %>%
    lapply(.title_strict) %>%
    lapply(.encase) %>%
    paste() %>%
    grep(article, perl = T)

  if (!!length(a)) {

    for (i in seq_along(a)) {

      if (is.na(article[a[i] + 1])) {

        b <- c(b, a[i])

      } else {

        if (nchar(article[a[i] + 1]) == 0) {
          b <- c(b, a[i], a[i] + 2)
        } else {
          b <- c(b, a[i], a[i] + 1)
        }
      }

      is_true <- any(.negate_reg_title_1(article[b]))
      if (is_true) d <- c(d, b)

    }
    if (!!length(d)) return(d)
  }

  a <-
    dict %>%
    magrittr::extract(words) %>%
    lapply(.bound) %>%
    lapply(.title_strict, within_text = T) %>%
    lapply(.encase) %>%
    paste() %>%
    grep(article, perl = T)

  if (!!length(a)) {

    is_true <- .negate_reg_title_1(article[a])
    a <- a[is_true]

  }

  return(a)
}


#' Identify registration titles - specific with no negation
#'
#' Extract the index of mentions such as: "Trial registration: ..."
#'
#' @param article A string or a list of strings.
#' @return Index of element with phrase of interest
.which_reg_title_2 <- function(article) {

  b <- integer()
  d <- integer()

  reg_title_synonyms <- c(
    "R(?i)egistration info(|rmation)(?-i)",
    "R(?i)egistration detail(|s)(?-i)",
    "R(?i)egistration no(|s)(|\\.)(?-i)",
    "R(?i)egistration number(|s)(?-i)",
    "R(?i)egistration identifier(|s)(?-i)",
    "T(?i)rial(|s) identifier(|s)(?-i)",
    "C(?i)linical trial(|s) identifier(|s)(?-i)",
    "T(?i)rial(|s) registration(?-i)",
    "C(?i)linical trial(|s) registration(?-i)",
    "S(?i)tudy registration(?-i)"
  )

  a <-
    reg_title_synonyms %>%
    lapply(.bound) %>%
    lapply(.title_strict) %>%
    .encase %>%
    grep(article, perl = T)

  if (!!length(a)) {

    for (i in seq_along(a)) {

      if (is.na(article[a[i] + 1])) {

        b <- c(b, a[i])

      } else {

        if (nchar(article[a[i] + 1]) == 0) {
          b <- c(b, a[i], a[i] + 2)
        } else {
          b <- c(b, a[i], a[i] + 1)
        }
      }

      is_true <- any(.negate_reg_title_1(article[b]))
      if (is_true) d <- c(d, b)

    }
    if (!!length(d)) return(d)
  }


  a <-
    reg_title_synonyms %>%
    lapply(.bound) %>%
    lapply(.title_strict, within_text = T) %>%
    .encase %>%
    grep(article, perl = T)

  return(a)
}


#' Identify registration titles - specific
#'
#' Extract the index of mentions such as: "Retrospective clinical trial
#'     registration:"
#'
#' @param article A string or a list of strings.
#' @param dict A list of regular expressions for each concept.
#' @return Index of element with phrase of interest
.which_reg_title_3 <- function(article, dict) {

  b <- integer()
  d <- integer()

  # Protocol only contributed to FPs due to protocol ethical approval
  registration_synonyms <- c(
    "Registration",
    "Clinical [Tt]rial",
    "Trial"
  )

  number_synonyms <- c(
    "registration",
    "no(|s)(|\\.)",
    "number(|s)",
    "#",
    "ID(|s)",
    "identifier(|s)",
    "information",
    "details"
  )

  start <-  "^.{0,1}[A-Z](?i)\\w+"
  registration <- registration_synonyms %>% .bound %>% .encase
  number <- number_synonyms %>% .bound %>% .encase
  finish <- "(| \\w+)(\\.|:|-+)(?-i)"

  a <-
    paste(start, registration, number, finish, sep = "\\s*") %>%
    paste0("$") %>%
    grep(article, perl = T)


  if (!!length(a)) {

    for (i in seq_along(a)) {

      if (is.na(article[a[i] + 1])) {

        b <- c(b, a[i])

      } else {

        if (nchar(article[a[i] + 1]) == 0) {
          b <- c(b, a[i], a[i] + 2)
        } else {
          b <- c(b, a[i], a[i] + 1)
        }
      }

      is_true <- any(.negate_reg_title_1(article[b]))
      if (is_true) d <- c(d, b)

    }
    if (!!length(d)) return(d)
  }


  a <-
    paste(start, registration, number, finish, sep = "\\s*") %>%
    grep(article, perl = T)

  if (!!length(a)) {

    is_true <- .negate_reg_title_1(article[a])
    a <- a[is_true]

  }

  return(a)
}


#' Identify registration titles - specific
#'
#' Extract the index of mentions such as: "Clinical trial registration detail:"
#'
#' @param article A string or a list of strings.
#' @param dict A list of regular expressions for each concept.
#' @return Index of element with phrase of interest
.which_reg_title_4 <- function(article, dict) {

  b <- integer()
  d <- integer()

  # Protocol only contributed to FPs due to protocol ethical approval
  registration_synonyms <- c(
    "R(?i)egistration",
    "C(?i)linical [Tt]rial",
    "T(?i)rial"
  )

  number_synonyms <- c(
    "registration",
    "no(|s)(|\\.)",
    "number(|s)",
    "#",
    "ID(|s)",
    "identifier(|s)",
    "information",
    "details"
  )

  start <-  "^.{0,1}"
  registration <- registration_synonyms %>% .bound %>% .encase
  number <- number_synonyms %>% .bound %>% .encase
  finish <- "(| \\w+)(\\.|:|-+)(?-i)"

  a <-
    paste(start, registration, number, finish, sep = "\\s*") %>%
    paste0("$") %>%
    grep(article, perl = T)

  if (!!length(a)) {

    for (i in seq_along(a)) {

      if (is.na(article[a[i] + 1])) {

        b <- c(b, a[i])

      } else {

        if (nchar(article[a[i] + 1]) == 0) {
          b <- c(b, a[i], a[i] + 2)
        } else {
          b <- c(b, a[i], a[i] + 1)
        }
      }

      is_true <- any(.negate_reg_title_1(article[b]))
      if (is_true) d <- c(d, b)

    }
    if (!!length(d)) return(d)
  }

  a <-
    paste(start, registration, number, finish, sep = "\\s*") %>%
    grep(article, perl = T)

  if (!!length(a)) {

    is_true <- .negate_reg_title_1(article[a])
    a <- a[is_true]

  }

  return(a)
}


#' Identify registration titles - specific with no negation
#'
#' Extract the index of mentions such as: "Trial registration: ..."
#'
#' @return Index of element with phrase of interest
.create_register_pmc_title <- function() {

  title_starts <- c(
    "R(?i)egistration",
    "T(?i)rial",
    "C(?i)linical",
    "(?i)Systematic",
    "M(?i)eta-analysis"
  )

  title_ends <- c(
    "registration",
    "number",
    "\\bid(entifier)",
    "\\bno(|s)",
    "detail(|s)",
    "info(|rmation)"
  )

  list(title_starts, title_ends) %>%
    purrr::map(.encase) %>%
    paste(collapse = "(?i).{0,20}") %>%
    paste(".{0,4}", ., ".{0,1}", sep = "")
}


#' Identify mentions of protocol
#'
#' Extract the index of mentions such as: "The complete study protocol has been
#'     published previously (Supplement 1)"
#'
#' @param article A string or a list of strings.
#' @param dict A list of regular expressions for each concept.
#' @return Index of element with phrase of interest
.which_protocol_1 <- function(article, dict) {

  words <- c("study protocol", "published", "previously")

  dict %>%
    magrittr::extract(words) %>%
    lapply(.bound) %>%
    lapply(.encase) %>%
    paste(collapse = dict$txt) %>%
    grep(article, perl = T)

}


#' Identify mentions of protocol
#'
#' Extract the index of mentions such as: "Alliance for Clinical Trials in
#'     Oncology (formerly Cancer and Leukemia Group B) Protocol #369901"
#'
#' @param article A string or a list of strings.
#' @return Index of element with phrase of interest
.which_protocol_2 <- function(article) {

  grep("[Pp]rotocol .{0,5}(|[A-Z]+)[0-9]{5}", article, perl = T)

}


#' Identify mentions of funding followed by NCT
#'
#' Extract the index of mentions such as: "Funded by: the National Heart, Lung,
#'     and Blood Institute, the National Institute of Diabetes and Digestive and
#'     Kidney Disease, and others; SPECS ClinicalTrials.gov number, NCT00443599;
#'     Nutrition and Obesity Center at Harvard; NIH 5P30DK040561-17"
#'
#' @param article A string or a list of strings.
#' @param dict A list of regular expressions for each concept.
#' @return Index of element with phrase of interest
.which_funded_ct_1 <- function(article, dict) {


  # Anything more general contributed more false than true matches
  funded_by_SOME_ct_NCT <- paste0(
    "[Ff]unded by",
    dict$txt,
    "[A-Z]{2,7}.{0,20}[Cc]linical[Tt]rial.{0,20}NCT[0-9]{8}"
  )

  grep(funded_by_SOME_ct_NCT, article, perl = T)

}


#' Negate unwanted mentions of registry
#'
#' Negate mentions of registry such as: "Ethics approval and consent to
#'     participate This study was approved by the Institutional Review Board of
#'     Chang Gung Memorial Hospital under registry number 201601023B0."
#'
#' @param article A string or a list of strings.
#' @param dict A list of regular expressions for each concept.
#' @return Index of element with phrase of interest
.negate_registry_1 <- function(article, dict) {

  research <- dict$research %>% .bound %>% .encase
  registry <- dict$registry %>% .bound %>% .encase
  code <- "[\\s,;:]+([A-Z]{2,10}\\s*[0-9]{2}|[0-9]{5})"

  paste(research, "approv", registry, code, sep = dict$txt) %>%
    grepl(article, perl = T)

}


#' Negate unwanted mentions of title
#'
#' Negate title mentions that refer to unwanted text.
#'
#' @param article A string or a list of strings.
#' @return Index of element with phrase of interest
.negate_reg_title_1 <- function(article) {

  article %>% stringr::str_detect("[A-Z]{2}\\s*[0-9]{2}|[0-9]{5}")

}


#' Negate titles that mention that there was no registration
#'
#' Negate mentions such as "Clinical Trial Registration: N/A"
#'
#' @param article A string or a list of strings.
#' @return Index of element with phrase of interest
.negate_reg_title_2 <- function(article) {

  article %>% stringr::str_detect("\\bNA\\b|\\bN/A\\b|not registered")

}


#' Remove mentions of previously reported registered studies
#'
#' Removes mentions such as: "An active o <- servational cohort study was
#'     conducted as previously reported (ClinicalTrials.gov identifier
#'     NCT01280162) [16]."
#'
#' @param article A List with paragraphs of interest.
#' @return The list of paragraphs without mentions of financial COIs.
.obliterate_refs_2 <- function(article) {

  # Good for finding, but not for substituting b/c it's a lookahead
  # words <- c(
  #   # positive lookahead makes these phrases interchangeable
  #   "(?=[a-zA-Z0-9\\s,()-]*(financial|support))",
  #   "(?=[a-zA-Z0-9\\s,()-]*(conflict|competing))"
  # )

  article %>%
    stringr::str_replace_all("NCT[0-9]{8}.{3}REFFF(|\\)|\\])", "") %>%
    stringr::str_replace_all("NCT[0-9]{8}[a-zA-Z0-9\\s,()\\[\\]/:-]*REFFF(|\\)|\\])", "")

  # TODO: This to be inserted only for .which_ct_2!

}


#' Remove references
#'
#' Removes mentions such as: "An active observational cohort study was
#'     conducted as previously reported (ClinicalTrials.gov identifier
#'     NCT01280162) [16]."
#'
#' @param article A List with paragraphs of interest.
#' @return The list of paragraphs without mentions of financial COIs.
.obliterate_references_1 <- function(article) {

  # If within References or under references and starts with 1. or contains et al. then remove.

  ref_from <- .where_refs_txt(article)

  if (!!length(ref_from)) {

    ref_to <- length(article)

    article[ref_from] <- ""
    article[ref_from:ref_to] <-
      gsub("^([0-9]{1,3}\\.\\s|.*et al\\.).*$", "",
           article[ref_from:ref_to], perl = T)

  }
  return(article)
}


.get_register_pmc_title <- function(article_xml) {

  b <- ""
  title_regex <- .create_register_pmc_title()

  # If I had not stripped the d1 namespace:
  # "back//fn-group//*[self::d1:title or self::d1:bold or self::d1:italic]"
  back_footnote <-
    article_xml %>%
    xml2::xml_find_all("back//fn-group//*[self::title or self::bold or self::italic]")

  a <-
    back_footnote %>%
    xml2::xml_text() %>%
    stringr::str_which(title_regex)

  if (!!length(a)) {

    b <-
      back_footnote %>%
      magrittr::extract(a) %>%
      xml2::xml_parent() %>%
      xml2::xml_contents() %>%
      xml2::xml_text() %>%
      paste(collapse = " ")

    return(b)
  }


  abstract <-
    article_xml %>%
    xml2::xml_find_all("front//abstract//*[self::title or self::bold or self::italic]")

  a <-
    abstract %>%
    xml2::xml_text() %>%
    stringr::str_which(title_regex)

  if (!!length(a)) {

    b <-
      abstract %>%
      magrittr::extract(a) %>%
      xml2::xml_parent() %>%
      xml2::xml_contents() %>%
      xml2::xml_text() %>%
      paste(collapse = " ")

    return(b)
  }


  front_footnote <-
    article_xml %>%
    xml2::xml_find_all("front/article-meta//fn//*[self::title or self::bold or self::italic]")

  a <-
    front_footnote %>%
    xml2::xml_text() %>%
    stringr::str_which(title_regex)

  if (!!length(a)) {

    b <-
      front_footnote %>%
      magrittr::extract(a) %>%
      xml2::xml_parent() %>%
      xml2::xml_contents() %>%
      xml2::xml_text() %>%
      paste(collapse = " ")

    return(b)
  }

  return(b)
}



#' Identify and extract Registration statements in PMC XML files.
#'
#' Takes a PMC XML file as a list of strings and returns data related to the
#'     presence of a Registration statement, including whether such a statement
#'     exists. If a Registration statement exists, it extracts it. This is a
#'     modified version of the `rt_register_pmc` designed for integration with
#'     `rt_all_pmc`.
#'
#' @param article_ls A PMC XML as a list of strings.
#' @param pmc_reg_ls A list of results from the `.get_register_pmc` function.
#' @param dict A list of regular expressions for each concept.
#' @return A dataframe of results.
.rt_register_pmc <- function(article_ls, pmc_reg_ls, dict) {

  # TODO Update to match format of rt_coi_pmc.

  # Creating and outputting these lists adds negligible time
  # Way faster than index_any[["reg_title_pmc"]] <- NA
  index_any <- list(
    prospero_1 = NA,
    registered_1 = NA,
    registered_2 = NA,
    registered_3 = NA,
    registered_4 = NA,
    registered_5 = NA,
    not_registered_1 = NA,
    registration_1 = NA,
    registration_2 = NA,
    registration_3 = NA,
    registration_4 = NA,
    registry_1  = NA,
    reg_title_1 = NA,
    reg_title_2 = NA,
    reg_title_3 = NA,
    reg_title_4 = NA,
    funded_ct_1 = NA
  )

  index_method <- list(
    ct_2 = NA,
    ct_3 = NA,
    protocol_1 = NA
  )

  out <- list(  # do not change order of pmid:doi - id extraction depends on it
    is_relevant_reg = NA,
    is_method = NA,
    is_NCT = NA,
    is_register_pred = FALSE,
    register_text = "",
    is_explicit_reg = NA
  )


  if (!pmc_reg_ls$is_research & !pmc_reg_ls$is_review) {

    return(c(out, index_any, index_method))

  }


  if (pmc_reg_ls$is_register_pred) {

    out$is_relevant_reg <- TRUE
    out$is_explicit_reg <- TRUE
    out$is_register_pred <- TRUE
    out$register_text <- pmc_reg_ls$register_text

    return(c(out, index_any, index_method))
  }



  # TODO Consider adding unique
  article <-
    article_ls[c("ack", "methods", "abstract", "footnotes")] %>%
    unlist()
  # unique()

  methods <- unlist(article_ls$methods)

  reg_regex <- "\\b(|-)([Rr]egist|(|[Cc]linical)[Tt]rial|NCT[0-9]{8}|ISRCTN|ChiCTR|PROSPERO)"
  article %<>% purrr::keep(stringr::str_detect, pattern = reg_regex)


  out$is_relevant_reg <- !!length(article)

  # Check for relevance
  if (!out$is_relevant_reg) {

    return(c(out, index_any, index_method))

  }

  # Check for methods
  out$is_method <- !!length(methods)

  if (!out$is_method) {

    return(c(out, index_any, index_method))

  }


  # Text pre-processing
  article_processed <-
    article %>%
    iconv(from = 'UTF-8', to = 'ASCII//TRANSLIT', sub = "") %>%   # keep first
    trimws() %>%
    .obliterate_fullstop_1() %>%
    .obliterate_semicolon_1() %>%  # adds minimal overhead
    .obliterate_comma_1() %>%   # adds minimal overhead
    .obliterate_apostrophe_1() %>%
    .obliterate_punct_1() %>%
    .obliterate_line_break_1() %>%
    .obliterate_refs_2()


  # Way faster than: out[["is_NCT"]] <- ...
  out$is_NCT <- purrr::some(article, stringr::str_detect, "NCT[0-9]{8}")

  index_any$prospero_1 <- .which_prospero_1(article_processed)
  index_any$registered_1 <- .which_registered_1(article_processed, dict)
  index_any$registered_2 <- .which_registered_2(article_processed, dict)
  index_any$registered_3 <- .which_registered_3(article_processed, dict)
  index_any$registered_4 <- .which_registered_4(article_processed, dict)
  index_any$registered_5 <- .which_registered_5(article_processed, dict)
  index_any$not_registered_1 <- .which_not_registered_1(article_processed, dict)
  index_any$registration_1 <- .which_registration_1(article_processed, dict)
  index_any$registration_2 <- .which_registration_2(article_processed, dict)
  index_any$registration_3 <- .which_registration_3(article_processed)
  index_any$registration_4 <- .which_registration_4(article_processed, dict)
  index_any$registry_1 <- .which_registry_1(article_processed, dict)
  index_any$reg_title_1 <- .which_reg_title_1(article_processed, dict)
  index_any$reg_title_2 <- .which_reg_title_2(article_processed)
  index_any$reg_title_3 <- .which_reg_title_3(article_processed, dict)
  index_any$reg_title_4 <- .which_reg_title_4(article_processed, dict)
  index_any$funded_ct_1 <- .which_funded_ct_1(article_processed, dict)
  index <- unlist(index_any) %>% unique() %>% sort()


  if (!!length(index)) {

    out$is_explicit_reg <- !!length(unlist(index_any))
    out$is_register_pred <- !!length(index)
    out$register_text <- article[index] %>% paste(collapse = " ")

    index_any %<>% purrr::map(function(x) !!length(x))

    return(c(out, index_any, index_method))
  }


  # Apply a more sensitive search in Methods
  if (out$is_method) {

    # x30 faster than obliterating methods again
    # article_processed %<>% purrr::keep(article %in% methods)
    i <- which(article %in% methods)

    # methods %<>%
    #   .obliterate_fullstop_1() %>%
    #   .obliterate_semicolon_1() %>%  # adds minimal overhead
    #   .obliterate_comma_1() %>%   # adds minimal overhead
    #   .obliterate_apostrophe_1() %>%
    #   .obliterate_hash_1() %>%
    #   .obliterate_backlash_1() %>%
    #   .obliterate_line_break_1()
    #
    # article %<>% purrr::keep(magrittr::is_in, methods)

    index_method$ct_2 <- .which_ct_2(article_processed[i], dict)
    index_method$ct_3 <- .which_ct_3(article_processed[i], dict)
    index_method$protocol_1 <- .which_protocol_1(article_processed[i], dict)

    index <- i[unlist(index_method) %>% unique() %>% sort()]
    index_method %<>% purrr::map(function(x) !!length(x))
  }

  out$is_register_pred <- !!length(index)
  out$register_text <- article[index] %>% paste(collapse = " ")

  index_any %<>% purrr::map(function(x) !!length(x))

  if (out$is_register_pred) {

    out$is_explicit_reg <- FALSE

  }

  return(c(out, index_any, index_method))
}



#' Identify and extract Conflicts of Interest statements in PMC XML files.
#'
#' Takes a PMC XML file and returns data related to the
#'     presence of a Funding statement, including whether a Funding statement
#'     exists. If a Funding statement exists, it extracts it.
#'
#' @param filename The name of the PMC XML as a string.
#' @param remove_ns TRUE if an XML namespace exists, else FALSE (default).
#' @return A dataframe of results. It returns the unique article identifiers,
#'     whether this article was deemed a research, review or systematic review,
#'     whether the text was deemed relevant to registration (e.g. contained the
#'     word registration), whether a Methods section was identified, whether an
#'     NCT number was identified, whether a registration was explicitly
#'     identified (defunct), whether a registration statement was found, what
#'     the registration statement was, whether it the registration was
#'     identified from the PMC XML (i.e. it was found within a dedicated
#'     registration tag) and whether each labeling function identified a
#'     relevant text or not. The labeling functions are returned to
#'     add flexibility in how this package is used; for example, future
#'     definitions of Registration may differ from the one we used.
#' @examples
#' \dontrun{
#' # Path to PMC XML.
#' filepath <- "../inst/extdata/00003-PMID26637448-PMC4737611.xml"
#'
#' # Identify and extract meta-data and indicators of transparency.
#' results_table <- rt_register_pmc(filepath, remove_ns = T)
#' }
#' @export
rt_register_pmc <- function(filename, remove_ns = F) {

  # TODO Update to match format of rt_coi_pmc.

  xpath <- c(
    "front/article-meta/article-id[@pub-id-type = 'pmid']",
    "front/article-meta/article-id[@pub-id-type = 'pmc']",
    "front/article-meta/article-id[@pub-id-type = 'pmc-uid']",
    "front/article-meta/article-id[@pub-id-type = 'doi']"
  )

  var_names <- c(
    "pmid",
    "pmcid_pmc",
    "pmcid_uid",
    "doi"
  )

  # Creating and outputting these lists adds negligible time
  # Way faster than index_any[["reg_title_pmc"]] <- NA
  index_any <- list(
    reg_title_pmc = NA,
    prospero_1 = NA,
    registered_1 = NA,
    registered_2 = NA,
    registered_3 = NA,
    registered_4 = NA,
    registered_5 = NA,
    not_registered_1 = NA,
    registration_1 = NA,
    registration_2 = NA,
    registration_3 = NA,
    registration_4 = NA,
    registry_1  = NA,
    reg_title_1 = NA,
    reg_title_2 = NA,
    reg_title_3 = NA,
    reg_title_4 = NA,
    funded_ct_1 = NA
  )

  index_method <- list(
    ct_2 = NA,
    ct_3 = NA,
    protocol_1 = NA
  )

  out <- list(  # do not change order of pmid:doi - id extraction depends on it
    pmid = NA,
    pmcid_pmc = NA,
    pmcid_uid = NA,
    doi = NA,
    is_research = NA,
    is_review = NA,
    is_slr = FALSE,
    is_relevant = NA,
    is_method = NA,
    is_NCT = NA,
    is_register_pred = FALSE,
    register_text = "",
    is_explicit = NA
  )


  # A lot of the PMC XML files are malformed
  article_xml <- tryCatch(.get_xml(filename, remove_ns), error = function(e) e)

  if (inherits(article_xml, "error")) {

    return(tibble::tibble(filename, is_success = F))

  }


  # Extract IDs
  out %<>% purrr::list_modify(!!!purrr::map(xpath, ~ .get_text(article_xml, .x, T)))
  # out <-
  #   xpath %>%
  #   lapply(.get_text, article_xml = article_xml, find_first = T) %>%
  #   {purrr::list_modify(out, !!!.)}


  # Check for type
  # Definitions at PMC -> Tagging Guidelines -> Document Objects
  research_types <- c(
    "research-article",
    "protocol",
    "letter",
    "brief-report",
    "data-paper"
  )

  review_types <- c(
    "review-article",  # SLRs can also be labelled as review-article...
    "systematic-review"
  )

  type <- article_xml %>% xml2::xml_attr("article-type")

  out$is_research <- magrittr::is_in(type, research_types)
  out$is_review <- magrittr::is_in(type, review_types)

  if (!out$is_research & !out$is_review) {

    return(tibble::as_tibble(c(out, index_any, index_method)))

  }

  # Check for SLR (code is marginally faster without this)
  # if (out$is_review) {
  #
  #   txt <- xml_text(article_xml)
  #   out$is_slr <- grepl("systematic review", txt, ignore.case = T)
  #
  #   if (!out$is_slr) {
  #
  #     return(tibble::as_tibble(c(out, index_any, index_method)))
  #
  #   }
  # }


  # Go through titles
  title_txt <- .get_register_pmc_title(article_xml)
  is_title <- nchar(title_txt) > 0

  if (is_title) {

    index_any$reg_title_pmc <- TRUE
    out$is_relevant <- TRUE
    out$is_explicit <- TRUE
    out$is_register_pred <- TRUE
    out$register_text <- title_txt

    return(tibble::as_tibble(c(out, index_any, index_method)))

  }


  # Check for relevance
  # txt <- xml_text(article_xml)
  # rel_regex <- "\\b(|-)([Rr]egist|(|[Cc]linical)[Tt]rial|NCT[0-9]{8}|PROSPERO)"
  # out$is_relevant <- grepl(rel_regex, txt)
  #
  # if (!out$is_relevant) {
  #
  #   return(tibble::as_tibble(c(out, index_any, index_method)))
  #
  # }


  # Check for methods (10x more costly than xml_text(article_xml))
  # methods <- .xml_methods(article_xml)
  # out$is_method <- !!length(methods)
  #
  # if (!out$is_method) {
  #
  #   return(tibble::as_tibble(c(out, index_any, index_method)))
  #
  # }


  # Extract article text into a list
  # article[["ack"]] <- .xml_ack(article_xml)
  # article[["methods"]] <- .xml_methods(article_xml)
  # article[["abstract"]] <- .xml_abstract(article_xml)
  # article[["footnotes"]] <- .xml_footnotes(article_xml)


  # Extract article text into a vector
  ack <- .xml_ack(article_xml)
  methods <- .xml_methods(article_xml)
  abstract <- .xml_abstract(article_xml)
  footnotes <- .xml_footnotes(article_xml)
  article <- c(abstract, methods, footnotes, ack)

  # Can use xpath, but x2 slower (8 vs 4ms) and cannot get refs
  # article_xml %>%
  #     xml_find_all("//text()[contains(translate(., 'REGIST', 'regist'), \
  #                  'regist') or contains(translate(., 'TRIAL', 'trial'),\
  #                  'trial') or contains(., 'NCT') or contains(., 'PROSPERO')\
  #                  ]") %>%
  #     xml_text()

  # Can use article as list (negligible change in performance)
  # article <-
  #   list(abstract, methods, footnotes, ack) %>%
  #   purrr::compact() %>%
  #   purrr::map(~ keep(.x, ~ grepl(relevant_regex, .x))) %>%
  #   purrr::compact()

  # Adding PROPSERO adds negligible overhead
  rel_regex <- "\\b(|-)([Rr]egist|(|[Cc]linical)[Tt]rial|NCT[0-9]{8}|ISRCTN|ChiCTR|PROSPERO)"
  article %<>% purrr::keep(stringr::str_detect, pattern = rel_regex)


  out$is_relevant <- !!length(article)

  # Check for relevance
  if (!out$is_relevant) {

    return(tibble::as_tibble(c(out, index_any, index_method)))

  }

  # Check for methods
  out$is_method <- !!length(methods)

  if (!out$is_method) {

    return(tibble::as_tibble(c(out, index_any, index_method)))

  }

  # Activate if I want to check for title here.
  # if (!!length(index_any$reg_title_pmc)) {
  #
  #   i <- index_any$reg_title_pmc[2]
  #   is_true <- .negate_reg_title_1(article[i])
  #
  #   if (is_true) {
  #
  #   index <- unlist(index_any)
  #   out[["is_register_pred"]] <- TRUE
  #   out[["register_text"]] <- article[index] %>% paste(collapse = " ")
  #
  #   return(tibble::as_tibble(c(out, index_any, index_method)))
  #   }
  # }


  # Text pre-processing
  # .xml_preprocess(article_xml)  # 5x faster to obliterate within each section
  article_processed <-
    article %>%
    iconv(from = 'UTF-8', to = 'ASCII//TRANSLIT', sub = "") %>%   # keep first
    .obliterate_fullstop_1() %>%
    .obliterate_semicolon_1() %>%  # adds minimal overhead
    .obliterate_comma_1() %>%   # adds minimal overhead
    .obliterate_apostrophe_1() %>%
    .obliterate_punct_1() %>%
    .obliterate_line_break_1() %>%
    .obliterate_refs_2()

  # Way faster than: out[["is_NCT"]] <- ...
  out$is_NCT <- purrr::some(article, stringr::str_detect, "NCT[0-9]{8}")

  dict <- .create_synonyms()
  index_any$reg_title_pmc <- integer()
  index_any$prospero_1 <- .which_prospero_1(article_processed)
  index_any$registered_1 <- .which_registered_1(article_processed, dict)
  index_any$registered_2 <- .which_registered_2(article_processed, dict)
  index_any$registered_3 <- .which_registered_3(article_processed, dict)
  index_any$registered_4 <- .which_registered_4(article_processed, dict)
  index_any$registered_5 <- .which_registered_5(article_processed, dict)
  index_any$not_registered_1 <- .which_not_registered_1(article_processed, dict)
  index_any$registration_1 <- .which_registration_1(article_processed, dict)
  index_any$registration_2 <- .which_registration_2(article_processed, dict)
  index_any$registration_3 <- .which_registration_3(article_processed)
  index_any$registration_4 <- .which_registration_4(article_processed, dict)
  index_any$registry_1 <- .which_registry_1(article_processed, dict)
  index_any$reg_title_1 <- .which_reg_title_1(article_processed, dict)
  index_any$reg_title_2 <- .which_reg_title_2(article_processed)
  index_any$reg_title_3 <- .which_reg_title_3(article_processed, dict)
  index_any$reg_title_4 <- .which_reg_title_4(article_processed, dict)
  index_any$funded_ct_1 <- .which_funded_ct_1(article_processed, dict)
  index <- unlist(index_any) %>% unique() %>% sort()

  # Tidier but takes a median 11.0 ms vs current, which takes 10.6 ms
#   index_any <- list(
#     prospero_1 = NA,
#     registered_1 = NA,
#     registered_2 = NA,
#     registered_3 = NA,
#     registered_4 = NA,
#     registered_5 = NA,
#     not_registered_1 = NA,
#     registration_1 = NA,
#     registration_2 = NA,
#     registration_3 = NA,
#     registration_4 = NA,
#     registry_1  = NA,
#     reg_title_1 = NA,
#     reg_title_2 = NA,
#     reg_title_3 = NA,
#     reg_title_4 = NA,
#     funded_ct_1 = NA
#   )
#
#   func <- list(
#     .which_prospero_1,
#     .which_registered_1,
#     .which_registered_2,
#     .which_registered_3,
#     .which_registered_4,
#     .which_registered_5,
#     .which_not_registered_1,
#     .which_registration_1,
#     .which_registration_2,
#     .which_registration_3,
#     .which_registration_4,
#     .which_registry_1,
#     .which_reg_title_1,
#     .which_reg_title_2,
#     .which_reg_title_3,
#     .which_reg_title_4,
#     .which_funded_ct_1
#   )
#
#   pepa <- article %>% purrr::invoke_map(func, .)
#   index_any %<>% list_modify(!!!pepa)
# }


  if (!!length(index)) {

    out$is_explicit <- !!length(unlist(index_any))
    out$is_register_pred <- !!length(index)
    out$register_text <- article[index] %>% paste(collapse = " ")

    index_any %<>% purrr::map(function(x) !!length(x))

    return(tibble::as_tibble(c(out, index_any, index_method)))
  }


  # Apply a more sensitive search in Methods
  if (out$is_method) {

    # x30 faster than obliterating methods again
    # article_processed %<>% purrr::keep(article %in% methods)
    i <- which(article %in% methods)

    # methods %<>%
    #   .obliterate_fullstop_1() %>%
    #   .obliterate_semicolon_1() %>%  # adds minimal overhead
    #   .obliterate_comma_1() %>%   # adds minimal overhead
    #   .obliterate_apostrophe_1() %>%
    #   .obliterate_hash_1() %>%
    #   .obliterate_backlash_1() %>%
    #   .obliterate_line_break_1()
    #
    # article %<>% purrr::keep(magrittr::is_in, methods)

    index_method$ct_2 <- .which_ct_2(article_processed[i], dict)
    index_method$ct_3 <- .which_ct_3(article_processed[i], dict)
    index_method$protocol_1 <- .which_protocol_1(article_processed[i], dict)

    index <- i[unlist(index_method) %>% unique() %>% sort()]
    index_method %<>% purrr::map(function(x) !!length(x))
  }

  out$is_register_pred <- !!length(index)
  out$register_text <- article[index] %>% paste(collapse = " ")

  index_any %<>% purrr::map(function(x) !!length(x))

  if (out$is_register_pred) {

    out$is_explicit <- FALSE

  }

  return(tibble::as_tibble(c(out, index_any, index_method)))
}
serghiou/rtransparent documentation built on Dec. 26, 2024, 8:19 p.m.