R/rt_register_pmc.R

Defines functions .which_registered_5 .which_registered_4 .which_registered_3 .which_registered_2 .which_registered_1 .which_prospero_2 .which_prospero_1 .which_ct_3 .which_ct_2 .which_ct_1

#' Identify mentions of registration on ClinicalTrials.gov
#'
#' Extract the index of mentions such as: "The study is registered at
#'     www.clinicaltrials.gov (NCT01624883)."
#'
#' @param article A string or a list of strings.
#' @return Index of element with phrase of interest
#' @noRd
.which_ct_1 <- function(article) {

  # Just using the NCT was too sensitive
  # e.g. picked up references to protocols, mentions of trials underway, etc.
  grep("\\b(|pre|pre-)regist.{0,20}NCT[0-9]{8}", article, perl = TRUE)

}


#' Identify mentions of registration on ClinicalTrials.gov
#'
#' Extract the index of mentions such as: "The study (EudraCT 2011‐001925‐26;
#'     ClinicalTrial.gov NCT01489592) was approved by the Ethics Committee of
#'     Rennes University Hospital."
#'
#' @param article A string or a list of strings.
#' @param dict A list of regular expressions for each concept.
#' @return Index of element with phrase of interest
#' @noRd
.which_ct_2 <- function(article, dict) {

  the_study <- "(This|The|It|There) ([a-zA-Z0-9]+\\s){0,2}(randomi(z|s)ed|study|trial|was a|is a|were)"
  ct_nct <- "([Cc]linical[Tt]rial.* NCT[0-9]{8}|ISRCTN[0-9]{8}|ChiCTR-IOR-[0-9]{5})"

  c(the_study, ct_nct) %>%
    paste(collapse = dict$txt) %>%
    grep(article, perl = TRUE)

}


#' Identify mentions of registration on ClinicalTrials.gov
#'
#' Extract the index of mentions such as: "Registered on ClinicalTrials.gov (NCT12345678)."
#'
#' @param article A string or a list of strings.
#' @param dict A list of regular expressions for each concept.
#' @return Index of element with phrase of interest
#' @noRd
.which_ct_3 <- function(article, dict) {

  ct_nct <- c(
    "(^|\\.)[a-zA-Z0-9\\s,]+[Cc]linical[Tt]rials",
    "NCT[0-9]{8}"
  )

  ct_nct %>%
    paste(collapse = dict$txt) %>%
    grep(article, perl = TRUE)
}


#' Identify mentions of registration on PROSPERO
#'
#' Extract the index of mentions such as: "We registered the protocol for this
#'     meta-analysis with the PROSPERO database (www.crd.york.ac.uk/prosper
#'     o)—registration no. CRD42014015595."
#'
#' @param article A string or a list of strings.
#' @return Index of element with phrase of interest
#' @noRd
.which_prospero_1 <- function(article) {

  # Just using the NCT was too sensitive
  # e.g. picked up references to protocols, mentions of trials underway, etc.
  grep("\\bP(?i)ROSPERO\\b(?-i).*CRD\\s*[0-9]{5,11}", article, perl = TRUE)

}


#' Identify PROSPERO registrations that do not quote the CRD number
#'
#' Some reviews report a PROSPERO registration without the CRD identifier, for
#'     example "the review protocol was registered in PROSPERO" or "Registered to
#'     PROSPERO". The past-tense verb "registered" is required next to PROSPERO,
#'     so the registry's own name ("International Prospective Register of
#'     Systematic Reviews") and generic recommendations to register are not
#'     matched. Non-availability wording ("not registered") is removed by the
#'     downstream false-statement guard.
#'
#' @param article A string or a list of strings.
#' @return Index of element with phrase of interest
#' @noRd
.which_prospero_2 <- function(article) {

  grep(paste0("[Rr]egistered[a-zA-Z ]{0,12}P(?i)ROSPERO(?-i)",
              "|P(?i)ROSPERO(?-i)[a-zA-Z (]{0,12}[Rr]egistered"),
       article, perl = TRUE)

}


#' Identify generic mentions of registration
#'
#' Extract the index of mentions such as: "This study was approved by the local
#'     Scientific and Ethics Committees of IRCCS \"Saverio de Bellis\",
#'     Castellana Grotte (Ba), Italy, and it was part of a registered research
#'     on https://www.clinicaltrials.gov, reg. number: NCT01244945."
#'
#' @param article A string or a list of strings.
#' @param dict A list of regular expressions for each concept.
#' @return Index of element with phrase of interest
#' @noRd
.which_registered_1 <- function(article, dict) {

  words <- c("this", "research_lower_strict", "and", "registered_registration")
  # Too generic without research & registered, or if including registration

  this_research <-
    dict %>%
    magrittr::extract(words[c(1, 2)]) %>%
    lapply(.bound) %>%
    lapply(.encase) %>%
    paste(collapse = .max_words(" ", 5, space_first = FALSE))

  this_research_registered <-
    dict %>%
    magrittr::extract(words[c(1, 2, 4)]) %>%
    lapply(.bound) %>%
    lapply(.encase) %>%
    paste(collapse = .max_words(" ", 5, space_first = FALSE))

  and_registered <-
    dict %>%
    magrittr::extract(words[c(3:4)]) %>%
    lapply(.bound) %>%
    lapply(.encase) %>%
    paste(collapse = .max_words(" ", 5, space_first = FALSE))

  research_and_registered <-
    dict %>%
    magrittr::extract(words[2]) %>%
    lapply(.bound) %>%
    lapply(.encase) %>%
    paste(and_registered, sep = dict$txt)

  this_research_and_registered <-
    this_research %>%
    paste(and_registered, sep = dict$txt)

  # c(this_research_registered, research_and_registered) %>%
  #   lapply(.encase) %>%
  #   .encase() %>%
  #   paste("([A-Z]{2}\\s*[0-9]{2}|[0-9]{5})", sep = dict$txt) %>%
  #   grep(article, perl = TRUE)

  c(this_research_registered, this_research_and_registered) %>%
    lapply(.encase) %>%
    .encase() %>%
    paste("([A-Z]{2}\\s*[0-9]{2}|[0-9]{5})", sep = dict$txt) %>%
    grep(article, perl = TRUE)

}


#' Identify generic mentions of registration
#'
#' Extract the index of mentions such as: " EPGP is registered with
#'     clinicaltrials.gov (NCT00552045)."
#'
#' @param article A string or a list of strings.
#' @param dict A list of regular expressions for each concept.
#' @return Index of element with phrase of interest
#' @noRd
.which_registered_2 <- function(article, dict) {

  c("(^|\\.\\s*)(Ethical|Approval|(|The )[A-Z][A-Z]+)",
    "(registered|registration)",
    "([Tt]rial|[Ss]tudy)"
  ) %>%
    paste(collapse = dict$txt) %>%
    paste("([A-Z]{2}\\s*[0-9]{2}|[0-9]{5})", sep = dict$txt) %>%
    grep(article, perl = TRUE)

}


#' Identify generic mentions of registration
#'
#' Extract the index of mentions such as: "registered with Clinical Trials
#'     (ChiCTR-IOR-14005438)"
#'
#' @param article A string or a list of strings.
#' @param dict A list of regular expressions for each concept.
#' @return Index of element with phrase of interest
#' @noRd
.which_registered_3 <- function(article, dict) {

  c("([Tt]rial|[Pp]rotocol) (registered|registration) (with\\b|under\\b|on\\b|at\\b|as\\b)") %>%
    paste("([A-Z]{2}\\s*[0-9]{2}|[0-9]{5})", sep = dict$txt) %>%
    grep(article, perl = TRUE)

}


#' Identify generic mentions of registration
#'
#' Extract the index of mentions such as: "This registered study on
#'     www.clinicaltrials.gov (NCT01375270) was approved by the Ethics
#'     Committee of the Capital Region of Denmark (H-3-2010-127), and all
#'     subjects provided informed written consent to participate."
#'
#' @param article A string or a list of strings.
#' @param dict A list of regular expressions for each concept.
#' @return Index of element with phrase of interest
#' @noRd
.which_registered_4 <- function(article, dict) {

  words <- c("this", "registered", "research_lower_strict")
  # Too generic without research & registered, or if including registration

  dict$this <- append(dict$this, "\\b[Ww]e")

  dict %>%
    magrittr::extract(words) %>%
    lapply(.bound) %>%
    lapply(.encase) %>%
    paste(collapse = .max_words(" ", 5, space_first = FALSE)) %>%
    paste("([A-Z]{2}\\s*[0-9]{2}|[0-9]{5})", sep = dict$txt) %>%
    grep(article, perl = TRUE)

}


#' Identify generic mentions of registration
#'
#' Extract the index of mentions such as: "The Régression de l'Albuminurie dans
#'     la Néphropathie Drépanocytaire (RAND) study design was approved by the
#'     local ethics committee (Ref: DGRI CCTIRS MG/CP09.503, 9 July 2009) and
#'     registered at ClinicalTrials.gov (NCT01195818)."
#'
#' @param article A string or a list of strings.
#' @param dict A list of regular expressions for each concept.
#' @return Index of element with phrase of interest
#' @noRd
.which_registered_5 <- function(article, dict) {

  words <- c("this", "research_strict", "and", "registered_registration")
  SOME <- "\\([a-zA-Z]+\\)"
  # Too generic without research & registered, or if including registration

  SOME_research <-
    dict %>%
    magrittr::extract(words[2]) %>%
    lapply(.bound) %>%
    lapply(.encase) %>%
    paste(SOME, .)

  the_SOME_research <-
    dict %>%
    magrittr::extract(words[1]) %>%
    lapply(.bound) %>%
    lapply(.encase) %>%
    paste(SOME_research, sep = dict$txt)

  the_SOME_research_registered <-
    dict %>%
    magrittr::extract(words[4]) %>%
    lapply(.bound) %>%
    lapply(.encase) %>%
    paste(the_SOME_research, ., sep = .max_words(" ", 5, space_first = FALSE))

  and_registered <-
    dict %>%
    magrittr::extract(words[c(3:4)]) %>%
    lapply(.bound) %>%
    lapply(.encase) %>%
    paste(collapse = .max_words(" ", 5, space_first = FALSE))

  the_SOME_research_and_registered <-
    the_SOME_research %>%
    paste(and_registered, sep = dict$txt)


  c(the_SOME_research_registered, the_SOME_research_and_registered) %>%
    lapply(.encase) %>%
    .encase() %>%
    paste("([A-Z]{2}\\s*[0-9]{2}|[0-9]{5})", sep = dict$txt) %>%
    grep(article, perl = TRUE)

}


#' Identify mentions of lack of registration
#'
#' Extract the index of mentions such as: "This trial and its protocol were not
#'     registered on a publicly accessible registry."
#'
#' @param article A string or a list of strings.
#' @param dict A list of regular expressions for each concept.
#' @return Index of element with phrase of interest
#' @noRd
.which_not_registered_1 <- function(article, dict) {

  words <- c("this", "research_lower_strict")

  dict %>%
    magrittr::extract(words) %>%
    lapply(.bound, location = "both") %>%
    lapply(.encase) %>%
    paste(collapse = .max_words(" ", n_max = 4, space_first = FALSE)) %>%
    paste(" not", " registered", sep = .max_words("", n_max = 6)) %>%
    grep(article, perl = TRUE)
}


#' Identify generic mentions of registration
#'
#' Extract the index of mentions such as: "Trial registration number is
#'     TCTR20151021001."
#'
#' @param article A string or a list of strings.
#' @param dict A list of regular expressions for each concept.
#' @return Index of element with phrase of interest
#' @noRd
.which_registration_1 <- function(article, dict) {

  words <- c("research_strict", "registration")  # Too generic without these

  research_registration <-
    dict %>%
    magrittr::extract(words) %>%
    lapply(.bound) %>%
    lapply(.encase) %>%
    paste(collapse = " ") %>%
    paste0("(:|-+)")

  c(research_registration, "([A-Z]{2}\\s*[0-9]{2}|[0-9]{5})") %>%
    paste(collapse = dict$txt) %>%
    grep(article, perl = TRUE)

}


#' Identify generic mentions of registration
#'
#' Extract the index of mentions such as: "Trial registration number is
#'     TCTR20151021001."
#'
#' @param article A string or a list of strings.
#' @param dict A list of regular expressions for each concept.
#' @return Index of element with phrase of interest
#' @noRd
.which_registration_2 <- function(article, dict) {

  words <- c("Research_strict", "registration")

  research_registration <-
    dict %>%
    magrittr::extract(words) %>%
    lapply(.bound) %>%
    lapply(.encase) %>%
    paste(collapse = " ")

  c(research_registration, "([A-Z]{2}\\s*[0-9]{2}|[0-9]{5})") %>%
    paste(collapse = dict$txt) %>%
    grep(article, perl = TRUE)

}


#' Identify generic mentions of registration
#'
#' Extract the index of mentions such as: "Public clinical trial registration
#'     (www.clinicaltrials.gov) ID# NCT02720653"
#'
#' @param article A string or a list of strings.
#' @return Index of element with phrase of interest
#' @noRd
.which_registration_3 <- function(article) {

  grep("(^|\\.).{0,35}\\b(|pre|pre-)registration.{0,35}NCT[0-9]{8}",
       article, perl = TRUE)

}


#' Identify generic mentions of registration
#'
#' Extract the index of mentions such as: "The RAPiD trial's International
#'     Standard Randomised Controlled Trial Number (ISRCTN) registration is
#'     ISRCTN 49204710."
#'
#' @param article A string or a list of strings.
#' @param dict A list of regular expressions for each concept.
#' @return Index of element with phrase of interest
#' @noRd
.which_registration_4 <- function(article, dict) {

  words <- c("This", "research_lower_strict", "registration")
  # Too generic without research & registered, or if including registration

  this_research <-
    dict %>%
    magrittr::extract(words[c(1, 2)]) %>%
    lapply(.bound) %>%
    lapply(.encase) %>%
    paste(collapse = .max_words(" ", 5, space_first = FALSE))

  c(this_research) %>%
    lapply(.encase) %>%
    .encase() %>%
    paste("registration", "([A-Z]{2}\\s*[0-9]{2}|[0-9]{5})", sep = dict$txt) %>%
    grep(article, perl = TRUE)

}


#' Identify mentions of registry
#'
#' Extract the index of mentions such as: "Here, we describe a collaboration
#'     between an international group of patient organisations advocating for
#'     patients with atypical haemolytic uraemic syndrome (aHUS), the aHUS
#'     Alliance, and an international aHUS patient registry (ClinicalTrials.gov
#'     NCT01522183)."
#'
#' @param article A string or a list of strings.
#' @param dict A list of regular expressions for each concept.
#' @return Index of element with phrase of interest
#' @noRd
.which_registry_1 <- function(article, dict) {

  words <- c("research_lower_strict", "registry")  # Too generic without these

  research_registry <-
    dict %>%
    magrittr::extract(words) %>%
    lapply(.bound) %>%
    lapply(.encase) %>%
    paste(collapse = dict$txt)

  a <-
    c(research_registry, "[\\s,;:]+([A-Z]{2,10}\\s*[0-9]{2}|[0-9]{5})") %>%
    paste(collapse = dict$txt) %>%
    grep(article, perl = TRUE)

  if (!!length(a)) {

    is_false <- .negate_registry_1(article[a], dict)
    a <- a[!is_false]

  }

  return(a)

}




#' Identify registration titles - sensitive with negation
#'
#' Extract the index of mentions such as: "Study registration: ..."
#'
#' @param article A string or a list of strings.
#' @param dict A list of regular expressions for each concept.
#' @return Index of element with phrase of interest
#' @noRd
.which_reg_title_1 <- function(article, dict) {

  b <- integer()
  d <- integer()

  words <- c("registration_title")

  a <-
    dict %>%
    magrittr::extract(words) %>%
    lapply(.bound) %>%
    lapply(.title_strict) %>%
    lapply(.encase) %>%
    paste() %>%
    grep(article, perl = TRUE)

  if (!!length(a)) {

    for (i in seq_along(a)) {

      if (is.na(article[a[i] + 1])) {

        b <- c(b, a[i])

      } else {

        if (nchar(article[a[i] + 1]) == 0) {
          b <- c(b, a[i], a[i] + 2)
        } else {
          b <- c(b, a[i], a[i] + 1)
        }
      }

      is_true <- any(.negate_reg_title_1(article[b]))
      if (is_true) d <- c(d, b)

    }
    if (!!length(d)) return(d)
  }

  a <-
    dict %>%
    magrittr::extract(words) %>%
    lapply(.bound) %>%
    lapply(.title_strict, within_text = TRUE) %>%
    lapply(.encase) %>%
    paste() %>%
    grep(article, perl = TRUE)

  if (!!length(a)) {

    is_true <- .negate_reg_title_1(article[a])
    a <- a[is_true]

  }

  return(a)
}


#' Identify registration titles - specific with no negation
#'
#' Extract the index of mentions such as: "Trial registration: ..."
#'
#' @param article A string or a list of strings.
#' @return Index of element with phrase of interest
#' @noRd
.which_reg_title_2 <- function(article) {

  b <- integer()
  d <- integer()

  reg_title_synonyms <- c(
    "R(?i)egistration info(|rmation)(?-i)",
    "R(?i)egistration detail(|s)(?-i)",
    "R(?i)egistration no(|s)(|\\.)(?-i)",
    "R(?i)egistration number(|s)(?-i)",
    "R(?i)egistration identifier(|s)(?-i)",
    "T(?i)rial(|s) identifier(|s)(?-i)",
    "C(?i)linical trial(|s) identifier(|s)(?-i)",
    "T(?i)rial(|s) registration(?-i)",
    "C(?i)linical trial(|s) registration(?-i)",
    "S(?i)tudy registration(?-i)"
  )

  a <-
    reg_title_synonyms %>%
    lapply(.bound) %>%
    lapply(.title_strict) %>%
    .encase %>%
    grep(article, perl = TRUE)

  if (!!length(a)) {

    for (i in seq_along(a)) {

      if (is.na(article[a[i] + 1])) {

        b <- c(b, a[i])

      } else {

        if (nchar(article[a[i] + 1]) == 0) {
          b <- c(b, a[i], a[i] + 2)
        } else {
          b <- c(b, a[i], a[i] + 1)
        }
      }

      is_true <- any(.negate_reg_title_1(article[b]))
      if (is_true) d <- c(d, b)

    }
    if (!!length(d)) return(d)
  }


  a <-
    reg_title_synonyms %>%
    lapply(.bound) %>%
    lapply(.title_strict, within_text = TRUE) %>%
    .encase %>%
    grep(article, perl = TRUE)

  return(a)
}


#' Identify registration titles - specific
#'
#' Extract the index of mentions such as: "Retrospective clinical trial
#'     registration:"
#'
#' @param article A string or a list of strings.
#' @param dict A list of regular expressions for each concept.
#' @return Index of element with phrase of interest
#' @noRd
.which_reg_title_3 <- function(article, dict) {

  b <- integer()
  d <- integer()

  # Protocol only contributed to FPs due to protocol ethical approval
  registration_synonyms <- c(
    "Registration",
    "Clinical [Tt]rial",
    "Trial"
  )

  number_synonyms <- c(
    "registration",
    "no(|s)(|\\.)",
    "number(|s)",
    "#",
    "ID(|s)",
    "identifier(|s)",
    "information",
    "details"
  )

  start <-  "^.{0,1}[A-Z](?i)\\w+"
  registration <- registration_synonyms %>% .bound %>% .encase
  number <- number_synonyms %>% .bound %>% .encase
  finish <- "(| \\w+)(\\.|:|-+)(?-i)"

  a <-
    paste(start, registration, number, finish, sep = "\\s*") %>%
    paste0("$") %>%
    grep(article, perl = TRUE)


  if (!!length(a)) {

    for (i in seq_along(a)) {

      if (is.na(article[a[i] + 1])) {

        b <- c(b, a[i])

      } else {

        if (nchar(article[a[i] + 1]) == 0) {
          b <- c(b, a[i], a[i] + 2)
        } else {
          b <- c(b, a[i], a[i] + 1)
        }
      }

      is_true <- any(.negate_reg_title_1(article[b]))
      if (is_true) d <- c(d, b)

    }
    if (!!length(d)) return(d)
  }


  a <-
    paste(start, registration, number, finish, sep = "\\s*") %>%
    grep(article, perl = TRUE)

  if (!!length(a)) {

    is_true <- .negate_reg_title_1(article[a])
    a <- a[is_true]

  }

  return(a)
}


#' Identify registration titles - specific
#'
#' Extract the index of mentions such as: "Clinical trial registration detail:"
#'
#' @param article A string or a list of strings.
#' @param dict A list of regular expressions for each concept.
#' @return Index of element with phrase of interest
#' @noRd
.which_reg_title_4 <- function(article, dict) {

  b <- integer()
  d <- integer()

  # Protocol only contributed to FPs due to protocol ethical approval
  registration_synonyms <- c(
    "R(?i)egistration",
    "C(?i)linical [Tt]rial",
    "T(?i)rial"
  )

  number_synonyms <- c(
    "registration",
    "no(|s)(|\\.)",
    "number(|s)",
    "#",
    "ID(|s)",
    "identifier(|s)",
    "information",
    "details"
  )

  start <-  "^.{0,1}"
  registration <- registration_synonyms %>% .bound %>% .encase
  number <- number_synonyms %>% .bound %>% .encase
  finish <- "(| \\w+)(\\.|:|-+)(?-i)"

  a <-
    paste(start, registration, number, finish, sep = "\\s*") %>%
    paste0("$") %>%
    grep(article, perl = TRUE)

  if (!!length(a)) {

    for (i in seq_along(a)) {

      if (is.na(article[a[i] + 1])) {

        b <- c(b, a[i])

      } else {

        if (nchar(article[a[i] + 1]) == 0) {
          b <- c(b, a[i], a[i] + 2)
        } else {
          b <- c(b, a[i], a[i] + 1)
        }
      }

      is_true <- any(.negate_reg_title_1(article[b]))
      if (is_true) d <- c(d, b)

    }
    if (!!length(d)) return(d)
  }

  a <-
    paste(start, registration, number, finish, sep = "\\s*") %>%
    grep(article, perl = TRUE)

  if (!!length(a)) {

    is_true <- .negate_reg_title_1(article[a])
    a <- a[is_true]

  }

  return(a)
}


#' Identify registration titles - specific with no negation
#'
#' Extract the index of mentions such as: "Trial registration: ..."
#'
#' @return Index of element with phrase of interest
#' @noRd
.create_register_pmc_title <- function() {

  title_starts <- c(
    "R(?i)egistration",
    "T(?i)rial",
    "C(?i)linical",
    "(?i)Systematic",
    "M(?i)eta-analysis"
  )

  title_ends <- c(
    "registration",
    "number",
    "\\bid(entifier)",
    "\\bno(|s)",
    "detail(|s)",
    "info(|rmation)"
  )

  list(title_starts, title_ends) %>%
    purrr::map(.encase) %>%
    paste(collapse = "(?i).{0,20}") %>%
    paste(".{0,4}", ., ".{0,1}", sep = "")
}


#' Identify mentions of protocol
#'
#' Extract the index of mentions such as: "The complete study protocol has been
#'     published previously (Supplement 1)"
#'
#' @param article A string or a list of strings.
#' @param dict A list of regular expressions for each concept.
#' @return Index of element with phrase of interest
#' @noRd
.which_protocol_1 <- function(article, dict) {

  words <- c("study protocol", "published", "previously")

  dict %>%
    magrittr::extract(words) %>%
    lapply(.bound) %>%
    lapply(.encase) %>%
    paste(collapse = dict$txt) %>%
    grep(article, perl = TRUE)

}


#' Identify mentions of protocol
#'
#' Extract the index of mentions such as: "Alliance for Clinical Trials in
#'     Oncology (formerly Cancer and Leukemia Group B) Protocol #369901"
#'
#' @param article A string or a list of strings.
#' @return Index of element with phrase of interest
#' @noRd
.which_protocol_2 <- function(article) {

  grep("[Pp]rotocol .{0,5}(|[A-Z]+)[0-9]{5}", article, perl = TRUE)

}


#' Identify mentions of funding followed by NCT
#'
#' Extract the index of mentions such as: "Funded by: the National Heart, Lung,
#'     and Blood Institute, the National Institute of Diabetes and Digestive and
#'     Kidney Disease, and others; SPECS ClinicalTrials.gov number, NCT00443599;
#'     Nutrition and Obesity Center at Harvard; NIH 5P30DK040561-17"
#'
#' @param article A string or a list of strings.
#' @param dict A list of regular expressions for each concept.
#' @return Index of element with phrase of interest
#' @noRd
.which_funded_ct_1 <- function(article, dict) {


  # Anything more general contributed more false than true matches
  funded_by_SOME_ct_NCT <- paste0(
    "[Ff]unded by",
    dict$txt,
    "[A-Z]{2,7}.{0,20}[Cc]linical[Tt]rial.{0,20}NCT[0-9]{8}"
  )

  grep(funded_by_SOME_ct_NCT, article, perl = TRUE)

}


#' Negate unwanted mentions of registry
#'
#' Negate mentions of registry such as: "Ethics approval and consent to
#'     participate This study was approved by the Institutional Review Board of
#'     Chang Gung Memorial Hospital under registry number 201601023B0."
#'
#' @param article A string or a list of strings.
#' @param dict A list of regular expressions for each concept.
#' @return Index of element with phrase of interest
#' @noRd
.negate_registry_1 <- function(article, dict) {

  research <- dict$research %>% .bound %>% .encase
  registry <- dict$registry %>% .bound %>% .encase
  code <- "[\\s,;:]+([A-Z]{2,10}\\s*[0-9]{2}|[0-9]{5})"

  paste(research, "approv", registry, code, sep = dict$txt) %>%
    grepl(article, perl = TRUE)

}


#' Negate unwanted mentions of title
#'
#' Negate title mentions that refer to unwanted text.
#'
#' @param article A string or a list of strings.
#' @return Index of element with phrase of interest
#' @noRd
.negate_reg_title_1 <- function(article) {

  article %>% stringr::str_detect("[A-Z]{2}\\s*[0-9]{2}|[0-9]{5}")

}


#' Negate titles that mention that there was no registration
#'
#' Negate mentions such as "Clinical Trial Registration: N/A"
#'
#' @param article A string or a list of strings.
#' @return Index of element with phrase of interest
#' @noRd
.negate_reg_title_2 <- function(article) {

  article %>% stringr::str_detect("\\bNA\\b|\\bN/A\\b|not registered")

}


#' Remove mentions of previously reported registered studies
#'
#' Removes mentions such as: "An active o <- servational cohort study was
#'     conducted as previously reported (ClinicalTrials.gov identifier
#'     NCT01280162) [16]."
#'
#' @param article A List with paragraphs of interest.
#' @return The list of paragraphs without mentions of financial COIs.
#' @noRd
.obliterate_refs_2 <- function(article) {

  # Good for finding, but not for substituting b/c it's a lookahead
  # words <- c(
  #   # positive lookahead makes these phrases interchangeable
  #   "(?=[a-zA-Z0-9\\s,()-]*(financial|support))",
  #   "(?=[a-zA-Z0-9\\s,()-]*(conflict|competing))"
  # )

  article %>%
    stringr::str_replace_all("NCT[0-9]{8}.{3}REFFF(|\\)|\\])", "") %>%
    stringr::str_replace_all("NCT[0-9]{8}[a-zA-Z0-9\\s,()\\[\\]/:-]*REFFF(|\\)|\\])", "")

  # TODO: This to be inserted only for .which_ct_2!

}


#' Remove references
#'
#' Removes mentions such as: "An active observational cohort study was
#'     conducted as previously reported (ClinicalTrials.gov identifier
#'     NCT01280162) [16]."
#'
#' @param article A List with paragraphs of interest.
#' @return The list of paragraphs without mentions of financial COIs.
#' @noRd
.obliterate_references_1 <- function(article) {

  # If within References or under references and starts with 1. or contains et al. then remove.

  ref_from <- .where_refs_txt(article)

  if (!!length(ref_from)) {

    ref_to <- length(article)

    article[ref_from] <- ""
    article[ref_from:ref_to] <-
      gsub("^([0-9]{1,3}\\.\\s|.*et al\\.).*$", "",
           article[ref_from:ref_to], perl = TRUE)

  }
  return(article)
}


.get_register_pmc_title <- function(article_xml) {

  b <- ""
  title_regex <- .create_register_pmc_title()

  # If I had not stripped the d1 namespace:
  # "back//fn-group//*[self::d1:title or self::d1:bold or self::d1:italic]"
  back_footnote <-
    article_xml %>%
    xml2::xml_find_all("back//fn-group//*[self::title or self::bold or self::italic]")

  a <-
    back_footnote %>%
    xml2::xml_text() %>%
    stringr::str_which(title_regex)

  if (!!length(a)) {

    b <-
      back_footnote %>%
      magrittr::extract(a) %>%
      xml2::xml_parent() %>%
      xml2::xml_contents() %>%
      xml2::xml_text() %>%
      paste(collapse = " ")

    return(b)
  }


  abstract <-
    article_xml %>%
    xml2::xml_find_all("front//abstract//*[self::title or self::bold or self::italic]")

  a <-
    abstract %>%
    xml2::xml_text() %>%
    stringr::str_which(title_regex)

  if (!!length(a)) {

    b <-
      abstract %>%
      magrittr::extract(a) %>%
      xml2::xml_parent() %>%
      xml2::xml_contents() %>%
      xml2::xml_text() %>%
      paste(collapse = " ")

    return(b)
  }


  front_footnote <-
    article_xml %>%
    xml2::xml_find_all("front/article-meta//fn//*[self::title or self::bold or self::italic]")

  a <-
    front_footnote %>%
    xml2::xml_text() %>%
    stringr::str_which(title_regex)

  if (!!length(a)) {

    b <-
      front_footnote %>%
      magrittr::extract(a) %>%
      xml2::xml_parent() %>%
      xml2::xml_contents() %>%
      xml2::xml_text() %>%
      paste(collapse = " ")

    return(b)
  }

  return(b)
}



#' Identify mentions of registration on ISRCTN
#'
#' @param article A string or a list of strings.
#' @return Index of element with phrase of interest
#' @noRd
.which_isrctn_1 <- function(article) {

  grep("\\bISRCTN[0-9]{8}\\b", article, perl = TRUE)

}


#' Identify mentions of registration on ANZCTR
#'
#' @param article A string or a list of strings.
#' @return Index of element with phrase of interest
#' @noRd
.which_anzctr_1 <- function(article) {

  grep("\\bACTRN[0-9]{14}\\b", article, perl = TRUE)

}


#' Identify mentions of registration on DRKS
#'
#' @param article A string or a list of strings.
#' @return Index of element with phrase of interest
#' @noRd
.which_drks_1 <- function(article) {

  grep("\\bDRKS[0-9]{8}\\b", article, perl = TRUE)

}


#' Identify mentions of registration on IRCT
#'
#' @param article A string or a list of strings.
#' @return Index of element with phrase of interest
#' @noRd
.which_irct_1 <- function(article) {

  # IRCT IDs follow the format IRCT + digits + uppercase letter + digits
  # e.g. IRCT20120526009954N3, IRCT138707012049N3
  grep("\\bIRCT[0-9]{5,}[A-Z][0-9]+\\b", article, perl = TRUE)

}


#' Identify mentions of registration on UMIN
#'
#' @param article A string or a list of strings.
#' @return Index of element with phrase of interest
#' @noRd
.which_umin_1 <- function(article) {

  grep("\\bUMIN[0-9]{9}\\b", article, perl = TRUE)

}


#' Identify flexible ClinicalTrials.gov registration statements.
#'
#' @param article A string or a list of strings.
#' @return Index of element with phrase of interest
#' @noRd
.which_ct_4 <- function(article) {

  grep(
    paste(
      "\\b(the|this|our) (study|trial).{0,80}registered at clinicaltrials\\.?gov.{0,80}NCT[0-9]{8}",
      "\\b(the|this|our|it) (study|trial)? ?was registered at clinicaltrial(s)?\\.?\\s*gov.{0,100}(registration number )?NCT[0-9]{8}",
      "\\bclinical trial registration.{0,100}clinicaltrial(s)?\\.?\\s*gov.{0,80}NCT[0-9]{8}",
      "\\bclinicaltrial(s)?\\.?\\s*gov identifier is NCT[0-9]{8}",
      # Generic: a registration verb or clinicaltrials.gov near an NCT id. NCT
      # ids are an unambiguous ClinicalTrials.gov registration format, so this
      # catches "registered with/on/in ClinicalTrials.gov (NCT...)" etc.
      "\\bregister(ed|ation)?\\b.{0,60}\\bNCT[0-9]{8}\\b",
      "\\bNCT[0-9]{8}\\b.{0,40}\\b(register(ed|ation)?|clinicaltrial)",
      "\\bclinicaltrial(s)?\\.?\\s*gov\\b.{0,50}\\bNCT[0-9]{8}\\b",
      "\\b(trial|study|protocol) registration (number|no|id)?\\b.{0,40}NCT[0-9]{8}",
      sep = "|"
    ),
    article,
    perl = TRUE,
    ignore.case = TRUE
  )

}


#' Identify mentions of registration on ChiCTR
#'
#' @param article A string or a list of strings.
#' @return Index of element with phrase of interest
#' @noRd
.which_chictr_1 <- function(article) {

  grep("\\bChiCTR[-A-Z]*[0-9]{5,}\\b|Chinese Clinical Trial Registry",
       article, perl = TRUE, ignore.case = TRUE)

}


#' Identify mentions of registered systematic-review protocols on INPLASY
#'
#' @param article A string or a list of strings.
#' @return Index of element with phrase of interest
#' @noRd
.which_inplasy_1 <- function(article) {

  grep("\\bINPLASY[0-9]{5,}\\b|International Platform of Registered Systematic Review",
       article, perl = TRUE, ignore.case = TRUE)

}


#' Identify registered protocols hosted on OSF
#'
#' @param article A string or a list of strings.
#' @return Index of element with phrase of interest
#' @noRd
.which_osf_protocol_1 <- function(article) {

  grep("protocol.{0,80}(registered|registration).{0,80}(Open Science Framework|OSF|osf\\.io|10\\.17605/OSF)",
       article, perl = TRUE, ignore.case = TRUE)

}


#' Identify preregistered studies hosted on OSF.
#'
#' @param article A string or a list of strings.
#' @return Index of element with phrase of interest
#' @noRd
.which_osf_preregistered_1 <- function(article) {

  grep(
    "\\b(this|the|our) (study|work|trial|review|protocol).{0,80}pre-?registered.{0,100}(Open Science Framework|OSF|osf\\.io|10\\.17605/OSF)",
    article,
    perl = TRUE,
    ignore.case = TRUE
  )

}


#' Identify blinded PROSPERO registration statements.
#'
#' @param article A string or a list of strings.
#' @return Index of element with phrase of interest
#' @noRd
.which_prospero_redacted_1 <- function(article) {

  grep(
    "\\bprotocol for (this|the) review.{0,100}registered with PROSPERO.{0,100}CRD number redacted",
    article,
    perl = TRUE,
    ignore.case = TRUE
  )

}


#' Identify and extract Registration statements in PMC XML files.
#'
#' Takes a PMC XML file as a list of strings and returns data related to the
#'     presence of a Registration statement, including whether such a statement
#'     exists. If a Registration statement exists, it extracts it. This is a
#'     modified version of the `rt_register_pmc` designed for integration with
#'     `rt_all_pmc`.
#'
#' @param article_ls A PMC XML as a list of strings.
#' @param pmc_reg_ls A list of results from the `.get_register_pmc` function.
#' @param dict A list of regular expressions for each concept.
#' @return A dataframe of results.
#' @noRd
.rt_register_pmc <- function(article_ls, pmc_reg_ls, dict) {

  # TODO Update to match format of rt_coi_pmc.

  # Creating and outputting these lists adds negligible time
  # Way faster than index_any[["reg_title_pmc"]] <- NA
  index_any <- list(
    prospero_1 = NA,
    prospero_2 = NA,
    prospero_redacted_1 = NA,
    ct_4 = NA,
    registered_1 = NA,
    registered_2 = NA,
    registered_3 = NA,
    registered_4 = NA,
    registered_5 = NA,
    not_registered_1 = NA,
    registration_1 = NA,
    registration_2 = NA,
    registration_3 = NA,
    registration_4 = NA,
    registry_1  = NA,
    reg_title_1 = NA,
    reg_title_2 = NA,
    reg_title_3 = NA,
    reg_title_4 = NA,
    funded_ct_1 = NA,
    isrctn_1    = NA,
    anzctr_1    = NA,
    drks_1      = NA,
    irct_1      = NA,
    umin_1      = NA,
    chictr_1    = NA,
    inplasy_1   = NA,
    osf_protocol_1 = NA,
    osf_preregistered_1 = NA
  )

  index_method <- list(
    ct_2 = NA,
    ct_3 = NA,
    protocol_1 = NA
  )

  out <- list(  # do not change order of pmid:doi - id extraction depends on it
    is_relevant_reg = NA,
    is_method = NA,
    is_NCT = NA,
    is_register_pred = FALSE,
    register_text = "",
    is_explicit_reg = NA
  )


  if (!pmc_reg_ls$is_research & !pmc_reg_ls$is_review) {

    return(c(out, index_any, index_method))

  }


  if (pmc_reg_ls$is_register_pred) {

    out$is_relevant_reg <- TRUE
    out$is_explicit_reg <- TRUE
    out$is_register_pred <- TRUE
    out$register_text <- pmc_reg_ls$register_text
    if (.is_false_register_statement(out$register_text)) {
      out$is_register_pred <- FALSE
      out$register_text <- ""
      out$is_explicit_reg <- NA
    }

    return(c(out, index_any, index_method))
  }



  # TODO Consider adding unique
  article <-
    article_ls[c("ack", "methods", "abstract", "footnotes")] %>%
    unlist()
  # unique()

  methods <- unlist(article_ls$methods)

  reg_regex <- "\\b(|-)([Rr]egist|(|[Cc]linical)[Tt]rial|NCT[0-9]{8}|ISRCTN|ChiCTR|INPLASY|PROSPERO|Open Science Framework|OSF|osf\\.io|10\\.17605/OSF)"
  article %<>% purrr::keep(stringr::str_detect, pattern = reg_regex)


  out$is_relevant_reg <- !!length(article)

  # Check for relevance
  if (!out$is_relevant_reg) {

    return(c(out, index_any, index_method))

  }

  # Check for methods
  out$is_method <- !!length(methods)

  if (!out$is_method) {

    return(c(out, index_any, index_method))

  }


  # Text pre-processing
  article_processed <-
    article %>%
    iconv(from = 'UTF-8', to = 'ASCII//TRANSLIT', sub = "") %>%   # keep first
    trimws() %>%
    .obliterate_fullstop_1() %>%
    .obliterate_semicolon_1() %>%  # adds minimal overhead
    .obliterate_comma_1() %>%   # adds minimal overhead
    .obliterate_apostrophe_1() %>%
    .obliterate_punct_1() %>%
    .obliterate_line_break_1() %>%
    .obliterate_refs_2()


  # Way faster than: out[["is_NCT"]] <- ...
  out$is_NCT <- purrr::some(article, stringr::str_detect, "NCT[0-9]{8}")

  index_any$prospero_1 <- .which_prospero_1(article_processed)
  index_any$prospero_2 <- .which_prospero_2(article_processed)
  index_any$prospero_redacted_1 <- .which_prospero_redacted_1(article_processed)
  index_any$ct_4 <- .which_ct_4(article_processed)
  index_any$registered_1 <- .which_registered_1(article_processed, dict)
  index_any$registered_2 <- .which_registered_2(article_processed, dict)
  index_any$registered_3 <- .which_registered_3(article_processed, dict)
  index_any$registered_4 <- .which_registered_4(article_processed, dict)
  index_any$registered_5 <- .which_registered_5(article_processed, dict)
  index_any$not_registered_1 <- .which_not_registered_1(article_processed, dict)
  index_any$registration_1 <- .which_registration_1(article_processed, dict)
  index_any$registration_2 <- .which_registration_2(article_processed, dict)
  index_any$registration_3 <- .which_registration_3(article_processed)
  index_any$registration_4 <- .which_registration_4(article_processed, dict)
  index_any$registry_1 <- .which_registry_1(article_processed, dict)
  index_any$reg_title_1 <- .which_reg_title_1(article_processed, dict)
  index_any$reg_title_2 <- .which_reg_title_2(article_processed)
  index_any$reg_title_3 <- .which_reg_title_3(article_processed, dict)
  index_any$reg_title_4 <- .which_reg_title_4(article_processed, dict)
  index_any$funded_ct_1 <- .which_funded_ct_1(article_processed, dict)
  index_any$isrctn_1    <- .which_isrctn_1(article_processed)
  index_any$anzctr_1    <- .which_anzctr_1(article_processed)
  index_any$drks_1      <- .which_drks_1(article_processed)
  index_any$irct_1      <- .which_irct_1(article_processed)
  index_any$umin_1      <- .which_umin_1(article_processed)
  index_any$chictr_1    <- .which_chictr_1(article_processed)
  index_any$inplasy_1   <- .which_inplasy_1(article_processed)
  index_any$osf_protocol_1 <- .which_osf_protocol_1(article_processed)
  index_any$osf_preregistered_1 <- .which_osf_preregistered_1(article_processed)
  index <- unlist(index_any) %>% unique() %>% sort()


  if (!!length(index)) {

    out$is_explicit_reg <- !!length(unlist(index_any))
    out$is_register_pred <- !!length(index)
    out$register_text <- article[index] %>% paste(collapse = " ")
    if (.is_false_register_statement(out$register_text)) {
      out$is_register_pred <- FALSE
      out$register_text <- ""
      out$is_explicit_reg <- NA
      index <- integer()
    }

    index_any %<>% purrr::map(function(x) !!length(x))

    return(c(out, index_any, index_method))
  }


  # Apply a more sensitive search in Methods
  if (out$is_method) {

    # x30 faster than obliterating methods again
    # article_processed %<>% purrr::keep(article %in% methods)
    i <- which(article %in% methods)

    # methods %<>%
    #   .obliterate_fullstop_1() %>%
    #   .obliterate_semicolon_1() %>%  # adds minimal overhead
    #   .obliterate_comma_1() %>%   # adds minimal overhead
    #   .obliterate_apostrophe_1() %>%
    #   .obliterate_hash_1() %>%
    #   .obliterate_backlash_1() %>%
    #   .obliterate_line_break_1()
    #
    # article %<>% purrr::keep(magrittr::is_in, methods)

    index_method$ct_2 <- .which_ct_2(article_processed[i], dict)
    index_method$ct_3 <- .which_ct_3(article_processed[i], dict)
    index_method$protocol_1 <- .which_protocol_1(article_processed[i], dict)

    index <- i[unlist(index_method) %>% unique() %>% sort()]
    index_method %<>% purrr::map(function(x) !!length(x))
  }

  out$is_register_pred <- !!length(index)
  out$register_text <- article[index] %>% paste(collapse = " ")
  if (out$is_register_pred && .is_false_register_statement(out$register_text)) {
    out$is_register_pred <- FALSE
    out$register_text <- ""
    out$is_explicit_reg <- NA
  }

  index_any %<>% purrr::map(function(x) !!length(x))

  if (out$is_register_pred) {

    out$is_explicit_reg <- FALSE

  }

  return(c(out, index_any, index_method))
}



#' Identify and extract Conflicts of Interest statements in PMC XML files.
#'
#' Takes a PMC XML file and returns data related to the
#'     presence of a Funding statement, including whether a Funding statement
#'     exists. If a Funding statement exists, it extracts it.
#'
#' @param filename The name of the PMC XML as a string.
#' @param remove_ns TRUE if an XML namespace exists, else FALSE (default).
#' @return A dataframe of results. It returns the unique article identifiers,
#'     whether this article was deemed a research, review or systematic review,
#'     whether the text was deemed relevant to registration (e.g. contained the
#'     word registration), whether a Methods section was identified, whether an
#'     NCT number was identified, whether a registration was explicitly
#'     identified (defunct), whether a registration statement was found, what
#'     the registration statement was, whether it the registration was
#'     identified from the PMC XML (i.e. it was found within a dedicated
#'     registration tag) and whether each labeling function identified a
#'     relevant text or not. The labeling functions are returned to
#'     add flexibility in how this package is used; for example, future
#'     definitions of Registration may differ from the one we used.
#' @examples
#' \donttest{
#' # Path to a bundled example PMC XML file.
#' filepath <- system.file(
#'   "extdata", "PMID32171256-PMC7071725.xml", package = "rtransparency"
#' )
#'
#' # Identify and extract meta-data and indicators of transparency.
#' results_table <- rt_register_pmc(filepath, remove_ns = TRUE)
#' }
#' @export
rt_register_pmc <- function(filename, remove_ns = FALSE) {

  # TODO Update to match format of rt_coi_pmc.

  xpath <- c(
    "front/article-meta/article-id[@pub-id-type = 'pmid']",
    "front/article-meta/article-id[@pub-id-type = 'pmc']",
    "front/article-meta/article-id[@pub-id-type = 'pmc-uid']",
    "front/article-meta/article-id[@pub-id-type = 'doi']"
  )

  var_names <- c(
    "pmid",
    "pmcid_pmc",
    "pmcid_uid",
    "doi"
  )

  # Creating and outputting these lists adds negligible time
  # Way faster than index_any[["reg_title_pmc"]] <- NA
  index_any <- list(
    reg_title_pmc = NA,
    prospero_1 = NA,
    prospero_2 = NA,
    prospero_redacted_1 = NA,
    ct_4 = NA,
    registered_1 = NA,
    registered_2 = NA,
    registered_3 = NA,
    registered_4 = NA,
    registered_5 = NA,
    not_registered_1 = NA,
    registration_1 = NA,
    registration_2 = NA,
    registration_3 = NA,
    registration_4 = NA,
    registry_1  = NA,
    reg_title_1 = NA,
    reg_title_2 = NA,
    reg_title_3 = NA,
    reg_title_4 = NA,
    funded_ct_1 = NA,
    isrctn_1    = NA,
    anzctr_1    = NA,
    drks_1      = NA,
    irct_1      = NA,
    umin_1      = NA,
    chictr_1    = NA,
    inplasy_1   = NA,
    osf_protocol_1 = NA,
    osf_preregistered_1 = NA
  )

  index_method <- list(
    ct_2 = NA,
    ct_3 = NA,
    protocol_1 = NA
  )

  out <- list(  # do not change order of pmid:doi - id extraction depends on it
    pmid = NA,
    pmcid_pmc = NA,
    pmcid_uid = NA,
    doi = NA,
    is_research = NA,
    is_review = NA,
    is_slr = FALSE,
    is_relevant = NA,
    is_method = NA,
    is_NCT = NA,
    is_register_pred = FALSE,
    register_text = "",
    is_explicit = NA
  )


  # A lot of the PMC XML files are malformed
  article_xml <- tryCatch(.get_xml(filename, remove_ns), error = function(e) e)

  if (inherits(article_xml, "error")) {

    return(tibble::tibble(filename, is_success = FALSE))

  }


  # Extract IDs
  out %<>% purrr::list_modify(!!!purrr::map(xpath, ~ .get_text(article_xml, .x, TRUE)))
  # out <-
  #   xpath %>%
  #   lapply(.get_text, article_xml = article_xml, find_first = TRUE) %>%
  #   {purrr::list_modify(out, !!!.)}


  # Check for type
  # Definitions at PMC -> Tagging Guidelines -> Document Objects
  research_types <- c(
    "research-article",
    "protocol",
    "letter",
    "brief-report",
    "data-paper"
  )

  review_types <- c(
    "review-article",  # SLRs can also be labelled as review-article...
    "systematic-review"
  )

  type <- article_xml %>% xml2::xml_attr("article-type")

  out$is_research <- magrittr::is_in(type, research_types)
  out$is_review <- magrittr::is_in(type, review_types)

  if (!out$is_research & !out$is_review) {

    return(tibble::as_tibble(c(out, index_any, index_method)))

  }

  # Check for SLR (code is marginally faster without this)
  # if (out$is_review) {
  #
  #   txt <- xml_text(article_xml)
  #   out$is_slr <- grepl("systematic review", txt, ignore.case = TRUE)
  #
  #   if (!out$is_slr) {
  #
  #     return(tibble::as_tibble(c(out, index_any, index_method)))
  #
  #   }
  # }


  # Go through titles
  title_txt <- .get_register_pmc_title(article_xml)
  is_title <- nchar(title_txt) > 0

  if (is_title) {

    index_any$reg_title_pmc <- TRUE
    out$is_relevant <- TRUE
    out$is_explicit <- TRUE
    out$is_register_pred <- TRUE
    out$register_text <- title_txt

    return(tibble::as_tibble(c(out, index_any, index_method)))

  }


  # Check for relevance
  # txt <- xml_text(article_xml)
  # rel_regex <- "\\b(|-)([Rr]egist|(|[Cc]linical)[Tt]rial|NCT[0-9]{8}|PROSPERO)"
  # out$is_relevant <- grepl(rel_regex, txt)
  #
  # if (!out$is_relevant) {
  #
  #   return(tibble::as_tibble(c(out, index_any, index_method)))
  #
  # }


  # Check for methods (10x more costly than xml_text(article_xml))
  # methods <- .xml_methods(article_xml)
  # out$is_method <- !!length(methods)
  #
  # if (!out$is_method) {
  #
  #   return(tibble::as_tibble(c(out, index_any, index_method)))
  #
  # }


  # Extract article text into a list
  # article[["ack"]] <- .xml_ack(article_xml)
  # article[["methods"]] <- .xml_methods(article_xml)
  # article[["abstract"]] <- .xml_abstract(article_xml)
  # article[["footnotes"]] <- .xml_footnotes(article_xml)


  # Extract article text into a vector
  ack <- .xml_ack(article_xml)
  methods <- .xml_methods(article_xml)
  abstract <- .xml_abstract(article_xml)
  footnotes <- .xml_footnotes(article_xml)
  article <- c(abstract, methods, footnotes, ack)

  # Can use xpath, but x2 slower (8 vs 4ms) and cannot get refs
  # article_xml %>%
  #     xml_find_all("//text()[contains(translate(., 'REGIST', 'regist'), \
  #                  'regist') or contains(translate(., 'TRIAL', 'trial'),\
  #                  'trial') or contains(., 'NCT') or contains(., 'PROSPERO')\
  #                  ]") %>%
  #     xml_text()

  # Can use article as list (negligible change in performance)
  # article <-
  #   list(abstract, methods, footnotes, ack) %>%
  #   purrr::compact() %>%
  #   purrr::map(~ keep(.x, ~ grepl(relevant_regex, .x))) %>%
  #   purrr::compact()

  # Adding PROPSERO adds negligible overhead
  rel_regex <- "\\b(|-)([Rr]egist|(|[Cc]linical)[Tt]rial|NCT[0-9]{8}|ISRCTN|ACTRN|DRKS|IRCT|UMIN|ChiCTR|INPLASY|PROSPERO|Open Science Framework|OSF|osf\\.io|10\\.17605/OSF)"
  article %<>% purrr::keep(stringr::str_detect, pattern = rel_regex)


  out$is_relevant <- !!length(article)

  # Check for relevance
  if (!out$is_relevant) {

    return(tibble::as_tibble(c(out, index_any, index_method)))

  }

  # Check for methods
  out$is_method <- !!length(methods)

  if (!out$is_method) {

    return(tibble::as_tibble(c(out, index_any, index_method)))

  }

  # Activate if I want to check for title here.
  # if (!!length(index_any$reg_title_pmc)) {
  #
  #   i <- index_any$reg_title_pmc[2]
  #   is_true <- .negate_reg_title_1(article[i])
  #
  #   if (is_true) {
  #
  #   index <- unlist(index_any)
  #   out[["is_register_pred"]] <- TRUE
  #   out[["register_text"]] <- article[index] %>% paste(collapse = " ")
  #
  #   return(tibble::as_tibble(c(out, index_any, index_method)))
  #   }
  # }


  # Text pre-processing
  # .xml_preprocess(article_xml)  # 5x faster to obliterate within each section
  article_processed <-
    article %>%
    iconv(from = 'UTF-8', to = 'ASCII//TRANSLIT', sub = "") %>%   # keep first
    .obliterate_fullstop_1() %>%
    .obliterate_semicolon_1() %>%  # adds minimal overhead
    .obliterate_comma_1() %>%   # adds minimal overhead
    .obliterate_apostrophe_1() %>%
    .obliterate_punct_1() %>%
    .obliterate_line_break_1() %>%
    .obliterate_refs_2()

  # Way faster than: out[["is_NCT"]] <- ...
  out$is_NCT <- purrr::some(article, stringr::str_detect, "NCT[0-9]{8}")

  dict <- .create_synonyms()
  index_any$reg_title_pmc <- integer()
  index_any$prospero_1 <- .which_prospero_1(article_processed)
  index_any$prospero_2 <- .which_prospero_2(article_processed)
  index_any$prospero_redacted_1 <- .which_prospero_redacted_1(article_processed)
  index_any$ct_4 <- .which_ct_4(article_processed)
  index_any$registered_1 <- .which_registered_1(article_processed, dict)
  index_any$registered_2 <- .which_registered_2(article_processed, dict)
  index_any$registered_3 <- .which_registered_3(article_processed, dict)
  index_any$registered_4 <- .which_registered_4(article_processed, dict)
  index_any$registered_5 <- .which_registered_5(article_processed, dict)
  index_any$not_registered_1 <- .which_not_registered_1(article_processed, dict)
  index_any$registration_1 <- .which_registration_1(article_processed, dict)
  index_any$registration_2 <- .which_registration_2(article_processed, dict)
  index_any$registration_3 <- .which_registration_3(article_processed)
  index_any$registration_4 <- .which_registration_4(article_processed, dict)
  index_any$registry_1 <- .which_registry_1(article_processed, dict)
  index_any$reg_title_1 <- .which_reg_title_1(article_processed, dict)
  index_any$reg_title_2 <- .which_reg_title_2(article_processed)
  index_any$reg_title_3 <- .which_reg_title_3(article_processed, dict)
  index_any$reg_title_4 <- .which_reg_title_4(article_processed, dict)
  index_any$funded_ct_1 <- .which_funded_ct_1(article_processed, dict)
  index_any$isrctn_1    <- .which_isrctn_1(article_processed)
  index_any$anzctr_1    <- .which_anzctr_1(article_processed)
  index_any$drks_1      <- .which_drks_1(article_processed)
  index_any$irct_1      <- .which_irct_1(article_processed)
  index_any$umin_1      <- .which_umin_1(article_processed)
  index_any$chictr_1    <- .which_chictr_1(article_processed)
  index_any$inplasy_1   <- .which_inplasy_1(article_processed)
  index_any$osf_protocol_1 <- .which_osf_protocol_1(article_processed)
  index_any$osf_preregistered_1 <- .which_osf_preregistered_1(article_processed)
  index <- unlist(index_any) %>% unique() %>% sort()

  # Tidier but takes a median 11.0 ms vs current, which takes 10.6 ms
#   index_any <- list(
#     prospero_1 = NA,
#     registered_1 = NA,
#     registered_2 = NA,
#     registered_3 = NA,
#     registered_4 = NA,
#     registered_5 = NA,
#     not_registered_1 = NA,
#     registration_1 = NA,
#     registration_2 = NA,
#     registration_3 = NA,
#     registration_4 = NA,
#     registry_1  = NA,
#     reg_title_1 = NA,
#     reg_title_2 = NA,
#     reg_title_3 = NA,
#     reg_title_4 = NA,
#     funded_ct_1 = NA
#   )
#
#   func <- list(
#     .which_prospero_1,
#     .which_registered_1,
#     .which_registered_2,
#     .which_registered_3,
#     .which_registered_4,
#     .which_registered_5,
#     .which_not_registered_1,
#     .which_registration_1,
#     .which_registration_2,
#     .which_registration_3,
#     .which_registration_4,
#     .which_registry_1,
#     .which_reg_title_1,
#     .which_reg_title_2,
#     .which_reg_title_3,
#     .which_reg_title_4,
#     .which_funded_ct_1
#   )
#
#   pepa <- article %>% purrr::invoke_map(func, .)
#   index_any %<>% list_modify(!!!pepa)
# }


  if (!!length(index)) {

    out$is_explicit <- !!length(unlist(index_any))
    out$is_register_pred <- !!length(index)
    out$register_text <- article[index] %>% paste(collapse = " ")

    if (.is_false_register_statement(out$register_text)) {

      out$is_register_pred <- FALSE
      out$register_text <- ""
      out$is_explicit <- NA
      index <- integer()

    } else {

      index_any %<>% purrr::map(function(x) !!length(x))

      return(tibble::as_tibble(c(out, index_any, index_method)))

    }
  }


  # Apply a more sensitive search in Methods
  if (out$is_method) {

    # x30 faster than obliterating methods again
    # article_processed %<>% purrr::keep(article %in% methods)
    i <- which(article %in% methods)

    # methods %<>%
    #   .obliterate_fullstop_1() %>%
    #   .obliterate_semicolon_1() %>%  # adds minimal overhead
    #   .obliterate_comma_1() %>%   # adds minimal overhead
    #   .obliterate_apostrophe_1() %>%
    #   .obliterate_hash_1() %>%
    #   .obliterate_backlash_1() %>%
    #   .obliterate_line_break_1()
    #
    # article %<>% purrr::keep(magrittr::is_in, methods)

    index_method$ct_2 <- .which_ct_2(article_processed[i], dict)
    index_method$ct_3 <- .which_ct_3(article_processed[i], dict)
    index_method$protocol_1 <- .which_protocol_1(article_processed[i], dict)

    index <- i[unlist(index_method) %>% unique() %>% sort()]
    index_method %<>% purrr::map(function(x) !!length(x))
  }

  out$is_register_pred <- !!length(index)
  out$register_text <- article[index] %>% paste(collapse = " ")

  index_any %<>% purrr::map(function(x) !!length(x))

  if (out$is_register_pred) {

    out$is_explicit <- FALSE

  }

  return(tibble::as_tibble(c(out, index_any, index_method)))
}

Try the rtransparency package in your browser

Any scripts or data that you put into this service are public.

rtransparency documentation built on July 1, 2026, 9:07 a.m.