rtransparent: Identifies indicators of transparency

Documented in rt_coi

#' Identify and extract Conflicts of Interest (COI) statements in TXT files.
#'
#' Takes a TXT file and returns data related to the presence of a COI
#'     statement, including whether a COI statement exists. If a COI statement
#'     exists, it extracts it.
#'
#' @param filename The name of the TXT file as a string.
#' @return A dataframe of results. It returns the filename, PMID (if it was part
#'     of the file name), whether a COI was found and the text identified.
#' @examples
#' \dontrun{
#' # Path to PMC XML.
#' filepath <- "../inst/extdata/00003-PMID26637448-PMC4737611.txt"
#'
#' # Identify and extract meta-data and indicators of transparency.
#' results_table <- rt_coi(filepath)
#' }
#' @export
rt_coi <- function(filename) {

  # TODO Redevelop to match rt_coi_pmc

  paper_text <- readr::read_file(filename)
  # Do the UTF-8 encoding here because it creates problems with multiple funs
  splitted <- strsplit(paper_text, "\n| \\*")[[1]] %>% utf8::utf8_encode()

  conflict <- "conflict of interest"
  conflicts <- "conflicts of interest"
  competing <- "competing interest"
  disclosure <- "disclosure"
  finance <- "competing financial interest"
  declare <- "declaration of interest"
  dual <- "duality of interest"

  is_conflict = agrep(conflict, splitted, ignore.case = T)
  is_conflicts = agrep(conflicts, splitted, ignore.case = T)
  is_competing = agrep(competing, splitted, ignore.case = T)
  is_disclosure = agrep(disclosure, splitted, ignore.case = T)
  is_finance = agrep(finance, splitted, ignore.case = T)
  is_declare = agrep(declare, splitted, ignore.case = T)
  is_dual = agrep(dual, splitted, ignore.case = T)

  # Exclude financial disclosures
  if (length(is_disclosure) > 0) {
    a <- grep("financial disclosure", splitted[is_disclosure], ignore.case = T, invert = T)
    is_disclosure <- is_disclosure[a]
  }

  # Exclude mentions of COI that are not COI
  # Assume if no capital C, no mention of "no", no punctuation and no reported
  # then, this is NOT a mention of COI.
  the_conflicts <- unique(c(is_conflict, is_conflicts, is_competing))

  if (length(the_conflicts) > 0) {
    for (j in seq_along(the_conflicts)) {
      # No capital C
      is_capital_1 <- agrepl("Conflict", splitted[the_conflicts[j]])
      is_capital_2 <- agrepl("CONFLICT", splitted[the_conflicts[j]])
      is_capital_3 <- agrepl("Compet", splitted[the_conflicts[j]])
      is_capital_4 <- agrepl("COMPET", splitted[the_conflicts[j]])
      is_capital <- any(c(is_capital_1, is_capital_2, is_capital_3, is_capital_4))
      # No mention of "no"
      is_no <- agrepl("no.{0,20}conflict", splitted[the_conflicts[j]], ignore.case = T)
      # No mention of authors (needed for 0245)
      is_author <- agrepl("author", splitted[the_conflicts[j]], ignore.case = T)
      # No punctuation after interests
      is_punctuation <-
        agrepl("interest.{0,1}[.:;,]", splitted[the_conflicts[j]], ignore.case = T)
      # No mention of reporting after interests
      is_report <- grepl("disclosed|reported|mentioned|declared|communicated|revealed|divulged|aired|voiced|expressed", splitted[the_conflicts[j]], ignore.case = T)

      if (!is_capital & !is_no & !is_punctuation & !is_report & !is_author) {
        the_conflicts[j] <- NA
      }
    }
    the_conflicts <- the_conflicts[!is.na(the_conflicts)]
  }

  index <- unique(c(the_conflicts, is_disclosure, is_finance, is_declare, is_dual)) %>% sort()

  article <- basename(filename)
  pmid <- gsub("^.*PMID([0-9]+).*$", "\\1", filename)

  # Try to protect from mentions of words in other sections of text
  # If a hit ends with interest or starts with Disclosure, more likely to be COI
  vals <- c("interest.{0,1}:|Disclosure.{0,1}:")
  if (length(index) > 1) {
    a <- grep(vals, splitted[index], ignore.case = T)

    if (length(a) > 0) {  # only do this if a hit was found
      index <- index[index >= index[min(a)]]
    }
  }

  # If in point form, sections are missed when they do not include the keywords.
  if (length(index) > 1 & any(diff(index) > 1)) {
    if (max(index) - min(index) < 10) {  # Safeguard
      index <- seq(min(index), max(index), 1)
    }
  }

  coi_text <- splitted[index] %>% unlist %>% paste(., collapse = " ")

  # Identify text that may have been  missed because it was in a new line
  if (length(index) == 1) {
    no_stop_words <- gsub(" of ", " ", coi_text,  ignore.case = T)
    if (length(strsplit(no_stop_words, " ")[[1]]) < 4) {
      if (!grepl("no", splitted[index], ignore.case = T)) {
        if (nchar(splitted[index + 1]) == 0) {
          index <- c(index, index + 2)
        } else {
          index <- c(index, index + 1)
        }
        new_str <- gsub("^.+(None.*$)", "\\1", splitted[index[2]])
        if (grepl("^.*\\.$", new_str)) {
          # make sure this is a whole sentence
          coi_text <- paste(coi_text, new_str)
        } else {
          coi_text <- paste(coi_text, new_str, splitted[index[2] + 1])
        }
      }
    }
  }

  # Exclude other mentions of disclosure that are not disclosures of interest
  # e.g. Patient information disclosure
  # If no capital D and no mention of competing/conflicts, or None
  # then false positive
  # I am doing this after the above b/c things like Disclosure/n None. would not be captured otherwise
  if (length(is_disclosure) > 0 & length(the_conflicts) == 0) {

    # Capital D
    is_capital <- grepl("Disclos|DISCLOS", coi_text)
    # Mention of conflict/competing
    # (this uses only mentions of conflict/competing interest deemed relevant)
    is_compconf <- length(the_conflicts) > 0
    # Mention negation
    is_no_1 <- grepl("None|Nothing|No|Nil", coi_text)
    is_no_2 <- grepl("NONE|NOTHING|NO|NIL", coi_text)
    is_no <- any(c(is_no_1, is_no_2))
    # Punctuation
    # (disclose. needed for 0245)
    is_punctuation <- grepl("disclosure.{0,1}[.:;,]", coi_text, ignore.case = T)
    # Mention of author
    is_author <- grepl("Author.{0,2} [dD]isclo", coi_text)

    if (!is_capital & !is_compconf & !is_no & !is_punctuation & !is_author) {
      index <- c()
      coi_text <- ""
    }
  }

  # Adjudicate presence of COI
  is_coi_pred <- length(index) > 0

  # Remove preceding text that is not relevant
  coi_text <-
    gsub(  # lazy match to stop at first occurrence of a word
      "^.*?(Disclosure.*conflict.*$)|^.*?(Disclosure.*compet.*$)|^.*?(Declaration of Conflict.*$)|^.*?(Declaration of Interest.*$)|^.*?(Potential conflict.{0,1} of interest.*$)|^.*?(Conflict.*$)|^.*?(Competing.*$)",  # disclosure may refer to financial disclosures
      "\\1\\2\\3\\4\\5\\6\\7",
      coi_text,
      fixed = F
    )

  # Capture disclosures that do not contain "conflict" but are relevant
  if (grepl("disclosure", coi_text, ignore.case = T)) {
    val <- "^.*conflict.*disclosure.*$|^.*compet.*disclosure.*$"
    if (!grepl(val, coi_text, ignore.case = T)) {
      coi_text <- gsub( "^.*?(Disclosure.*$)", "\\1", coi_text, fixed = F)
    }
  }

  # If only None/No/Nothing appear, stop after the fullstop.
  coi_text <-
    gsub(
      "(^.+None\\.).*$|(^.+None disclosed\\.).*$|(^.+None reported\\.).*$|(^.+None declared\\.).*$|(^.+None mentioned\\.).*$|(^.+None aired\\.).*$|(^.+None communicated\\.).*$|(^.+None revealed\\.).*$|(^.+Nothing to declare\\.).*$|(^.+No\\.).*$|(^.+Nil\\.).*$",
      "\\1\\2\\3\\4",
      coi_text,
      fixed = F, ignore.case = T
    )

  # Correct statements with repeating sentences (e.g. 0036, 0405)
  # 0405 not fixed b/c it contains 3 slightly different versions - hard to fix
  new <- strsplit(coi_text, "\\. {0,1}")[[1]]
  if (length(new) > 1) {
    if (all(duplicated(new)[2:length(new)])) {
      coi_text <- paste0(new[1], ".")
    }
  }

  # If 'The authors declare no competing financial interests' appears, extract.
  # (solves e.g. 0077, 0098)
  # e.g. "no competing interests exist."
  val1 <- "^.*(The author.+no.*competing.+interest.*$)"
  val2 <- "^.*(The author.+no.*conflict.+interest.*$)"
  val3 <- "^.*(None of the author.+no.*competing.+interest.*$)"
  val4 <- "^.*(None of the author.+no.*conflict.+interest.*$)"
  val5 <- "^.*(No conflict.*$)"
  val6 <- "^.*(No competing.*$)"
  vals <- paste(val1, val2, val3, val4, sep = "|")
  if (grepl(vals, coi_text)) {
    # Make sure that we are not missing the title
    new_n <- stringr::str_count(coi_text, "[cC]ompeting|[cC]onflict")
    # Identify all statements that are not preceded by a title
    # and extract everything from "The author" onwards
    if (new_n == 1 & length(c(is_disclosure, is_declare, is_dual)) == 0) {
      coi_text <- gsub(vals, "\\1\\2\\3\\4\\5\\6", coi_text)
    }

    # Do not extract info after the last such sentence, if such info exists.
    # Only do this if "author" is mentioned once.
    # (e.g. 0089, 0097, 0137, 0231)
    if (stringr::str_count(coi_text, "author") < 2) {
      val1 <- "(^.*The author.+no.*competing.+interest.*?\\.) [A-Z].*$"
      val2 <- "(^.*The author.+no.*conflict.+interest.*?\\.) [A-Z].*$"
      val3 <- "(^.*All authors.+no.*conflict.+interest.*?\\.) [A-Z].*$"
      val4 <- "(^.*Both authors.+no.*conflict.+interest.*?\\.) [A-Z].*$"
      val5 <- "(^.*No conflicts of interest.{0,12}?\\.) [A-Z].*$"
      val6 <- "(^.*No conflicting.{0,12} interest.{0,12}?\\.) [A-Z].*$"
      val7 <- "(^.*No competing.{0,12} interest.{0,12}?\\.) [A-Z].*$"
      vals <- paste(val1, val2, val3, val4, val5, val6, val7, sep = "|")
      coi_text <- gsub(vals, "\\1\\2\\3\\4\\5\\6\\7", coi_text)
    }
  }

  coi_text %<>% trimws()

  data.frame(article, pmid, is_coi_pred, coi_text, stringsAsFactors = F)
}