#' Identify standard mentions of COI statements
#'
#' Extract mentions of COI statements that contain standard phrases, such as
#' "Conflicts of Interest".
#'
#' @param article The text as a vector of strings.
#' @param dict A list of regular expressions for each concept.
#' @return Index of element with phrase of interest
.which_coi_1 <- function(article, dict) {
a <-
dict$conflict_title %>%
.encase() %>%
grep(article, ignore.case = T)
if (!!length(a)) {
a %<>% purrr::keep(~ .negate_coi_1(article[.x], dict))
}
return(a)
}
#' Identify less explicit mentions of COI statements.
#'
#' Extract mentions of COI statements that are less standard, e.g. "We declare
#' that we have no financial or other conflicts of interest."
#'
#' @param article The text as a vector of strings.
#' @param dict A list of regular expressions for each concept.
#' @return Index of elements with phrase of interest
.which_coi_2 <- function(article, dict) {
txt <- dict$txt
conflict_1 <- "no "
conflict_2 <- "(conflict|competing)"
conflict_3 <- "interest"
conflict_regex_1 <- paste0(conflict_1, txt, conflict_2, txt, conflict_3)
is_conflict_1 <- grep(conflict_regex_1, article, perl = T, ignore.case = T)
conflict_regex_2 <- paste0(conflict_1, txt, conflict_3, txt, conflict_2)
is_conflict_2 <- grep(conflict_regex_2, article, perl = T, ignore.case = T)
return(unique(c(is_conflict_1, is_conflict_2)))
}
#' Identify mentions of financial disclosures
#'
#' Extract mentions of financial disclosures, e.g. "Financial disclosure:
#' Nothing to disclose.
#'
#' @param article The text as a vector of strings.
#' @param dict A list of regular expressions for each concept.
#' @return Index of element with phrase of interest
.which_disclosure_1 <- function(article, dict) {
a <- agrep("disclosure", article, ignore.case = T)
is_financial <- agrepl("financial disclosure", article[a])
if (any(is_financial)) {
a %<>% purrr::keep(~ .negate_disclosure_1(article[.x], dict))
} else {
a %<>% purrr::keep(~ .negate_disclosure_2(article[.x], dict))
}
return(a)
}
#' Identify less explicit mentions of commercial interest
#'
#' Extract mentions of COI statements that are less standard and mention
#' commercial interests, e.g. "The authors of this study declare a financial
#' relationship with GSK."
#'
#' @param article The text as a vector of strings.
#' @param dict A list of regular expressions for each concept.
#' @return Index of elements with phrase of interest
.which_commercial_1 <- function(article, dict) {
txt <- dict$txt
commerce_0 <- "([Aa]uthor|[Ee]ditor|[Rr]eviewer|[Ss]tudy|party|has |have |No)"
commerce_1 <- "(commerically|commercial|financial)"
commerce_2 <- "(sponsored|financed|funded|interest(|s)|benefit|relationship)"
commercial_regex_1 <- paste0(commerce_0, txt, commerce_1, txt, commerce_2)
is_commercial_1 <- grep(commercial_regex_1, article, perl = T)
commercial_regex_2 <- paste0(commerce_0, txt, commerce_2, txt, commerce_1)
is_commercial_2 <- grep(commercial_regex_2, article, perl = T)
return(unique(c(is_commercial_1, is_commercial_2)))
}
#' Identify mentions of receiving benefits
#'
#' Extract mentions of benefits, e.g. "SS has received benefits of commercial
#' nature from GSK."
#'
#' @param article The text as a vector of strings.
#' @param dict A list of regular expressions for each concept.
#' @return Index of elements with phrase of interest
.which_benefit_1 <- function(article, dict) {
txt <- dict$txt
benefit_regex_1 <-
paste0("received", txt, "benefit", txt, "from", txt, "commercial")
benefit_regex_2 <-
paste0("benefit", txt, "received", txt, "from", txt, "commercial")
benefit_regex_3 <-
paste0("commercial", txt, "benefit|gain", txt, "received")
is_benefit_1 <- grep(benefit_regex_1, article, perl = T)
is_benefit_2 <- grep(benefit_regex_2, article, perl = T)
is_benefit_3 <- grep(benefit_regex_3, article, perl = T)
return(unique(c(is_benefit_1, is_benefit_2, is_benefit_3)))
}
#' Identify consultants
#'
#' Identify mentions such as "SS is a consultant for GSK."
#'
#' @param article The text as a vector of strings.
#' @param dict A list of regular expressions for each concept.
#' @return Index of elements with phrase of interest
.which_consultant_1 <- function(article, dict) {
txt <- dict$txt
consultant_0 <- "( is a| are)"
consultant_1 <- "consultant(|s)"
consultant_2 <- "for"
consultant_regex <- paste0(consultant_0, txt, consultant_1, txt, consultant_2)
is_consultant <- grep(consultant_regex, article, perl = T)
return(is_consultant)
}
#' Identify mentions of brief explanation
#'
#' Extract mentions of COI statements that mention "Brief explanation."
#'
#' @param article The text as a vector of strings.
#' @return Index of element with phrase of interest
.which_brief <- function(article) {
grep("Brief explanation for [A-Za-z]+:", article, ignore.case = T, fixed = F)
}
#' Identify receipt of grants
#'
#' Identify mentions such as: "SS has received research grants from GSK."
#'
#' @param article The text as a vector of strings.
#' @return Index of elements with phrase of interest
.which_grants_1 <- function(article) {
grep("received grants", article, perl = T)
}
#' Identify mentions of paid fees
#'
#' Identify mentions such as: "SS has received fees from GSK."
#'
#' @param article The text as a vector of strings.
#' @param dict A list of regular expressions for each concept.
#' @return Index of elements with phrase of interest
.which_fees_1 <- function(article, dict) {
words <- c("received_strict", "stock", "fees")
fees <-
dict %>%
magrittr::extract(words[c(2, 3)]) %>%
unlist() %>%
.bound() %>%
.encase()
received <-
dict %>%
magrittr::extract(words[1]) %>%
unlist() %>%
.bound() %>%
.encase()
c(received, fees) %>%
paste(collapse = dict$txt) %>%
grep(article, perl = T)
}
#' Identify mentions of consulting
#'
#' Identify mentions such as: "SS consults for GSK."
#'
#' @param article The text as a vector of strings.
#' @param dict A list of regular expressions for each concept.
#' @return Index of elements with phrase of interest
.which_consults_1 <- function(article, dict) {
words <- c("consult_all", "speaker")
consults <-
dict %>%
magrittr::extract(words) %>%
unlist() %>%
.bound() %>%
.encase()
of <-
c("of", "for", "on" , "by", "from", "by", "under", "within") %>%
.bound(location = "both") %>%
.encase
name_consults <- paste("[A-Z]\\s*[A-Z]", consults, of, sep = dict$txt)
consults_name <- paste(consults, "[A-Z]+", sep = dict$txt)
c(name_consults, consults_name) %>%
.encase() %>%
grep(article, perl = T)
}
#' Identify mentions of financial connections
#'
#' Identify mentions such as: "SS has a financial relationship with GSK."
#'
#' @param article The text as a vector of strings.
#' @param dict A list of regular expressions for each concept.
#' @return Index of elements with phrase of interest
.which_connections_1 <- function(article, dict) {
txt <- dict$txt
words <- c("relationship", "related", "relationship_strict", "related_strict")
commercial <- "\\bcommercial(|ly)"
financial <- "\\bfinancial(|y)"
commercial_relation <-
dict %>%
magrittr::extract(words[c(1, 2)]) %>%
unlist %>%
.bound %>%
.encase
financial_relation <-
dict %>%
magrittr::extract(words[c(3, 4)]) %>%
unlist %>%
.bound %>%
.encase
commercial_relation <- paste(commercial, commercial_relation, sep = txt)
financial_relation <- paste(financial, financial_relation, sep = txt)
c(commercial_relation, financial_relation) %>%
.encase %>%
grep(article, perl = T, ignore.case = T)
}
#' Identify declarations of connections
#'
#' Identify mentions such as: "SS declares a relationship with GSK."
#'
#' @param article The text as a vector of strings.
#' @param dict A list of regular expressions for each concept.
#' @return Index of elements with phrase of interest
.which_connections_2 <- function(article, dict) {
declare <- "(disclose(|s)|declare(|s)) (|a|an|no)"
words <- c("disclose", "relationship")
connection <-
dict %>%
magrittr::extract(words[2]) %>%
unlist %>%
.bound %>%
.encase
c(declare, connection) %>%
paste(collapse = "\\s*") %>%
grep(article, perl = T)
}
#' Identify mentions of commercial funding
#'
#' Identify mentions such as: "This study was commercially funded."
#'
#' @param article The text as a vector of strings.
#' @param dict A list of regular expressions for each concept.
#' @return Index of elements with phrase of interest
.which_commercial_ack_1 <- function(article, dict) {
txt <- dict$txt
words <- c("commercial", "funded", "interests")
interests <-
dict %>%
magrittr::extract(words[c(2, 3)]) %>%
unlist() %>%
.bound() %>%
.encase()
commercial <-
# dict %>%
# magrittr::extract(words[1]) %>%
"commercial(|ly)" %>%
unlist() %>%
.bound() %>%
.encase()
commercial_interests <- paste(interests, commercial, sep = txt)
interests_commercial <- paste(commercial, interests, sep = txt)
financialy_sponsored <- paste("financial(|y)", "sponsored", sep = txt)
sponsored_financialy <- paste("sponsored", "financial(|y)", sep = txt)
c(commercial_interests,
interests_commercial,
financialy_sponsored,
sponsored_financialy
) %>%
.encase() %>%
grep(article, perl = T, ignore.case = T)
}
#' Identify mentions of proprietary material
#'
#' Identify mentions such as: "SS has the rights to the presented tool."
#'
#' @param article The text as a vector of strings.
#' @param dict A list of regular expressions for each concept.
#' @return Index of elements with phrase of interest
.which_rights_1 <- function(article, dict) {
words <- c("received_strict", "proprietary")
dict %>%
magrittr::extract(words) %>%
lapply(.bound) %>%
lapply(.encase) %>%
paste(collapse = dict$txt) %>%
grep(article, perl = T, ignore.case = T)
}
#' Identify founders
#'
#' Identify mentions such as: "SS is a founding member of GSK."
#'
#' @param article The text as a vector of strings.
#' @param dict A list of regular expressions for each concept.
#' @return Index of elements with phrase of interest
.which_founder_1 <- function(article, dict) {
words <- c("founder", "for_of")
dict %>%
magrittr::extract(words) %>%
lapply(.bound) %>%
lapply(.encase) %>%
paste(collapse = dict$txt) %>%
grep(article, perl = T, ignore.case = T)
}
#' Identify advisors
#'
#' Identify mentions such as: "SS is a scientific advisor of GSK."
#'
#' @param article The text as a vector of strings.
#' @param dict A list of regular expressions for each concept.
#' @return Index of elements with phrase of interest
.which_advisor_1 <- function(article, dict) {
sci_advisor <- paste("scientific", "advisor", sep = dict$txt)
of <- dict$for_of %>% unlist %>% .bound %>% .encase
c(sci_advisor, of) %>%
paste(collapse = dict$txt) %>%
grep(article, ignore.case = T, perl = T)
}
#' Identify payments
#'
#' Identify mentions such as: "SS was paid by GSK."
#'
#' @param article The text as a vector of strings.
#' @param dict A list of regular expressions for each concept.
#' @return Index of elements with phrase of interest
.which_paid_1 <- function(article, dict) {
grep(" paid ", article, ignore.case = T, perl = T)
}
#' Identify board membership
#'
#' Identify mentions such as: "SS is a member of the board for GSK."
#'
#' @param article The text as a vector of strings.
#' @param dict A list of regular expressions for each concept.
#' @return Index of elements with phrase of interest
.which_board_1 <- function(article, dict) {
txt <- dict$txt
partake_synonyms <- c("member(|ship)", "serve(|s|d)", "participate(|s|d)")
partake <- paste0("(", paste(partake_synonyms, collapse = "|"), ") ")
preps_synonyms <- c("on", "of", "in", "for")
preps <- paste0("(", paste(preps_synonyms, collapse = "|"), ") ")
any_board <- paste0(partake, txt, preps, txt, "[Bb]oard")
grep(any_board, article, perl = T)
}
#' Identify statements of no COI
#'
#' Identify mentions such as: "SS was paid by GSK."
#'
#' @param article The text as a vector of strings.
#' @param dict A list of regular expressions for each concept.
#' @return Index of elements with phrase of interest
.which_no_coi_1 <- function(article, dict) {
grep("[Nn]o[a-zA-Z\\s]+conflict", article, perl = T)
}
#' Identify statements that the funders were not involved
#'
#' Identify mentions such as: "SS was paid by GSK."
#'
#' @param article The text as a vector of strings.
#' @param dict A list of regular expressions for each concept.
#' @return Index of elements with phrase of interest
.which_no_funder_role_1 <- function(article, dict) {
words <- c("is_have", "role")
funders <- "(The [Ff]unders|Funders)"
no <- c("no", "not") %>% .bound(location = "both") %>% .encase
had <-
dict %>%
magrittr::extract(words[1]) %>%
unlist %>%
.bound(location = "both") %>%
.encase
role <-
dict %>%
magrittr::extract(words[2]) %>%
unlist %>%
c("involved") %>%
.bound %>%
.encase
in_prep <- "\\bin\\b"
c(funders, had, no, role, in_prep) %>%
paste(collapse = dict$txt) %>%
grep(article, perl = T)
}
#' Negate statements that mention conflict but are not COI
#'
#' Keep statements such as "No conficts of interest reported", but do not
#' keep statements such as "The conflicts of interest are becoming
#' commoner."
#'
#' @param article The text as a vector of strings.
#' @param dict A list of regular expressions for each concept.
#' @return A boolean indicating whether a disclosure should be retained.
.negate_coi_1 <- function(article, dict) {
no <- dict$no %>% .bound() %>% .encase()
no_conflict <- paste(no, "conflict|compet", sep = dict$txt)
has_capital_c <- grepl("C(?i)onflict(?-i)|C(?i)ompet(?-i)", article, perl = T)
has_negation <- grepl(no_conflict, article)
has_author <- grepl("[Aa]uthor", article, perl = T)
has_punct <- grepl("interest(|s)[.:;,]", article, ignore.case = T)
any(c(has_capital_c, has_negation, has_author, has_punct))
}
#' Negate disclosures that are funding statements
#'
#' Keep statements such as "No financial disclosures reported.", but do not
#' keep statements such as "The authors disclose not funding received."
#'
#' @param article The text as a vector of strings.
#' @param dict A list of regular expressions for each concept.
#' @return A boolean indicating whether a disclosure should be retained.
.negate_disclosure_1 <- function(article, dict) {
words <- c("no_financial_disclosure", "consult_all", "speaker", "proprietary")
dict %>%
magrittr::extract(words) %>%
unlist() %>%
.encase() %>%
grepl(article, perl = T)
}
#' Negate disclosures that are neither COI nor funding statements
#'
#' Remove mentions of disclosures such as "Patient Information Disclosure".
#'
#' @param article The text as a vector of strings.
#' @param dict A list of regular expressions for each concept.
#' @return A boolean indicating whether a disclosure should be retained.
.negate_disclosure_2 <- function(article, dict) {
has_capital_d <- grepl("Disclos|DISCLOS", article)
has_negation <- grepl(.encase(dict$No), article)
has_author <- grepl("[Aa]uthor[a-zA-Z\\s-]*[dD]isclo", article, perl = T)
has_punct <- grepl("disclosure(|s)[.:;,]", article, ignore.case = T)
has_online <- grepl("\\bonline\\b", article, ignore.case = T)
any(has_capital_d, has_negation, has_author, has_punct, has_online)
}
#' Remove irrelevat mentions of honoraria
#'
#' Remove mentions of honoraria such as "Patients received honoraria".
#'
#' @param article The text as a vector of strings.
#' @param dict A list of regular expressions for each concept.
#' @return A boolean indicating whether a disclosure should be retained.
.obliterate_honoraria_1 <- function(article, dict) {
honorary_1 <- "([Pp]articipant|[Pp]rovider|[Ss]ubject|[Pp]atient|[Ff]amil)"
honorary_2 <- "[Hh]onorari(um|a)"
honorary_regex <- paste0(honorary_1, dict$txt, honorary_2)
gsub(honorary_regex, "", article, perl = T)
}
#' Identify COI titles using XML labels
#'
#' Extract XML titles related to COI statements and all text children.
#'
#' @param article_xml The text as an xml_document.
#' @param dict A list of regular expressions for each concept.
#' @return The title and its related text as a string.
.get_coi_pmc_title <- function(article_xml, dict) {
b <- ""
words <- c("conflict_title", "disclosure_coi_title")
# In case I want to programmatically add these
# prefix <- "(C(?i)|(?-i)[A-Z](?i)[a-z]+ c)"
# suffix <- "(?-i)(| \\(COI\\)| \\w+)"
coi_titles <-
dict %>%
magrittr::extract(words) %>%
unlist %>%
# .title %>% # If I uncomment, add the "disclosure" synonyms as postfix
.encase
# back_xpath <-
# c(
# "back/ack//*[self::title or self::bold or self::italic]",
# "back/fn-group//*[self::title or self::bold or self::italic]",
# "back/notes//*[self::title or self::bold or self::italic]"
# ) %>%
# paste(collapse = " | ")
# If I had not stripped the d1 namespace:
# "back//fn-group//*[self::d1:title or self::d1:bold or self::d1:italic]"
# back_matter <-
# article_xml %>%
# xml_find_all(back_xpath)
# TODO: Turn into a function applied to the back, front and body matters
back_matter <-
article_xml %>%
xml2::xml_find_all("back/*[not(name()='ref-list')]") %>%
xml2::xml_find_all(".//*[self::title or self::bold or self::italic or self::sup]")
a <-
back_matter %>%
xml2::xml_text() %>%
stringr::str_which(stringr::regex(coi_titles, ignore_case = T))
if (!!length(a)) {
# I can try to use ancestor to avoid the nested if statement in the future
# ancestor_xpaths <- c(
# "./ancestor::fn",
# "./ancestor::sec",
# "./ancestor::notes",
# "./ancestor::author-notes"
# )
#
# ancestor_xpath <- paste(ancestor_xpaths, collapse = " | ")
a <-
back_matter %>%
magrittr::extract(a) %>%
xml2::xml_parent() %>%
xml2::xml_contents()
if (length(a) == 1) {
b <-
a %>%
xml2::xml_parent() %>%
xml2::xml_parent() %>%
xml2::xml_contents() %>%
xml2::xml_text() %>%
paste(collapse = ": ")
} else {
b <-
a %>%
xml2::xml_text() %>%
paste(collapse = " ")
}
return(b)
}
front_matter <-
article_xml %>%
xml2::xml_find_all("front//fn//*[self::title or self::bold or self::italic or self::sup]")
a <-
front_matter %>%
xml2::xml_text() %>%
stringr::str_which(stringr::regex(coi_titles, ignore_case = T))
if (!!length(a)) {
a <-
front_matter %>%
magrittr::extract(a) %>%
xml2::xml_parent() %>%
xml2::xml_contents()
if (length(a) == 1) {
b <-
a %>%
xml2::xml_parent() %>%
xml2::xml_parent() %>%
xml2::xml_contents() %>%
xml2::xml_text() %>%
paste(collapse = ": ")
} else {
b <-
a %>%
xml2::xml_text() %>%
paste(collapse = " ")
}
return(b)
}
coi_titles <-
dict %>%
magrittr::extract(words) %>%
unlist %>%
# .title_strict %>%
.encase
body_matter <-
article_xml %>%
xml2::xml_find_all("body/sec//*[self::title or self::bold or self::italic or self::sup]")
a <-
body_matter %>%
xml2::xml_text() %>%
stringr::str_which(stringr::regex(coi_titles, ignore_case = T))
if (!!length(a)) {
a <-
body_matter %>%
magrittr::extract(a) %>%
xml2::xml_parent() %>%
xml2::xml_contents()
if (length(a) == 1) {
b <-
a %>%
xml2::xml_parent() %>%
xml2::xml_parent() %>%
xml2::xml_contents() %>%
xml2::xml_text() %>%
paste(collapse = ": ")
} else {
b <-
a %>%
xml2::xml_text() %>%
paste(collapse = " ")
}
}
return(b)
}
#' Identify elements with COI attributes
#'
#' Extract elements with COI attributes from NLM XLMs.
#'
#' @param article_xml The text as an xml_document.
#' @return The title and its related text as a string.
.get_coi_pmc_fn <- function(article_xml) {
fn_xpaths <- c(
"back//fn[@fn-type = 'conflict' or @fn-type = 'COI-statement']",
"front//fn[@fn-type = 'conflict' or @fn-type = 'COI-statement']"
)
fn_xpath <- paste(fn_xpaths, collapse = " | ")
coi_fn <-
article_xml %>%
xml2::xml_find_all(fn_xpath) %>%
xml2::xml_contents() %>%
xml2::xml_text() %>%
paste0(collapse = ": ")
if (nchar(coi_fn) == 0) {
sec_xpaths <- c(
"back//sec[@sec-type = 'conflict' or @fn-type = 'COI-statement']",
"front//sec[@sec-type = 'conflict' or @fn-type = 'COI-statement']"
)
sec_xpath <- paste(sec_xpaths, collapse = " | ")
coi_fn <-
article_xml %>%
xml2::xml_find_all(sec_xpath) %>%
purrr::map(function(x) xml2::xml_contents(x) %>% xml2::xml_text()) %>%
purrr::map_chr(paste, collapse = ": ") %>%
paste(collapse = " ")
}
return(coi_fn)
}
.is_relevant_coi <- function(article) {
hi_synonyms <- c(
"conflict",
"compet",
"disclos",
"declar",
"\\commercial"
)
lo_synonyms <- c(
"fee(|s)\\b",
"honorari",
"\\bboard\\b",
"consult",
"relation",
"connection",
"\\bfinancial",
"\\b(co|co-)founder",
"\\bpaid\\b",
"speaker",
"\\bemployee",
"member\\b",
"funder"
)
hi_regex <- paste(hi_synonyms, collapse = "|")
lo_regex <- paste(lo_synonyms, collapse = "|")
hi_relevance <-
article %>%
stringr::str_detect(stringr::regex(hi_regex, ignore_case = T))
lo_relevance <-
article %>%
stringr::str_detect(stringr::regex(lo_regex, ignore_case = T))
is_relevant_hi = any(hi_relevance)
is_relevant_lo = any(lo_relevance)
list(
is_relevant_coi = any(is_relevant_hi, is_relevant_lo),
is_relevant_coi_hi = is_relevant_hi,
is_relevant_coi_lo = is_relevant_lo,
index = hi_relevance | lo_relevance
)
}
#' Identify and extract Conflicts of Interest (COI) statements in PMC XML files.
#'
#' Takes a PMC XML file as a list of strings and returns data related to the
#' presence of a COI statement, including whether a COI statement
#' exists. If a Funding statement exists, it extracts it. This is a modified
#' version of the `rt_coi_pmc` designed for integration with `rt_all_pmc`.
#'
#' @param article_ls A PMC XML as a list of strings.
#' @param pmc_coi_ls A list of results from the `.get_coi_pmc` function.
#' @param dict A list of regular expressions for each concept.
#' @return A dataframe of results.
.rt_coi_pmc <- function(article_ls, pmc_coi_ls, dict) {
index <- integer()
# Way faster than index_any[["reg_title_pmc"]] <- NA
index_any <- list(
coi_1 = NA,
coi_2 = NA,
coi_disclosure_1 = NA,
commercial_1 = NA,
benefit_1 = NA,
consultant_1 = NA,
grants_1 = NA,
brief_1 = NA
)
index_ack <- list(
fees_1 = NA,
consults_1 = NA,
connect_1 = NA,
connect_2 = NA,
commercial_ack_1 = NA,
rights_1 = NA,
founder_1 = NA,
advisor_1 = NA,
paid_1 = NA,
board_1 = NA,
no_coi_1 = NA,
no_funder_role_1 = NA
)
relevance_ls <- list(
is_relevant_coi = NA,
is_relevant_coi_hi = NA,
is_relevant_coi_lo = NA
)
out <- list(
is_coi_pred = FALSE,
coi_text = "",
is_explicit_coi = NA
)
if (pmc_coi_ls$is_coi_pred) {
out$is_coi_pred <- TRUE
out$coi_text <- pmc_coi_ls$coi_text
relevance_ls$is_relevant_coi <- TRUE
relevance_ls$is_relevant_coi_hi <- TRUE
return(c(relevance_ls, out, index_any, index_ack))
}
# TODO Consider adding unique
article <-
article_ls[c("ack", "body", "footnotes")] %>%
unlist()
# unique()
relevance_ls <- .is_relevant_coi(article)
if (!relevance_ls$is_relevant_coi) {
return(c(relevance_ls[-4], out, index_any, index_ack))
}
article %<>% purrr::keep(relevance_ls$index)
relevance_ls$index <- NULL
article_processed <-
article %>%
iconv(from = 'UTF-8', to = 'ASCII//TRANSLIT', sub = "") %>% # keep first
trimws() %>%
.obliterate_fullstop_1() %>%
.obliterate_semicolon_1() %>% # adds minimal overhead
.obliterate_comma_1() %>% # adds minimal overhead
.obliterate_apostrophe_1() %>%
.obliterate_punct_1() %>%
.obliterate_line_break_1() %>%
.obliterate_honoraria_1(dict)
# No change in speed by recreating the list
index_any <- list(
coi_1 = .which_coi_1(article_processed, dict),
coi_2 = .which_coi_2(article_processed, dict),
coi_disclosure_1 = .which_disclosure_1(article_processed, dict),
commercial_1 = .which_commercial_1(article_processed, dict),
benefit_1 = .which_benefit_1(article_processed, dict),
consultant_1 = .which_consultant_1(article_processed, dict),
grants_1 = .which_grants_1(article_processed),
brief_1 = .which_brief(article_processed)
)
index <- unlist(index_any) %>% unique() %>% sort()
if (!!length(index)) {
out$is_explicit_coi <- !!length(unlist(index_any))
out$is_coi_pred <- !!length(index)
out$coi_text <- article[index] %>% paste(collapse = " ")
index_any %<>% purrr::map(function(x) !!length(x))
if (length(index) == 1) {
coi_only <- dict$conflict_title %>% .encase %>% .title_strict
is_coi_only <- stringr::str_detect(out$coi_text, coi_only)
if (is_coi_only & !is.na(article[index + 1])) {
out$coi_text <- article[c(index, index + 1)] %>% paste(collapse = ": ")
}
}
return(c(out, index_any, index_ack))
}
# Identify potentially missed signals
i <- which(article %in% c(article_ls$ack, article_ls$footnotes))
if (!!length(i)) {
index_ack <- list(
fees_1 = .which_fees_1(article_processed[i], dict),
consults_1 = .which_consults_1(article_processed[i], dict),
connect_1 = .which_connections_1(article_processed[i], dict),
connect_2 = .which_connections_2(article_processed[i], dict),
commercial_ack_1 = .which_commercial_ack_1(article_processed[i], dict),
rights_1 = .which_rights_1(article_processed[i], dict),
founder_1 = .which_founder_1(article_processed[i], dict),
advisor_1 = .which_advisor_1(article_processed[i], dict),
paid_1 = .which_paid_1(article_processed[i], dict),
board_1 = .which_board_1(article_processed[i], dict),
no_coi_1 = .which_no_coi_1(article_processed[i], dict),
no_funder_role_1 = .which_no_funder_role_1(article_processed[i], dict)
)
index <- i[unlist(index_ack) %>% unique() %>% sort()]
index_ack %<>% purrr::map(function(x) !!length(x))
}
out$is_coi_pred <- !!length(index)
out$coi_text <- article[index] %>% paste(collapse = " ")
index_any %<>% purrr::map(function(x) !!length(x))
if (out$is_coi_pred) {
out$is_explicit_coi <- FALSE
}
return(c(relevance_ls, out, index_any, index_ack))
}
#' Identify and extract Conflicts of Interest (COI) statements in PMC XML files.
#'
#' Takes a PMC XML file and returns data related to the presence of a COI
#' statement, including whether a COI statement exists. If a COI statement
#' exists, it extracts it.
#'
#' @param filename The name of the PMC XML as a string.
#' @param remove_ns TRUE if an XML namespace exists, else FALSE (default).
#' @return A dataframe of results. It returns unique article identifiers,
#' whether this article was deemed relevant to COI, whether a COI was found,
#' the text that suggested the presence of COI and the name of the function
#' that identified this text. The functions are returned to add flexibility
#' in how this package is used, such as future definitions of COI that may
#' differ from the one we used.
#' @examples
#' \dontrun{
#' # Path to PMC XML.
#' filepath <- "../inst/extdata/00003-PMID26637448-PMC4737611.xml"
#'
#' # Identify and extract meta-data and indicators of transparency.
#' results_table <- rt_coi_pmc(filepath, remove_ns = T)
#' }
#' @export
rt_coi_pmc <- function(filename, remove_ns = F) {
index <- integer()
dict <- .create_synonyms()
xpath <- c(
"front/article-meta/article-id[@pub-id-type = 'pmid']",
"front/article-meta/article-id[@pub-id-type = 'pmc']",
"front/article-meta/article-id[@pub-id-type = 'pmc-uid']",
"front/article-meta/article-id[@pub-id-type = 'doi']"
)
# Way faster than index_any[["reg_title_pmc"]] <- NA
index_any <- list(
coi_fn_pmc = NA,
coi_title_pmc = NA,
coi_1 = NA,
coi_2 = NA,
disclosure_1 = NA,
commercial_1 = NA,
benefit_1 = NA,
consultant_1 = NA,
grants_1 = NA,
brief_1 = NA
)
index_ack <- list(
fees_1 = NA,
consults_1 = NA,
connect_1 = NA,
connect_2 = NA,
commercial_ack_1 = NA,
rights_1 = NA,
founder_1 = NA,
advisor_1 = NA,
paid_1 = NA,
board_1 = NA,
no_coi_1 = NA,
no_funder_role_1 = NA
)
out <- list(
pmid = NA,
pmcid_pmc = NA,
pmcid_uid = NA,
doi = NA,
is_relevant = NA,
is_relevant_hi = NA,
is_relevant_lo = NA,
is_coi_pred = FALSE,
coi_text = "",
is_explicit = NA
)
# A lot of the PMC XML files are malformed
article_xml <- tryCatch(.get_xml(filename, remove_ns), error = function(e) e)
if (inherits(article_xml, "error")) {
return(tibble::tibble(filename, is_success = F))
}
# Extract IDs
out %<>% purrr::list_modify(!!!purrr::map(xpath, ~ .get_text(article_xml, .x, T)))
# Capture coi fn elements
out$coi_text <- .get_coi_pmc_fn(article_xml)
index_any$coi_fn_pmc <- nchar(out$coi_text) > 0
if (index_any$coi_fn_pmc) {
out$is_relevant <- TRUE
out$is_coi_pred <- TRUE
return(tibble::as_tibble(c(out, index_any, index_ack)))
}
# Go through titles
title_txt <- .get_coi_pmc_title(article_xml, dict)
is_title <- nchar(title_txt) > 0
if (is_title) {
index_any$coi_title_pmc <- TRUE
out$coi_text <- title_txt
out$is_relevant <- TRUE
out$is_explicit <- TRUE
out$is_coi_pred <- TRUE
return(tibble::as_tibble(c(out, index_any, index_ack)))
}
# Extract article text into a vector
ack <- .xml_ack(article_xml)
body <- .xml_body(article_xml, get_last_two = T)
footnotes <- .xml_footnotes(article_xml) %>% .obliterate_contribs()
article <- c(footnotes, body, ack)
# Check relevance
hi <- "conflict|compet|disclos|declar|\\bcommercial"
lo <- "fee(|s)\\b|honorari|\\bboard\\b|consult|relation|connection|\\bfinancial|\\b(co|co-)founder|\\bpaid\\b|speaker|\\bemployee|member\\b|funder"
hi_relevance <- stringr::str_detect(article, stringr::regex(hi, ignore_case = T))
lo_relevance <- stringr::str_detect(article, stringr::regex(lo, ignore_case = T))
article <- article[(hi_relevance + lo_relevance) > 0]
# rel_regex <- paste(hi_regex, lo_regex, sep = "|")
# article %<>% purrr::keep(~ str_detect(.x, regex(rel_regex, ignore_case = T)))
out$is_relevant_hi <- any(hi_relevance)
out$is_relevant_lo <- any(lo_relevance)
out$is_relevant <- with(out, any(c(is_relevant_hi, is_relevant_lo)))
# Check for relevance
if (!out$is_relevant) {
return(tibble::as_tibble(c(out, index_any, index_ack)))
}
# Text pre-processing
# .xml_preprocess(article_xml) # 5x faster to obliterate within each section
article_processed <-
article %>%
iconv(from = 'UTF-8', to = 'ASCII//TRANSLIT', sub = "") %>% # keep first
trimws() %>%
.obliterate_fullstop_1() %>%
.obliterate_semicolon_1() %>% # adds minimal overhead
.obliterate_comma_1() %>% # adds minimal overhead
.obliterate_apostrophe_1() %>%
.obliterate_punct_1() %>%
.obliterate_line_break_1() %>%
.obliterate_honoraria_1(dict)
index_any$coi_fn_pmc <- integer()
index_any$coi_title_pmc <- integer()
index_any$coi_1 <- .which_coi_1(article_processed, dict)
index_any$coi_2 <- .which_coi_2(article_processed, dict)
index_any$disclosure_1 <- .which_disclosure_1(article_processed, dict)
index_any$commercial_1 <- .which_commercial_1(article_processed, dict)
index_any$benefit_1 <- .which_benefit_1(article_processed, dict)
index_any$consultant_1 <- .which_consultant_1(article_processed, dict)
index_any$grants_1 <- .which_grants_1(article_processed)
index_any$brief_1 <- .which_brief(article_processed)
index <- unlist(index_any) %>% unique() %>% sort()
if (!!length(index)) {
out$is_explicit <- !!length(unlist(index_any))
out$is_coi_pred <- !!length(index)
out$coi_text <- article[index] %>% paste(collapse = " ")
index_any %<>% purrr::map(function(x) !!length(x))
if (length(index) == 1) {
coi_only <- dict$conflict_title %>% .encase %>% .title_strict
is_coi_only <- stringr::str_detect(out$coi_text, coi_only)
if (is_coi_only & !is.na(article[index + 1])) {
out$coi_text <- article[c(index, index + 1)] %>% paste(collapse = ": ")
}
}
return(tibble::as_tibble(c(out, index_any, index_ack)))
}
# Identify potentially missed signals
i <- which(article %in% c(ack, footnotes))
if (!!length(i)) {
index_ack$fees_1 <- .which_fees_1(article_processed[i], dict)
index_ack$consults_1 <- .which_consults_1(article_processed[i], dict)
index_ack$connect_1 <- .which_connections_1(article_processed[i], dict)
index_ack$connect_2 <- .which_connections_2(article_processed[i], dict)
index_ack$commercial_ack_1 <-
.which_commercial_ack_1(article_processed[i], dict)
index_ack$rights_1 <- .which_rights_1(article_processed[i], dict)
index_ack$founder_1 <- .which_founder_1(article_processed[i], dict)
index_ack$advisor_1 <- .which_advisor_1(article_processed[i], dict)
index_ack$paid_1 <- .which_paid_1(article_processed[i], dict)
index_ack$board_1 <- .which_board_1(article_processed[i], dict)
index_ack$no_coi_1 <- .which_no_coi_1(article_processed[i], dict)
index_ack$no_funder_role_1 <-
.which_no_funder_role_1(article_processed[i], dict)
index <- i[unlist(index_ack) %>% unique() %>% sort()]
index_ack %<>% purrr::map(function(x) !!length(x))
}
out$is_coi_pred <- !!length(index)
out$coi_text <- article[index] %>% paste(collapse = " ")
index_any %<>% purrr::map(function(x) !!length(x))
if (out$is_coi_pred) {
out$is_explicit <- FALSE
}
return(tibble::as_tibble(c(out, index_any, index_ack)))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.