Nothing
#' Creates an environment containing CDB files
#'
#' Extracts SNOMED CT concepts from appropriate places in the
#' hierarchy to create a set of CDB files in an environment.
#' Uses WordNet and manual synonyms if available.
#'
#' @param SNOMED environment containing a SNOMED dictionary
#' @param TRANSITIVE transitive closure table, generated by
#' createTransitive. It is regenerated if not provided.
#' @param WN WordNet data.table as returned by downloadWordnet
#' containing WordNet data from appropriate
#' categories, in the format: cat (character), wordnetId (integer64),
#' synonyms (list), parents (list), adj (list)
#' @param MANUAL_SYNONYMS data.table with columns term1 and term2,
#' containing additional exact synonyms or abbreviations
#' @param noisy whether to output status messages
#' @param stopwords vector of stopwords
#' @return environment containing the following data tables: FINDINGS,
#' QUAL, CAUSES, BODY, OTHERCAUSE, OTHERSEARCH, OVERLAP, TRANSITIVE
#' @export
#' @family CDB functions
#' @seealso exportMiADECDB, MANUAL_SYNONYMS
#' @examples
#' # Not run
#' # data(MANUAL_SYNONYMS)
#' # WN <- downloadWordnet()
#' # MANUAL_SYNONYMS <- rbind(MANUAL_SYNONYMS, downloadOrphanet())
#' # CDBNEW <- createCDB(WN = WN, MANUAL_SYNONYMS = MANUAL_SYNONYMS)
createCDB <- function(SNOMED = getSNOMED(), TRANSITIVE = NULL,
WN = NULL, MANUAL_SYNONYMS = NULL, noisy = TRUE,
stopwords = c('the', 'of', 'by', 'with', 'to', 'into', 'and', 'or',
'at', 'as', 'and/or', 'in')){
# Returns an environment containing data.tables used for
# generating decompositions
CDB <- new.env()
# Declare symbols to avoid R check error
type <- conceptId <- term <- typeId <- adj <- wordnetId <- NULL
synonyms <- syn <- part <- whole <- snomed <- synonym <- NULL
extra <- sourceId <- destinationId <- laterality <- NULL
nonlat_parentId <- N <- semType <- findingId <- otherId <- NULL
descendantId <- ancestorId <- bidirectional <- multipart <- NULL
#### USEFUL FUNCTIONS ####
s <- function(x) SNOMEDconcept(x, SNOMED = SNOMED)
desc <- function(x, ...){
descendants(x, SNOMED = SNOMED, TRANSITIVE = TRANSITIVE, ...)
}
anc <- function(x, ...){
ancestors(x, SNOMED = SNOMED, TRANSITIVE = TRANSITIVE, ...)
}
init <- function(x){
x <- desc(x)
# Add acronyms e.g. 'AF - atrial fibrillation' --> AF
# Add concepts with phrases in parentheses removed unless
# the phrase in brackets alters the meaning of the root phrase
# , e.g.
# ERCP (Endoscopic retrograde cholangiopancreatography) normal
# --> ERCP normal
# but not if brackets start with except|exclud|with|without
DESC <- description(x, SNOMED = SNOMED,
include_synonyms = TRUE)[type == 'Synonym']
if (nrow(DESC) > 0){
INIT <- rbind(DESC[, list(conceptId, term = std_term(term))],
DESC[, list(conceptId, term = std_term(term,
stopwords = stopwords, hyphens_to_space = TRUE))],
DESC[, list(conceptId, term = std_term(term,
stopwords = stopwords, remove_stopwords = TRUE))],
DESC[, list(conceptId, term = std_term(term,
stopwords = stopwords,
remove_words_in_parentheses = TRUE))],
acronyms(x, SNOMED = SNOMED)[, list(conceptId,
term = paste0(' ', term, ' '))])
return(INIT[!duplicated(INIT)])
} else {
return(DESC)
}
}
#### CAUSES (NOT USING TRANSITIVE TABLE) ####
if (noisy) message('Initialising causes.')
CAUSES <- description(unique(
SNOMED$RELATIONSHIP[typeId %in% s(
c('Due to', 'Causative agent', 'After'))]$destinationId),
SNOMED = SNOMED, include_synonyms = T)[type == 'Synonym',
list(conceptId, term = std_term(term))]
#### GENERATE TRANSITIVE TABLE ####
# Include sampleSNOMED() for testing purposes
if (is.null(TRANSITIVE)){
if (noisy) message('Creating transitive closure table.')
TRANSITIVE <- createTransitive(c(sampleSNOMED()$CONCEPT$id,
descendants(c('Clinical finding', 'Qualifier value',
'Body structure'), SNOMED = SNOMED, include_self = TRUE),
CAUSES$conceptId), SNOMED = SNOMED)
if (noisy){
message(paste0('Transitive closure table created, ',
nrow(TRANSITIVE), ' rows.'))
}
}
#### FINDINGS ####
if (noisy) message('Initialising findings and qualifiers.')
FINDINGS <- init('Clinical finding')
#### QUALIFIERS ####
QUAL <- init(c('Grades (qualifier value)',
'Groups (qualifier value)', 'Levels (qualifier value)',
'Scores (qualifier value)', 'Types (qualifier value)',
'Classification system (qualifier value)',
'Descriptor (qualifier value)', 'Finding value (qualifier value)',
'Courses (qualifier value)',
'General site descriptor (qualifier value)'))
#### BODY STRUCTURES ####
if (noisy) message('Initialising body structures.')
BODY <- init('Body structure')
# Remove 'entire', 'structure of', 'of' and 'the'
strip_structure <- ' entire | region structure | structure of | structure | of the | of | the '
if (nrow(BODY) > 0){
BODY <- rbind(BODY, BODY[, list(conceptId,
term = gsub(strip_structure, ' ',
gsub(strip_structure, ' ', term)))])
BODY <- BODY[!duplicated(BODY)]
# Remove structure type (e.g. 'muscle structure' etc.) if a name is
# unique (e.g. there is only one rectus femoris and it is a muscle)
structure_types <- c(' muscle | skeletal muscle ', ' ligament ',
' tendon ', ' bone | bone structure ', ' joint ', ' artery ',
' vein ')
structure_terms <- lapply(structure_types, function(structure_type){
# Find concepts with structure
ANCESTOR <- BODY[term %like% paste0('^(', structure_type, ')$')]
# first generation children are non-specific and are not to have
# their body type stripped out
if (nrow(ANCESTOR) > 0){
if (noisy) message(paste0('Removing the word(s) ',
structure_type, ' where possible.'))
TEMP <- BODY[term %like% structure_type &
conceptId %in% desc(ANCESTOR$conceptId),
list(conceptId, term = gsub(paste0(strip_structure, '|',
structure_type), ' ', gsub(paste0(strip_structure, '|',
structure_type), ' ', term)))]
TEMP[!duplicated(TEMP)]
TEMP
} else {
data.table(conceptId = bit64::integer64(0),
term = character(0))
}
})
setattr(structure_terms, 'names', structure_types)
# For each structure types, we have a list of putative terms.
# Need to check if these are unique
other_structure_terms <- lapply(structure_types, function(structure_type){
# Find concepts with structure
unique(unlist(lapply(
structure_terms[setdiff(structure_types, structure_type)],
function(x){
x$term
})))
})
setattr(other_structure_terms, 'names', structure_types)
# Body parts without a structure
body_ancestors <- BODY[term %like% paste0('^(',
paste(structure_types, collapse = '|'), ')$')]$conceptId
OTHER_BODY_PARTS <- BODY[!(conceptId %in% desc(body_ancestors))]
# Find concepts for generic body parts (these should not have
# their body type stripped)
firstgen <- BODY[term %like% paste0('^(',
paste(structure_types, collapse = '|'),
')(part |)$')]$conceptId
firstgen <- children(firstgen, include_self = TRUE,
SNOMED = SNOMED)
# Find unique structure terms without body part type
unique_structure_terms <- lapply(structure_types,
function(structure_type){
TEMP <- structure_terms[[structure_type]]
searchtext <- paste(setdiff(structure_types, structure_type),
collapse = '|')
exclude <- c(TEMP[(term %in% c(OTHER_BODY_PARTS$term,
other_structure_terms[[structure_type]])) |
(term %like% searchtext)]$conceptId,
firstgen)
# Exclude if term is ambiguous or has another structure
# type mentioned
TEMP[!(conceptId %in% exclude),
list(conceptId, term, structure_type)]
}
)
setattr(unique_structure_terms, 'names', structure_types)
EXTRA_WITHOUT_STRUCTURE <- rbindlist(unique_structure_terms)
BODY <- rbind(BODY, EXTRA_WITHOUT_STRUCTURE[,
list(conceptId, term)])
}
#### ADDING WORDNET ADJECTIVES ####
if (!is.null(WN)){
if (noisy) message('Adding WordNet adjectives.')
# Extra concepts for
# XXX of XXX --> XXX XXX (adjectival)
# e.g. neck of femur --> femoral neck
# dissection of artery --> arterial dissection
ADJ <- WN[, list(adj = adj[1][[1]]), by = list(wordnetId, cat)]
ADJ[, adj := sub('[1-9]$', '', adj)]
SYN <- WN[, list(syn = synonyms[1][[1]]), by = wordnetId]
SYN[, syn := sub('[1-9]$', '', syn)]
ADJ <- merge(ADJ, SYN, by = 'wordnetId',
allow.cartesian = TRUE)[adj != syn, list(adj, syn, cat)]
ADJ <- ADJ[!duplicated(ADJ)]
# Body structures
ADJBODY <- BODY[term %like% '^ ([[:alpha:]]+) of ([[:alpha:]]+) $']
ADJBODY[, part := sub('^ ([[:alpha:]]+) of ([[:alpha:]]+) $',
'\\1', term)]
ADJBODY[, whole := sub('^ ([[:alpha:]]+) of ([[:alpha:]]+) $',
'\\2', term)]
TEMP <- merge(ADJBODY[, list(syn = whole, part, conceptId)],
ADJ[cat %in% c('noun.body')], by = 'syn')
if (nrow(TEMP) > 0){
BODY <- rbind(BODY, TEMP[,
list(conceptId, term = paste0(' ', adj, ' ', part, ' '))])
if (noisy){
message(paste0('Added ', nrow(TEMP),
' adjectival phrases for body structures.'))
}
BODY <- BODY[!duplicated(BODY)]
}
# Findings
ADJFINDINGS <- FINDINGS[
term %like% '^ ([[:alpha:]]+) of ([[:alpha:]]+) $' &
!(term %like% ' (fear|risk|finding|observation) of ')]
ADJFINDINGS[, part := sub('^ ([[:alpha:]]+) of ([[:alpha:]]+) $',
'\\1', term)]
ADJFINDINGS[, whole := sub('^ ([[:alpha:]]+) of ([[:alpha:]]+) $',
'\\2', term)]
TEMP <- merge(ADJFINDINGS[, list(syn = whole, part, conceptId)],
ADJ[cat %in% c('noun.state', 'noun.process', 'noun.phenomenon')],
by = 'syn')
if (nrow(TEMP) > 0){
FINDINGS <- rbind(FINDINGS, TEMP[,
list(conceptId, term = paste0(' ', adj, ' ', part, ' '))])
if (noisy){
message(paste0('Added ', nrow(TEMP),
' adjectival phrases for findings.'))
}
FINDINGS <- FINDINGS[!duplicated(FINDINGS)]
}
TEMP <- NULL
}
# Severity codes as per FHIR valueset plus a few extra
if (noisy) message('Creating severity and stage lists.')
SEVERITY <- QUAL[conceptId %in% desc(c(
'Degrees of severity (qualifier value)',
'Severities (qualifier value)'))]
QUAL <- QUAL[!(conceptId %in% SEVERITY$conceptId)]
# Concepts for stage definition as per FHIR valueset with a few extra
# from 'Stages'
STAGE <- desc(c('Stages', 'Tumour stage finding'))
STAGE <- rbind(FINDINGS[conceptId %in% STAGE],
QUAL[conceptId %in% STAGE])
QUAL <- QUAL[!(conceptId %in% STAGE$conceptId)]
#### LATERALITY ####
# Create a list of lateralised body structures
if (noisy) message('Creating lists of lateralised structures.')
left_structures <- relatedConcepts('Left', typeId = 'Laterality',
reverse = TRUE, SNOMED = SNOMED)
right_structures <- relatedConcepts('Right', typeId = 'Laterality',
reverse = TRUE, SNOMED = SNOMED)
lateralisable_structures <- relatedConcepts('Side', typeId = 'Laterality',
reverse = TRUE, SNOMED = SNOMED)
bilateral_structures <- intersect(left_structures, right_structures)
# Laterality concepts
CDB$latConcepts <- s(c('Left', 'Right', 'Bilateral'))
setattr(CDB$latConcepts, 'names', c('Left', 'Right', 'Bilateral'))
LATERALITY <- QUAL[conceptId %in% CDB$latConcepts]
LATERALITY <- rbind(LATERALITY,
data.table(conceptId = s('Left'),
term = c(' L ', ' lt ', ' left sided ')))
LATERALITY <- rbind(LATERALITY,
data.table(conceptId = s('Right'),
term = c(' R ', ' rt ', ' right sided ')))
LATERALITY <- rbind(LATERALITY,
data.table(conceptId = s('Bilateral'),
term = c(' left and right ', ' both sided ', ' bilat ',
' L and R ', ' R and L ')))
QUAL <- QUAL[!(conceptId %in% LATERALITY$conceptId)]
QUAL <- QUAL[!(term %in% paste0(' ', stopwords, ' '))]
#### PREPARE CDB ENVIRONMENT ####
addmw <- function(X, wn_categories){
if (!is.null(MANUAL_SYNONYMS)){
M <- copy(as.data.table(MANUAL_SYNONYMS))
M <- M[!duplicated(M)]
M[, snomed := sub('^ *', ' ', sub(' *$', ' ', snomed))]
M[, synonym := sub('^ *', ' ', sub(' *$', ' ', synonym))]
X <- rbind(X,
merge(X, M[, list(term = snomed, extra = synonym)],
by = 'term')[, list(conceptId, term = extra)],
merge(X, M[bidirectional == TRUE, list(term = synonym,
extra = snomed)], by = 'term')[,
list(conceptId, term = extra)], fill = TRUE)
}
if (!is.null(WN)){
return(addWordnet(X, wn_categories = wn_categories,
WN = WN, CHECK_TABLE <- rbind(QUAL, LATERALITY,
FINDINGS, BODY, STAGE, CAUSES, SEVERITY,
fill = TRUE), noisy = noisy))
} else {
return(X[!duplicated(X)])
}
}
CDB$FINDINGS <- addmw(FINDINGS, c('noun.state',
'noun.process', 'noun.phenomenon'))
CDB$QUAL <- addmw(QUAL, c('noun.state',
'noun.process', 'noun.phenomenon'))
CDB$CAUSES <- addmw(CAUSES, c('noun.state',
'noun.process', 'noun.phenomenon', 'noun.animal', 'noun.plant'))
BODY <- addmw(BODY, c('noun.body'))
BODY <- BODY[!term %in% LATERALITY$term]
#### PROCESS LATERALITY
# Find the lateralised version of each concept
# Need to allow for empty set of lateralised structures
lat_structures <- union(left_structures,
union(right_structures, bilateral_structures))
LAT <- SNOMED$RELATIONSHIP[sourceId %in% lat_structures
& typeId == s('Is a')]
if (nrow(LAT) == 0){
LAT <- data.table(conceptId = bit64::as.integer64(0),
nonlat_parentId = bit64::as.integer64(0))
} else {
LAT <- LAT[!(destinationId %in% lat_structures),
list(conceptId = sourceId, nonlat_parentId = destinationId)]
LAT <- LAT[!duplicated(LAT)]
}
# Create a laterality flag
BODY[, laterality := "No laterality"]
BODY[conceptId %in% lateralisable_structures,
laterality := "Lateralisable"]
BODY[conceptId %in% left_structures, laterality := 'Left']
BODY[conceptId %in% right_structures, laterality := 'Right']
BODY[conceptId %in% bilateral_structures,
laterality := 'Bilateral']
# Mark body site concepts that are actually concepts describing
# two separate body parts (e.g. proximal end of radius and ulna)
# Use the
BODY[, multipart := conceptId %in%
description(unique(BODY$conceptId), SNOMED = SNOMED)[
tolower(term) %like%
' and | or | and/or |\\(combined site\\)']$conceptId]
# Remove morphologic abnormalities from BODY to FINDINGS
MORPH <- BODY[semanticType(conceptId, SNOMED = SNOMED) ==
'morphologic abnormality', list(conceptId, term)]
if (nrow(MORPH) > 0){
BODY <- BODY[!(conceptId %in% MORPH$conceptId)]
}
CDB$MORPH <- MORPH
CDB$BODY <- BODY
CDB$BODY_LATERALITY <- merge(BODY, LAT, by = 'conceptId')
CDB$BODY_LATERALITY <- CDB$BODY_LATERALITY[
laterality %in% c('Left', 'Right', 'Bilateral'),
list(conceptId, laterality, nonlat_parentId)]
CDB$BODY_LATERALITY <- CDB$BODY_LATERALITY[
!duplicated(CDB$BODY_LATERALITY)]
# Remove ambiguous parent concepts
CDB$BODY_LATERALITY[, .N, by = list(laterality, nonlat_parentId)][
N > 1]$nonlat_parentId -> toremove
if (nrow(CDB$BODY_LATERALITY) > 0){
CDB$BODY_LATERALITY <- CDB$BODY_LATERALITY[
!(nonlat_parentId %in% toremove) & !is.na(nonlat_parentId)]
}
CDB$SEVERITY <- SEVERITY
CDB$LATERALITY <- LATERALITY
CDB$STAGE <- STAGE
# Causes which are not other findings (may be events,
# substances or organisms)
CDB$CAUSES <- CAUSES
CDB$OTHERCAUSE <- CAUSES[!(conceptId %in% FINDINGS$conceptId)]
CDB$OTHERCAUSE[, semType := semanticType(conceptId, SNOMED = SNOMED)]
# OVERLAP = concepts that are in findings as well as another
# (qual etc.)
FindingsFSN <- description(CDB$FINDINGS$conceptId, SNOMED = SNOMED)[,
list(findingId = conceptId, term = sub(' \\([^\\(]+\\)$', '', term))]
OtherFSN <- description(c(CDB$QUAL$conceptId, CDB$BODY$conceptId,
CDB$MORPH$conceptId), SNOMED = SNOMED)[,
list(otherId = conceptId, term = sub(' \\([^\\(]+\\)$', '', term))]
CDB$OVERLAP <- merge(FindingsFSN, OtherFSN)[, list(findingId, otherId)]
CDB$OVERLAP <- CDB$OVERLAP[!duplicated(CDB$OVERLAP)]
CDB$TRANSITIVE <- TRANSITIVE
CDB$metadata <- SNOMED$metadata
CDB$SCT_assoc <- s('Associated with')
CDB$SCT_cause <- s('Causative agent')
CDB$SCT_after <- s('After')
CDB$SCT_dueto <- s('Due to')
CDB$SCT_findingsite <- s('Finding site')
CDB$SCT_disorder <- s('Disorder')
CDB$SCT_finding <- s('Clinical finding')
CDB$allergyConcepts <- union(s(c('Allergic disposition',
'Adverse reaction',
'Intolerance to substance',
'Hypersensitivity disposition')), s('281647001'))
# remove allergy as synonym of 'allergic reaction'. Ensure that
# there is at least one allergy concept regardless of version of
# SNOMED dictionary used, to avoid error.
CDB$stopwords <- stopwords
CDB$SEMTYPE <- rbind(
CDB$FINDINGS[, list(conceptId, semType = semanticType(conceptId,
SNOMED = SNOMED))],
CDB$MORPH[, list(conceptId, semType = 'morphologic abnormality')],
CDB$BODY[, list(conceptId, semType = 'body structure')],
CDB$QUAL[, list(conceptId, semType = 'qualifier value')],
CDB$LATERALITY[, list(conceptId, semType = 'laterality')],
CDB$SEVERITY[, list(conceptId, semType = 'severity')],
CDB$OTHERCAUSE[!(conceptId %in% c(CDB$FINDINGS$conceptId,
CDB$BODY$conceptId, CDB$QUAL$conceptId,
CDB$LATERALITY$conceptId, CDB$SEVERITY$conceptId)),
list(conceptId, semType)])
# contents of stage are mostly findings so don't need to include
CDB$SEMTYPE <- CDB$SEMTYPE[!duplicated(CDB$SEMTYPE)]
# Set keys for fast searching
if (noisy) message('Creating indices for fast searching')
setkey(CDB$SEMTYPE, conceptId)
setkey(CDB$FINDINGS, term); setindex(CDB$FINDINGS, conceptId)
setkey(CDB$MORPH, term); setindex(CDB$MORPH, conceptId)
setkey(CDB$BODY, term); setindex(CDB$BODY, conceptId)
setkey(CDB$QUAL, term); setindex(CDB$QUAL, conceptId)
setkey(CDB$LATERALITY, term); setindex(CDB$LATERALITY, conceptId)
setkey(CDB$SEVERITY, term); setindex(CDB$SEVERITY, conceptId)
setkey(CDB$STAGE, term); setindex(CDB$STAGE, conceptId)
setkey(CDB$OTHERCAUSE, term); setindex(CDB$OTHERCAUSE, conceptId)
setkey(CDB$CAUSES, term); setindex(CDB$CAUSES, conceptId)
setkey(CDB$OVERLAP, otherId)
setkey(CDB$BODY_LATERALITY, conceptId)
setindex(CDB$BODY_LATERALITY, laterality)
setindex(CDB$BODY_LATERALITY, nonlat_parentId)
return(CDB)
}
createDisambiguationTrainer <- function(CDB, SNOMED){
# Create disambiguation trainer for unigrams among SNOMED concepts
# of semantic type 'Clinical finding' and 'Body structure'
# Output a data.table table text,p with {p ...} surrounding the
# acronym, which can be converted to MedCAT training data
# Declare symbols to avoid R check error
term <- conceptId <- has_unigram <- conceptId <- text <- NULL
findings_to_disambiguate <- unique(CDB$FINDINGS[,
list(has_unigram = any(term %like% '^ [[:alpha:]]+ $')),
by = conceptId][has_unigram == TRUE]$conceptId)
body_to_disambiguate <- unique(CDB$BODY[,
list(has_unigram = any(term %like% '^ [[:alpha:]]+ $')),
by = conceptId][has_unigram == TRUE]$conceptId)
disambiguate_concept <- function(the_conceptId, prefix){
terms <- union(CDB$FINDINGS[conceptId == the_conceptId]$term,
CDB$BODY[conceptId == the_conceptId]$term)
descendant_concepts <- descendants(the_conceptId,
SNOMED = SNOMED, TRANSITIVE = CDB$TRANSITIVE,
include_self = FALSE)
descendant_terms <- union(
CDB$FINDINGS[conceptId %in% descendant_concepts]$term,
CDB$BODY[conceptId %in% descendant_concepts]$term)
if (length(terms) > 0 & length(descendant_terms) > 0){
rbindlist(lapply(terms, function(term){
if (any(descendant_terms %like% term)){
data.table(conceptId = the_conceptId,
text = sub(term, paste0(' {', prefix,
sub(' $', '', term), '} '),
descendant_terms[descendant_terms %like% term]))
} else {
data.table(conceptId = bit64::integer64(0),
text = character(0))
}
}))
} else {
data.table(conceptId = bit64::integer64(0),
text = character(0))
}
}
OUT <- rbind(
rbindlist(lapply(1:length(findings_to_disambiguate), function(x){
disambiguate_concept(findings_to_disambiguate[x], prefix = 'p')
})),
rbindlist(lapply(1:length(body_to_disambiguate), function(x){
disambiguate_concept(body_to_disambiguate[x], prefix = 'p')
}))
)
# Exclude entries with no context
OUT <- OUT[!(text %like% '^ \\{.*\\} $')]
}
#' Exports CDB files for MedCAT / MiADE
#'
#' Produces a set of files for the findings / problems algorithm of
#' MedCAT and MiADE. Uses the CDB environment created using createCDB
#' which can incorporate additional manual synonyms or synonyms from
#' WordNet.
#'
#' The following files are exported:
#'
#' For MedCAT (named entity recognition and linking):
#'
#' \describe{
#' \item{problems_cdb.csv}{ - CSV file in MedCAT concept database
#' format
#' containing cui (SNOMED CT concept ID), name, name_status ('P'
#' for preferred term, 'N' for terms that must be disambiguated
#' (e.g. acronyms or short terms), 'A' for synonym),
#' ontologies = SNO (for SNOMED CT)}
#' }
#'
#' For MiADE postprocessing:
#'
#' \describe{
#' \item{negated.csv}{ - CSV file with columns findingId (SNOMED CT
#' concept ID of the underlying finding / disorder) and
#' situationId (SNOMED CT concept ID of the pre-coordinated
#' situation concept for negation of the finding / disorder).
#' Sorted by findingId.}
#' \item{historic.csv}{ - CSV file with columns findingId (SNOMED CT
#' concept ID of the underlying finding / disorder) and
#' situationId (SNOMED CT concept ID of the pre-coordinated
#' situation concept for 'history of' the finding / disorder).
#' Sorted by findingId.}
#' \item{suspected.csv}{ - CSV file with columns findingId (SNOMED CT
#' concept ID of the underlying finding / disorder) and
#' situationId (SNOMED CT concept ID of the pre-coordinated
#' situation concept for 'suspected' finding / disorder).
#' Sorted by findingId.}
#' \item{overlap.csv}{ - CSV file with columns findingId (SNOMED CT
#' concept ID of the underlying finding / disorder) and
#' otherId (SNOMED CT concept ID of a concept with the same
#' description but a different semantic type, typically a
#' morphologic abnormality). Sorted by otherId.}
#' \item{problem_blacklist.csv}{ - CSV file without header with one
#' column containing SNOMED CT concept IDs for concepts that
#' may be identified by MedCAT as part of text analysis but
#' should not be included in final MiADE output, Examples include
#' procedure codes which may be used to link to precoordinated
#' `history of...' concepts. This file can also be used to
#' force MiADE to ignore any specific SNOMED CT concepts in the
#' output. Sorted in ascending order.}
#' }
#'
#' For more information about MiADE, visit
#' \url{https://www.ucl.ac.uk/health-informatics/research/miade/miade-software-and-availability}
#'
#' For more information about MedCAT, visit
#' \url{https://github.com/CogStack/MedCAT}
#'
#' @param CDB concept database environment created by createCDB
#' @param export_folderpath folder path to export to
#' @param lang_refset_files character vector of file paths to
#' SNOMED CT language refset files, in order to identify the
#' preferred term for each concept. If NULL, the Fully Specified Name
#' minus the semantic type suffix is used as the preferred term
#' (e.g. if the Fully Specified Name is `Cancer (disorder)', the
#' default preferred term is `Cancer'.
#' @param exclude a SNOMEDconcept or SNOMEDcodelist object specifying
#' concepts to exclude from the concept database. By
#' default, all concepts in the FINDINGS, CAUSES, BODY, LATERALITY,
#' MORPH, SEVERITY, STAGE and QUAL tables will be included.
#' @param include a SNOMEDconcept or SNOMEDcodelist object specifying
#' additional concepts to include in the concept database. By
#' default, all findings are included for potential export, but
#' there may additional concepts of other semantic types
#' (e.g. situation concepts) that need to be included. Inclusion
#' takes place after exclusion, i.e. a concept in both the include
#' and exclude lists will be included.
#' @param exclude_historic a SNOMEDconcept or SNOMEDcodelist object
#' specifying concepts to be excluded from the `historic' lookup,
#' i.e. those that should not be converted into historic forms.
#' The default is to not do this conversion for disorders, only for
#' procedures.
#' @param blacklist a SNOMEDconcept or SNOMEDcodelist object specifying
#' concepts to filter out of the final output. By default, concepts
#' in the CDB of any semantic type other than `finding' or `disorder'
#' are excluded. The blacklist can be used to exclude a subset of
#' findings or disorders that are not useful for the particular
#' application.
#' @param SNOMED environment containing a SNOMED dictionary
#' @seealso createCDB, downloadWordnet, downloadOrphanet, MANUAL_SYNONYMS,
#' exclude_irrelevant_findings
#' @return TRUE if successful
#' @family MiADE functions
#' @export
#' @examples
#' # Not run
#' # exportMiADECDB(CDB, export_folderpath = tempdir())
exportMiADECDB <- function(CDB, export_folderpath,
lang_refset_files = NULL, exclude = NULL, include = NULL,
exclude_historic = descendants('Disorder', SNOMED = getSNOMED()),
blacklist = NULL, SNOMED = getSNOMED()){
# Declare symbols to avoid R check error
active <- effectiveTime <- referencedComponentId <- NULL
pref <- acceptabilityId <- Synonym <- typeId <- NULL
conceptId <- term <- lowerterm <- destinationId <- NULL
sourceId <- destinationId <- situationId <- findingId <- NULL
Nsit <- toremove <- name_status <- name <- otherId <- NULL
export_folderpath <- sub('([^/\\])$', '\\1/', export_folderpath)
if (!is.null(lang_refset_files)){
message(paste0('Loading language refset from ',
lang_refset_files))
# Load up language Refset
LANG <- data.table(id = character(0), effectiveTime = integer(0),
active = integer(0), moduleId = bit64::integer64(0),
refsetId = bit64::integer64(0),
referencedComponentId = bit64::integer64(0),
acceptabilityId = bit64::integer64(0))
for (i in lang_refset_files){
LANG <- rbind(LANG, fread(i), fill = TRUE)
}
# Keep only the most recent value for active concepts
LANG[, keep := active == 1 & effectiveTime == max(effectiveTime),
by = referencedComponentId]
LANG <- LANG[keep == TRUE]
LANG <- LANG[!duplicated(LANG)]
# Mark the UK preferred terms
sct_pref <- bit64::as.integer64(SNOMEDconcept('Preferred'))
SNOMED$DESCRIPTION[, pref :=
LANG[, list(pref = any(acceptabilityId == sct_pref)),
by = list(id = referencedComponentId)][
SNOMED$DESCRIPTION, on = 'id']$pref]
synonym_type <- SNOMEDconcept('Synonym')
SNOMED$DESCRIPTION[, Synonym := typeId == synonym_type]
# Select a single UK preferred concept
setkey(SNOMED$DESCRIPTION, conceptId, pref, Synonym)
SCT <- SNOMED$DESCRIPTION[, choose := c(rep(FALSE, .N - 1),
TRUE), by = conceptId][, list(conceptId, term)]
message('Using the language preferred terms')
} else {
# Choose the Fully Specified Name without the suffix
setkey(SNOMED$DESCRIPTION, conceptId, typeId)
SCT <- SNOMED$DESCRIPTION[, choose := c(TRUE, rep(FALSE, .N - 1)),
by = conceptId][choose == TRUE,
list(conceptId, term = sub(' \\([^\\)]+\\)$', '', term))]
message('Using Fully Specified Names without suffixes as preferred terms')
}
# remove old ICD-10-style concepts like [X]Depression
SCT[, term := sub('^\\[X\\]', '', term)]
SCT[, lowerterm := tolower(term)]
#### SUSPECTED, HISTORIC, NEGATED, BLACKLIST ####
# Suspected, Historic, Negated version of concepts (where available)
relation_source <- function(destination){
sort(unique(as.SNOMEDconcept(SNOMED$RELATIONSHIP[
destinationId %in% SNOMEDconcept(destination)]$sourceId)))
}
# Procedures can be used for historic concepts for problem list
# but not for current procedures
message('Creating SNOMED CT relation lists')
findings <- descendants('Clinical finding', SNOMED = SNOMED,
include_self = TRUE)
disorders <- descendants('Disorder', SNOMED = SNOMED)
acute_diseases <- descendants('Acute disease', SNOMED = SNOMED)
procedures <- descendants('Procedure', SNOMED = SNOMED)
suspected <- relation_source('Suspected')
known_absent <- relation_source('Known absent')
known_present <- relation_source('Known present')
subject_of_record <- relation_source('Subject of record')
family_person <- relation_source('Person in family of subject')
in_the_past <- relation_source('In the past')
done <- relation_source('Done')
current_or_specified_time <-
relation_source('Current or specified time')
ASSOC_FINDPROC <- SNOMED$RELATIONSHIP[
typeId %in% SNOMEDconcept(c('Associated finding',
'Associated procedure'), SNOMED = SNOMED),
list(situationId = sourceId, findingId = destinationId)]
if (is.null(exclude_historic)){
exclude_historic <- bit64::as.integer64(0)
} else if (is.SNOMEDcodelist(exclude_historic)){
exclude_historic <- as.SNOMEDconcept(exclude_historic$conceptId)
} else {
exclude_historic <- as.SNOMEDconcept(exclude_historic)
}
HISTORIC <- ASSOC_FINDPROC[situationId %in% intersect(
c(known_present, done),
intersect(subject_of_record, in_the_past)) &
findingId %in% setdiff(union(procedures, disorders),
exclude_historic)][order(findingId)]
# Keep if only one situation per finding/procedure, to avoid errors
# e.g. 'Implantation procedure' --> 'History of heart valve recipient'
HISTORIC[, Nsit := .N, by = findingId]
HISTORIC <- HISTORIC[Nsit == 1 &
findingId != SNOMEDconcept('Aftercare', SNOMED = SNOMED)]
NEGATED <- ASSOC_FINDPROC[situationId %in% intersect(known_absent,
intersect(subject_of_record, current_or_specified_time))][
order(findingId)]
# Keep if only one situation per finding, to avoid errors
NEGATED[, Nsit := .N, by = findingId]
NEGATED <- NEGATED[Nsit == 1]
SUSPECTED <- ASSOC_FINDPROC[situationId %in% intersect(suspected,
intersect(subject_of_record, current_or_specified_time))][
order(findingId)]
# Keep if only one situation per finding, to avoid errors
SUSPECTED[, Nsit := .N, by = findingId]
SUSPECTED <- SUSPECTED[Nsit == 1]
#### PROCESS SNOMED CONCEPTS ####
message('Initialising CDB')
SCT_CDB <- rbind(CDB$FINDINGS, CDB$CAUSES, CDB$BODY,
CDB$LATERALITY, CDB$MORPH, CDB$SEVERITY, CDB$STAGE, CDB$QUAL,
fill = TRUE)[, list(conceptId, term = gsub('^ +| +$', '', term),
lowerterm = tolower(gsub('^ +| +$', '', term)))]
SCT_CDB <- SCT_CDB[!duplicated(SCT_CDB)]
# Exclude SCT_CDB synonyms that have the same lowerterm as
# a preferred term (in which case to keep the preferred term
# wording as is)
# Find terms to remove that correspond to preferred names
# (in order to re-capitalise)
MATCH <- merge(SCT_CDB, SCT[, list(conceptId,
lowerterm = tolower(term))], by = c('conceptId', 'lowerterm'))
# Mark terms to remove
SCT_CDB <- merge(SCT_CDB,
MATCH[, list(conceptId, lowerterm, toremove = TRUE)],
by = c('conceptId', 'lowerterm'), all.x = TRUE)
# Remove terms and replace with original term for preferred name
SCT <- rbind(SCT_CDB[is.na(toremove), list(conceptId,
term, name_status = 'A')],
SCT[conceptId %in% SCT_CDB$conceptId, list(conceptId,
term, name_status = 'P')])
rm(MATCH)
message('Concept database has ', nrow(SCT), ' rows (',
uniqueN(SCT$conceptId), ' concepts).')
# Verify only one preferred per concept Id
stopifnot(nrow(SCT[name_status == 'P']) == SCT[name_status == 'P'][,
uniqueN(conceptId)])
stopifnot(nrow(SCT[name_status == 'P']) == SCT[,
uniqueN(conceptId)])
# Force disambiguation for all short terms (less than 4 characters)
SCT[name_status == 'A' & nchar(term) <= 3, name_status := 'N']
#### INCLUSION AND EXCLUSION ####
# Remove exclusion concepts
if (is.null(exclude)){
message('No exclusions.')
} else {
message('Applying exclusion list.')
if (is.SNOMEDcodelist(exclude)){
SCT <- SCT[!conceptId %in% exclude$conceptId]
} else {
exclude <- as.SNOMEDconcept(exclude)
SCT <- SCT[!conceptId %in% exclude]
}
message('After exclusions, concept database has ',
nrow(SCT), ' rows (', uniqueN(SCT$conceptId),
' concepts).')
}
# Add inclusion concepts
if (!is.null(include)){
if (is.SNOMEDcodelist(include)){
include <- setdiff(as.SNOMEDconcept(include$conceptId),
SCT$conceptId)
} else {
include <- setdiff(as.SNOMEDconcept(include), SCT$conceptId)
}
message(paste0('Including ', length(include),
' additional concepts from include list.'))
EXTRA <- SNOMED$DESCRIPTION[conceptId %in% include,
list(conceptId, term, name_status = ifelse(typeId ==
as.SNOMEDconcept('Fully Specified Name', SNOMED = SNOMED),
'P', 'A'))]
SCT <- rbind(SCT, EXTRA)
} else {
include <- as.SNOMEDconcept(bit64::integer64(0))
}
# Prepare blacklist
if (is.null(blacklist)){
message('Creating default blacklist.')
blacklist <- as.SNOMEDconcept(bit64::integer64(0))
} else {
message('Applying blacklist.')
if (is.SNOMEDcodelist(blacklist)){
blacklist <- as.SNOMEDconcept(blacklist$conceptId)
} else {
blacklist <- as.SNOMEDconcept(blacklist)
}
}
keep <- union(intersect(findings, SCT$conceptId), include)
blacklist <- union(intersect(blacklist, SCT$conceptId),
setdiff(as.SNOMEDconcept(SCT$conceptId, SNOMED = SNOMED), keep))
#### EXPORT DATASETS ####
message(paste0('Exporting dataset to ', export_folderpath))
# Format of output file for MedCAT:
# cui, name, ontologies, name_status
setkey(SCT, conceptId, name_status, term)
fwrite(SCT[, list(cui = conceptId, name = term,
ontologies = 'SNO', name_status)],
file = paste0(export_folderpath, 'cdb_problems.csv'))
# Export lookup files for MiADE
fwrite(NEGATED[, list(findingId, situationId)][order(findingId)],
file = paste0(export_folderpath, 'negated.csv'))
fwrite(HISTORIC[, list(findingId, situationId)][order(findingId)],
file = paste0(export_folderpath, 'historic.csv'))
fwrite(SUSPECTED[, list(findingId, situationId)][order(findingId)],
file = paste0(export_folderpath, 'suspected.csv'))
fwrite(CDB$OVERLAP[, list(findingId, otherId)][order(otherId)],
file = paste0(export_folderpath, 'overlap.csv'))
# Blacklist of ignorable concepts not to present as final output
write(as.character(sort(unique(blacklist))),
file = paste0(export_folderpath, 'problem_blacklist.csv'),
ncolumns = 1)
return(TRUE)
}
#' Sample inclusion, exclusion and blacklist sets for a MiADE CDB
#'
#' Returns a set of SNOMED concepts (as a SNOMEDconcept vector)
#' which can be used to exclude
#' findings in the MedCAT named entity recognition step, or blacklist
#' (filter out) findings from the final output.
#'
#' \describe{
#' \item{exclude_irrelevant_findings}{social history
#' (except housing problems and care needs),
#' administrative statuses (except registered disabled) and
# normal findings, intended to be used as an exclusion list
#' for concept detection}
#' \item{blacklist_vague_findings}{vague findings and disorders,
#' intended to be used in the blacklist}
#' \item{blacklist_almost_all_except_diseases}{almost all findings
#' and vague disorders, intended to be used in the blacklist}
#' }
#'
#' @seealso exportMiADECDB, createCDB
#' @family MiADE functions
#' @param SNOMED environment containing a SNOMED dictionary
#' @return SNOMEDconcept vector containing findings to exclude
#' @export
exclude_irrelevant_findings <- function(SNOMED = getSNOMED()){
# Remove social history (except housing problems and care needs),
# administrative statuses (except registered disabled) and
# normal findings
# Define symbols for R CMD check
term <- NULL
social_history <- descendants('Social and personal history finding',
SNOMED = SNOMED, include_self = TRUE)
admin <- descendants('Administrative statuses',
SNOMED = SNOMED, include_self = TRUE)
# Keep disability registration and housing and care needs
regstatus_keep <- SNOMEDconcept(c('On adult protection register',
'On learning disability register',
'On social services disability register',
'Registered blind',
'Registered partially sighted',
'Registered deaf',
'Registered hearing impaired',
'Registered sight impaired'), SNOMED = SNOMED)
housing_and_care_keep <- descendants(c(
'Homeless',
'No fixed abode',
'Lives alone',
'Lives in supported home',
'Unsatisfactory living conditions',
'Finding related to care and support circumstances and networks'),
SNOMED = SNOMED, include_self = TRUE)
# Remove general and normal findings
disorders <- descendants('Disorder', SNOMED = SNOMED,
include_self = TRUE)
general_findings <- setdiff(
union(descendants(c('Evaluation finding',
'Colour finding', 'Patient condition finding'),
SNOMED = SNOMED, include_self = TRUE),
SNOMEDconcept(c('General symptom',
'Complaining of a general symptom',
'Urine finding',
'Finding related to pregnancy',
'Delivery finding',
'Safety finding',
'Feeding finding',
'Finding of movement',
'Stool finding'), SNOMED = SNOMED)), disorders)
normal <- SNOMEDconcept('^Normal| normal', exact_match = FALSE,
SNOMED = SNOMED)
normal <- normal[semanticType(normal) == 'finding']
normal <- union(normal, descendants('No abnormality detected',
include_self = TRUE, SNOMED = SNOMED))
# Remove generic body system findings
BODY_DESC <- description(
descendants('Body structure', SNOMED = SNOMED),
SNOMED = SNOMED, include_synonyms = TRUE)
body_desc_generic_terms <- c(paste(BODY_DESC$term, 'finding'),
paste('Finding of', tolower(BODY_DESC$term)),
paste(BODY_DESC$term, 'system finding'),
paste('Finding of', tolower(BODY_DESC$term), 'system'),
paste(BODY_DESC$term, 'observation'),
paste('Observation of', tolower(BODY_DESC$term)),
paste(BODY_DESC$term, 'system observation'),
paste('Observation of', tolower(BODY_DESC$term), 'system'))
body_desc_generic_findings <- SNOMED$DESCRIPTION[term %in%
body_desc_generic_terms]$conceptId
setdiff(union(union(
union(admin, social_history),
union(normal, general_findings)),
body_desc_generic_findings),
union(housing_and_care_keep, regstatus_keep))
}
#' @rdname exclude_irrelevant_findings
#' @family MiADE functions
#' @export
blacklist_vague_findings <- function(SNOMED = getSNOMED()){
intersect(descendants('Clinical finding', SNOMED = SNOMED,
include_self = TRUE), SNOMEDconcept(c(
'Disease',
'Clinical finding',
'Problem',
'Impairment',
'Chief complaint',
'Sign',
'Complaint',
'Sequela',
'Early complication',
'Co-morbid conditions',
'Pre-existing condition',
'Acute disease',
'Subacute disease',
'Chronic disease',
'General problem AND/OR complaint',
'Evaluation finding',
'Administrative statuses',
'Finding by site',
'Finding by method',
'Clinical history and observation findings',
'Behaviour',
'Adverse incident outcome categories',
'Prognosis/outlook finding',
'General clinical state finding',
'Disorder by body site',
'Failure',
'Acute failure',
'Subacute failure',
'Chronic failure',
'Decompensation',
'Discrepancy',
'Idiosyncrasy',
'Inefficiency',
'General body state finding',
'General clinical state finding',
'Pressure',
'Disease related state',
'Absence of pressure',
'Decreased pressure',
'Increased pressure',
'Swelling',
'Disease condition finding',
'Allergic disposition',
'Pain',
'Values (community)',
'Fit and well',
'No sensitivity to pain'), SNOMED = SNOMED))
}
#' @rdname exclude_irrelevant_findings
#' @family MiADE functions
#' @export
blacklist_almost_all_except_diseases <- function(SNOMED = getSNOMED()){
dont_exclude <- descendants(c(
'Disorder',
'Amputee',
'Mass of body region',
'Functional disease present',
'Seizure',
'Syncope',
'Falls',
'Recurrent falls',
'Elderly fall',
'Unexplained recurrent falls',
'Unexplained falls',
'Falls caused by medication'), SNOMED = SNOMED,
include_self = TRUE)
union(setdiff(descendants('Clinical finding', include_self = TRUE,
SNOMED = SNOMED), dont_exclude),
blacklist_vague_findings(SNOMED = SNOMED))
}
std_term <- function(x, stopwords = c('the', 'of', 'by', 'with', 'to',
'into', 'and', 'or', 'at', 'as', 'and/or', 'in'),
hyphens_to_space = FALSE, remove_stopwords = FALSE,
remove_words_in_parentheses = FALSE,
regex_do_not_remove_parentheses =
'\\(exclud|\\(with|\\(except|\\(includ'){
# lowercase except if single word concepts with second, third
# or final letter upper case (i.e. an acronym like HbA1c or
# nSTEMI)
# decapitalise the first letter if rest of first word is
# lower case and non-numeric, otherwise keep case as is.
if (remove_words_in_parentheses){
x <- ifelse(x %like% regex_do_not_remove_parentheses, x,
sub('(.) \\([^\\)]+\\)', '\\1', x))
}
if (hyphens_to_space){
x <- gsub('-', ' ', x)
}
x <- gsub(' +', ' ', gsub('^ *|-|,|\\(|\\)| *$', '', x))
words <- strsplit(x, ' ')
x <- paste0(' ', sapply(lapply(words, function(y){
ifelse(y %like% '^[A-Z][a-z]+$' |
y %in% toupper(stopwords), tolower(y), y)
}), function(y) paste(y, collapse = ' ')), ' ')
if (remove_stopwords){
x <- gsub(paste0(' ', paste0(stopwords, collapse = ' | '),
' '), ' ', x)
}
x
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.