R/cdb.R

Defines functions std_term blacklist_almost_all_except_diseases blacklist_vague_findings exclude_irrelevant_findings exportMiADECDB createDisambiguationTrainer createCDB

Documented in blacklist_almost_all_except_diseases blacklist_vague_findings createCDB exclude_irrelevant_findings exportMiADECDB

#' Creates an environment containing CDB files
#'
#' Extracts SNOMED CT concepts from appropriate places in the 
#' hierarchy to create a set of CDB files in an environment.
#' Uses WordNet and manual synonyms if available.
#'
#' @param SNOMED environment containing a SNOMED dictionary
#' @param TRANSITIVE transitive closure table, generated by
#'   createTransitive. It is regenerated if not provided.
#' @param WN WordNet data.table as returned by downloadWordnet
#'   containing WordNet data from appropriate
#'   categories, in the format: cat (character), wordnetId (integer64),
#'   synonyms (list), parents (list), adj (list)
#' @param MANUAL_SYNONYMS data.table with columns term1 and term2,
#'   containing additional exact synonyms or abbreviations
#' @param noisy whether to output status messages
#' @param stopwords vector of stopwords 
#' @return environment containing the following data tables: FINDINGS,
#'   QUAL, CAUSES, BODY, OTHERCAUSE, OTHERSEARCH, OVERLAP, TRANSITIVE
#' @export
#' @family CDB functions
#' @seealso exportMiADECDB, MANUAL_SYNONYMS
#' @examples
#' # Not run
#' # data(MANUAL_SYNONYMS)
#' # WN <- downloadWordnet()
#' # MANUAL_SYNONYMS <- rbind(MANUAL_SYNONYMS, downloadOrphanet())
#' # CDBNEW <- createCDB(WN = WN, MANUAL_SYNONYMS = MANUAL_SYNONYMS)
createCDB <- function(SNOMED = getSNOMED(), TRANSITIVE = NULL,
	WN = NULL, MANUAL_SYNONYMS = NULL, noisy = TRUE,
	stopwords = c('the', 'of', 'by', 'with', 'to', 'into', 'and', 'or',
	'at', 'as', 'and/or', 'in')){
	# Returns an environment containing data.tables used for
	# generating decompositions 
	CDB <- new.env()
	
	# Declare symbols to avoid R check error
	type <- conceptId <- term <- typeId <- adj <- wordnetId <- NULL
	synonyms <- syn <- part <- whole <- snomed <- synonym <- NULL
	extra <- sourceId <- destinationId <- laterality <- NULL
	nonlat_parentId <- N <- semType <- findingId <- otherId <- NULL
	descendantId <- ancestorId <- bidirectional <- multipart <- NULL

	#### USEFUL FUNCTIONS ####
	
	s <- function(x) SNOMEDconcept(x, SNOMED = SNOMED)
	desc <- function(x, ...){
		descendants(x, SNOMED = SNOMED, TRANSITIVE = TRANSITIVE, ...)
	}
	anc <- function(x, ...){
		ancestors(x, SNOMED = SNOMED, TRANSITIVE = TRANSITIVE, ...)
	}
	init <- function(x){
		x <- desc(x)
		# Add acronyms e.g. 'AF - atrial fibrillation' --> AF
		# Add concepts with phrases in parentheses removed unless
		# the phrase in brackets alters the meaning of the root phrase
		# , e.g.
		# ERCP (Endoscopic retrograde cholangiopancreatography) normal
		# --> ERCP normal
		# but not if brackets start with except|exclud|with|without
		DESC <- description(x, SNOMED = SNOMED,
			include_synonyms = TRUE)[type == 'Synonym']
		if (nrow(DESC) > 0){
			INIT <- rbind(DESC[, list(conceptId, term = std_term(term))],
				DESC[, list(conceptId, term = std_term(term,
					stopwords = stopwords, hyphens_to_space = TRUE))],
				DESC[, list(conceptId, term = std_term(term,
					stopwords = stopwords, remove_stopwords = TRUE))],
				DESC[, list(conceptId, term = std_term(term,
					stopwords = stopwords,
					remove_words_in_parentheses = TRUE))],
				acronyms(x, SNOMED = SNOMED)[, list(conceptId,
					term = paste0(' ', term, ' '))])
			return(INIT[!duplicated(INIT)])
		} else {
			return(DESC)
		}
	}

	#### CAUSES (NOT USING TRANSITIVE TABLE) ####
	if (noisy) message('Initialising causes.')
	CAUSES <- description(unique(
		SNOMED$RELATIONSHIP[typeId %in% s(
		c('Due to', 'Causative agent', 'After'))]$destinationId),
		SNOMED = SNOMED, include_synonyms = T)[type == 'Synonym',
		list(conceptId, term = std_term(term))]
	
	#### GENERATE TRANSITIVE TABLE ####
	# Include sampleSNOMED() for testing purposes
	if (is.null(TRANSITIVE)){
		if (noisy) message('Creating transitive closure table.')
		TRANSITIVE <- createTransitive(c(sampleSNOMED()$CONCEPT$id,
			descendants(c('Clinical finding', 'Qualifier value',
			'Body structure'), SNOMED = SNOMED, include_self = TRUE),
			CAUSES$conceptId), SNOMED = SNOMED)
		if (noisy){
			message(paste0('Transitive closure table created, ',
				nrow(TRANSITIVE), ' rows.'))
		}
	}

	#### FINDINGS ####
	if (noisy) message('Initialising findings and qualifiers.')
	FINDINGS <- init('Clinical finding')
	
	#### QUALIFIERS ####
	QUAL <- init(c('Grades (qualifier value)',
		'Groups (qualifier value)', 'Levels (qualifier value)',
		'Scores (qualifier value)', 'Types (qualifier value)',
		'Classification system (qualifier value)',
		'Descriptor (qualifier value)', 'Finding value (qualifier value)',
		'Courses (qualifier value)',
		'General site descriptor (qualifier value)'))

	#### BODY STRUCTURES ####
	if (noisy) message('Initialising body structures.')
	BODY <- init('Body structure')
	
	# Remove 'entire', 'structure of', 'of' and 'the'
	strip_structure <- ' entire | region structure | structure of | structure | of the | of | the '
	if (nrow(BODY) > 0){
		BODY <- rbind(BODY, BODY[, list(conceptId,
			term = gsub(strip_structure, ' ',
			gsub(strip_structure, ' ', term)))])
		BODY <- BODY[!duplicated(BODY)]

		# Remove structure type (e.g. 'muscle structure' etc.) if a name is
		# unique (e.g. there is only one rectus femoris and it is a muscle)
		structure_types <- c(' muscle | skeletal muscle ', ' ligament ',
			' tendon ', ' bone | bone structure ', ' joint ', ' artery ',
			' vein ')
		structure_terms <- lapply(structure_types, function(structure_type){
			# Find concepts with structure
			ANCESTOR <- BODY[term %like% paste0('^(', structure_type, ')$')]
			# first generation children are non-specific and are not to have
			# their body type stripped out
			if (nrow(ANCESTOR) > 0){
				if (noisy) message(paste0('Removing the word(s) ',
					structure_type, ' where possible.'))
				TEMP <- BODY[term %like% structure_type &
					conceptId %in% desc(ANCESTOR$conceptId),
					list(conceptId, term = gsub(paste0(strip_structure, '|',
					structure_type), ' ', gsub(paste0(strip_structure, '|',
					structure_type), ' ', term)))]
				TEMP[!duplicated(TEMP)]
				TEMP
			} else {
				data.table(conceptId = bit64::integer64(0),
					term = character(0))
			}
		})
		setattr(structure_terms, 'names', structure_types)

		# For each structure types, we have a list of putative terms. 
		# Need to check if these are unique
		other_structure_terms <- lapply(structure_types, function(structure_type){
			# Find concepts with structure
			unique(unlist(lapply(
			structure_terms[setdiff(structure_types, structure_type)],
			function(x){
				x$term
			})))
		})
		setattr(other_structure_terms, 'names', structure_types)

		# Body parts without a structure
		body_ancestors <- BODY[term %like% paste0('^(',
			paste(structure_types, collapse = '|'), ')$')]$conceptId
		OTHER_BODY_PARTS <- BODY[!(conceptId %in% desc(body_ancestors))]

		# Find concepts for generic body parts (these should not have
		# their body type stripped)
		firstgen <- BODY[term %like% paste0('^(',
			paste(structure_types, collapse = '|'),
			')(part |)$')]$conceptId
		firstgen <- children(firstgen, include_self = TRUE,
			SNOMED = SNOMED)
		
		# Find unique structure terms without body part type
		unique_structure_terms <- lapply(structure_types,
			function(structure_type){
				TEMP <- structure_terms[[structure_type]]
				searchtext <- paste(setdiff(structure_types, structure_type),
					collapse = '|')
				exclude <- c(TEMP[(term %in% c(OTHER_BODY_PARTS$term,
					other_structure_terms[[structure_type]])) |
					(term %like% searchtext)]$conceptId, 
					firstgen)
				# Exclude if term is ambiguous or has another structure
				# type mentioned
				TEMP[!(conceptId %in% exclude), 
					list(conceptId, term, structure_type)]
			}
		)
		setattr(unique_structure_terms, 'names', structure_types)

		EXTRA_WITHOUT_STRUCTURE <- rbindlist(unique_structure_terms)
		BODY <- rbind(BODY, EXTRA_WITHOUT_STRUCTURE[,
			list(conceptId, term)])
	}

	#### ADDING WORDNET ADJECTIVES ####
	if (!is.null(WN)){
		if (noisy) message('Adding WordNet adjectives.')
		# Extra concepts for 
		# XXX of XXX --> XXX XXX (adjectival)
		# e.g. neck of femur --> femoral neck
		#      dissection of artery --> arterial dissection
		ADJ <- WN[, list(adj = adj[1][[1]]), by = list(wordnetId, cat)]
		ADJ[, adj := sub('[1-9]$', '', adj)]
		SYN <- WN[, list(syn = synonyms[1][[1]]), by = wordnetId]
		SYN[, syn := sub('[1-9]$', '', syn)]
		ADJ <- merge(ADJ, SYN, by = 'wordnetId',
			allow.cartesian = TRUE)[adj != syn, list(adj, syn, cat)]
		ADJ <- ADJ[!duplicated(ADJ)]

		# Body structures
		ADJBODY <- BODY[term %like% '^ ([[:alpha:]]+) of ([[:alpha:]]+) $']
		ADJBODY[, part := sub('^ ([[:alpha:]]+) of ([[:alpha:]]+) $',
			'\\1', term)]
		ADJBODY[, whole := sub('^ ([[:alpha:]]+) of ([[:alpha:]]+) $',
			'\\2', term)]

		TEMP <- merge(ADJBODY[, list(syn = whole, part, conceptId)],
			ADJ[cat %in% c('noun.body')], by = 'syn')
		if (nrow(TEMP) > 0){
			BODY <- rbind(BODY, TEMP[,
				list(conceptId, term = paste0(' ', adj, ' ', part, ' '))])
			if (noisy){
				message(paste0('Added ', nrow(TEMP),
					' adjectival phrases for body structures.'))
			}
			BODY <- BODY[!duplicated(BODY)]
		}

		# Findings
		ADJFINDINGS <- FINDINGS[
			term %like% '^ ([[:alpha:]]+) of ([[:alpha:]]+) $' & 
			!(term %like% ' (fear|risk|finding|observation) of ')]
		ADJFINDINGS[, part := sub('^ ([[:alpha:]]+) of ([[:alpha:]]+) $',
			'\\1', term)]
		ADJFINDINGS[, whole := sub('^ ([[:alpha:]]+) of ([[:alpha:]]+) $',
			'\\2', term)]

		TEMP <- merge(ADJFINDINGS[, list(syn = whole, part, conceptId)],
			ADJ[cat %in% c('noun.state', 'noun.process', 'noun.phenomenon')],
			by = 'syn')
		if (nrow(TEMP) > 0){
			FINDINGS <- rbind(FINDINGS, TEMP[,
				list(conceptId, term = paste0(' ', adj, ' ', part, ' '))])
			if (noisy){
				message(paste0('Added ', nrow(TEMP),
					' adjectival phrases for findings.'))
			}
			FINDINGS <- FINDINGS[!duplicated(FINDINGS)]
		}
		TEMP <- NULL
	}

	# Severity codes as per FHIR valueset plus a few extra
	if (noisy) message('Creating severity and stage lists.')
	SEVERITY <- QUAL[conceptId %in% desc(c(
		'Degrees of severity (qualifier value)',
		'Severities (qualifier value)'))]
	QUAL <- QUAL[!(conceptId %in% SEVERITY$conceptId)]

	# Concepts for stage definition as per FHIR valueset with a few extra
	# from 'Stages' 
	STAGE <- desc(c('Stages', 'Tumour stage finding'))
	STAGE <- rbind(FINDINGS[conceptId %in% STAGE],
		QUAL[conceptId %in% STAGE])
	QUAL <- QUAL[!(conceptId %in% STAGE$conceptId)]

	#### LATERALITY ####
	# Create a list of lateralised body structures
	if (noisy) message('Creating lists of lateralised structures.')
	left_structures <- relatedConcepts('Left', typeId = 'Laterality',
		reverse = TRUE, SNOMED = SNOMED)
	right_structures <- relatedConcepts('Right', typeId = 'Laterality',
		reverse = TRUE, SNOMED = SNOMED)
	lateralisable_structures <- relatedConcepts('Side', typeId = 'Laterality',
		reverse = TRUE, SNOMED = SNOMED)
	bilateral_structures <- intersect(left_structures, right_structures)
	
	# Laterality concepts
	CDB$latConcepts <- s(c('Left', 'Right', 'Bilateral'))
	setattr(CDB$latConcepts, 'names', c('Left', 'Right', 'Bilateral'))
	LATERALITY <- QUAL[conceptId %in% CDB$latConcepts]
	LATERALITY <- rbind(LATERALITY,
		data.table(conceptId = s('Left'),
		term = c(' L ', ' lt ', ' left sided ')))
	LATERALITY <- rbind(LATERALITY,
		data.table(conceptId = s('Right'),
		term = c(' R ', ' rt ', ' right sided ')))
	LATERALITY <- rbind(LATERALITY,
		data.table(conceptId = s('Bilateral'),
		term = c(' left and right ', ' both sided ', ' bilat ',
			' L and R ', ' R and L ')))
	QUAL <- QUAL[!(conceptId %in% LATERALITY$conceptId)]
	QUAL <- QUAL[!(term %in% paste0(' ', stopwords, ' '))]

	#### PREPARE CDB ENVIRONMENT ####
	addmw <- function(X, wn_categories){
		if (!is.null(MANUAL_SYNONYMS)){
			M <- copy(as.data.table(MANUAL_SYNONYMS))
			M <- M[!duplicated(M)]
			M[, snomed := sub('^ *', ' ', sub(' *$', ' ', snomed))]
			M[, synonym := sub('^ *', ' ', sub(' *$', ' ', synonym))]
			X <- rbind(X,
				merge(X, M[, list(term = snomed, extra = synonym)],
				by = 'term')[, list(conceptId, term = extra)],
				merge(X, M[bidirectional == TRUE, list(term = synonym,
				extra = snomed)], by = 'term')[,
				list(conceptId, term = extra)], fill = TRUE)
		}
		if (!is.null(WN)){
			return(addWordnet(X, wn_categories = wn_categories,
				WN = WN, CHECK_TABLE <- rbind(QUAL, LATERALITY,
				FINDINGS, BODY, STAGE, CAUSES, SEVERITY,
				fill = TRUE), noisy = noisy))
		} else {
			return(X[!duplicated(X)])
		}
	}

	CDB$FINDINGS <- addmw(FINDINGS, c('noun.state',
		'noun.process', 'noun.phenomenon'))
	CDB$QUAL <- addmw(QUAL, c('noun.state',
		'noun.process', 'noun.phenomenon'))
	CDB$CAUSES <- addmw(CAUSES, c('noun.state',
		'noun.process', 'noun.phenomenon', 'noun.animal', 'noun.plant'))
	BODY <- addmw(BODY, c('noun.body'))
	BODY <- BODY[!term %in% LATERALITY$term]
	
	#### PROCESS LATERALITY
	
	# Find the lateralised version of each concept
	# Need to allow for empty set of lateralised structures
	lat_structures <- union(left_structures,
		union(right_structures, bilateral_structures))
		LAT <- SNOMED$RELATIONSHIP[sourceId %in% lat_structures
			& typeId == s('Is a')]
	if (nrow(LAT) == 0){
		LAT <- data.table(conceptId = bit64::as.integer64(0),
			nonlat_parentId = bit64::as.integer64(0))
	} else {
		LAT <- LAT[!(destinationId %in% lat_structures),
			list(conceptId = sourceId, nonlat_parentId = destinationId)]
		LAT <- LAT[!duplicated(LAT)]
	}
	
	# Create a laterality flag
	BODY[, laterality := "No laterality"]
	BODY[conceptId %in% lateralisable_structures,
		laterality := "Lateralisable"]
	BODY[conceptId %in% left_structures, laterality := 'Left']
	BODY[conceptId %in% right_structures, laterality := 'Right']
	BODY[conceptId %in% bilateral_structures,
		laterality := 'Bilateral']
	
	# Mark body site concepts that are actually concepts describing
	# two separate body parts (e.g. proximal end of radius and ulna)
	# Use the 
	BODY[, multipart := conceptId %in% 
		description(unique(BODY$conceptId), SNOMED = SNOMED)[
		tolower(term) %like%
		' and | or | and/or |\\(combined site\\)']$conceptId]
	
	# Remove morphologic abnormalities from BODY to FINDINGS
	MORPH <- BODY[semanticType(conceptId, SNOMED = SNOMED) ==
		'morphologic abnormality', list(conceptId, term)]
	if (nrow(MORPH) > 0){
		BODY <- BODY[!(conceptId %in% MORPH$conceptId)]
	}
	
	CDB$MORPH <- MORPH
	CDB$BODY <- BODY

	CDB$BODY_LATERALITY <- merge(BODY, LAT, by = 'conceptId')
	CDB$BODY_LATERALITY <- CDB$BODY_LATERALITY[
		laterality %in% c('Left', 'Right', 'Bilateral'),
		list(conceptId, laterality, nonlat_parentId)]
	CDB$BODY_LATERALITY <- CDB$BODY_LATERALITY[
		!duplicated(CDB$BODY_LATERALITY)]

	# Remove ambiguous parent concepts 
	CDB$BODY_LATERALITY[, .N, by = list(laterality, nonlat_parentId)][
		N > 1]$nonlat_parentId -> toremove
	if (nrow(CDB$BODY_LATERALITY) > 0){
		CDB$BODY_LATERALITY <- CDB$BODY_LATERALITY[
			!(nonlat_parentId %in% toremove) & !is.na(nonlat_parentId)]
	}

	CDB$SEVERITY <- SEVERITY
	CDB$LATERALITY <- LATERALITY
	CDB$STAGE <- STAGE
	
	# Causes which are not other findings (may be events,
	#	substances or organisms)
	CDB$CAUSES <- CAUSES
	CDB$OTHERCAUSE <- CAUSES[!(conceptId %in% FINDINGS$conceptId)]
	CDB$OTHERCAUSE[, semType := semanticType(conceptId, SNOMED = SNOMED)]
	
	# OVERLAP = concepts that are in findings as well as another
	# (qual etc.)
	FindingsFSN <- description(CDB$FINDINGS$conceptId, SNOMED = SNOMED)[,
		list(findingId = conceptId, term = sub(' \\([^\\(]+\\)$', '', term))]
	OtherFSN <- description(c(CDB$QUAL$conceptId, CDB$BODY$conceptId,
		CDB$MORPH$conceptId), SNOMED = SNOMED)[,
		list(otherId = conceptId, term = sub(' \\([^\\(]+\\)$', '', term))]
	CDB$OVERLAP <- merge(FindingsFSN, OtherFSN)[, list(findingId, otherId)]
	CDB$OVERLAP <- CDB$OVERLAP[!duplicated(CDB$OVERLAP)]

	CDB$TRANSITIVE <- TRANSITIVE
	CDB$metadata <- SNOMED$metadata
	
	CDB$SCT_assoc <- s('Associated with')
	CDB$SCT_cause <- s('Causative agent')
	CDB$SCT_after <- s('After')
	CDB$SCT_dueto <- s('Due to')
	CDB$SCT_findingsite <- s('Finding site')
	CDB$SCT_disorder <- s('Disorder')
	CDB$SCT_finding <- s('Clinical finding')
	CDB$allergyConcepts <- union(s(c('Allergic disposition',
		'Adverse reaction',
		'Intolerance to substance',
		'Hypersensitivity disposition')), s('281647001'))
	# remove allergy as synonym of 'allergic reaction'. Ensure that
	# there is at least one allergy concept regardless of version of
	# SNOMED dictionary used, to avoid error.
	
	CDB$stopwords <- stopwords
	CDB$SEMTYPE <- rbind(
		CDB$FINDINGS[, list(conceptId, semType = semanticType(conceptId,
			SNOMED = SNOMED))],
		CDB$MORPH[, list(conceptId, semType = 'morphologic abnormality')],
		CDB$BODY[, list(conceptId, semType = 'body structure')],
		CDB$QUAL[, list(conceptId, semType = 'qualifier value')],
		CDB$LATERALITY[, list(conceptId, semType = 'laterality')],
		CDB$SEVERITY[, list(conceptId, semType = 'severity')],
		CDB$OTHERCAUSE[!(conceptId %in% c(CDB$FINDINGS$conceptId,
			CDB$BODY$conceptId, CDB$QUAL$conceptId,
			CDB$LATERALITY$conceptId, CDB$SEVERITY$conceptId)),
			list(conceptId, semType)])
		# contents of stage are mostly findings so don't need to include
	CDB$SEMTYPE <- CDB$SEMTYPE[!duplicated(CDB$SEMTYPE)]

	# Set keys for fast searching
	if (noisy) message('Creating indices for fast searching')
	setkey(CDB$SEMTYPE, conceptId)
	setkey(CDB$FINDINGS, term); setindex(CDB$FINDINGS, conceptId)
	setkey(CDB$MORPH, term); setindex(CDB$MORPH, conceptId)
	setkey(CDB$BODY, term); setindex(CDB$BODY, conceptId)
	setkey(CDB$QUAL, term); setindex(CDB$QUAL, conceptId)
	setkey(CDB$LATERALITY, term); setindex(CDB$LATERALITY, conceptId)
	setkey(CDB$SEVERITY, term); setindex(CDB$SEVERITY, conceptId)
	setkey(CDB$STAGE, term); setindex(CDB$STAGE, conceptId)
	setkey(CDB$OTHERCAUSE, term); setindex(CDB$OTHERCAUSE, conceptId)
	setkey(CDB$CAUSES, term); setindex(CDB$CAUSES, conceptId)
	setkey(CDB$OVERLAP, otherId)
	setkey(CDB$BODY_LATERALITY, conceptId)
	setindex(CDB$BODY_LATERALITY, laterality)
	setindex(CDB$BODY_LATERALITY, nonlat_parentId)
	return(CDB)
}

createDisambiguationTrainer <- function(CDB, SNOMED){
	# Create disambiguation trainer for unigrams among SNOMED concepts
	# of semantic type 'Clinical finding' and 'Body structure'
	# Output a data.table table text,p with {p ...} surrounding the
	# acronym, which can be converted to MedCAT training data
	
	# Declare symbols to avoid R check error
	term <- conceptId <- has_unigram <- conceptId <- text  <- NULL
	
	findings_to_disambiguate <- unique(CDB$FINDINGS[,
		list(has_unigram = any(term %like% '^ [[:alpha:]]+ $')),
		by = conceptId][has_unigram == TRUE]$conceptId)

	body_to_disambiguate <- unique(CDB$BODY[,
		list(has_unigram = any(term %like% '^ [[:alpha:]]+ $')),
		by = conceptId][has_unigram == TRUE]$conceptId)

	disambiguate_concept <- function(the_conceptId, prefix){
		terms <- union(CDB$FINDINGS[conceptId == the_conceptId]$term,
			CDB$BODY[conceptId == the_conceptId]$term)
		descendant_concepts <- descendants(the_conceptId,
			SNOMED = SNOMED, TRANSITIVE = CDB$TRANSITIVE,
			include_self = FALSE)
		descendant_terms <- union(
			CDB$FINDINGS[conceptId %in% descendant_concepts]$term,
			CDB$BODY[conceptId %in% descendant_concepts]$term)
		if (length(terms) > 0 & length(descendant_terms) > 0){
			rbindlist(lapply(terms, function(term){
				if (any(descendant_terms %like% term)){
					data.table(conceptId = the_conceptId,
						text = sub(term, paste0(' {', prefix,
						sub(' $', '', term), '} '),
						descendant_terms[descendant_terms %like% term]))
				} else {
					data.table(conceptId = bit64::integer64(0),
						text = character(0))
				}
			}))
		} else {
			data.table(conceptId = bit64::integer64(0),
					text = character(0))
		}
	}

	OUT <- rbind(
		rbindlist(lapply(1:length(findings_to_disambiguate), function(x){
			disambiguate_concept(findings_to_disambiguate[x], prefix = 'p')
		})),
		rbindlist(lapply(1:length(body_to_disambiguate), function(x){
			disambiguate_concept(body_to_disambiguate[x], prefix = 'p')
		}))
	)
	# Exclude entries with no context
	OUT <- OUT[!(text %like% '^ \\{.*\\} $')]
}

#' Exports CDB files for MedCAT / MiADE
#'
#' Produces a set of files for the findings / problems algorithm of
#' MedCAT and MiADE. Uses the CDB environment created using createCDB
#' which can incorporate additional manual synonyms or synonyms from
#' WordNet.
#'
#' The following files are exported:
#'
#' For MedCAT (named entity recognition and linking):
#'
#' \describe{
#'   \item{problems_cdb.csv}{ - CSV file in MedCAT concept database
#'      format
#'      containing cui (SNOMED CT concept ID), name, name_status ('P'
#'      for preferred term, 'N' for terms that must be disambiguated
#'      (e.g. acronyms or short terms), 'A' for synonym),
#'      ontologies = SNO (for SNOMED CT)} 
#' }
#'
#' For MiADE postprocessing:
#'
#' \describe{
#'   \item{negated.csv}{ - CSV file with columns findingId (SNOMED CT
#'      concept ID of the underlying finding / disorder) and
#'      situationId (SNOMED CT concept ID of the pre-coordinated
#'      situation concept for negation of the finding / disorder).
#'      Sorted by findingId.}
#'   \item{historic.csv}{ - CSV file with columns findingId (SNOMED CT
#'      concept ID of the underlying finding / disorder) and
#'      situationId (SNOMED CT concept ID of the pre-coordinated
#'      situation concept for 'history of' the finding / disorder).
#'      Sorted by findingId.}
#'   \item{suspected.csv}{ - CSV file with columns findingId (SNOMED CT
#'      concept ID of the underlying finding / disorder) and
#'      situationId (SNOMED CT concept ID of the pre-coordinated
#'      situation concept for 'suspected' finding / disorder).
#'      Sorted by findingId.}
#'   \item{overlap.csv}{ - CSV file with columns findingId (SNOMED CT
#'      concept ID of the underlying finding / disorder) and
#'      otherId (SNOMED CT concept ID of a concept with the same
#'      description but a different semantic type, typically a
#'      morphologic abnormality). Sorted by otherId.}
#'   \item{problem_blacklist.csv}{ - CSV file without header with one
#'      column containing SNOMED CT concept IDs for concepts that
#'      may be identified by MedCAT as part of text analysis but 
#'      should not be included in final MiADE output, Examples include
#'      procedure codes which may be used to link to precoordinated
#'      `history of...' concepts. This file can also be used to
#'      force MiADE to ignore any specific SNOMED CT concepts in the
#'      output. Sorted in ascending order.}
#' }
#'
#' For more information about MiADE, visit
#' \url{https://www.ucl.ac.uk/health-informatics/research/miade/miade-software-and-availability}
#' 
#' For more information about MedCAT, visit 
#' \url{https://github.com/CogStack/MedCAT}
#'
#' @param CDB concept database environment created by createCDB
#' @param export_folderpath folder path to export to
#' @param lang_refset_files character vector of file paths to
#'   SNOMED CT language refset files, in order to identify the
#'   preferred term for each concept. If NULL, the Fully Specified Name
#'   minus the semantic type suffix is used as the preferred term
#'   (e.g. if the Fully Specified Name is `Cancer (disorder)', the
#'   default preferred term is `Cancer'.
#' @param exclude a SNOMEDconcept or SNOMEDcodelist object specifying
#'   concepts to exclude from the concept database. By
#'   default, all concepts in the FINDINGS, CAUSES, BODY, LATERALITY,
#'   MORPH, SEVERITY, STAGE and QUAL tables will be included.
#' @param include a SNOMEDconcept or SNOMEDcodelist object specifying
#'   additional concepts to include in the concept database. By
#'   default, all findings are included for potential export, but
#'   there may additional concepts of other semantic types
#'   (e.g. situation concepts) that need to be included. Inclusion
#'   takes place after exclusion, i.e. a concept in both the include
#'   and exclude lists will be included.
#' @param exclude_historic a SNOMEDconcept or SNOMEDcodelist object
#'   specifying concepts to be excluded from the `historic' lookup,
#'   i.e. those that should not be converted into historic forms.
#'   The default is to not do this conversion for disorders, only for
#'   procedures.
#' @param blacklist a SNOMEDconcept or SNOMEDcodelist object specifying
#'   concepts to filter out of the final output. By default, concepts
#'   in the CDB of any semantic type other than `finding' or `disorder'
#'   are excluded. The blacklist can be used to exclude a subset of
#'   findings or disorders that are not useful for the particular
#'   application. 
#' @param SNOMED environment containing a SNOMED dictionary
#' @seealso createCDB, downloadWordnet, downloadOrphanet, MANUAL_SYNONYMS,
#'   exclude_irrelevant_findings
#' @return TRUE if successful
#' @family MiADE functions
#' @export
#' @examples
#' # Not run
#' # exportMiADECDB(CDB, export_folderpath = tempdir())
exportMiADECDB <- function(CDB, export_folderpath, 
	lang_refset_files = NULL, exclude = NULL, include = NULL,
	exclude_historic = descendants('Disorder', SNOMED = getSNOMED()),
	blacklist = NULL, SNOMED = getSNOMED()){
		
	# Declare symbols to avoid R check error
	active <- effectiveTime <- referencedComponentId <- NULL
	pref <- acceptabilityId <- Synonym <- typeId <- NULL
	conceptId <- term <- lowerterm <- destinationId <- NULL
	sourceId <- destinationId <- situationId <- findingId <- NULL
	Nsit <- toremove <- name_status <- name <- otherId <- NULL

	export_folderpath <- sub('([^/\\])$', '\\1/', export_folderpath)

	if (!is.null(lang_refset_files)){
		message(paste0('Loading language refset from ',
			lang_refset_files))
		# Load up language Refset
		LANG <- data.table(id = character(0), effectiveTime = integer(0),
			active = integer(0), moduleId = bit64::integer64(0),
			refsetId = bit64::integer64(0),
			referencedComponentId = bit64::integer64(0),
			acceptabilityId = bit64::integer64(0))
		for (i in lang_refset_files){
			LANG <- rbind(LANG, fread(i), fill = TRUE)
		}
		# Keep only the most recent value for active concepts
		LANG[, keep := active == 1 & effectiveTime == max(effectiveTime),
			by = referencedComponentId]
		LANG <- LANG[keep == TRUE]
		LANG <- LANG[!duplicated(LANG)]

		# Mark the UK preferred terms
		sct_pref <- bit64::as.integer64(SNOMEDconcept('Preferred'))
		SNOMED$DESCRIPTION[, pref := 
			LANG[, list(pref = any(acceptabilityId == sct_pref)),
			by = list(id = referencedComponentId)][
			SNOMED$DESCRIPTION, on = 'id']$pref]
		synonym_type <- SNOMEDconcept('Synonym')
		SNOMED$DESCRIPTION[, Synonym := typeId == synonym_type]

		# Select a single UK preferred concept
		setkey(SNOMED$DESCRIPTION, conceptId, pref, Synonym)
		SCT <- SNOMED$DESCRIPTION[, choose := c(rep(FALSE, .N - 1),
			TRUE), by = conceptId][, list(conceptId, term)]
		message('Using the language preferred terms')
	} else {
		# Choose the Fully Specified Name without the suffix 
		setkey(SNOMED$DESCRIPTION, conceptId, typeId)
		SCT <- SNOMED$DESCRIPTION[, choose := c(TRUE, rep(FALSE, .N - 1)),
			by = conceptId][choose == TRUE,
			list(conceptId, term = sub(' \\([^\\)]+\\)$', '', term))]
		message('Using Fully Specified Names without suffixes as preferred terms')
	}
	# remove old ICD-10-style concepts like [X]Depression
	SCT[, term := sub('^\\[X\\]', '', term)]
	SCT[, lowerterm := tolower(term)]

	#### SUSPECTED, HISTORIC, NEGATED, BLACKLIST ####

	# Suspected, Historic, Negated version of concepts (where available)
	relation_source <- function(destination){
		sort(unique(as.SNOMEDconcept(SNOMED$RELATIONSHIP[
			destinationId %in% SNOMEDconcept(destination)]$sourceId)))
	}

	# Procedures can be used for historic concepts for problem list
	# but not for current procedures
	message('Creating SNOMED CT relation lists') 
	findings <- descendants('Clinical finding', SNOMED = SNOMED,
		include_self = TRUE)
	disorders <- descendants('Disorder', SNOMED = SNOMED)
	acute_diseases <- descendants('Acute disease', SNOMED = SNOMED)
	procedures <- descendants('Procedure', SNOMED = SNOMED)
	suspected <- relation_source('Suspected')
	known_absent <- relation_source('Known absent')
	known_present <- relation_source('Known present')
	subject_of_record <- relation_source('Subject of record')
	family_person <- relation_source('Person in family of subject')
	in_the_past <- relation_source('In the past')
	done <- relation_source('Done')
	current_or_specified_time <-
		relation_source('Current or specified time')

	ASSOC_FINDPROC <- SNOMED$RELATIONSHIP[
		typeId %in% SNOMEDconcept(c('Associated finding',
		'Associated procedure'), SNOMED = SNOMED),
		list(situationId = sourceId, findingId = destinationId)]

	if (is.null(exclude_historic)){
		exclude_historic <- bit64::as.integer64(0)
	} else if (is.SNOMEDcodelist(exclude_historic)){
		exclude_historic <- as.SNOMEDconcept(exclude_historic$conceptId)
	} else {
		exclude_historic <- as.SNOMEDconcept(exclude_historic)
	}

	HISTORIC <- ASSOC_FINDPROC[situationId %in% intersect(
		c(known_present, done),
		intersect(subject_of_record, in_the_past)) &
		findingId %in% setdiff(union(procedures, disorders), 
		exclude_historic)][order(findingId)]
	# Keep if only one situation per finding/procedure, to avoid errors
	# e.g. 'Implantation procedure' --> 'History of heart valve recipient'
	HISTORIC[, Nsit := .N, by = findingId]
	HISTORIC <- HISTORIC[Nsit == 1 &
		findingId != SNOMEDconcept('Aftercare', SNOMED = SNOMED)]

	NEGATED <- ASSOC_FINDPROC[situationId %in% intersect(known_absent,
		intersect(subject_of_record, current_or_specified_time))][
		order(findingId)]
	# Keep if only one situation per finding, to avoid errors
	NEGATED[, Nsit := .N, by = findingId]
	NEGATED <- NEGATED[Nsit == 1]

	SUSPECTED <- ASSOC_FINDPROC[situationId %in% intersect(suspected,
		intersect(subject_of_record, current_or_specified_time))][
		order(findingId)]
	# Keep if only one situation per finding, to avoid errors
	SUSPECTED[, Nsit := .N, by = findingId]
	SUSPECTED <- SUSPECTED[Nsit == 1]

	#### PROCESS SNOMED CONCEPTS ####

	message('Initialising CDB')
	SCT_CDB <- rbind(CDB$FINDINGS, CDB$CAUSES, CDB$BODY,
		CDB$LATERALITY, CDB$MORPH, CDB$SEVERITY, CDB$STAGE, CDB$QUAL,
		fill = TRUE)[, list(conceptId, term = gsub('^ +| +$', '', term),
		lowerterm = tolower(gsub('^ +| +$', '', term)))]
	SCT_CDB <- SCT_CDB[!duplicated(SCT_CDB)]
	
	# Exclude SCT_CDB synonyms that have the same lowerterm as
	# a preferred term (in which case to keep the preferred term
	# wording as is)
	
	# Find terms to remove that correspond to preferred names
	# (in order to re-capitalise)
	MATCH <- merge(SCT_CDB, SCT[, list(conceptId,
		lowerterm = tolower(term))], by = c('conceptId', 'lowerterm'))
	# Mark terms to remove
	SCT_CDB <- merge(SCT_CDB,
		MATCH[, list(conceptId, lowerterm, toremove = TRUE)],
		by = c('conceptId', 'lowerterm'), all.x = TRUE)
	# Remove terms and replace with original term for preferred name
	SCT <- rbind(SCT_CDB[is.na(toremove), list(conceptId,
		term, name_status = 'A')],
		SCT[conceptId %in% SCT_CDB$conceptId, list(conceptId,
		term, name_status = 'P')])
	rm(MATCH)

	message('Concept database has ', nrow(SCT), ' rows (',
		uniqueN(SCT$conceptId), ' concepts).')

	# Verify only one preferred per concept Id
	stopifnot(nrow(SCT[name_status == 'P']) == SCT[name_status == 'P'][,
		uniqueN(conceptId)])
	stopifnot(nrow(SCT[name_status == 'P']) == SCT[,
		uniqueN(conceptId)])

	# Force disambiguation for all short terms (less than 4 characters)
	SCT[name_status == 'A' & nchar(term) <= 3, name_status := 'N']

	#### INCLUSION AND EXCLUSION ####

	# Remove exclusion concepts
	if (is.null(exclude)){
		message('No exclusions.')
	} else {
		message('Applying exclusion list.')
		if (is.SNOMEDcodelist(exclude)){
			SCT <- SCT[!conceptId %in% exclude$conceptId]
		} else {
			exclude <- as.SNOMEDconcept(exclude)
			SCT <- SCT[!conceptId %in% exclude]
		}
		message('After exclusions, concept database has ',
			nrow(SCT), ' rows (', uniqueN(SCT$conceptId),
			' concepts).')
	}

	# Add inclusion concepts
	if (!is.null(include)){
		if (is.SNOMEDcodelist(include)){
			include <- setdiff(as.SNOMEDconcept(include$conceptId),
				SCT$conceptId)
		} else {
			include <- setdiff(as.SNOMEDconcept(include), SCT$conceptId)
		}
		message(paste0('Including ', length(include),
			' additional concepts from include list.'))
		EXTRA <- SNOMED$DESCRIPTION[conceptId %in% include,
			list(conceptId, term, name_status = ifelse(typeId ==
			as.SNOMEDconcept('Fully Specified Name', SNOMED = SNOMED),
			'P', 'A'))]
		SCT <- rbind(SCT, EXTRA)
	} else {
		include <- as.SNOMEDconcept(bit64::integer64(0))
	}

	# Prepare blacklist
	if (is.null(blacklist)){
		message('Creating default blacklist.')
		blacklist <- as.SNOMEDconcept(bit64::integer64(0))
	} else {
		message('Applying blacklist.')
		if (is.SNOMEDcodelist(blacklist)){
			blacklist <- as.SNOMEDconcept(blacklist$conceptId)
		} else {
			blacklist <- as.SNOMEDconcept(blacklist)
		}
	}
	keep <- union(intersect(findings, SCT$conceptId), include)
	blacklist <- union(intersect(blacklist, SCT$conceptId),
		setdiff(as.SNOMEDconcept(SCT$conceptId, SNOMED = SNOMED), keep)) 

	#### EXPORT DATASETS ####
	message(paste0('Exporting dataset to ', export_folderpath))
	
	# Format of output file for MedCAT:
	# cui, name, ontologies, name_status
	setkey(SCT, conceptId, name_status, term)
	
	fwrite(SCT[, list(cui = conceptId, name = term,
		ontologies = 'SNO', name_status)],
		file = paste0(export_folderpath, 'cdb_problems.csv'))

	# Export lookup files for MiADE
	fwrite(NEGATED[, list(findingId, situationId)][order(findingId)],
		file = paste0(export_folderpath, 'negated.csv')) 
	fwrite(HISTORIC[, list(findingId, situationId)][order(findingId)],
		file = paste0(export_folderpath, 'historic.csv')) 
	fwrite(SUSPECTED[, list(findingId, situationId)][order(findingId)],
		file = paste0(export_folderpath, 'suspected.csv')) 
	fwrite(CDB$OVERLAP[, list(findingId, otherId)][order(otherId)],
		file = paste0(export_folderpath, 'overlap.csv')) 

	# Blacklist of ignorable concepts not to present as final output 
	write(as.character(sort(unique(blacklist))),
		file = paste0(export_folderpath, 'problem_blacklist.csv'),
		ncolumns = 1)
	
	return(TRUE)
}

#' Sample inclusion, exclusion and blacklist sets for a MiADE CDB
#'
#' Returns a set of SNOMED concepts (as a SNOMEDconcept vector)
#' which can be used to exclude
#' findings in the MedCAT named entity recognition step, or blacklist
#' (filter out) findings from the final output.
#'
#' \describe{
#'   \item{exclude_irrelevant_findings}{social history 
#'      (except housing problems and care needs),
#'      administrative statuses (except registered disabled) and
#       normal findings, intended to be used as an exclusion list
#'      for concept detection}
#'   \item{blacklist_vague_findings}{vague findings and disorders,
#'      intended to be used in the blacklist}
#'   \item{blacklist_almost_all_except_diseases}{almost all findings
#'      and vague disorders, intended to be used in the blacklist}
#' }
#'
#' @seealso exportMiADECDB, createCDB
#' @family MiADE functions
#' @param SNOMED environment containing a SNOMED dictionary
#' @return SNOMEDconcept vector containing findings to exclude
#' @export
exclude_irrelevant_findings <- function(SNOMED = getSNOMED()){
	# Remove social history (except housing problems and care needs),
	# administrative statuses (except registered disabled) and
	# normal findings
	
	# Define symbols for R CMD check
	term <- NULL
	
	social_history <- descendants('Social and personal history finding',
		SNOMED = SNOMED, include_self = TRUE)
	admin <- descendants('Administrative statuses',
		SNOMED = SNOMED, include_self = TRUE)

	# Keep disability registration and housing and care needs
	regstatus_keep <- SNOMEDconcept(c('On adult protection register',
		'On learning disability register',
		'On social services disability register',
		'Registered blind',
		'Registered partially sighted',
		'Registered deaf',
		'Registered hearing impaired',
		'Registered sight impaired'), SNOMED = SNOMED)
	housing_and_care_keep <- descendants(c(
		'Homeless',
		'No fixed abode',
		'Lives alone',
		'Lives in supported home',
		'Unsatisfactory living conditions',
		'Finding related to care and support circumstances and networks'),
		SNOMED = SNOMED, include_self = TRUE)
	
	# Remove general and normal findings
	disorders <- descendants('Disorder', SNOMED = SNOMED,
		include_self = TRUE)
	general_findings <- setdiff(
		union(descendants(c('Evaluation finding',
		'Colour finding', 'Patient condition finding'), 
		SNOMED = SNOMED, include_self = TRUE),
		SNOMEDconcept(c('General symptom',
		'Complaining of a general symptom',
		'Urine finding',
		'Finding related to pregnancy',
		'Delivery finding',
		'Safety finding',
		'Feeding finding',
		'Finding of movement',
		'Stool finding'), SNOMED = SNOMED)), disorders)
	normal <- SNOMEDconcept('^Normal| normal', exact_match = FALSE,
		SNOMED = SNOMED)
	normal <- normal[semanticType(normal) == 'finding']
	normal <- union(normal, descendants('No abnormality detected',
		include_self = TRUE, SNOMED = SNOMED))
	
	# Remove generic body system findings
	BODY_DESC <- description(
		descendants('Body structure', SNOMED = SNOMED),
		SNOMED = SNOMED, include_synonyms = TRUE)

	body_desc_generic_terms <- c(paste(BODY_DESC$term, 'finding'),
		paste('Finding of', tolower(BODY_DESC$term)),
		paste(BODY_DESC$term, 'system finding'),
		paste('Finding of', tolower(BODY_DESC$term), 'system'),
		paste(BODY_DESC$term, 'observation'),
		paste('Observation of', tolower(BODY_DESC$term)),
		paste(BODY_DESC$term, 'system observation'),
		paste('Observation of', tolower(BODY_DESC$term), 'system'))

	body_desc_generic_findings <- SNOMED$DESCRIPTION[term %in%
		body_desc_generic_terms]$conceptId
	
	setdiff(union(union(
		union(admin, social_history),
		union(normal, general_findings)),
		body_desc_generic_findings),
		union(housing_and_care_keep, regstatus_keep))
}

#' @rdname exclude_irrelevant_findings
#' @family MiADE functions
#' @export
blacklist_vague_findings <- function(SNOMED = getSNOMED()){
	intersect(descendants('Clinical finding', SNOMED = SNOMED,
		include_self = TRUE), SNOMEDconcept(c(
		'Disease',
		'Clinical finding',
		'Problem',
		'Impairment',
		'Chief complaint',
		'Sign',
		'Complaint',
		'Sequela',
		'Early complication',
		'Co-morbid conditions',
		'Pre-existing condition',
		'Acute disease',
		'Subacute disease',
		'Chronic disease',
		'General problem AND/OR complaint',
		'Evaluation finding',
		'Administrative statuses', 
		'Finding by site',
		'Finding by method',
		'Clinical history and observation findings',
		'Behaviour',
		'Adverse incident outcome categories', 
		'Prognosis/outlook finding',
		'General clinical state finding',
		'Disorder by body site',
		'Failure',
		'Acute failure',
		'Subacute failure',
		'Chronic failure',
		'Decompensation',
		'Discrepancy',
		'Idiosyncrasy',
		'Inefficiency',
		'General body state finding',
		'General clinical state finding',
		'Pressure',
		'Disease related state',
		'Absence of pressure',
		'Decreased pressure',
		'Increased pressure',
		'Swelling',
		'Disease condition finding',
		'Allergic disposition',
		'Pain',
		'Values (community)',
		'Fit and well',
		'No sensitivity to pain'), SNOMED = SNOMED))
}

#' @rdname exclude_irrelevant_findings
#' @family MiADE functions
#' @export
blacklist_almost_all_except_diseases <- function(SNOMED = getSNOMED()){
	dont_exclude <- descendants(c(
		'Disorder',
		'Amputee',
		'Mass of body region',
		'Functional disease present',
		'Seizure',
		'Syncope',
		'Falls',
		'Recurrent falls', 
		'Elderly fall',
		'Unexplained recurrent falls',
		'Unexplained falls',
		'Falls caused by medication'), SNOMED = SNOMED,
		include_self = TRUE)
	union(setdiff(descendants('Clinical finding', include_self = TRUE,
		SNOMED = SNOMED), dont_exclude),
		blacklist_vague_findings(SNOMED = SNOMED))
}

std_term <- function(x, stopwords = c('the', 'of', 'by', 'with', 'to',
	'into', 'and', 'or', 'at', 'as', 'and/or', 'in'),
	hyphens_to_space = FALSE, remove_stopwords = FALSE,
	remove_words_in_parentheses = FALSE,
	regex_do_not_remove_parentheses =
	'\\(exclud|\\(with|\\(except|\\(includ'){
	# lowercase except if single word concepts with second, third
	# or final letter upper case (i.e. an acronym like HbA1c or
	# nSTEMI)
	# decapitalise the first letter if rest of first word is 
	# lower case and non-numeric, otherwise keep case as is.
	if (remove_words_in_parentheses){
		x <- ifelse(x %like% regex_do_not_remove_parentheses, x,
			sub('(.) \\([^\\)]+\\)', '\\1', x))
	}
	if (hyphens_to_space){
		x <- gsub('-', ' ', x)
	}
	x <- gsub(' +', ' ', gsub('^ *|-|,|\\(|\\)| *$', '', x))
	words <- strsplit(x, ' ')
	x <- paste0(' ', sapply(lapply(words, function(y){
		ifelse(y %like% '^[A-Z][a-z]+$' |
			y %in% toupper(stopwords), tolower(y), y)
	}), function(y) paste(y, collapse = ' ')), ' ')
	if (remove_stopwords){
		x <- gsub(paste0(' ', paste0(stopwords, collapse = ' | '),
			' '), ' ', x)
	}
	x
}

Try the Rdiagnosislist package in your browser

Any scripts or data that you put into this service are public.

Rdiagnosislist documentation built on April 4, 2025, 2:41 a.m.