if (getRversion() >= "2.15.1")
utils::globalVariables(
c(
"PatientID",
".SD",
"CStage",
"NumbOfBx",
"Years",
"Difference",
"barplot",
"head",
"read.table",
"eHospitalNum",
"pHospitalNum",
".",
"EVENT",
"MonthYear",
"freq",
"Endoscopist",
"avg",
"v",
"destination",
"dcast",
"complete.cases",
"g",
"gvisSankey",
"head",
"pHospitalNum",
"par",
"plot",
"r",
"read.table",
"region",
"rgb",
"setDT",
"Myendo",
"Mypath",
"doc_id",
"sentence",
"upos",
"xpos",
"feats",
"udmodel_english",
"cbind_dependencies",
"head_token_id",
"deps",
"morph_voice",
"OriginalReport",
"dep_rel",
"misc",
"has_morph",
"morph_abbr",
"morph_case",
"morph_definite",
"morph_degree",
"morph_foreign",
"morph_gender",
"morph_mood",
"morph_number",
"morph_numtype",
"morph_person",
"morph_poss",
"morph_prontype",
"morph_reflex",
"morph_tense",
"morph_verbform",
"str_subset",
"lower",
"upper",
"pathSiteString",
"outputFinal",
"theme_foundation",
"udmodel",
"x3",
"FINDINGSmyDx",
"FindingsAfterProcessing",
"primDxVector",
"Temporal",
"sentence_id"
)
)
##########Text preparation##########
#' textPrep function
#'
#' This function prepares the data by cleaning
#' punctuation, checking spelling against the lexicons, mapping terms
#' accorsing to the lexicons, removing negative expressions
#' and lower casing everything. It contains several of the other functions
#' in the package for ease of use. The user can decide whether to also include
#' POS tagging and Negative removal as well as which extractor. By default the
#' extractor called 'Extractor' (which assumes all headers are present in the
#' same order in each text entry) is used. Also by default the negative phrases
#' are removed and POS tagging is not performed.
#' @keywords Find and replace
#' @param inputText The relevant pathology text column
#' @param delim the delimitors so the extractor can be used
#' @param NegEx parameter to say whether the NegativeRemove function used.
#' @param Extractor this states which Extractor you want to use. 1 is
#' Extractor 1 (for uniformly ordered headers), 2 is Extractor2 for
#' text when headers are sometimes missing
#' @importFrom stringi stri_split_boundaries
#' @export
#' @return This returns a string vector.
#' @examples mywords<-c("Hospital Number","Patient Name:","DOB:","General Practitioner:",
#' "Date received:","Clinical Details:","Macroscopic description:",
#' "Histology:","Diagnosis:")
#' CleanResults<-textPrep(PathDataFrameFinal$PathReportWhole,mywords,NegEx="TRUE",Extractor="1",ExtractPOS="2")
textPrep<-function(inputText,delim,NegEx=c('TRUE','FALSE'),Extractor=c('1','2'),ExtractPOS=c('1','2')){
#1. Flatten the text
inputText<-tolower(inputText)
#1b General cleanup tasks
inputText <- ColumnCleanUp(inputText)
#2a . Fuzzy find and replace and term mapping using the find and replace function above using the Location list
HistolType<-paste0(unlist(HistolType(),use.names=F),collapse="|")
LocationList<-paste0(unlist(LocationList(),use.names=F),collapse="|")
EventList<-paste0(unlist(EventList(),use.names=F),collapse="|")
#Spellcheck using the lexicons as reference
L <- tolower(str_split(LocationList,"\\|"))
inputText<-Reduce(function(x, nm) spellCheck(nm, L[[nm]], x), init = inputText, names(L))
L <- tolower(str_split(HistolType,"\\|"))
inputText<-Reduce(function(x, nm) spellCheck(nm, L[[nm]], x), init = inputText, names(L))
L <- tolower(unique(unlist(EventList, use.names = FALSE)))
inputText<-Reduce(function(x, nm) spellCheck(nm, L[[nm]], x), init = inputText, names(L))
#3.Remove all the negative phrases from the report if the parameter has been supplied
#Need to write here if the NegativeRemove has been ticked then should use it
if (missing(NegEx)||NegEx=="TRUE")
{
inputText<-NegativeRemove(inputText)
}
#4. Need to map the terms to the lexicons to make sure everything standardised.
inputText<-DictionaryInPlaceReplace(inputText,LocationList())
inputText<-DictionaryInPlaceReplace(inputText,EventList())
inputText<-DictionaryInPlaceReplace(inputText,HistolType())
#returns a lower case version
inputText<-tolower(inputText)
#Merge the POS frame with the original text so tagging happens right at the beginning
#Will also need to add the Extractor output to the dataframe.
standardisedTextOutput<-stri_split_boundaries(inputText, type="sentence")
standardisedTextOutput<-lapply(standardisedTextOutput, function(x) paste0(unlist(x),collapse="\n"))
#If the more complex Extractor is required:
if (missing(Extractor)||Extractor=="1")
{
MyCompleteFrame<-Extractor(as.character(standardisedTextOutput),tolower(delim))
}
#If the normal Extractor is required:
if (Extractor=="2")
{
#Convert the delimiters into a list and use it to initiate an empty data frame
#which also contains the original column:
#EndoscTree<-list(delim)
easydf <- data.frame(matrix(ncol = length(delim),nrow=length(inputText)))
#Make sure the delimiters are lower case as the text will be by now
delim<-tolower(delim)
#Name the new dataframe columns
#colnames(easydf)<-delim
easydf$inputText<-inputText
for(i in 1:(length(delim)-1)) {
MyCompleteFrame<-Extractor2(easydf,'inputText',as.character(delim[i]),
as.character(delim[i+1]),as.character(delim[i]))
}
}
#Optionally add the POS (parameter driven)
if (ExtractPOS=="1")
{
#Create the dataframe with all the POS extracted:
MyPOSframe<-EndoPOS(as.character(standardisedTextOutput))
#Create a column in both dataframes that can act as a join
MyPOSframe$RowIndex<-as.numeric(rownames(MyPOSframe))
MyCompleteFrame$RowIndex<-as.numeric(rownames(MyCompleteFrame))
#Now merge the POS tags and the extraction:
MyCompleteFrame<-merge(MyCompleteFrame,MyPOSframe,by="RowIndex")
MyCompleteFrame<-data.frame(MyCompleteFrame,stringsAsFactors = FALSE)
}
if (missing(ExtractPOS)||ExtractPOS=="2")
{
MyCompleteFrame<-MyCompleteFrame
}
#Last minute clean up:
names(MyCompleteFrame) <- gsub(".", "", names(MyCompleteFrame), fixed = TRUE)
return(MyCompleteFrame)
}
#' Parts of speech tagging for reports
#'
#' This uses udpipe to tag the text. It then compresses all of the text
#' so you have continuous POS tagging or the whole text. The udpipe package has to be
#' pre downloaded to run this.
#'
#'
#' @param inputString The input string vector
#' @keywords Macroscopic
#' @importFrom stringr str_replace
#' @importFrom udpipe udpipe_download_model udpipe_load_model udpipe_annotate cbind_morphological
#' @export
#' @examples library(udpipe)
#'
#' #Just some quick cleaning up- this will be done in the actual data set eventually
#' #Myendo$OGDReportWhole<-gsub("\\.\\s+\\,"," ",Myendo$OGDReportWhole)
#' #Myendo$OGDReportWhole<-gsub("^\\s+\\,"," ",Myendo$OGDReportWhole)
#' #Myendo$RowIndex<-as.numeric(rownames(Myendo))
#'
#' #We will only use the first 100
#' #Myendo2<-head(Myendo,100)
#'
#' #Run the function
#' #MyPOSframe<-EndoPOS(Myendo2$OGDReportWhole) #returns a dataframe
#'
#' #Then merge the MyPOSframe with the original by row number.
#' #Myendo$RowIndex<-as.numeric(rownames(Myendo))
#' #Get the whole merged dataset with all the POS tags and morphological
#' #and all the dependecies.
#' #MergedUp<-merge(Myendo2,MyPOSframe,by.x="RowIndex",by.y="doc_id")
EndoPOS<-function(inputString){
#Get into a tokenised form first
#Have to do this on the raw pre-prepared data so that sentences can be recognised.
#Get the model from the data folder to save user from having to download it each time.
udmodel_english<-udpipe_load_model(file = "~/EndoMineR/inst/POS_Corpus/english-ewt-ud-2.3-181115.udpipe")
x <- udpipe_annotate(udmodel_english, x = inputString)
x2 <- as.data.frame(x)
#To get all in one long cell per original document:
Newx<-x2 %>% group_by(doc_id,sentence_id) %>% summarise(upos = paste(upos, collapse=";"),
sentence=paste(unique(sentence), collapse=";"),
xpos = paste(xpos, collapse=";"),
feats = paste(feats, collapse=";"),
head_token_id = paste(head_token_id, collapse=";"),
dep_rel = paste(dep_rel, collapse=";"),
deps = paste(deps, collapse=";"),
misc = paste(misc, collapse=";"))%>%group_by(doc_id)
#has_morph = paste(has_morph, collapse=";"),
#morph_abbr = paste(morph_abbr, collapse=";"),
#morph_case = paste(morph_case, collapse=";"),
#morph_definite = paste(morph_definite, collapse=";"),
#morph_degree = paste(morph_degree, collapse=";"),
#morph_gender = paste(morph_gender, collapse=";"),
#morph_mood = paste(morph_mood, collapse=";"),
#morph_number = paste(morph_number, collapse=";"),
#morph_numtype = paste(morph_numtype, collapse=";"),
#morph_person = paste(morph_person, collapse=";"),
#morph_prontype = paste(morph_prontype, collapse=";"),
#morph_tense = paste(morph_tense, collapse=";"),
#morph_typo = paste(morph_typo, collapse=";"),
#morph_verbform = paste(morph_verbform, collapse=";")
#morph_voice = paste(morph_voice, collapse=";"))
Newx<-data.frame(Newx)
Newxdoc<-Newx %>%
group_by(doc_id) %>% summarise(
sentence = paste(sentence, collapse="\n"),
upos = paste(upos, collapse="\n"),
xpos= paste(collapse="\n"),
feats = paste(feats, collapse="\n"),
head_token_id = paste(head_token_id, collapse="\n"),
dep_rel = paste(dep_rel, collapse="\n"),
deps = paste(deps, collapse="\n"),
misc = paste(misc, collapse="\n"))
#has_morph = paste(has_morph, collapse="\n"),
#morph_abbr = paste(morph_abbr, collapse="\n"),
#morph_case = paste(morph_case, collapse="\n"),
#morph_definite = paste(morph_definite, collapse="\n"),
#morph_degree = paste(morph_degree, collapse="\n"),
#morph_gender = paste(morph_gender, collapse="\n"),
#morph_mood = paste(morph_mood, collapse="\n"),
#morph_number = paste(morph_number, collapse="\n"),
#morph_numtype = paste(morph_numtype, collapse="\n"),
#morph_person = paste(morph_person, collapse="\n"),
#morph_poss = paste(morph_poss, collapse="\n"),
#morph_prontype = paste(morph_prontype, collapse="\n"),
#morph_tense = paste(morph_tense, collapse="\n"),
#morph_typo = paste(morph_typo, collapse="\n"),
#morph_verbform = paste(morph_verbform, collapse="\n"),
#morph_voice = paste(morph_voice, collapse="\n"))
# Need to convert the docids as they are alphabetically grouped, to allow merge
# with original dataframe
Newxdoc$doc_id<-as.numeric(gsub("doc","",Newxdoc$doc_id))
Newxdoc<-data.frame(Newxdoc[order(Newxdoc$doc_id),],stringsAsFactors=FALSE)
#Newxdoc$sentence<-OriginalReport
return(Newxdoc)
}
#' Extracts the columns from the raw report
#'
#' This is the main extractor for the Endoscopy and Histology report.
#' This depends on the user creating a list of words or characters that
#' act as the words that should be split against. The list is then fed to the
#' Extractor in a loop so that it acts as the beginning and the end of the
#' regex used to split the text. Whatever has been specified in the list
#' is used as a column header. Column headers don't tolerate special characters
#' like : or ? and / and don't allow numbers as the start character so these
#' have to be dealt with in the text before processing
#'
#' @param inputString the column to extract from
#' @param delim the vector of words that will be used as the boundaries to
#' extract against
#' @importFrom stringr str_extract
#' @importFrom tidyr separate
#' @importFrom rlang sym
#' @keywords Extraction
#' @export
#' @examples
#' # As column names cant start with a number, one of the dividing
#' # words has to be converted
#' # A list of dividing words (which will also act as column names)
#' # is then constructed
#' mywords<-c("Hospital Number","Patient Name:","DOB:","General Practitioner:",
#' "Date received:","Clinical Details:","Macroscopic description:",
#' "Histology:","Diagnosis:")
#' Mypath2<-Extractor(PathDataFrameFinal$PathReportWhole,mywords)
#'
Extractor <- function(inputString, delim) {
#Save inputString so can be merge back in as the origincal for later
inputStringForLater<- inputString
#Create dataframe for tidyverse usage
inputStringdf <- data.frame(inputString,stringsAsFactors = FALSE)
#Do the separation according to delimiters
inputStringdf <- inputStringdf %>%
tidyr::separate(inputString, into = c("added_name",delim),
sep = paste(delim, collapse = "|"),
extra = "drop", fill = "right")
#Make sure columns names are correct
names(inputStringdf) <- gsub(".", "", names(inputStringdf), fixed = TRUE)
#Get rid of the errant first column
inputStringdf <- inputStringdf [,-1]
#Add the original column back in so have the original reference
inputStringdf$Original<- inputString
inputStringdf <-data.frame(inputStringdf,stringsAsFactors = FALSE)
names(inputStringdf)<-gsub(".","",names(inputStringdf),fixed=TRUE)
return(inputStringdf)
}
#' Extractor2
#'
#' This is the alternative extractor for the Endoscopy and Histology report.
#' THis performs the same essentially as the main extractor but is useful when the
#' semi-structured text is organised in a non-standard way ie the delimiting text is not always in the same order
#' As per the main Extractor, This function on the user creating a list of words or characters that
#' act as the words that should be split against. The list is then fed to the
#' Extractor in a loop so that it acts as the beginning and the end of the
#' regex used to split the text. Whatever has been specified in the list
#' is used as a column header. Column headers don't tolerate special characters
#' like : or ? and / and don't allow numbers as the start character so these
#' have to be dealt with in the text before processing
#'
#' @param x the dataframe
#' @param y the column to extract from
#' @param stra the start of the boundary to extract
#' @param strb the end of the boundary to extract
#' @param t the column name to create
#' @importFrom stringr str_extract
#' @keywords Extraction
#' @export
#' @examples v<-TheOGDReportFinal
#' Myendo<-TheOGDReportFinal
#' Myendo$OGDReportWhole<-gsub('2nd Endoscopist:','Second endoscopist:',
#' Myendo$OGDReportWhole)
#'
#' EndoscTree<-list('Hospital Number:','Patient Name:','General Practitioner:',
#' 'Date of procedure:','Endoscopist:','Second Endoscopist:','Medications',
#' 'Instrument','Extent of Exam:','Indications:','Procedure Performed:',
#' 'Findings:','Endoscopic Diagnosis:')
#'
#' for(i in 1:(length(EndoscTree)-1)) {
#' Myendo<-Extractor2(Myendo,'OGDReportWhole',as.character(EndoscTree[i]),
#' as.character(EndoscTree[i+1]),as.character(EndoscTree[i]))
#' }
#' res<-Myendo
Extractor2 <- function(x, y, stra, strb, t) {
t <- gsub("[^[:alnum:],]", " ", t)
t <- gsub(" ", "", t, fixed = TRUE)
x[, t] <- stringr::str_extract(x[, y], stringr::regex(paste(stra,
"(.*)", strb, sep = ""), dotall = TRUE))
x[, t] <- gsub("\\\\.*", "", x[, t])
names(x[, t]) <- gsub(".", "", names(x[, t]), fixed = TRUE)
x[, t] <- gsub(" ", "", x[, t])
x[, t] <- gsub(stra, "", x[, t], fixed = TRUE)
if (strb != "") {
x[, t] <- gsub(strb, "", x[, t], fixed = TRUE)
}
x[, t] <- gsub(" ", "", x[, t])
x[, t]<- ColumnCleanUp(x[, t])
return(x)
}
#' Dictionary In Place Replace
#'
#' This maps terms in the text replaces them with the
#' standardised term (mapped in the lexicon file) within the text.
#' It is used within the textPrep function.
#'
#' @param inputString the input string
#' @param list The replacing list
#' @keywords Replace
#' @export
#' @return This returns a character vector
#' @examples inputText<-DictionaryInPlaceReplace(TheOGDReportFinal$OGDReportWhole,LocationList())
DictionaryInPlaceReplace <- function(inputString,list) {
key<-names(list)
value<-as.character(t(data.frame(list,stringsAsFactors=FALSE))[,1])
list<-data.frame(key,value,stringsAsFactors = FALSE)
new_string <- inputString
vapply(1:nrow(list),
function (k) {
new_string <<- gsub(list$key[k], list$value[k], new_string)
0L
}, integer(1))
return(new_string)
}
#' Removes negative and normal sentences
#'
#' Extraction of the Negative sentences so that normal findings can be
#' removed and not counted when searching for true diseases. eg remove
#' No evidence of candidal infection so it doesn't get included if
#' looking for candidal infections.
#' It should be used after the Extractor and the optional
#' NewLines has been used. It can be used as part of the other functions
#' or as a way of providing a 'positive diagnosis only' type output (see
#' HistolDx)
#' @param inputText column of interest
#' @keywords Negative Sentences
#' @importFrom stringr str_replace
#' @export
#' @return This returns a column within a dataframe. THis should be changed to a
#' character vector eventually
#' @examples # Build a character vector and then
#' # incorporate into a dataframe
#' anexample<-c("There is no evidence of polyp here",
#' "Although the prep was poor,there was no adenoma found",
#' "The colon was basically inflammed, but no polyp was seen",
#' "The Barrett's segment was not biopsied",
#' "The C0M7 stretch of Barrett's was flat")
#' anexample<-data.frame(anexample)
#' names(anexample)<-"Thecol"
#' # Run the function on the dataframe and it should get rid of sentences (and
#' # parts of sentences) with negative parts in them.
#' hh<-NegativeRemove(anexample$Thecol)
NegativeRemove <- function(inputText) {
# Conjunctions
inputText <- gsub(
"(but|although|however|though|apart| -|otherwise
|unremarkable|\\,)[a-zA-Z0-9_ ]+(no |negative|
unremarkable|-ve|normal).*?(\\.|\\n|:|$)\\R*",
"\\.\n",
inputText,
perl = TRUE,
ignore.case = TRUE
)
inputText <-
gsub(
"(no |negative|unremarkable|-ve| normal) .*?([Bb]ut|
[Aa]lthough| [Hh]owever| [Tt]hough| [Aa]part| -|[Oo]therwise|
[Uu]nremarkable)\\R*",
"",
inputText,
perl = TRUE,
ignore.case = TRUE
)
# Nots
inputText<-
gsub(
".*(?:does|is|was|were|are|have) not.*?(\\.|\n|:|$)\\R*",
"",
inputText,
perl = TRUE,
ignore.case = TRUE
)
inputText <-
gsub(
"not (biop|seen).*?(\\.|\n|:|$)\\R*",
"",
inputText,
perl = TRUE,
ignore.case = TRUE
)
# Nos
inputText <-
gsub(
".*(?:((?<!with)|(?<!there is )|(?<!there are ))\\bno\\b(?![?:A-Za-z])|
([?:]\\s*N?![A-Za-z])).*\\R*",
"",
inputText,
perl = TRUE,
ignore.case = TRUE
)
#Withouts
inputText <-
gsub(
".*without.*\\R*",
"",
inputText,
perl = TRUE,
ignore.case = TRUE
)
inputText <-
gsub(
".*(:|[?])\\s*(\\bno\\b|n)\\s*[^A-Za-z0-9].*?(\\.|\n|:|$)
\\R*",
"",
inputText,
perl = TRUE,
ignore.case = TRUE
)
inputText <-
gsub(
".*(negative|neither).*?(\\.|\n|:|$)\\R*",
"",
inputText,
perl = TRUE,
ignore.case = TRUE
)
# Keep abnormal in- don't ignore case as it messes it up
inputText <- str_replace(inputText,
".*(?<!b)[Nn]ormal.*?(\\.|\n|:|$)", "")
# Other negatives
inputText <- gsub(
".*there (is|are|were) \\bno\\b .*?(\\.|\n|:|$)\\R*",
"",
inputText,
perl = TRUE,
ignore.case = TRUE
)
inputText <- gsub(
"(within|with) (normal|\\bno\\b) .*?(\\.|\n|:|$)\\R*",
"",
inputText,
perl = TRUE,
ignore.case = TRUE
)
# Specific cases
inputText <- gsub(
".*duct.*clear.*?(\\.|\n|:|$)\\R*",
"",
inputText,
perl = TRUE,
ignore.case = TRUE
)
# Unanswered prompt lines
inputText <- gsub(".*:(\\.|\n)\\R*",
"",
inputText,
perl = TRUE,
ignore.case = TRUE)
# Time related phrases eg post and previous
inputText <- gsub(" (post|previous|prior)[^a-z].+?[A-Za-z]{3}",
" TIME_REPLACED",
inputText,
perl = TRUE,
ignore.case = TRUE)
return(inputText)
}
#' Find and Replace function
#'
#' This is a helper function for finding and replacing from dictionaries like the event list
#' It uses fuzzy find and replace to account for spelling errors
#' @keywords Find and replace
#' @importFrom utils aregexec
#' @param pattern the pattern to look for
#' @param replacement the pattern replaceme with
#' @param x the target string
#' @return This returns a character vector
#' @examples L <- tolower(stringr::str_split(HistolType(),"\\|"))
#' inputText<-TheOGDReportFinal$OGDReportWhole
#' inputText<-Reduce(function(x, nm) spellCheck(nm, L[[nm]], x), init = inputText, names(L))
#'
spellCheck <- function(pattern, replacement, x, fixed = FALSE, ...) {
m <- aregexec(pattern, x, fixed = fixed,ignore.case = T)
r <- regmatches(x, m)
lens <- lengths(r)
if (all(lens == 0)) return(x) else
replace(x, lens > 0, mapply(sub, r[lens > 0], replacement, x[lens > 0]))
}
#' Tidies up messy columns
#'
#' This does a general clean up of whitespace,
#' semi-colons,full stops at the start
#' of lines and converts end sentence full stops to new lines.
#' @param vector column of interest
#' @keywords Cleaner
#' @export
#' @importFrom stringr str_replace
#' @importFrom stringi stri_split_boundaries
#' @return This returns a character vector
#' @examples ii<-ColumnCleanUp(Myendo$Findings)
ColumnCleanUp <- function(vector) {
#Optimise for tokenisation eg full stops followed by a number need to change so add a Letter before the number
vector<-gsub("\\.\\s*(\\d)","\\.T\\1",vector)
vector<-gsub("([A-Za-z]\\s*)\\.(\\s*[A-Za-z])","\\1\n\\2",vector)
vector<-gsub("([A-Za-z]+.*)\\?(.*[A-Za-z]+.*)","\\1 \\2",vector)
vector<-gsub("\\.,","\\.",vector)
vector<-gsub(",([A-Z])","\\.\\1",vector)
vector<-gsub("\\. ,",".",vector)
vector<-gsub("\\.\\s+\\,"," ",vector)
vector<-gsub("^\\s+\\,"," ",vector)
vector<-gsub("\\\\.*", "", vector)
vector<-gsub(" ", "", vector)
#Get rid of query type punctuation:
vector<-gsub("(.*)\\?(.*[A-Za-z]+)","\\1 \\2",vector)
vector<-gsub("'","",vector,fixed=TRUE)
#Have to tokenize here so you can strip punctuation without getting rid of newlines
standardisedTextOutput<-stri_split_boundaries(vector, type="sentence")
#Get rid of whitespace
standardisedTextOutput<-lapply(standardisedTextOutput, function(x) trimws(x))
#Get rid of trailing punctuation
standardisedTextOutput<-lapply(standardisedTextOutput,function(x) gsub("^[[:punct:]]+","",x))
standardisedTextOutput<-lapply(standardisedTextOutput,function(x) gsub("[[:punct:]]+$","",x))
#Question marks result in tokenized sentences so whenever anyone write query Barrett's, it gets split.
standardisedTextOutput<-lapply(standardisedTextOutput,function(x) gsub("([A-Za-z]+.*)\\?(.*[A-Za-z]+.*)","\\1 \\2",x))
standardisedTextOutput<-lapply(standardisedTextOutput,function(x) gsub("(Dr.*?[A-Za-z]+)|([Rr]eported.*)|([Dd]ictated by.*)","",x))
#Get rid of strange things
standardisedTextOutput<-lapply(standardisedTextOutput,function(x) gsub("\\.\\s+\\,"," ",x))
standardisedTextOutput<-lapply(standardisedTextOutput,function(x) gsub("^\\s+\\,"," ",x))
retVector<-sapply(standardisedTextOutput, function(x) paste(x,collapse="\n"))
return(retVector)
}
############## Extraction Utilities - Basic ###################
#' Extract sentences with the POS tags desired
#'
#' This uses udpipe to tag the text. It then compresses all of the text
#' so you have continuous POS tagging or the whole text. The udpipe package has to be
#' pre downloaded to run this.
#'
#'
#' @param POS_seq The POS tag sequence to extract
#' @param columnPOS The column with the POS tags
#' @param columnSentence The column with the sentence
#' @keywords Macroscopic
#' @export
#' @examples #Has to be after POS extraction is done
#' #mylist<-POS_Extract("NOUN;ADJ;NOUN",MergedUp$upos,MergedUp$sentence)
#' #MergedUp$Extr<-mylist
POS_Extract<-function(POS_seq,columnPOS,columnSentence){
#Convert the POS tags to list nested list
columnPOS<-strsplit(columnPOS,"\n")
#Search the list for the tags that match, and keep the matching indices in a list
myIndexes<-lapply(columnPOS,function(x) grep(POS_seq,x))
#Get the index for the matched tags
#Return the index of the matched tags for the other columns too
# Select the corresponding indices for the next column:
columnSentence<-strsplit(columnSentence,"\n")
#Select the indices
my<-Map(`[`, columnSentence, myIndexes)
#For each part of the list select out the index of the list:
return(my)
}
#' Extrapolate from Dictionary
#'
#' Provides term mapping and extraction in one.
#' Standardises any term according to a mapping lexicon provided and then
#' extracts the term. Thus is
#' different to the DictionaryInPlaceReplace in that it provides a new column
#' with the extracted terms as opposed to changing it in place
#'
#' @param inputString The text string to process
#' @param list of words to iterate through
#' @keywords Withdrawal
#' @importFrom fuzzyjoin regex_left_join
#' @importFrom dplyr as_tibble pull
#' @export
#' @examples #Firstly we extract histology from the raw report
#' # The function then standardises the histology terms through a series of
#' # regular expressions and then extracts the type of tissue
#' Mypath$Tissue<-ExtrapolatefromDictionary(Mypath$Histology,HistolType())
#' rm(MypathExtraction)
ExtrapolatefromDictionary<-function(inputString,list){
#lower case the input string
inputString<-tolower(inputString)
#Get the names from the named list
mylist<-paste0(unlist(list,use.names=F),collapse="|")
#Make the inputstrings unique
ToIndex<-lapply(inputString, function(x) unique(x))
#Give each an index in the list (taken from the location list)
#The results to replace in the string
replace<-names(list)
#The result of the replacement
replaceValue<-paste0(unlist(list,use.names=F))
#Create a dataframe
df1<-data.frame(key = replace, val = replaceValue)
#Create a tibble to merge with the list
d1 <- as_tibble(df1)
#Select the elements that have characters in them
i1 <- lengths(ToIndex) > 0
#Do the merge. This takes the key to lookup and then if found, replaces
#the value with value associated with it in the table. I think the pull
#function also creates a new column with the value in it. This is an important
#function as uses a table lookup
ToIndex[i1] <- map(ToIndex[i1], ~
tibble::tibble(key = .x) %>%
fuzzyjoin::regex_left_join(d1) %>%
pull(val))
#This unlists the nested list
ToIndex<-lapply(ToIndex, function(x) unlist(x,recursive=F))
#This collapses the list so that the output is a single string
ToIndex<-unlist(lapply(ToIndex, function(x) paste(x,collapse=";")))
return(ToIndex)
}
############## Extraction Utilities- Colocation ###################
#' EntityPairs_OneSentence
#'
#' Use to see if words from two lists co-exist within a sentence. Eg site and tissue type.
#' This function only looks in one sentence for the two terms. This should be ised a
#' @keywords PathPairLookup
#' @param inputText The relevant pathology text column
#' @param list1 First list to refer to
#' @param list2 The second list to look for
#' @importFrom purrr flatten_chr map_chr map map_if
#' @export
#' @examples # tbb<-EntityPairs_OneSentence(Mypath$Histology,HistolType(),LocationList())
EntityPairs_OneSentence<-function(inputText,list1,list2){
#dataframe<-data.frame(dataframe,stringsAsFactors = FALSE)
list1<-paste0(unlist(list1,use.names=F),collapse="|")
list2<-paste0(unlist(list2,use.names=F),collapse="|")
#text<-textPrep(inputText)
text<-standardisedTextOutput<-stri_split_boundaries(inputText, type="sentence")
r1 <-lapply(text,function(x) Map(paste, str_extract_all(tolower(x),tolower(list2)), str_extract_all(tolower(x),tolower(list1)), MoreArgs = list(sep=":")))
r1<-lapply(r1,function(x) unlist(x))
#Unlist into a single row-This should output a character vector
out<-lapply(r1,function(x) paste(x,collapse=","))
return(out)
}
#' EntityPairs_TwoSentence
#'
#' This is used to look for relationships between site and event especially for endoscopy events
#' @keywords Find and replace
#' @param inputString The relevant pathology text column
#' @param list1 The intial list to assess
#' @param list2 The other list to look for
#' @importFrom stringr str_replace_na str_c str_split str_which str_extract_all regex str_subset
#' @importFrom stringi stri_split_boundaries
#' @importFrom rlang is_empty
#' @importFrom purrr flatten_chr map_chr map map_if
#' @export
#' @examples # tbb<-EntityPairs_TwoSentence(Myendo$Findings,EventList(),HistolType())
EntityPairs_TwoSentence<-function(inputString,list1,list2){
#Prepare the text to be back into a tokenised version.
#text<-textPrep(inputText)
text<-standardisedTextOutput<-stri_split_boundaries(inputString, type="sentence")
text<-lapply(text,function(x) tolower(x))
#Some clean up to get rid of white space- all of this prob already covered in the ColumnCleanUp function but for investigation later
text<-lapply(text,function(x) gsub("[[:punct:]]+"," ",x))
#Prepare the list to use as a lookup:
tofind <-paste(tolower(list2),collapse="|")
#Prepare the second list to use as a lookup
EventList<-unique(tolower(unlist(list1,use.names = FALSE)))
text<-sapply(text,function(x) {
#Cleaning
x<-trimws(x)
#Prepare the text so that all empty text is replaced with NA and
#ready for processing
try(words <-
x %>%
unlist() %>%
str_replace_na()%>%
str_c(collapse = ' ') %>%
str_split(' ') %>%
`[[`(1))
words<-words[words != ""]
x1 <- str_extract_all(tolower(x),tolower(paste(unlist(list1), collapse="|")))
i1 <- which(lengths(x1) > 0)
try(if(any(i1)) {
EventList %>%
map(
~words %>%
str_which(paste0('^.*', .x)) %>%
map_chr(
~words[1:.x] %>%
str_c(collapse = ' ') %>%
str_extract_all(regex(tofind, ignore_case = TRUE)) %>%
map_if(rlang::is_empty, ~ NA_character_) %>%
flatten_chr()%>%
`[[`(length(.)) %>%
.[length(.)]
) %>%
paste0(':', .x)
) %>%
unlist() %>%
str_subset('.+:')
} else "")
}
)
return(text)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.