R/Basic_CleanUp.R

Defines functions EntityPairs_TwoSentence EntityPairs_OneSentence ExtrapolatefromDictionary EndoPaste ColumnCleanUp spellCheck NegativeRemove NegativeRemoveWrapper DictionaryInPlaceReplace Extractor textPrep

Documented in ColumnCleanUp DictionaryInPlaceReplace EndoPaste EntityPairs_OneSentence EntityPairs_TwoSentence Extractor ExtrapolatefromDictionary NegativeRemove NegativeRemoveWrapper spellCheck textPrep

if (getRversion() >= "2.15.1")
  utils::globalVariables(
    c(
      "PatientID",
      ".SD",
      "CStage",
      "NumbOfBx",
      "Years",
      "Difference",
      "barplot",
      "head",
      "read.table",
      "eHospitalNum",
      "pHospitalNum",
      ".",
      "EVENT",
      "MonthYear",
      "freq",
      "Endoscopist",
      "avg",
      "v",
      "destination",
      "dcast",
      "complete.cases",
      "g",
      "gvisSankey",
      "head",
      "pHospitalNum",
      "par",
      "plot",
      "r",
      "read.table",
      "region",
      "rgb",
      "setDT",
      "Myendo",
      "Mypath",
      "doc_id",
      "sentence",
      "upos",
      "xpos",
      "feats",
      "udmodel_english",
      "cbind_dependencies",
      "head_token_id",
      "deps",
      "morph_voice",
      "OriginalReport",
      "dep_rel",
      "misc",
      "has_morph",
      "morph_abbr",
      "morph_case",
      "morph_definite",
      "morph_degree",
      "morph_foreign",
      "morph_gender",
      "morph_mood",
      "morph_number",
      "morph_numtype",
      "morph_person",
      "morph_poss",
      "morph_prontype",
      "morph_reflex",
      "morph_tense",
      "morph_verbform",
      "str_subset",
      "lower",
      "upper",
      "pathSiteString",
      "outputFinal",
      "theme_foundation",
      "udmodel",
      "x3",
      "FINDINGSmyDx",
      "FindingsAfterProcessing",
      "primDxVector",
      "Temporal",
      "sentence_id",
      "img",
      "Endo_ResultEntered",
      "V1",
      "pandoc.image.return"
    )
  )











##########Text preparation##########



#' Combine all the text cleaning and extraction functions into one
#'
#' This function prepares the data by cleaning 
#' punctuation, checking spelling against the lexicons, mapping terms
#' according to the lexicons and lower casing everything. 
#' It contains several of the other functions
#' in the package for ease of use. 
#' @keywords text cleaning
#' @param inputText The relevant pathology text columns
#' @param delim the delimitors so the extractor can be used
#' @importFrom stringi stri_split_boundaries
#' @export
#' @family NLP - Text Cleaning and Extraction
#' @return This returns a string vector.
#' @examples mywords<-c("Hospital Number","Patient Name:","DOB:","General Practitioner:",
#' "Date received:","Clinical Details:","Macroscopic description:",
#' "Histology:","Diagnosis:")
#' CleanResults<-textPrep(PathDataFrameFinal$PathReportWhole,mywords)



#Need to make sure the sentences are separated in the Extractor column by a separator such as carriage return
#So that a tokenizer can be used for NegEx or any other function.
#Also need to get rid of ASCII \\X10 etc in the Column Cleanup.


textPrep<-function(inputText,delim){
  
  #1. Flatten the text..
  inputText<-tolower(inputText)
  #Need to do clean up on the data first before sentence splitting:
  inputText<-ColumnCleanUp(inputText)
  
  #1b General cleanup tasks tokenize then clean then recombine
  standardisedTextOutput<-stri_split_boundaries(inputText, type="sentence")
  standardisedTextOutput<-lapply(standardisedTextOutput, function(x) ColumnCleanUp(x))
  inputText<-lapply(standardisedTextOutput, function(x) paste0(unlist(x),collapse="\n"))
  
  #2a . Fuzzy find and replace and term mapping using the find and replace function above using the Location list
  HistolType<-paste0(unlist(HistolType(),use.names=F),collapse="|")
  LocationList<-paste0(unlist(LocationList(),use.names=F),collapse="|")
  EventList<-paste0(unlist(EventList(),use.names=F),collapse="|")
  
  #Spellcheck using the lexicons as reference
  L <- tolower(str_split(LocationList,"\\|"))
  inputText<-Reduce(function(x, nm) spellCheck(nm, L[[nm]], x), init = inputText, names(L))
  L <- tolower(str_split(HistolType,"\\|"))
  inputText<-Reduce(function(x, nm) spellCheck(nm, L[[nm]], x), init = inputText, names(L))
  L <- tolower(unique(unlist(EventList, use.names = FALSE)))
  inputText<-Reduce(function(x, nm) spellCheck(nm, L[[nm]], x), init = inputText, names(L))
  

  #4. Need to map the terms to the lexicons to make sure everything standardised.
  inputText<-DictionaryInPlaceReplace(inputText,LocationList())
  inputText<-DictionaryInPlaceReplace(inputText,EventList())
  inputText<-DictionaryInPlaceReplace(inputText,HistolType())
  
  #returns a lower case version
  inputText<-tolower(inputText)
  #Will also need to add the Extractor output to the dataframe.
  MyCompleteFrame<-Extractor(as.character(inputText),tolower(delim))
  #Last minute clean up:
  names(MyCompleteFrame) <- gsub(".", "", names(MyCompleteFrame), fixed = TRUE)
  
  return(MyCompleteFrame)
}




#' Extract columns from the raw text
#'
#' This is the main extractor for the Endoscopy and Histology report.
#' This relies on the user creating a list of words representing the
#' subheadings. The list is then fed to the
#' Extractor so that it acts as the beginning and the end of the
#' regex used to split the text. Whatever has been specified in the list
#' is used as a column header. Column headers don't tolerate special characters
#' like : or ? and / and don't allow numbers as the start character so these
#' have to be dealt with in the text before processing
#'
#' @param inputString the column to extract from
#' @param delim the vector of words that will be used as the boundaries to
#' extract against
#' @importFrom stringr str_extract
#' @importFrom stringi stri_replace_all_fixed stri_replace_all
#' @importFrom tidyr separate
#' @importFrom rlang sym
#' @keywords Extraction
#' @family NLP - Text Cleaning and Extraction
#' @export
#' @examples
#' # As column names cant start with a number, one of the dividing
#' # words has to be converted
#' # A list of dividing words (which will also act as column names)
#' # is then constructed
#' mywords<-c("Hospital Number","Patient Name:","DOB:","General Practitioner:",
#' "Date received:","Clinical Details:","Macroscopic description:",
#' "Histology:","Diagnosis:")
#' Mypath2<-Extractor(PathDataFrameFinal$PathReportWhole,mywords)


Extractor <- function(inputString, delim) {
  
  
  #Create a named list of words
  delim <- stri_replace_all_fixed(delim,":","")
  names(delim) <- trimws(delim)
  #Add a : to the tags 
  
  delim <- gsub("(.*)","\\1: ",delim)
  delim<-as.list(delim)
  
  inputString<-stri_replace_all(inputString,"",regex="'|\\)|:|\\(")
  
  
  #Do the find and replace to place the tags in the input text
  inputString<-DictionaryInPlaceReplace(inputString,delim)
  
  #Do a bit more cleaning to make it into a dcf file:
  inputString<-stri_replace_all_fixed(inputString,": :",": ")
  
  inputString<-gsub(":([A-Za-z0-9])",": \\1",inputString)
  #Don't remove newlines as these are used as the sentence separators
  inputString<-gsub("\n",". ",inputString,fixed=TRUE)
  inputString<-stri_replace_all(inputString,"Start:",regex="^")
  inputString<-stri_replace_all_fixed(inputString,"::",":")
  
  #Create the dcf file
  pat <- sprintf("(%s)", paste(delim, collapse = "|"))
  g <- gsub(pat, "\n\\1", paste0(inputString, "\n"))
  m <- read.dcf(textConnection(g))
  m<-data.frame(m,stringsAsFactors = FALSE)
  return(m)
  
}



#' Dictionary In Place Replace
#'
#' This maps terms in the text and replaces them with the 
#' standardised term (mapped in the lexicon file) within the text.
#' It is used within the textPrep function.

#'
#' @param inputString the input string (ie the full medical report)
#' @param list The replacing list
#' @keywords Replace
#' @importFrom stringi stri_replace stri_replace_all
#' @export
#' @return This returns a character vector
#' @family NLP - Text Cleaning and Extraction
#' @examples inputText<-DictionaryInPlaceReplace(TheOGDReportFinal$OGDReportWhole,LocationList())



DictionaryInPlaceReplace <- function(inputString,list) {
  
  key<-names(list)
  value<-as.character(t(data.frame(list,stringsAsFactors=FALSE))[,1])
  list<-data.frame(key,value,stringsAsFactors = FALSE)
  
  new_string <- inputString
  vapply(1:nrow(list),
         function (k) {
           #new_string <<- gsub(list$key[k], list$value[k], new_string)
           new_string <<- stri_replace_all(new_string,list$value[k],regex=list$key[k])
           0L
         }, integer(1))
  
  return(new_string)
}


#' Wrapper for Negative Remove
#'
#' This performs negative removal on a per sentance basis 
#' @keywords Negative Sentences
#' @importFrom stringr str_replace
#' @importFrom Hmisc capitalize
#' @param inputText the text to remove Negatives from
#' @export
#' @return This returns a column within a dataframe. This should be changed to a 
#' character vector eventually
#' @family NLP - Text Cleaning and Extraction
#' @examples # Build a character vector and then
#' # incorporate into a dataframe
#' anexample<-c("There is no evidence of polyp here",
#' "Although the prep was poor,there was no adenoma found",
#' "The colon was basically inflammed, but no polyp was seen",
#' "The Barrett's segment was not biopsied",
#' "The C0M7 stretch of Barrett's was flat")
#' anexample<-data.frame(anexample)
#' names(anexample)<-"Thecol"
#' # Run the function on the dataframe and it should get rid of sentences (and
#' # parts of sentences) with negative parts in them.
#' #hh<-NegativeRemoveWrapper(anexample$Thecol)

NegativeRemoveWrapper <- function(inputText) {
  
  standardisedTextOutput<-stringr::str_split(inputText, "\\.")
  standardisedTextOutput<-lapply(standardisedTextOutput, function(x) Hmisc::capitalize(as.character(x)))
  standardisedTextOutput<-lapply(standardisedTextOutput, function(x) NegativeRemove(x))
  inputText<-sapply(standardisedTextOutput, function(x) paste(x,collapse="."))
  
}








#' Remove negative and normal sentences
#'
#' Extraction of the negative sentences so that normal findings can be
#' removed and not counted when searching for true diseases. eg remove
#' 'No evidence of candidal infection' so it doesn't get included if
#' looking for candidal infections. It is used by default as part of
#' the textPrep function but can be turned off as an optional parameter
#' @param inputText column of interest
#' @keywords Negative Sentences
#' @importFrom stringr str_replace
#' @export
#' @return This returns a column within a dataframe. THis should be changed to a 
#' character vector eventually
#' @family NLP - Text Cleaning and Extraction
#' @examples # Build a character vector and then
#' # incorporate into a dataframe
#' anexample<-c("There is no evidence of polyp here",
#' "Although the prep was poor,there was no adenoma found",
#' "The colon was basically inflammed, but no polyp was seen",
#' "The Barrett's segment was not biopsied",
#' "The C0M7 stretch of Barrett's was flat")
#' anexample<-data.frame(anexample)
#' names(anexample)<-"Thecol"
#' # Run the function on the dataframe and it should get rid of sentences (and
#' # parts of sentences) with negative parts in them.
#' hh<-NegativeRemove(anexample$Thecol)

NegativeRemove <- function(inputText) {
  # Conjunctions
  inputText <- gsub(
    "(but|although|however|though|apart| -|otherwise
    |unremarkable|\\,)[a-zA-Z0-9_ ]+(no |negative|
    unremarkable|-ve|normal).*?(\\.|\\n|:|$)\\R*",
    "\\.\n",
    inputText,
    perl = TRUE,
    ignore.case = TRUE
    )
  inputText <-
    gsub(
      "(no |negative|unremarkable|-ve| normal) .*?([Bb]ut|
      [Aa]lthough| [Hh]owever| [Tt]hough| [Aa]part| -|[Oo]therwise|
      [Uu]nremarkable)\\R*",
      "",
      inputText,
      perl = TRUE,
      ignore.case = TRUE
      )
  # Nots
  inputText<-
    gsub(
      ".*(?:does|is|was|were|are|have) not.*?(\\.|\n|:|$)\\R*",
      "",
      inputText,
      perl = TRUE,
      ignore.case = TRUE
    )
  inputText <-
    gsub(
      "not (biop|seen).*?(\\.|\n|:|$)\\R*",
      "",
      inputText,
      perl = TRUE,
      ignore.case = TRUE
    )
  # Nos
  inputText <-
    gsub(
      ".*(?:((?<!with)|(?<!there is )|(?<!there are ))\\bno\\b(?![?:A-Za-z])|
      ([?:]\\s*N?![A-Za-z])).*\\R*",
      "",
      inputText,
      perl = TRUE,
      ignore.case = TRUE
      )
  
  #Withouts
  inputText <-
    gsub(
      ".*without.*\\R*",
      "",
      inputText,
      perl = TRUE,
      ignore.case = TRUE
    )
  
  
  inputText <-
    gsub(
      ".*(:|[?])\\s*(\\bno\\b|n)\\s*[^A-Za-z0-9].*?(\\.|\n|:|$)
      \\R*",
      "",
      inputText,
      perl = TRUE,
      ignore.case = TRUE
    )
  inputText <-
    gsub(
      ".*(negative|neither).*?(\\.|\n|:|$)\\R*",
      "",
      inputText,
      perl = TRUE,
      ignore.case = TRUE
    )
  # Keep abnormal in- don't ignore case as it messes it up
  inputText <- str_replace(inputText,
                           ".*(?<!b)[Nn]ormal.*?(\\.|\n|:|$)", "")
  # Other negatives
  inputText <- gsub(
    ".*there (is|are|were) \\bno\\b .*?(\\.|\n|:|$)\\R*",
    "",
    inputText,
    perl = TRUE,
    ignore.case = TRUE
  )
  inputText <- gsub(
    "(within|with) (normal|\\bno\\b) .*?(\\.|\n|:|$)\\R*",
    "",
    inputText,
    perl = TRUE,
    ignore.case = TRUE
  )
  # Specific cases
  inputText <- gsub(
    ".*duct.*clear.*?(\\.|\n|:|$)\\R*",
    "",
    inputText,
    perl = TRUE,
    ignore.case = TRUE
  )
  
  # Time related phrases eg post and previous
  inputText <- gsub(" (post|previous|prior)[^a-z].+?[A-Za-z]{3}",
                    " TIME_REPLACED-",
                    inputText,
                    perl = TRUE,
                    ignore.case = TRUE)
  
  
  return(inputText)
}

#' Find and Replace
#'
#' This is a helper function for finding and replacing from lexicons
#' like the event list. The lexicons are all named lists where the name
#' is the text to replace and the value what it should be replaced with
#' It uses fuzzy find and replace to account for spelling errors
#' @keywords Find and replace
#' @importFrom utils aregexec
#' @param pattern the pattern to look for
#' @param fixed whether the pattern is regex or not. Default not.
#' @param replacement the pattern replaceme with
#' @param x the target string
#' @return This returns a character vector
#' @examples L <- tolower(stringr::str_split(HistolType(),"\\|"))
#' @family NLP - Text Cleaning and Extraction
#' inputText<-TheOGDReportFinal$OGDReportWhole
#' inputText<-Reduce(function(x, nm) spellCheck(nm, L[[nm]], x), init = inputText, names(L))
#' 


spellCheck <- function(pattern, replacement, x, fixed = FALSE) {
  m <- aregexec(pattern, x, fixed = fixed,ignore.case = T)
  r <- regmatches(x, m)
  lens <- lengths(r)
  if (all(lens == 0)) return(x) else
    replace(x, lens > 0, mapply(sub, r[lens > 0], replacement, x[lens > 0]))
}

#' Tidy up messy columns
#'
#' This does a general clean up of whitespace,
#' semi-colons,full stops at the start
#' of lines and converts end sentence full stops to new lines.
#' @param vector column of interest
#' @keywords Cleaner
#' @export
#' @importFrom stringr str_replace str_trim
#' @importFrom stringi stri_split_boundaries stri_replace stri_replace_all
#' @return This returns a character vector
#' @family NLP - Text Cleaning and Extraction
#' @examples ii<-ColumnCleanUp(Myendo$Findings)


ColumnCleanUp <- function(vector) {
  
  
  
  #Optimise for tokenisation eg full stops followed by a number need to change so add a Letter before the number
  #vector<-gsub("\\.\\s*(\\d)","\\.T\\1",vector)
  
  #Get rid of convert anything that has a full stop in the middle into a new line eg line .Ever
  #vector<-gsub("([A-Za-z]\\s*)\\.(\\s*[A-Za-z])","\\1\n\\2",vector)
  
  
  #vector<-gsub("([A-Za-z]+.*)\\?(.*[A-Za-z]+.*)","\\1 \\2",vector)
  
  #Convert word return to space
  vector<-gsub("\r\n"," ",vector)
  vector<-gsub("\r","\n",vector)
  
  
  
  #Convert ,hi to fullstop the the word if it is a capital letter
  vector<-gsub(",([A-Z])","\n\\1",vector)
  
  
  #Conver "., or . ,"  to a space and vice versa
  vector<-stri_replace_all(vector,"\\.",regex="(\\.\\s*\\,)|(\\,\\s*\\.)|((\\.\\s*)+)")
  #vector<-gsub("\\.\\s*\\,","\\.",vector)
  #vector<-gsub("\\,\\s*\\.","\\.",vector)
  #vector<-gsub("(\\.\\s*)+","\\.",vector)
  
  #Get rid of middle of line newlines which seems to
  #happen e.g. I am Sebastian and \n I live in a hole
  vector<-gsub("(?:\\h*\\R)++(?!\\z)\\h*", " ", vector, perl=TRUE)
  
  #Get rid of ASCCII hex here
  vector<-gsub("\\\\[Xx].*?\\\\", " ", vector)
  
  #Get rid of query type punctuation:
  vector<-gsub("(.*)\\?(.*[A-Za-z]+)","\\1 \\2",vector)
  
  #Get rid of pointless single quote marks
  vector<-gsub("'","",vector,fixed=TRUE)
  
  #Have to tokenize here so you can strip punctuation without getting rid of newlines
  standardisedTextOutput<-stringi::stri_split_boundaries(vector, type="sentence")
  
  #Get rid of whitespace
  standardisedTextOutput<-lapply(standardisedTextOutput, function(x) gsub("(^[[:space:]]+|[[:space:]]+$)", "", x))
  
  #Get rid of trailing punctuation
  standardisedTextOutput<-lapply(standardisedTextOutput,function(x) gsub("^[[:punct:]]+","",x))
  
  #standardisedTextOutput<-lapply(standardisedTextOutput,function(x) gsub("[[:punct:]]+$","",x))
  #Question marks result in tokenized sentences so whenever anyone write query Barrett's, it gets split.
  standardisedTextOutput<-lapply(standardisedTextOutput,function(x) gsub("([A-Za-z]+.*)\\?(.*[A-Za-z]+.*)","\\1 \\2",x))
  #standardisedTextOutput<-lapply(standardisedTextOutput,function(x) gsub("(Dr.*?[A-Za-z]+)|([Rr]eported.*)|([Dd]ictated by.*)"," ",x))
  
  #Get rid of strange things in the text
  #standardisedTextOutput<-lapply(standardisedTextOutput,function(x) gsub("\\.\\s+\\,","\\.",x))
  #standardisedTextOutput<-lapply(standardisedTextOutput,function(x) str_replace_all(x,"(\\.\\s+\\,)|(^\\s+\\,)|(^[[:punct:]]+)|((Dr.*?[A-Za-z]+)|([Rr]eported.*)|([Dd]ictated by.*))","\\."))
  
  standardisedTextOutput<-lapply(standardisedTextOutput,function(x) stri_replace_all(x,"\\.",regex="(\\.\\s+\\,)|(^\\s+\\,)|(^[[:punct:]]+)|((Dr.*?[A-Za-z]+))"))
  
  
  #standardisedTextOutput<-lapply(standardisedTextOutput,function(x) gsub("^\\s+\\,"," ",x))
  #standardisedTextOutput<-lapply(standardisedTextOutput,function(x) gsub("^[[:punct:]]+","",x))
  retVector<-sapply(standardisedTextOutput, function(x) paste(x,collapse="."))
  retVector<-gsub("(\\.\\s*){2,}","\\.",retVector)
  return(retVector)
}

#' Paste endoscopy and histology results into one
#'
#' As spreadsheets are likely to be submitted with pre-segregated data as appears from 
#' endoscopy software output, these should be remerged prior to cleaning. This function
#' takes the column headers and places it before each text so that the original
#' full text is recreated. It will use the column headers as the delimiter. This should 
#' be used before textPrep as the textPrep function takes a character vector (ie the whole
#' report and not a segregated one) only
#' @keywords Merge dataframe columns into one text
#' @param x the dataframe
#' @export
#' @return This returns a list with a dataframe containing one column of the merged text
#' and a character vector which is the delimiter list for when the textPrep function is used
#' @family NLP - Text merging:
#' @examples testList<-structure(list(PatientName = c("Tom Hardy", "Elma Fudd", "Bingo Man"
#' ), HospitalNumber = c("H55435", "Y3425345", "Z343424"), Text = c("All bad. Not good", 
#' "Serious issues", "from a land far away")), class = "data.frame", row.names = c(NA, -3L))
#' EndoPaste(testList)


EndoPaste<-function(x){
  names(x)<-ColumnCleanUp(names(x))
  names(x)<-gsub("\n+"," ",names(x))
  delim<-paste(names(x))
  v1<-do.call(paste, c(Map(paste, names(x), x), " ",sep="\n"))
  df<-data.frame(X1_X2_X3 = unname(v1))
  return(list(df,delim))
}



############## Extraction Utilities - Basic ###################

#' Extrapolate from Dictionary
#'
#' Provides term mapping and extraction in one.
#' Standardises any term according to a mapping lexicon provided and then
#' extracts the term. This is
#' different to the DictionaryInPlaceReplace in that it provides a new column
#' with the extracted terms as opposed to changing it in place
#'
#' @param inputString The text string to process
#' @param list of words to iterate through
#' @keywords Withdrawal
#' @importFrom fuzzyjoin regex_left_join
#' @importFrom dplyr as_tibble pull
#' @export
#' @family Basic Column mutators 
#' @examples #Firstly we extract histology from the raw report
#' # The function then standardises the histology terms through a series of
#' # regular expressions and then extracts the type of tissue 

#' Mypath$Tissue<-suppressWarnings(
#' suppressMessages(
#' ExtrapolatefromDictionary(Mypath$Histology,HistolType()
#' )
#' )
#' )
#' rm(MypathExtraction)


ExtrapolatefromDictionary<-function(inputString,list){
  #lower case the input string
  inputString<-tolower(inputString)
  
  #Get the names from the named list
  mylist<-paste0(unlist(list,use.names=F),collapse="|")
  
  #Make the inputstrings unique
  ToIndex<-lapply(inputString, function(x) unique(x))
  #Give each an index in the list (taken from the location list)
  
  #The results to replace in the string
  replace<-names(list)
  
  #The result of the replacement 
  replaceValue<-paste0(unlist(list,use.names=F))
  
  #Create a dataframe
  df1<-data.frame(key = replace, val = replaceValue)
  
  #Create a tibble to merge with the list
  d1 <- as_tibble(df1)
  
  
  #Select the elements that have characters in them
  i1 <- lengths(ToIndex) > 0 
  
  #Do the merge. This takes the key to lookup and then if found, replaces
  #the value with value associated with it in the table. I think the pull
  #function also creates a new column with the value in it. This is an important
  #function as uses a table lookup
  ToIndex[i1] <- map(ToIndex[i1], ~ 
                       tibble::tibble(key = .x) %>%
                       fuzzyjoin::regex_left_join(d1) %>%
                       pull(val))
  
  #This unlists the nested list
  ToIndex<-lapply(ToIndex, function(x) unlist(x,recursive=F))
  
  #This collapses the list so that the output is a single string
  ToIndex<-unlist(lapply(ToIndex, function(x) paste(x,collapse=";")))
  
  return(ToIndex)
}

############## Extraction Utilities- Colocation ###################

#' See if words from two lists co-exist within a sentence 
#'
#' See if words from two lists co-exist within a sentence. Eg site and tissue type.
#' This function only looks in one sentence for the two terms. If you suspect the terms may
#' occur in adjacent sentences then use the EntityPairs_TwoSentence function.
#' @keywords PathPairLookup
#' @param inputText The relevant pathology text column
#' @param list1 First list to refer to
#' @param list2 The second list to look for
#' @importFrom purrr flatten_chr map_chr map map_if
#' @export
#' @family Basic Column mutators 
#' @examples # tbb<-EntityPairs_OneSentence(Mypath$Histology,HistolType(),LocationList())

EntityPairs_OneSentence<-function(inputText,list1,list2){
  
  list1<-paste0(unlist(list1,use.names=F),collapse="|")
  list2<-paste0(unlist(list2,use.names=F),collapse="|")
  
  text<-standardisedTextOutput<-stri_split_boundaries(inputText, type="sentence")
  r1 <-lapply(text,function(x) Map(paste, str_extract_all(tolower(x),tolower(list2)), 
                                   str_extract_all(tolower(x),tolower(list1)), MoreArgs = list(sep=":")))
  
  r1<-lapply(r1,function(x) unlist(x))
  #Unlist into a single row-This should output a character vector
  out<-lapply(r1,function(x) paste(x,collapse=","))
  
  return(out)
}

#' Look for relationships between site and event
#' 
#' This is used to look for relationships between site and event especially for endoscopy events
#' where sentences such as 'The stomach polyp was large. It was removed with a snare' ie the therapy
#' and the site are in two different locations.
#' @keywords Find and replace
#' @param inputString The relevant pathology text column
#' @param list1 The intial list to assess
#' @param list2 The other list to look for
#' @importFrom stringr str_replace_na str_c str_split str_which str_extract_all regex str_subset
#' @importFrom stringi stri_split_boundaries
#' @importFrom rlang is_empty
#' @importFrom purrr flatten_chr map_chr map map_if
#' @export
#' @family Basic Column mutators 
#' @examples # tbb<-EntityPairs_TwoSentence(Myendo$Findings,EventList(),HistolType())

EntityPairs_TwoSentence<-function(inputString,list1,list2){
  
  #Prepare the text to be back into a tokenised version.
  #text<-textPrep(inputText)
  text<-standardisedTextOutput<-stri_split_boundaries(inputString, type="sentence")
  text<-lapply(text,function(x) tolower(x))
  
  
  #Some clean up to get rid of white space- all of this prob already covered in the ColumnCleanUp function but for investigation later
  text<-lapply(text,function(x) gsub("[[:punct:]]+"," ",x))
  #Prepare the list to use as a lookup:
  tofind <-paste(tolower(list2),collapse="|")
  
  #Prepare the second list to use as a lookup
  EventList<-unique(tolower(unlist(list1,use.names = FALSE)))
  
  
  text<-sapply(text,function(x) {
    
    #Cleaning
    x<-trimws(x)
    
    
    
    #Prepare the text so that all empty text is replaced with NA and 
    #ready for processing
    try(words <-
          x %>%
          unlist() %>%
          str_replace_na()%>%
          str_c(collapse = ' ') %>%
          str_split(' ') %>%
          `[[`(1))
    
    
    words<-words[words != ""] 
    x1 <- str_extract_all(tolower(x),tolower(paste(unlist(list1), collapse="|")))
    i1 <- which(lengths(x1) > 0)
    
    
    try(if(any(i1)) {
      EventList %>%
        map(
          ~words %>%
            str_which(paste0('^.*', .x)) %>%
            map_chr(
              ~words[1:.x] %>%
                str_c(collapse = ' ') %>%
                
                str_extract_all(regex(tofind, ignore_case = TRUE)) %>%
                map_if(rlang::is_empty, ~ NA_character_) %>%
                flatten_chr()%>%
                `[[`(length(.)) %>%
                
                .[length(.)]
            ) %>%
            paste0(':', .x)
        ) %>%
        unlist() %>%
        str_subset('.+:')
      
    } else "")
    
  }
  )
  return(text)
}

#' Clean html endoscopic images
#'
#' This is used to pick and clean endoscopic images from html exports so they can be prepared
#' before being linked to pathology and endoscopy reports
#' @keywords Image extraction
#' @param file The html report to extract (the html will have all the images references in it)
#' @param delim The phrase that separates individual endoscopies
#' @param location The folder containing the actual images
#' @importFrom stringr str_extract 
#' @importFrom lubridate parse_date_time
#' @importFrom tidyr separate_rows
#' @importFrom pander pandoc.image.return
#' @importFrom data.table as.data.table
#' @export
#' @family Basic Column mutators 
#' @examples # MyImgLibrary("~/Images Captured with Proc Data Audit_Findings1.html",
#' #                         "procedureperformed","~/")

MyImgLibrary<-function (file, delim, location) 
{
  location <- gsub(".*\\/www\\/", "", location)
  htmlCode = readLines(file)
  mergedhtml <- paste(htmlCode, sep = "", collapse = "")
  df <- strsplit(mergedhtml, delim, fixed = FALSE, perl = FALSE, 
                 useBytes = FALSE)
  df <- as.data.frame(df)
  colnames(df) <- c("df")
  df$Endo_ResultEntered <- str_extract(df$df, "(\\d{4}[[:punct:]]\\d{2}[^:alnum:]\\d{2})|(\\d{2}[^:alnum:]\\d{2}[^:alnum:]\\d{4})")
  df$Endo_ResultEntered <- parse_date_time(df$Endo_ResultEntered, 
                                           orders = c("dmy", "ymd"))
  df$PatientID <- str_extract(df$df, "(?:[A-Z]{1,}[0-9]{3,}[0-9]{1})|(?:[0-9]{1,}[0-9]{3,}[A-Z]{1})")
  df$img <- stringr::str_extract_all(df$df, "img src.*?(jpg|png|gif|bmp)")
  df$df <- NULL
  mergeddf <- as.data.frame(as.data.table(df)[, toString(img), 
                                              by = list(Endo_ResultEntered, PatientID)])
  mergeddf <- separate_rows(mergeddf, V1, sep = ",")
  mergeddf$V1 <- gsub("img src=\"", "", mergeddf$V1)
  mergeddf$V1 <- trimws(mergeddf$V1)
  mergeddf$img <- str_extract(mergeddf$V1, "[A-Za-z0-9]+[.]bmp")
  mergeddf$url <- lapply(mergeddf$img, function(x) paste0("<img src=", 
                                                          location, "/", x, "'>"))
  mergeddf$base64 <- lapply(mergeddf$img, function(x) paste0(location, 
                                                             "/", x))
  
  # mergeddf$V1 <- NULL
  mergeddf$url <- gsub("=", "='", mergeddf$url)
  mergeddf$img <- str_extract(mergeddf$V1, "[A-Za-z0-9]+[.]bmp")
  mergeddf$url <- sapply(mergeddf$url, pandoc.image.return)
  mergeddf <- as.data.frame(as.data.table(mergeddf)[, toString(url), 
                                                    by = list(Endo_ResultEntered, PatientID)])
  mergeddf$url <- mergeddf$V1
  mergedhtml <- gsub("<(img src=.*?)>", ":\\1:", mergedhtml)
  mergedhtml <- gsub("<.*?>", ":", mergedhtml)
  mergedhtml <- gsub("([A-Za-z0-9]+):::([A-Za-z0-9]+)", "\\1;\\2", 
                     mergedhtml)
  outddd <- strsplit(mergedhtml, "Patient MRN")
  outddd <- lapply(outddd, function(x) gsub("^;", "Patient MRN;", 
                                            x))
  outddd <- lapply(outddd, function(x) gsub("img src", "Image Name;img src", 
                                            x))
  delim <- c("Patient MRN", "Date of procedure", "Procedure Performed", 
             "Image label", "Image Comment", "Image Name")
  mydf <- data.frame(lapply(outddd, function(x) EndoMineR::Extractor(x, 
                                                                     delim)))
  #mydf<-mydf%>%rename(PatientID=Patient.MRN)
  mydf$img <- str_extract(mydf$Image.Name, "[A-Za-z0-9]+[.]bmp")
  mydf2 <- apply(mydf, 2, function(x) gsub(";", "", x))
  mydf2 <- data.frame(mydf2)
  mydf2$Date.of.procedure <- as.Date(mydf2$Date.of.procedure, 
                                     format = "%Y-%m-%d")
  mydf2 <- mydf2 %>% rename(Endo_ResultEntered = "Date.of.procedure", 
                            PatientID = "Patient.MRN")
  mergeddf$img <- str_extract(mergeddf$url, "[A-Za-z0-9]+[.]bmp")
  mergeddf <- merge(mergeddf, mydf2, by = c("PatientID", "Endo_ResultEntered", 
                                            "img"))
  return(mergeddf)
}
sebastiz/EndoMineR documentation built on Dec. 4, 2022, 7:04 p.m.