R/Module_Barretts.R

if (getRversion() >= "2.15.1")
  utils::globalVariables(
    c(
      "PatientID",
      ".SD",
      "CStage",
      "NumbOfBx",
      "Years",
      "Difference",
      "barplot",
      "head",
      "read.table",
      "eHospitalNum",
      "pHospitalNum",
      ".",
      "n",
      "EVENT",
      "MonthYear",
      "freq",
      "Endoscopist",
      "avg",
      "v",
      "destination",
      "dcast",
      "complete.cases",
      "g",
      "gvisSankey",
      "head",
      "pHospitalNum",
      "par",
      "plot",
      "r",
      "read.table",
      "region",
      "rgb",
      "setDT",
      "ind",
      "mytext",
      "NumBx"
    )
  )
###### Barrett's specific extrapolation Functions ######

#' Prague score extraction
#'
#' The aim is to extract a C and M stage (Prague score) for Barrett's samples.
#' This is done using a regex where C and M stages are explicitly mentioned in
#' the free text
#' Specfically it extracts the Prague score
#' @param dataframe dataframe with column of interest
#' @param EndoReportColumn column of interest
#' @param EndoReportColumn2 second column of interest
#' @importFrom stringr str_extract str_replace str_extract_all
#' @importFrom purrr map
#' @keywords  Prague score
#' @export
#' @examples #The example takes the endoscopy demo dataset and searches the
#' # Findings column (which contains endoscopy free text about the
#' # procedure itself). It then extracts the Prague score if relevant. I
#' # find it easiest to use this on a Barrett's subset of data rather than
#' # a dump of all endoscopies but of course this is a permissible dataset
#' # too
#'
#'
#' aa<-Barretts_PragueScore(Myendo,'Findings','OGDReportWhole')

#Change to BarrPrague nee Barretts_PragueScore
Barretts_PragueScore <- function(dataframe, EndoReportColumn,EndoReportColumn2) {
  dataframe <- data.frame(dataframe)

  
  dataframe$CStage <-   
    #If the CStage is present then extract it
    ifelse(grepl("([Cc](\\s|=)*\\d+)",dataframe[,EndoReportColumn]),
           stringr::str_replace(stringr::str_extract(dataframe[,EndoReportColumn],'([Cc](\\s|=)*\\d+)'),"[Cc]", ""),
           ifelse(grepl("([Cc](\\s|=)*\\d+)",dataframe[,EndoReportColumn2]),
                  stringr::str_replace(stringr::str_extract(dataframe[,EndoReportColumn2],'([Cc](\\s|=)*\\d+)'),"[Cc]", ""),
                  ifelse(grepl("([Cc](\\s|=)*O(\\s)*[Mm](\\s)*\\d+)",dataframe[,EndoReportColumn]),
                         stringr::str_replace(stringr::str_extract(dataframe[,EndoReportColumn],'([Cc](\\s|=)*O(\\s)*[Mm](\\s)*\\d+)'),"[Cc]", ""),
                  "Insufficient")))
  
dataframe$CStage<-trimws(unlist(dataframe$CStage))
dataframe$CStage<-gsub("O","0",dataframe$CStage)
dataframe$CStage<-gsub("M.*","",dataframe$CStage)

dataframe$mytext<-stri_split_boundaries(dataframe[,EndoReportColumn], type="sentence")
dataframe$mytext<-lapply(dataframe$mytext,function(x) trimws(x))


dataframe<-dataframe %>%
  mutate(
    MStage = map(
      mytext, ~ case_when(
        grepl("( [Mm](\\s|=)*\\d+)",.x) ~ stringr::str_replace(stringr::str_extract(.x,"( [Mm](\\s|=)*\\d+)"),"M", ""),
        #dataframe[,CStage]!="Insufficient" ~ dataframe[,CStage],
        grepl("(?=[^\\.]*Barr)[^\\.]*\\s+\\d{2}\\s*[cm]*\\s*(to |-| and)\\s*\\d{2}\\s*[cm]*\\s*",.x,ignore.case = TRUE,perl=TRUE)  ~ as.character(as.numeric(sapply(stringr::str_extract_all(stringr::str_extract(.x,"\\d{2}\\s*[cm]*\\s*(to|-|and)\\s*\\d{2}\\s*[cm]*\\s*"),"\\d{2}"), function(y) abs(diff(as.numeric(y)))))),
        grepl("(?=[^\\.]*cm)(?=[^\\.]*Barr)(?=[^\\.]*(of |length))[^\\.]*", .x, perl=TRUE)  ~  stringr::str_extract(paste0(stringr::str_match(.x,"(?=[^\\.]*cm)(?=[^\\.]*[Bb]arr)(?=[^\\.]*(of |length))[^\\.]*"),collapse=""),"\\d+"),
        grepl("(\\.|^|\n)(?=[^\\.]*(small|tiny|tongue|finger))(?=[^\\.]*Barr)[^\\.]*(\\.|\n|$)", .x, perl=TRUE) ~  stringr::str_replace(.x, ".*","1"),
                TRUE ~ "Insufficient")
    )
  )


dataframe$MStage<-lapply(dataframe$MStage, function(x) gsub("Insufficient","",x))
dataframe$MStage<-lapply(dataframe$MStage, function(x) gsub("m","",x))

dataframe$MStage<-suppressWarnings(unlist(lapply(dataframe$MStage, function(x) max(as.numeric(x),na.rm=TRUE))))
#If there are more than two numbers pick the highest one

dataframe$MStage<-ifelse(is.infinite(dataframe$MStage),ifelse(dataframe$CStage!="Insufficient",dataframe$CStage,"Insufficient"),dataframe$MStage)

  return(dataframe)
}


#' Worst pathological stage Barrett's
#'
#' This extracts the pathological stage from the histopathology specimen. It is
#' done using 'degradation' so that it will look for the worst overall grade
#' in the histology specimen and if not found it will look for the next worst
#' and so on. It looks per report not per biopsy (it is more common
#' for histopathology reports to contain the worst overall grade
#' rather than individual biopsy grades).
#' Specfically it extracts the histopathology worst grade within the specimen
#' FOr the sake of accuracy this should alwats be used after the HistolDx function
#' and this removes negative sentences such as 'there is no dysplasia'. 
#' This current function should be used on the column derived from HistolDx
#' which is called Dx_Simplified
#' @param dataframe dataframe with column of interest
#' @param PathColumn column of interest
#' @keywords Pathology extraction
#' @export
#' @importFrom rlang sym
#' @examples # Firstly relevant columns are extrapolated from the
#' # Mypath demo dataset. These functions are all part of Histology data 
#' # cleaning as part of the package.
#' # The function then takes the Histology column from the merged data set (v).
#' # It extracts the worst histological grade for a specimen
#' b<-Barretts_PathStage(Mypath,'Histology')
#' rm(v)

Barretts_PathStage <- function(dataframe, PathColumn) {
  # Get the worst pathology for that sample inc SM stages
  dataframe <- data.frame(dataframe)
  PathColumna <- rlang::sym(PathColumn)
  
  df<-dataframe %>%
    mutate(
      IMorNoIM = case_when(
          grepl("sm2", !!PathColumna,ignore.case=TRUE) ~ "SM2",
          grepl("sm1", !!PathColumna,ignore.case=TRUE) ~ "SM1",
          grepl("T1b", !!PathColumna,ignore.case=TRUE) ~  "T1b",
          grepl("T1a|ntramucosal", !!PathColumna,ignore.case=TRUE) ~  "T1a",
          grepl("[Hh]igh [Gg]rade ", !!PathColumna,ignore.case=TRUE,perl=TRUE) ~  "HGD",
          grepl("[Ll]ow [Gg]rade", !!PathColumna,ignore.case=TRUE,perl=TRUE) ~  "LGD",
          grepl("[Ii]ndef", !!PathColumna,ignore.case=TRUE,perl=TRUE) ~  "IGD",
          grepl("[Ii]ntestinal|[^-][Ss]pecialised", !!PathColumna,ignore.case=TRUE,perl=TRUE) ~  "IM",
          grepl("[Mm]etaplasia|[Cc]olumnar|Glandular", !!PathColumna,ignore.case=TRUE,perl=TRUE) ~  "No_IM",
          is.na(!!PathColumna) ~  "Insufficient",
          TRUE ~ "Insufficient")
      )
    
  return(df$IMorNoIM)
}




      

#' Follow up group determination
#'
#' This determines the follow up rule a patient should fit in to (according to
#' the British Society for Gastroenterology guidance on Barrett's oesophagus)
#' Specfically it combines the presence of intestinal metaplasia with
#' Prague score so the follow-up group can be determined. It relies on the
#' presence of a Prague score. It should be run after
#' Barretts_PathStage which looks for the worst stage of a
#' specimen and which will determine the presence or absence of intestinal
#' metaplasia if the sample is non-dysplastic. Because reports often do not record
#' a Prague score a more pragmatic approach as been to assess the M stage and if 
#' this is not present then to use the C stage extrapolated using the 
#' Barretts_Prague function
#' @param dataframe the dataframe(which has to have been processed by the
#' Barretts_PathStage function first to get IMorNoIM and the Barretts_PragueScore
#' to get the C and M stage if available),
#' @param CStage CStage column
#' @param MStage MStage column
#' @param IMorNoIM IMorNoIM column
#' @keywords Follow-Up
#' @importFrom stringr str_extract str_replace
#' @export
#' @examples # Firstly relevant columns are extrapolated from the
#' # Mypath demo dataset. These functions are all part of Histology data
#' # cleaning as part of the package.
#' # Mypath demo dataset. These functions are all part of Histology data
#' # cleaning as part of the package.
#' v<-Mypath
#' v$NumBx<-HistolNumbOfBx(v$Macroscopicdescription,'specimen')
#' v$BxSize<-HistolBxSize(v$Macroscopicdescription)
#' # The histology is then merged with the Endoscopy dataset. The merge occurs
#' # according to date and Hospital number
#' v<-Endomerge2(Myendo,'Dateofprocedure','HospitalNumber',v,'Dateofprocedure',
#' 'HospitalNumber')
#' #The function relies on the other Barrett's functions being run as well:
#' v$IMorNoIM<-Barretts_PathStage(v,'Histology')
#' v<-Barretts_PragueScore(v,'Findings')
#' 
#' #The follow-up group depends on the histology and the Prague score for a
#' # patient so it takes the processed Barrett's data and then looks in the
#' # Findings column for permutations of the Prague score.
#' v$FU_Type<-Barretts_FUType(v,"CStage","MStage","IMorNoIM")
#' rm(v)

Barretts_FUType <- function(dataframe,CStage,MStage,IMorNoIM) {
  dataframe <- data.frame(dataframe)
  
  CStagea<-rlang::sym(CStage)
  MStagea<-rlang::sym(MStage)
  IMorNoIMa<-rlang::sym(IMorNoIM)

  df<-dataframe %>%
    mutate(
      FU_Type = case_when(
        grepl("SM2|SM1|T1b_Unspec|T1a|LGD|HGD|IGD", !!IMorNoIMa,ignore.case=TRUE) ~ "Therapy",
        !!CStagea == "Insufficient" & !!MStagea == "Insufficient" ~ "NoRules",
        !!IMorNoIMa == "No_IM" & !is.na(!!MStagea) & as.numeric(!!MStagea) < 3 ~ "Rule1",
        !!IMorNoIMa == "IM" & !is.na(!!MStagea) & as.numeric(!!MStagea) < 3 ~ "Rule2",
        (!is.na(!!MStagea) & as.numeric(!!MStagea)) >= 3 ~ "Rule3",
        !!IMorNoIMa == "No_IM" & !is.na(!!CStagea) & as.numeric(!!CStagea) < 3 ~"Rule1",
        !!IMorNoIMa == "IM" & !is.na(!!CStagea) & as.numeric(!!CStagea) < 3 ~"Rule2",
        (!is.na(!!CStagea) & as.numeric(!!CStagea)) >= 3 ~ "Rule3",
        TRUE ~ "NoRules")
    )
  
  return(df$FU_Type)

}


#' Paris vs histopath Barrett's
#'
#' This looks at the Paris grades of each EMR and then creates a heatmap
#' of pathological grade vs
#' endoscopic Paris grade.This should only be run after all the
#' BarrettsDataAccord functions.
#' @param Column Endoscopy report field of interest as a string vector
#' @param Column2 Another endoscopy report field of interest as a string vector
#' @keywords Does something with data
#' @export
#' @return a string vector
#' @examples # 
#' Myendo$EMR<-BarrettsParisEMR(Myendo$ProcedurePerformed,Myendo$Findings)
#' rm(v)

BarrettsParisEMR <- function(Column, Column2) {
  
  #NewCol<-paste0(Column, Column2)
  NewCol<-paste0(Column,Column2)
  NewCol <- data.frame(NewCol,stringsAsFactors = FALSE)
  
  # Get the worst pathology for that sample inc SM stages
  df<-NewCol %>%
    mutate(
      ParisClass = case_when(
        grepl("11a_c|2a_c|[Ii][Ii]a_c", NewCol,ignore.case=TRUE) ~ "2a_c",
        grepl("[Ii][Ii]a|2a|11a", NewCol,ignore.case=TRUE) ~ "2a",
        grepl("[Ii][Ii]b|2b|11b", NewCol,ignore.case=TRUE) ~  "2b",
        grepl("[Ii][Ii][Ii]|III", NewCol,ignore.case=TRUE) ~  "3",
        grepl("Paris [Tt]ype [Ii]s|1s ", NewCol,ignore.case=TRUE,perl=TRUE) ~  "1s",
        grepl(" [Ii]p |1p", NewCol,ignore.case=TRUE,perl=TRUE) ~  "1p",
        TRUE ~ "No_Paris")
    )
  
  return(df$ParisClass)
  
}



############## Pathology Quality #############


#' Barrett's number of biopsies
#'
#' This function gets the biopsies taken per endoscopy and compares to the
#' Prague score for that endoscopy.
#' @param dataframe dataframe
#' @param Endo_ResultPerformed Date of the Endocscopy
#' @param PatientID Patient's unique identifier
#' @param Endoscopist name of the column with the Endoscopist names
#' @importFrom dplyr summarise group_by filter
#' @importFrom rlang sym
#' @importFrom ggplot2 ggplot geom_point labs theme xlab ylab unit element_text
#' @keywords Does something with data
#' @export
#' @examples # Firstly relevant columns are extrapolated from the
#' # Mypath demo dataset. These functions are all part of Histology data
#' # cleaning as part of the package.
#' Mypath$NumBx<-HistolNumbOfBx(Mypath$Macroscopicdescription,'specimen')
#' Mypath$BxSize<-HistolBxSize(Mypath$Macroscopicdescription)
#' 
#' # The histology is then merged with the Endoscopy dataset. The merge occurs
#' # according to date and Hospital number
#' v<-Endomerge2(Myendo,'Dateofprocedure','HospitalNumber',Mypath,'Dateofprocedure',
#' 'HospitalNumber')
#' 
#' # The function relies on the other Barrett's functions being run as well:
#' b1<-Barretts_PragueScore(v,'Findings')
#' b1$PathStage<-Barretts_PathStage(b1,'Histology')

#' # The follow-up group depends on the histology and the Prague score for a
#' # patient so it takes the processed Barrett's data and then looks in the
#' # Findings column for permutations of the Prague score.
#' b1$FU_Type<-Barretts_FUType(b1,"CStage","MStage","PathStage")
#' colnames(b1)[colnames(b1) == 'pHospitalNum'] <- 'HospitalNumber'
#' 
#' # The number of average number of biopsies is then calculated and
#' # compared to the average Prague C score so that those who are taking
#' # too few biopsies can be determined
#' hh<-BarrettsBxQual(b1,'Date.x','HospitalNumber',
#'                                      'Endoscopist')
#' rm(v)

BarrettsBxQual <- function(dataframe,
                                                 Endo_ResultPerformed,
                                                 PatientID,
                                                 Endoscopist) {
  dataframe <- data.frame(dataframe)
  PatientIDa <- rlang::sym(PatientID)
  Endo_ResultPerformeda <- rlang::sym(Endo_ResultPerformed)
  Endoscopista <- rlang::sym(Endoscopist)
  
  #Make sure the C and M stage is mueric (wil be character from the PragueScore function to
  #incorporate "Insufficient " as an outcome)
  dataframe$CStage<-as.numeric(dataframe$CStage)
  dataframe$MStage<-as.numeric(dataframe$MStage)
  
  GroupedByEndoscopy <-
    suppressWarnings(dataframe %>% filter(!is.na(CStage), !is.na(NumBx)) %>%
    group_by(as.Date(!!Endo_ResultPerformeda),!!PatientID,
             !!Endoscopista) %>%
    summarise(Sum = sum(NumBx), AvgC = mean(CStage)))
  
  GroupedByEndoscopy$ExpectedNumber <-
    (GroupedByEndoscopy$AvgC + 1) * 2
  GroupedByEndoscopy$Difference <-
    GroupedByEndoscopy$Sum - GroupedByEndoscopy$ExpectedNumber
  
  # Now group the difference by endoscopist
  BxShortfallPre <-
    GroupedByEndoscopy %>% group_by(!!Endoscopista) %>%
    summarise(MeanDiff = mean(Difference))
  
  BxShortfallPre<-data.frame(BxShortfallPre)
  return(BxShortfallPre)
  
}
sebastiz/EndoMineR_devlop documentation built on May 29, 2019, 7:33 a.m.