R/Module_Barretts.R

Defines functions BarrettsBxQual BarrettsParisEMR BarrettsAll Barretts_FUType Barretts_PathStage Barretts_PragueScore

Documented in BarrettsAll BarrettsBxQual Barretts_FUType BarrettsParisEMR Barretts_PathStage Barretts_PragueScore

if (getRversion() >= "2.15.1") {
  utils::globalVariables(
    c(
      "PatientID",
      ".SD",
      "CStage",
      "NumbOfBx",
      "Years",
      "Difference",
      "barplot",
      "head",
      "read.table",
      "eHospitalNum",
      "pHospitalNum",
      ".",
      "n",
      "EVENT",
      "MonthYear",
      "freq",
      "Endoscopist",
      "avg",
      "v",
      "destination",
      "dcast",
      "complete.cases",
      "g",
      "gvisSankey",
      "head",
      "pHospitalNum",
      "par",
      "plot",
      "r",
      "read.table",
      "region",
      "rgb",
      "setDT",
      "ind",
      "mytext",
      "NumBx"
    )
  )
}
###### Barrett's specific extrapolation Functions ######

#' Extract the Prague score
#'
#' The aim is to extract a C and M stage (Prague score) for Barrett's samples.
#' This is done using a regex where C and M stages are explicitly mentioned in
#' the free text
#' Specfically it extracts the Prague score
#' @param dataframe dataframe with column of interest
#' @param EndoReportColumn column of interest
#' @param EndoReportColumn2 second column of interest
#' @importFrom stringr str_extract str_replace str_extract_all
#' @importFrom purrr map
#' @importFrom dplyr case_when
#' @keywords  Prague score
#' @export
#' @family Disease Specific Analysis - Barretts Data
#' @examples
#' # The example takes the endoscopy demo dataset and searches the
#' # Findings column (which contains endoscopy free text about the
#' # procedure itself). It then extracts the Prague score if relevant. I
#' # find it easiest to use this on a Barrett's subset of data rather than
#' # a dump of all endoscopies but of course this is a permissible dataset
#' # too
#' 
#' 
#' aa <- Barretts_PragueScore(Myendo, "Findings", "OGDReportWhole")
Barretts_PragueScore <- function(dataframe, EndoReportColumn, EndoReportColumn2) {
  dataframe <- data.frame(dataframe)


  dataframe$CStage <-
    # If the CStage is present then extract it
    ifelse(grepl("([Cc](\\s|=)*\\d+)", dataframe[, EndoReportColumn]),
      stringr::str_replace(stringr::str_extract(dataframe[, EndoReportColumn], "([Cc](\\s|=)*\\d+)"), "[Cc]", ""),
      ifelse(grepl("([Cc](\\s|=)*\\d+)", dataframe[, EndoReportColumn2]),
        stringr::str_replace(stringr::str_extract(dataframe[, EndoReportColumn2], "([Cc](\\s|=)*\\d+)"), "[Cc]", ""),
        ifelse(grepl("([Cc](\\s|=)*O(\\s)*[Mm](\\s)*\\d+)", dataframe[, EndoReportColumn]),
          stringr::str_replace(stringr::str_extract(dataframe[, EndoReportColumn], "([Cc](\\s|=)*O(\\s)*[Mm](\\s)*\\d+)"), "[Cc]", ""),
          "Insufficient"
        )
      )
    )

  dataframe$CStage <- trimws(unlist(dataframe$CStage))
  dataframe$CStage <- gsub("O", "0", dataframe$CStage)
  dataframe$CStage <- gsub("M.*", "", dataframe$CStage)

  dataframe$mytext <- stri_split_boundaries(dataframe[, EndoReportColumn], type = "sentence")
  dataframe$mytext <- lapply(dataframe$mytext, function(x) trimws(x))
  
  

  dataframe <- dataframe %>%
    mutate(
      MStage = map(
        mytext, ~ case_when(
          grepl("(?<=\\d)\\s*[Mm](?:\\s|=)*\\d+", .x,perl = TRUE) ~ stringr::str_replace(stringr::str_extract(.x, "(?<=\\d)\\s*[Mm](?:\\s|=)*\\d+"), "M", ""),
          grepl("(?=[^\\.]*[Bb]arr)[^\\.]*\\s+\\d{2}\\s*[cm]*\\s*(to |-| and)\\s*\\d{2}\\s*[cm]*\\s*", .x, ignore.case = TRUE, perl = TRUE) ~ as.character(as.numeric(sapply(stringr::str_extract_all(stringr::str_extract(.x, "\\d{2}\\s*[cm]*\\s*(to|-|and)\\s*\\d{2}\\s*[cm]*\\s*"), "\\d{2}"), function(y) abs(diff(as.numeric(y)))))),
          grepl("(?=[^\\.]*cm)(?=[^\\.]*[Bb]arr)(?=[^\\.]*(of |length))[^\\.]*", .x, perl = TRUE) ~ stringr::str_extract(paste0(stringr::str_match(.x, "(?=[^\\.]*cm)(?=[^\\.]*[Bb]arr)(?=[^\\.]*(of |length))[^\\.]*"), collapse = ""), "\\d+"),
          grepl("(\\.|^|\n)(?=[^\\.]*(small|tiny|tongue|finger))(?=[^\\.]*[Bb]arr)[^\\.]*(\\.|\n|$)", .x, perl = TRUE) ~ stringr::str_replace(.x, ".*", "1"),
          TRUE ~ "Insufficient"
        )
      )
    )


  dataframe$MStage <- lapply(dataframe$MStage, function(x) gsub("Insufficient", "", x))
  dataframe$MStage <- lapply(dataframe$MStage, function(x) gsub("m", "", x))
  dataframe$MStage <- suppressWarnings(unlist(lapply(dataframe$MStage, function(x) max(as.numeric(x), na.rm = TRUE))))
  # If there are more than two numbers pick the highest one

  dataframe$MStage <- ifelse(is.infinite(dataframe$MStage), ifelse(dataframe$CStage != "Insufficient", dataframe$CStage, "Insufficient"), dataframe$MStage)

  return(dataframe)
}


#' Get the worst pathological stage for Barrett's
#'
#' This extracts the pathological stage from the histopathology specimen. It is
#' done using 'degradation' so that it will look for the worst overall grade
#' in the histology specimen and if not found it will look for the next worst
#' and so on. It looks per report not per biopsy (it is more common
#' for histopathology reports to contain the worst overall grade
#' rather than individual biopsy grades).
#' Specfically it extracts the histopathology worst grade within the specimen
#' FOr the sake of accuracy this should alwats be used after the HistolDx function
#' and this removes negative sentences such as 'there is no dysplasia'.
#' This current function should be used on the column derived from HistolDx
#' which is called Dx_Simplified
#' @param dataframe dataframe with column of interest
#' @param PathColumn column of interest
#' @keywords Pathology extraction
#' @export
#' @importFrom rlang sym
#' @importFrom dplyr case_when
#' @family Disease Specific Analysis - Barretts Data
#' @examples
#' # Firstly relevant columns are extrapolated from the
#' # Mypath demo dataset. These functions are all part of Histology data
#' # cleaning as part of the package.
#' # The function then takes the Histology column from the merged data set (v).
#' # It extracts the worst histological grade for a specimen
#' b <- Barretts_PathStage(Mypath, "Histology")
#' rm(v)
Barretts_PathStage <- function(dataframe, PathColumn) {
  # Get the worst pathology for that sample inc SM stages
  dataframe <- data.frame(dataframe)
  PathColumna <- rlang::sym(PathColumn)

  df <- dataframe %>%
    mutate(
      IMorNoIM = case_when(
        grepl("[Ss][Mm]2", !!PathColumna, ignore.case = TRUE) ~ "SM2",
        grepl("[Ss][Mm]1", !!PathColumna, ignore.case = TRUE) ~ "SM1",
        grepl("[Tt]1b", !!PathColumna, ignore.case = TRUE) ~ "T1b",
        grepl("[Tt]1a|ntramucosal", !!PathColumna, ignore.case = TRUE) ~ "T1a",
        grepl("denocarcino", !!PathColumna, ignore.case = TRUE) ~ "Cancer unstaged",
        grepl("[Hh]igh [Gg]rade ", !!PathColumna, ignore.case = TRUE, perl = TRUE) ~ "HGD",
        grepl("[Ll]ow [Gg]rade", !!PathColumna, ignore.case = TRUE, perl = TRUE) ~ "LGD",
        grepl("[Ii]ndef", !!PathColumna, ignore.case = TRUE, perl = TRUE) ~ "IGD",
        grepl("[Ii]ntestinal|[^-][Ss]pecialised", !!PathColumna, ignore.case = TRUE, perl = TRUE) ~ "IM",
        grepl("[Mm]etaplasia|[Cc]olumnar|[Gg]landular", !!PathColumna, ignore.case = TRUE, perl = TRUE) ~ "No_IM",
        is.na(!!PathColumna) ~ "Insufficient",
        TRUE ~ "Insufficient"
      )
    )

  return(df$IMorNoIM)
}






#' Determine the Follow up group
#'
#' This determines the follow up rule a patient should fit in to (according to
#' the British Society for Gastroenterology guidance on Barrett's oesophagus)
#' Specfically it combines the presence of intestinal metaplasia with
#' Prague score so the follow-up group can be determined. It relies on the
#' presence of a Prague score. It should be run after
#' Barretts_PathStage which looks for the worst stage of a
#' specimen and which will determine the presence or absence of intestinal
#' metaplasia if the sample is non-dysplastic. Because reports often do not record
#' a Prague score a more pragmatic approach as been to assess the M stage and if
#' this is not present then to use the C stage extrapolated using the
#' Barretts_Prague function
#' @param dataframe the dataframe(which has to have been processed by the
#' Barretts_PathStage function first to get IMorNoIM and the Barretts_PragueScore
#' to get the C and M stage if available),
#' @param CStage CStage column
#' @param MStage MStage column
#' @param IMorNoIM IMorNoIM column
#' @keywords Follow-Up
#' @importFrom stringr str_extract str_replace
#' @importFrom dplyr case_when
#' @export
#' @family Disease Specific Analysis - Barretts Data
#' @examples
#' # Firstly relevant columns are extrapolated from the
#' # Mypath demo dataset. These functions are all part of Histology data
#' # cleaning as part of the package.
#' # Mypath demo dataset. These functions are all part of Histology data
#' # cleaning as part of the package.
#' v <- Mypath
#' v$NumBx <- HistolNumbOfBx(v$Macroscopicdescription, "specimen")
#' v$BxSize <- HistolBxSize(v$Macroscopicdescription)
#' # The histology is then merged with the Endoscopy dataset. The merge occurs
#' # according to date and Hospital number
#' v <- Endomerge2(
#'   Myendo, "Dateofprocedure", "HospitalNumber", v, "Dateofprocedure",
#'   "HospitalNumber"
#' )
#' # The function relies on the other Barrett's functions being run as well:
#' v$IMorNoIM <- Barretts_PathStage(v, "Histology")
#' v <- Barretts_PragueScore(v, "Findings")
#' 
#' # The follow-up group depends on the histology and the Prague score for a
#' # patient so it takes the processed Barrett's data and then looks in the
#' # Findings column for permutations of the Prague score.
#' v$FU_Type <- Barretts_FUType(v, "CStage", "MStage", "IMorNoIM")
#' rm(v)
Barretts_FUType <- function(dataframe, CStage, MStage, IMorNoIM) {
  dataframe <- data.frame(dataframe)

  CStagea <- rlang::sym(CStage)
  MStagea <- rlang::sym(MStage)
  IMorNoIMa <- rlang::sym(IMorNoIM)

  df <- dataframe %>%
    mutate(
      FU_Type = case_when(
        grepl("SM2|SM1|T1b_Unspec|T1a|T1b|LGD|HGD|IGD|Cancer unstaged", !!IMorNoIMa, ignore.case = TRUE) ~ "Therapy", #Therapy group
        !!CStagea == "Insufficient" & !!MStagea == "Insufficient" ~ "NoRules", # Can't figure it out
        !!IMorNoIMa == "No_IM" & !is.na(!!MStagea) & as.numeric(!!MStagea) < 3 ~ "Rule1", #Short segment No IM
        !!IMorNoIMa == "IM" & !is.na(!!MStagea) & as.numeric(!!MStagea) < 3 ~ "Rule2", #Short segment IM
        !is.na(!!MStagea) & as.numeric(!!MStagea) >= 3 ~ "Rule3", #Long segment whatever the path
        !!IMorNoIMa == "No_IM" & !is.na(!!CStagea) & as.numeric(!!CStagea) < 3 ~ "Rule1", # If M stage missing - Short segment no IM
        !!IMorNoIMa == "IM" & !is.na(!!CStagea) & as.numeric(!!CStagea) < 3 ~ "Rule2", # If M stage missing - Short segment  IM
        !is.na(!!CStagea) & as.numeric(!!CStagea) >= 3 ~ "Rule3", # If M stage missing - Long segment
        TRUE ~ "NoRules"
      )
    )

  return(df$FU_Type)
}



#' Run all the basic Barrett's functions
#'
#' Function to encapsulate all the Barrett's functions together. This includes the Prague
#' score and the worst pathological grade and then feeds both of these things into
#' the follow up function. The output is a dataframe with all the original data as
#' well as the new columns that have been created.
#' @param Endodataframe endoscopy dataframe of interest
#' @param EndoReportColumn Endoscopy report field of interest as a string vector
#' @param EndoReportColumn2 Second endoscopy report field of interest as a string vector
#' @param Pathdataframe pathology dataframe of interest
#' @param PathColumn Pathology report field of interest as a string vector
#' @keywords Does something with data
#' @importFrom dplyr case_when
#' @export
#' @return Newdf
#' @family Disease Specific Analysis - Barretts Data
#' @examples
#' Barretts_df <- BarrettsAll(Myendo, "Findings", "OGDReportWhole", Mypath, "Histology")
BarrettsAll <- function(Endodataframe, EndoReportColumn, EndoReportColumn2, Pathdataframe, PathColumn) {
  Newdf <- Barretts_PragueScore(Endodataframe, EndoReportColumn, EndoReportColumn2)
  Newdf$IMorNoIM <- Barretts_PathStage(Pathdataframe, PathColumn)
  # The named columns here are derived from the previous functions outputs
  Newdf$FU_Type <- Barretts_FUType(Newdf, "CStage", "MStage", "IMorNoIM")
  return(Newdf)
}


#' Run the Paris classification versus worst histopath grade for Barrett's
#'
#' This creates a column of Paris grade for all samples where this is mentioned.
#' @param Column Endoscopy report field of interest as a string vector
#' @param Column2 Another endoscopy report field of interest as a string vector
#' @keywords Does something with data
#' @importFrom dplyr case_when
#' @export
#' @return a string vector
#' @family Disease Specific Analysis - Barretts Data
#' @examples # 
#' Myendo$EMR<-BarrettsParisEMR(Myendo$ProcedurePerformed,Myendo$Findings)


BarrettsParisEMR <- function(Column, Column2) {
  NewCol <- paste0(Column, Column2)
  NewCol <- data.frame(NewCol, stringsAsFactors = FALSE)

  # Get the worst pathology for that sample inc SM stages
  df <- NewCol %>%
    mutate(
      ParisClass = case_when(
        grepl("11a_c|2a_c|[Ii][Ii]a_c", NewCol, ignore.case = TRUE) ~ "2a_c",
        grepl("[Ii][Ii]a|2a|11a", NewCol, ignore.case = TRUE) ~ "2a",
        grepl("[Ii][Ii]b|2b|11b", NewCol, ignore.case = TRUE) ~ "2b",
        grepl("[Ii][Ii][Ii]|III", NewCol, ignore.case = TRUE) ~ "3",
        grepl("Paris [Tt]ype [Ii]s|1s ", NewCol, ignore.case = TRUE, perl = TRUE) ~ "1s",
        grepl(" [Ii]p |1p", NewCol, ignore.case = TRUE, perl = TRUE) ~ "1p",
        TRUE ~ "No_Paris"
      )
    )

  return(df$ParisClass)
}



############## Pathology Quality #############


#' Get the number of Barrett's biopsies taken
#'
#' This function gets the number of biopsies taken per 
#' endoscopy and compares it to the
#' Prague score for that endoscopy.Endoscopists should be taking a certain
#' number of biopsies given the length of a Barrett's segment so it
#' should be straightforward to detect a shortfall in the number
#' of biopsies being taken. The output is the shortfall per endoscopist
#' @param dataframe dataframe
#' @param Endo_ResultPerformed Date of the Endoscopy
#' @param PatientID Patient's unique identifier
#' @param Endoscopist name of the column with the Endoscopist names
#' @importFrom dplyr summarise group_by filter
#' @importFrom rlang sym
#' @importFrom ggplot2 ggplot geom_point labs theme xlab ylab unit element_text
#' @keywords Does something with data
#' @export
#' @family Disease Specific Analysis - Barretts Data
#' @examples
#' # Firstly relevant columns are extrapolated from the
#' # Mypath demo dataset. These functions are all part of Histology data
#' # cleaning as part of the package.
#' Mypath$NumBx <- HistolNumbOfBx(Mypath$Macroscopicdescription, "specimen")
#' Mypath$BxSize <- HistolBxSize(Mypath$Macroscopicdescription)
#' 
#' # The histology is then merged with the Endoscopy dataset. The merge occurs
#' # according to date and Hospital number
#' v <- Endomerge2(
#'   Myendo, "Dateofprocedure", "HospitalNumber", Mypath, "Dateofprocedure",
#'   "HospitalNumber"
#' )
#' 
#' # The function relies on the other Barrett's functions being run as well:
#' b1 <- Barretts_PragueScore(v, "Findings")
#' b1$PathStage <- Barretts_PathStage(b1, "Histology")
#' 
#' # The follow-up group depends on the histology and the Prague score for a
#' # patient so it takes the processed Barrett's data and then looks in the
#' # Findings column for permutations of the Prague score.
#' b1$FU_Type <- Barretts_FUType(b1, "CStage", "MStage", "PathStage")
#' 
#' 
#' colnames(b1)[colnames(b1) == "pHospitalNum"] <- "HospitalNumber"
#' # The number of average number of biopsies is then calculated and
#' # compared to the average Prague C score so that those who are taking
#' # too few biopsies can be determined
#' hh <- BarrettsBxQual(
#'   b1, "Date.x", "HospitalNumber",
#'   "Endoscopist"
#' )
#' rm(v)
BarrettsBxQual <- function(dataframe,
                           Endo_ResultPerformed,
                           PatientID,
                           Endoscopist) {
  dataframe <- data.frame(dataframe)
  PatientIDa <- rlang::sym(PatientID)
  Endo_ResultPerformeda <- rlang::sym(Endo_ResultPerformed)
  Endoscopista <- rlang::sym(Endoscopist)

  # Make sure the C and M stage is mueric (wil be character from the PragueScore function to
  # incorporate "Insufficient " as an outcome)
  dataframe$CStage <- as.numeric(dataframe$CStage)
  dataframe$MStage <- as.numeric(dataframe$MStage)

  GroupedByEndoscopy <-
    suppressWarnings(dataframe %>%
      filter(!is.na(CStage), !is.na(NumBx)) %>%
      group_by(
        as.Date(!!Endo_ResultPerformeda), !!PatientID,
        !!Endoscopista
      ) %>%
      summarise(Sum = sum(NumBx), AvgC = mean(CStage)))

  GroupedByEndoscopy$ExpectedNumber <-
    (GroupedByEndoscopy$AvgC + 1) * 2
  GroupedByEndoscopy$Difference <-
    GroupedByEndoscopy$Sum - GroupedByEndoscopy$ExpectedNumber

  # Now group the difference by endoscopist
  BxShortfallPre <-
    GroupedByEndoscopy %>%
    group_by(!!Endoscopista) %>%
    summarise(MeanDiff = round(mean(Difference),2))

  BxShortfallPre <- data.frame(BxShortfallPre)
  return(BxShortfallPre)
}
ropensci/EndoMineR documentation built on March 14, 2023, 3:58 a.m.