EndoMineR: Functions to mine endoscopic and associated pathology datasets

##### Fake Data Creation #####

# Endoscopies() creates a spreadsheet of Endoscopy data,Histop_df() creates a dataframe of
# Histopathology data. Endomerge() associates them together
# EndoRaw() creates an Endoscopy report (so the data has not already been extracted) for upper GI
# ColonEndoRaw() creates an Endoscopy report (so the data has not already been extracted) for lower GI
# pathRep() creates an Pathology report (so the data has not already been extracted) for upper GI
# ColonpathRep() creates an Pathology report (so the data has not already been extracted) for upper GI

#' FakeEndoCreator
#'
#' Creates fake endoscopic data to play with as a spreadsheet format. It assumes
#' that some of the data has already been separated out. EndoRaw() for upper GI
#' and ColonEndoRaw() for lower GI are better functions for the real
#'  scenario of just getting the report as a series of unextracted text files.
#' (Their histology equivalents are pathRep and ColonpathRep respectively)
#' @param x None needed
#' @keywords Fake endoscopy
#' @export
#' @examples
#' Endoscopies()
Endoscopies <- function(x) {
  Endoscopist <- list(
    x1 = "Dr Jonny Begood", x2 = "Dr Elvis Presley",
    x3 = "Dr Bilbo Baggins", x4 = "Dr Elmo Fudd",
    x5 = "Dr Jimminey Cricket", x6 = "Dr Davy Jones",
    x7 = "Dr Bugs Bunny", x8 = "Dr Rara Rasputin",
    x9 = "Dr Chubby Checker", x10 = "Dr Frank Sinatra",
    x11 = "Dr Charles Dickens", x12 = "Dr Joseph Conrad",
    x13 = "Dr Florence Nightingale", x14 = "Dr Sal Addin",
    x15 = "Dr King Richard III"
  )
  Midazolam <- list(
    x = "1mg", x = "2mg", x = "3mg",
    x = "4mg", x = "5mg", x = "6mg", x = "7mg",
    x = "8mg"
  )
  Fentanyl <- list(
    x = "12.5mcg", x = "25mcg", x = "50mcg",
    x = "75mcg", x = "100mcg", x = "125mcg", x = "150mcg"
  )
  Indication <- list(
    x1 = "Therapeutic- Dilatation",
    x2 = "Other-", x3 = "Follow-up ULCER HEALING",
    x4 = "Haematemesis or Melaena/Blood PR", x5 = "Previous OGD ? 8 months ago",
    x6 = "Dysphagia/Odynophagia", x7 = "Surveillance-Barrett's",
    x8 = "Nausea and/or Vomiting", x9 = "Weight Loss",
    x10 = "Dysphagia/intermittent for a few months",
    x11 = "Other-", x12 = "Small Bowel Biopsy",
    x13 = "Dyspepsia", x14 = "Reflux-like Symptoms/Atypical Chest Pain",
    x15 = "chronic abdo pain and constipaton",
    x16 = "Oesophagus- Dysplasia", x17 = "Therapeutic- RFA"
  )
  Diagnosis <- list(
    x1 = "Ulcer- Oesophageal. ",
    x2 = "Post chemo-radiotherapy stricture ",
    x3 = "Possible achalasia.", x4 = "Oesophagitis. ",
    x5 = "Food bolus obstructing the oesophagus.",
    x6 = "Hiatus Hernia. ", x7 = "Extensive neoplastic looking esophageal lesion. ",
    x8 = "Esophageal candidiasis ", x9 = "Barretts oesophagus. ",
    x10 = "Gastritis"
  )
  Endodat <- sample(seq(as.Date("2013/01/01"), as.Date("2017/05/01"),
    by = "day"
  ), 1000)
  EndoHospNum <- sample(c(
    "P433224", "P633443", "K522332",
    "G244224", "S553322", "D0739033", "U873352",
    "P223333", "Y763634", "I927282", "P223311",
    "P029834", "U22415", "U234252", "S141141",
    "O349253", "T622722", "J322909", "F630230",
    "T432452"
  ), 1000, replace = TRUE)
  # Yes I know... This was just easier..
  BarrettsLength <- c(
    "C0M1", "C0M2", "C0M3", "C0M4",
    "C0M5", "C0M6", "C0M7", "C0M8", "C0M9", "C0M10",
    "C1M2", "C1M3", "C1M4", "C1M5", "C1M6", "C1M7",
    "C1M8", "C1M9", "C1M10", "C2M3", "C2M4", "C2M5",
    "C2M6", "C2M7", "C2M8", "C2M9", "C2M10", "C3M4",
    "C3M5", "C3M6", "C3M7", "C3M8", "C3M9", "C3M10",
    "C4M5", "C4M6", "C4M7", "C4M8", "C4M9", "C4M10",
    "C5M6", "C5M7", "C5M8", "C5M9", "C5M10", "C6M7",
    "C6M8", "C6M9"
  )
  # Merge them all together into a dataframe
  Endoscopies <- data.frame(EndoHospNum, replicate(
    1000,
    paste(
      "Date of Procedure", sample(Endodat,
        1,
        replace = F
      ), " Endoscopist: ", sample(Endoscopist,
        1,
        replace = F
      ), "Midazolam: ", sample(Midazolam,
        1,
        replace = F
      ), "Fentanyl: ", sample(Fentanyl,
        1,
        replace = F
      ), "Indication:", sample(Indication,
        1,
        replace = F
      ), "Diagnosis:", stringr::str_c(sample(Diagnosis,
        sample(1:10, 1),
        replace = F
      ), collapse = "."),
      sample(c("", paste(
        "Barrett's oesophagus length:",
        sample(BarrettsLength, 1)
      )), 1)
    )
  ))

  # Lets rename the one column to something more
  # intelligent
  names(Endoscopies) <- c("HospNum_Id", "EndoReports")
  ######### Data accordionisation Convert into paragraphs so
  ######### can be more easily separated
  Endoscopies$Date <- stringr::str_extract(
    Endoscopies$EndoReports,
    "Date of Procedure.*Endoscopist"
  )
  Endoscopies$Endoscopist <- stringr::str_extract(
    Endoscopies$EndoReports,
    "Endoscopist:.*Midazolam"
  )
  Endoscopies$Midazolam <- stringr::str_extract(
    Endoscopies$EndoReports,
    "Midazolam:.*Fentanyl"
  )
  Endoscopies$Fentanyl <- stringr::str_extract(
    Endoscopies$EndoReports,
    "Fentanyl:.*Indication"
  )
  Endoscopies$Indication <- stringr::str_extract(
    Endoscopies$EndoReports,
    "Indication:.*Diagnosis"
  )
  Endoscopies$Diagnosis <- stringr::str_extract(
    Endoscopies$EndoReports,
    "Diagnosis:.*"
  )
  Endoscopies$BarrC <- stringr::str_extract(
    Endoscopies$EndoReports,
    " oesophagus length: C.*M.*"
  )
  Endoscopies$BarrM <- stringr::str_extract(
    Endoscopies$BarrC,
    "M.*"
  )

  ######### Data cleaning Endoscopy dataset and formatting
  ######### the columns
  Endoscopies$Date <- gsub(
    "Date of Procedure", "",
    Endoscopies$Date
  )
  # Note we are using the date conversion function
  # here
  Endoscopies$Date <- as.Date(gsub(
    " Endoscopist",
    "", Endoscopies$Date
  ), format = "%Y-%m-%d")

  Endoscopies$Endoscopist <- gsub(
    "Endoscopist:  Dr ",
    "", Endoscopies$Endoscopist
  )
  Endoscopies$Endoscopist <- gsub(
    "Midazolam", "",
    Endoscopies$Endoscopist
  )

  Endoscopies$Midazolam <- gsub(
    "Midazolam: ", "",
    Endoscopies$Midazolam
  )
  # Also reformatting this column into a nueric
  # column at the same time
  Endoscopies$Midazolam <- as.numeric(gsub(
    "mg Fentanyl",
    "", Endoscopies$Midazolam
  ))

  Endoscopies$Fentanyl <- gsub(
    "Fentanyl: ", "",
    Endoscopies$Fentanyl
  )
  # Also reformatting this column into a nueric
  # column at the same time
  Endoscopies$Fentanyl <- as.numeric(gsub(
    "mcg Indication",
    "", Endoscopies$Fentanyl
  ))

  Endoscopies$Indication <- gsub(
    "Indication: ",
    "", Endoscopies$Indication
  )
  Endoscopies$Indication <- gsub(
    " Diagnosis", "",
    Endoscopies$Indication
  )

  Endoscopies$Diagnosis <- gsub(
    "Indication: ", "",
    Endoscopies$Diagnosis
  )
  Endoscopies$Diagnosis <- gsub(
    " Diagnosis", "",
    Endoscopies$Diagnosis
  )

  Endoscopies$BarrC <- gsub(
    "oesophagus length: ",
    "", Endoscopies$BarrC
  )
  # Also reformatting this column into a nueric
  # column at the same time
  Endoscopies$BarrC <- gsub("M.*", "", Endoscopies$BarrC)
  Endoscopies$BarrC <- as.numeric(gsub("C", "", Endoscopies$BarrC))
  # Also reformatting this column into a nueric
  # column at the same time
  Endoscopies$BarrM <- as.numeric(gsub("M", "", Endoscopies$BarrM))
  # load(file = "Endoscopies.rda")
  return(Endoscopies)
}



#' FakeHistolCreator
#'
#' Creates fake histology data to play with
#' @param x None needed
#' @keywords Fake histology spreadshet data
#' @export
#' @examples
#' Histop_df()
Histop_df <- function(x) {
  # Generate a load of strings
  line <- list(
    x1 = "Intestinal metaplasia is present.",
    x2 = "Basal hyperplasia is prominent", x3 = "There is no dysplasia or malignancy.",
    x4 = "No Helicobacter are seen.", x5 = "There is some ulceration.",
    x6 = "There is no intercellular oedema in the surface epithelium.",
    x7 = " PAS staining shows occasional spores, consistent with candida.",
    x8 = " No herpetic viral inclusions are seen.",
    x9 = " There is no dysplasia and no invasive carcinoma.",
    x10 = " There is mild regenerative epithelial change, but neither dysplasia nor malignancy is seen.",
    x11 = "The appearances are consistent with the endoscopic diagnosis of Barrett's oesophagus with active chronic inflammation.",
    x12 = "The biopsies of oesophageal squamous mucosa show surface erosion and active chronic inflammation.",
    x13 = "Numerous Candida spores and hyphae are present admixed with ulcer slough.",
    x14 = "There is reactive basal cell hyperplasia and mild inflammatory epithelial atypia.",
    x15 = "There is no significant increase in intraepithelial eosinophils.",
    x16 = "No granulomas or viral inclusions are seen.",
    x17 = "The appearances are those of Candida oesophagitis.",
    x18 = "Neither dysplasia nor malignancy is seen.",
    x19 = "The appearances are consistent with, but not specific for Barrett's (columnar lined) oesophagus.",
    x20 = "High grade dysplasia is present throughout this sample",
    x21 = "There is low grade dysplasia", x22 = "This is a dysplastic sample"
  )
  list.of.samples <- replicate(1000, paste(
    "Macrosopic description:",
    sample(1:10, 1), "specimens collected the largest measuring",
    sample(1:5, 1), "x", sample(1:5, 1), "x", sample(
      1:5,
      1
    ), "mm and the smallest", sample(
      1:5,
      1
    ), "x", sample(1:5, 1), "x", sample(
      1:5,
      1
    ), "mm"
  ), simplify = FALSE)

  # Merge the strings together randomly
  histop <- replicate(1000, paste(sample(list.of.samples,
    1,
    replace = F
  ), paste("Diagnoses", stringr::str_c(sample(line,
    sample(3:10, 1),
    replace = F
  ), collapse = "."))))

  # Because we eventually will merge histopath and
  # endoscopy together we are going to be crafty and
  # generate the histopath dates from the endoscopy
  # dates with 0-2 days difference
  dat <- Endoscopies$Date + sample(0:2, 1)
  dat <- sample(seq(as.Date("2013/01/01"), as.Date("2017/05/01"),
    by = "day"
  ), 1000)
  # Generate hospital numbers from the Endoscopies
  # report
  HospNum_Id <- Endoscopies$HospNum_Id

  Histop_df <- data.frame(HospNum_Id, dat, paste(
    "Date received:",
    dat, histop
  ))
  names(Histop_df) <- c("HospNum_Id", "dat", "HistoReport")

  ######### Data accordionisation Convert into paragraphs so
  ######### can be more easily separated
  Histop_df$Date <- stringr::str_extract(
    Histop_df$HistoReport,
    "Date received:.*Macrosopic description:"
  )
  Histop_df$Macro <- stringr::str_extract(
    Histop_df$HistoReport,
    "Macrosopic description:.*Diagnoses"
  )
  Histop_df$Diagnoses <- stringr::str_extract(
    Histop_df$HistoReport,
    "Diagnoses.*"
  )
  ######### Data cleaning Histopathology dataset and
  ######### formatting the columns
  Histop_df$Date <- gsub("Date received: ", "", Histop_df$Date)
  Histop_df$Date <- as.Date(gsub(
    "Macrosopic description:",
    "", Histop_df$Date
  ), format = "%Y-%m-%d")

  Histop_df$Macro <- gsub(
    "Macrosopic description: ",
    "", Histop_df$Macro
  )
  Histop_df$Macro <- gsub("Diagnoses", "", Histop_df$Macro)

  Histop_df$Diagnoses <- gsub("Diagnoses", "", Histop_df$Diagnoses)
  # Lets get rid of a column we don't need
  Histop_df$dat <- NULL
  # load(file = "Histop_df.rda")
  return(Histop_df)
}

######### Data merging We can merge straight away as we
######### have the same names for the columns date and
######### HospNum_Id so no need to mess around. We will use
######### the fuzzyjoin method as there is sometimes a gap
######### between the endoscopy date and the date that the
######### histopathology was received:






samplenumber <- 2000
HospitalNumberID <- paste("Hospital Number: ", sample(c(LETTERS)),
  sample(1e+06:9999999, (samplenumber - 1900), replace = T),
  sep = ""
)
NHS_Trust <- replicate(samplenumber, c("Hospital: Random NHS Foundation Trust"))
Patient_Name <- paste("Patient Name: ", randomNames::randomNames(
  samplenumber,
  "first", "last"
))
Date_of_Birth <- paste("DOB: ", generator::r_date_of_births(samplenumber,
  start = as.Date("1900-01-01"), end = as.Date("1999-01-01")
))
GeneralPractictioner <- paste("General Practitioner: Dr. ",
  randomNames::randomNames(samplenumber, "first", "last"),
  sep = ""
)
Date_of_ProcedureAll <- generator::r_date_of_births(samplenumber,
  start = as.Date("2001-01-01"), end = as.Date("2017-01-01")
)

#' EndoRaw
#'
#' Generates fake Endoscopy date
#' @param x None needed
#' @keywords fake endoscopy data
#' @import randomNames
#' @import generator
#' @export
#' @examples
#' EndoRaw(x)
EndoRaw2 <- function() {
  Date_of_Procedure <- Date_of_ProcedureAll
  Date <- paste("Date of procedure: ", Date_of_Procedure)
  EndoscopistList <- as.list(sample(randomNames::randomNames(
    samplenumber,
    "first", "last"
  ), 10, replace = T))
  Second_EndoscopistList <- as.list(sample(randomNames::randomNames(
    samplenumber,
    "first", "last"
  ), 10, replace = T))
  Endoscopist <- replicate(samplenumber, paste("Endoscopist: Dr. ",
    sample(EndoscopistList, 1, replace = F),
    sep = ""
  ))
  Second_Endoscopist <- replicate(samplenumber, paste("2nd Endoscopist: Dr. ",
    sample(Second_EndoscopistList, 1, replace = F),
    sep = ""
  ))
  MedicationsFent <- replicate(samplenumber, paste(
    "Medications: Fentanyl ",
    sample(list(
      x = "12.5mcg", x = "25mcg", x = "50mcg",
      x = "75mcg", x = "100mcg", x = "125mcg",
      x = "150mcg"
    ), 1, replace = F)
  ))
  MedicationsMidaz <- replicate(samplenumber, paste(
    "Midazolam ",
    sample(list(
      x = "1mg", x = "2mg", x = "3mg",
      x = "4mg", x = "5mg", x = "6mg", x = "7mg"
    ),
    1,
    replace = F
    )
  ))
  Instrument <- replicate(samplenumber, paste(
    "Instrument: ",
    sample(list(
      x = "FG1", x = "FG2", x = "FG3",
      x = "FG4", x = "FG5", x = "FG6", x = "FG7"
    ),
    1,
    replace = F
    )
  ))
  Extent_of_Exam <- replicate(samplenumber, paste(
    "Extent of Exam: ",
    sample(list(
      x = "Failed intubation", x = "Oesophagus",
      x = "Stomach body", x = "D1", x = "D2",
      x = "Pylorus", x = "GOJ"
    ), 1, replace = F)
  ))
  # Import the Findings text from data folder - but
  # how to get it there?
  INDICATIONS_FOR_EXAMINATION <- replicate(
    samplenumber,
    paste("Indications:", sample(list(
      x1 = "Therapeutic- Dilatation",
      x2 = "Other-", x3 = "Follow-up ULCER HEALING",
      x4 = "Haematemesis or Melaena/Blood PR",
      x5 = "Previous OGD ? 8 months ago", x6 = "Dysphagia/Odynophagia",
      x7 = "Surveillance-Barrett's", x8 = "Nausea and/or Vomiting",
      x9 = "Weight Loss", x10 = "Dysphagia/intermittent for a few months",
      x11 = "Other-", x12 = "Small Bowel Biopsy",
      x13 = "Dyspepsia", x14 = "Reflux-like Symptoms/Atypical Chest Pain",
      x15 = "chronic abdo pain and constipaton",
      x16 = "Oesophagus- Dysplasia", x17 = "Therapeutic- RFA"
    ),
    1,
    replace = F
    ))
  )
  PROCEDURE_PERFORMED <- "Procedure Performed: Gastroscopy (OGD)"
  FINDINGS <- read.table("/home/rstudio/EndoMineR/data-raw/data/FindingsText",
    header = T, stringsAsFactors = F
  )
  FINDINGS <- replicate(samplenumber, paste(
    "Findings: ",
    stringr::str_c(as.list(sample(FINDINGS$x, sample(1:10),
      replace = T
    )), collapse = ",")
  ))

  TherapyorNot <- replicate(
    samplenumber,
    paste(sample(list(
      x1 = "Therapeutic- Dilatation was performed",
      x2 = "", x3 = "HALO 90 done with good effect",
      x4 = "TTS HALO to area",
      x5 = "", x6 = "",
      x7 = "A lesion underwent EMR", x8 = "",
      x9 = "", x10 = "",
      x11 = "", x12 = "Area APC'd",
      x13 = "", x14 = "",
      x15 = "",
      x16 = "", x17 = "Therapeutic- RFA", x18 = "",
      x19 = "", x20 = "", x21 = "", x22 = "", x23 = "", x24 = "", x25 = ""
    ),
    1,
    replace = F
    ))
  )

  ENDOSCOPIC_DIAGNOSIS <- data.frame(c(
    "Ulcer- Oesophageal. ",
    "Post chemo-radiotherapy stricture ", "Possible achalasia.",
    "Oesophagitis. ", "Food bolus obstructing the oesophagus.",
    "Hiatus Hernia. ", "Extensive neoplastic looking esophageal lesion. ",
    "Esophageal candidiasis ", "Barretts oesophagus. ",
    "Gastritis"
  ), stringsAsFactors = F)
  names(ENDOSCOPIC_DIAGNOSIS) <- "x"
  ENDOSCOPIC_DIAGNOSIS <- replicate(
    samplenumber,
    paste("Endoscopic Diagnosis: ", stringr::str_c(as.list(sample(ENDOSCOPIC_DIAGNOSIS$x,
      sample(1:3),
      replace = F
    )), collapse = ","))
  )
  # Now put it all together in one long text to
  # simulate a real Endoscopic report
  TheOGDReport <- data.frame(
    NHS_Trust, HospitalNumberID,
    Patient_Name, GeneralPractictioner, Date, Endoscopist,
    Second_Endoscopist, MedicationsFent, MedicationsMidaz,
    Instrument, Extent_of_Exam, INDICATIONS_FOR_EXAMINATION,
    PROCEDURE_PERFORMED, FINDINGS, TherapyorNot, ENDOSCOPIC_DIAGNOSIS
  )
  # Now paste the OGD report dataframe together to
  # make the fake report:
  TheOGDReportFinal <- tidyr::unite(TheOGDReport,
    cat(paste(colnames(TheOGDReport), collapse = "\n")),
    colnames(TheOGDReport),
    sep = "\n"
  )
  names(TheOGDReportFinal) <- "OGDReportWhole"
  save(TheOGDReportFinal, file = "/home/rstudio/EndoMineR/data/TheOGDReportFinal.rda")
  # return(TheOGDReportFinal)

  Myendo <- TheOGDReportFinal
  Myendo$OGDReportWhole <- gsub("2nd Endoscopist:", "Second endoscopist:", Myendo$OGDReportWhole)
  EndoscTree <- list(
    "Hospital Number:", "Patient Name:", "General Practitioner:",
    "Date of procedure:", "Endoscopist:", "Second endoscopist:", "Medications",
    "Instrument", "Extent of Exam:", "Indications:", "Procedure Performed:", "Findings:",
    "Endoscopic Diagnosis:"
  )
  for (i in 1:(length(EndoscTree) - 1)) {
    Myendo <- Extractor(
      Myendo, "OGDReportWhole", as.character(EndoscTree[i]),
      as.character(EndoscTree[i + 1]), as.character(EndoscTree[i])
    )
  }
  Myendo$Dateofprocedure <- as.Date(Myendo$Dateofprocedure)

  save(Myendo, file = "/home/rstudio/EndoMineR/data/Myendo.rda")
}













#' pathRep
#'
#' Creates raw Pathology reports
#' @param x None needed
#' @import stringr
#' @import generator
#' @keywords Pathology reports
#' @export
#' @examples
#' pathRep(x)
pathRep2 <- function() {
  AccessionNum <- paste0("SP-", sample(10:99), "-", sample(1000000:9999999, 2000, replace = F))
  Date <- Date_of_ProcedureAll + sample(1:12, 1)
  Date <- paste("Date received: ", Date)

  # replicate(samplenumber, as.numeric(sample(1:10)),
  #         1))
  # Clinical Details
  ClinDet <- read.table("./data-raw/data/HistolClinDetText",
    header = T, stringsAsFactors = F
  )
  ClinDet <- replicate(samplenumber, paste(
    "Clinical Details: ",
    stringr::str_c(as.list(sample(ClinDet$x, sample(1:10),
      replace = T
    )), collapse = ",")
  ))
  # Nature of the specimen
  NatureOfSpec <- read.table("./data-raw/data/HistolMacDescription.txt",
    header = T, stringsAsFactors = F
  )
  NatureOfSpec <- replicate(samplenumber, paste(
    sample(1:10, 1, replace = T), "specimen. Nature of specimen: ",
    stringr::str_c(as.list(sample(NatureOfSpec$x, sample(1:10), replace = T)), collapse = ",")
  ))

  MacDescrip <- unlist(replicate(samplenumber, paste(
    "Macroscopic description:",
    sample(1:10, 1), "specimens collected the largest measuring",
    sample(1:5, 1), "x", sample(1:5, 1), "x", sample(1:5, 1), "mm and the smallest", sample(1:5, 1), "x", sample(1:5, 1), "x", sample(1:5, 1), "mm"
  ), simplify = FALSE))
  # Merge the strings together randomly

  # Histol Details
  Histol <- read.table("./data-raw/data/HistolText", header = T, stringsAsFactors = F)
  Histol <- replicate(samplenumber, paste(
    "Histology: ",
    stringr::str_c(as.list(sample(Histol$x, sample(1:10),
      replace = T
    )), collapse = ",")
  ))
  Diagnostic <- read.table("./data-raw/data/HistolDxText",
    header = T, stringsAsFactors = F
  )

  Diagnostic <- replicate(samplenumber, paste(
    "Diagnosis: ",
    stringr::str_c(as.list(sample(Diagnostic$x, sample(5:10),
      replace = T
    )), collapse = ",")
  ))
  PathDataFrameReport <- data.frame(
    AccessionNum, NHS_Trust, HospitalNumberID,
    Patient_Name, Date_of_Birth, GeneralPractictioner,
    Date, ClinDet, NatureOfSpec, MacDescrip, Histol,
    Diagnostic
  )
  PathDataFrameFinal <- tidyr::unite(PathDataFrameReport,
    cat(paste(colnames(PathDataFrameReport), collapse = "\n")),
    colnames(PathDataFrameReport),
    sep = "\n"
  )
  names(PathDataFrameFinal) <- "PathReportWhole"
  save(PathDataFrameFinal, file = "/home/rstudio/EndoMineR/data/PathDataFrameFinal.rda")


  #################


  Mypath <- PathDataFrameFinal
  HistolTree <- list(
    "Hospital Number", "Patient Name", "DOB:",
    "General Practitioner:",
    "Date received:", "Clinical Details:", "Macroscopic description:", "Histology:",
    "Diagnosis:", ""
  )
  for (i in 1:(length(HistolTree) - 1)) {
    Mypath <- Extractor(
      Mypath, "PathReportWhole", as.character(HistolTree[i]),
      as.character(HistolTree[i + 1]), as.character(HistolTree[i])
    )
  }
  colnames(Mypath)[which(names(Mypath) == "Datereceived")] <- "Dateofprocedure"
  Mypath$Dateofprocedure <- as.Date(Mypath$Dateofprocedure)
  save(Mypath, file = "/home/rstudio/EndoMineR/data/Mypath.rda")
}




#' ColonEndoRaw
#'
#' Creates raw Pathology reports
#' @param x None needed
#' @keywords Pathology reports
#' @import randomNames
#' @import generator
#' @export
#' @examples
#' ColonEndoRaw(x)
ColonEndoRaw <- function(x) {
  # Date_of_Procedure <- generator::r_date_of_births(samplenumber,
  #                                                 start = as.Date("2001-01-01"), end = as.Date("2017-01-01"))
  Date <- paste("Date of procedure: ", Date_of_ProcedureAll)
  EndoscopistList <- as.list(sample(randomNames::randomNames(
    samplenumber,
    "first", "last"
  ), 10, replace = T))
  Second_EndoscopistList <- as.list(sample(randomNames::randomNames(
    samplenumber,
    "first", "last"
  ), 10, replace = T))
  Endoscopist <- replicate(samplenumber, paste("Endoscopist: Dr. ",
    sample(EndoscopistList, 1, replace = F),
    sep = ""
  ))
  Second_Endoscopist <- replicate(samplenumber, paste("2nd Endoscopist: Dr. ",
    sample(Second_EndoscopistList, 1, replace = F),
    sep = ""
  ))
  MedicationsFent <- replicate(samplenumber, paste(
    "Medications: Fentanyl ",
    sample(list(
      x = "12.5mcg", x = "25mcg", x = "50mcg",
      x = "75mcg", x = "100mcg", x = "125mcg",
      x = "150mcg"
    ), 1, replace = F)
  ))
  MedicationsMidaz <- replicate(samplenumber, paste(
    "Midazolam ",
    sample(list(
      x = "1mg", x = "2mg", x = "3mg",
      x = "4mg", x = "5mg", x = "6mg", x = "7mg"
    ),
    1,
    replace = F
    )
  ))
  Instrument <- replicate(samplenumber, paste(
    "Instrument: ",
    sample(list(
      x = "FC1", x = "FC2", x = "FC3",
      x = "FC4", x = "FC5", x = "FC6", x = "FC7"
    ),
    1,
    replace = F
    )
  ))
  Extent_of_Exam <- replicate(samplenumber, paste(
    "Extent of Exam: ",
    sample(list(
      x = "Failed intubation", x = "Recum",
      x = "Sigmoid", x = "Descending Colon",
      x = "Transverse Colon", x = "Ascending Colon",
      x = "Caecum"
    ), 1, replace = F)
  ))
  # Import the Findings text from data folder - but
  # how to get it there?
  INDICATIONS_FOR_EXAMINATION <- replicate(
    samplenumber,
    paste("Indications:", sample(list(
      x1 = "Therapeutic- Dilatation",
      x2 = "Other-", x3 = "Diarrrhoea", x4 = "Weight loss",
      x5 = "IBD Surveillance", x6 = "PR Bleeding",
      x7 = "Family History CRC", x8 = "Nausea and/or Vomiting",
      x9 = "Abnormal Imaging", x10 = "Planned polypectomy",
      x11 = "Fe deficiency anaemia", x12 = "Chronic abdominal pain"
    ),
    1,
    replace = F
    ))
  )
  PROCEDURE_PERFORMED <- "Procedure Performed: Colonoscopy"
  FINDINGS <- read.table("/home/rstudio/EndoMineR/data-raw/data/FindingsTextColon.txt",
    header = T, stringsAsFactors = F
  )
  FINDINGS <- replicate(samplenumber, paste(
    "Findings: ",
    stringr::str_c(as.list(sample(FINDINGS$x, sample(1:10),
      replace = T
    )), collapse = ",")
  ))
  ENDOSCOPIC_DIAGNOSIS <- data.frame(c(
    "Ulcer- Oesophageal. ",
    "Post chemo-radiotherapy stricture ", "Possible achalasia.",
    "Oesophagitis. ", "Food bolus obstructing the oesophagus.",
    "Hiatus Hernia. ", "Extensive neoplastic looking esophageal lesion. ",
    "Esophageal candidiasis ", "Barretts oesophagus. ",
    "Gastritis"
  ), stringsAsFactors = F)
  names(ENDOSCOPIC_DIAGNOSIS) <- "x"
  ENDOSCOPIC_DIAGNOSIS <- replicate(
    samplenumber,
    paste("Endoscopic Diagnosis: ", stringr::str_c(as.list(sample(ENDOSCOPIC_DIAGNOSIS$x,
      sample(1:3),
      replace = F
    )), collapse = ","))
  )
  # Now put it all together in one long text to
  # simulate a real Endoscopic report
  TheOGDReport <- data.frame(
    NHS_Trust, HospitalNumberID,
    Patient_Name, GeneralPractictioner, Date, Endoscopist,
    Second_Endoscopist, MedicationsFent, MedicationsMidaz,
    Instrument, Extent_of_Exam, INDICATIONS_FOR_EXAMINATION,
    PROCEDURE_PERFORMED, FINDINGS, ENDOSCOPIC_DIAGNOSIS
  )
  # Now paste the OGD report dataframe together to
  # make the fake report:
  TheOGDReportFinal <- tidyr::unite(TheOGDReport,
    cat(paste(colnames(TheOGDReport), collapse = "\n")),
    colnames(TheOGDReport),
    sep = "\n"
  )
  names(TheOGDReportFinal) <- "OGDReportWhole"
  ColonFinal <- TheOGDReportFinal
  # load(file = "ColonFinal.rda")

  save(ColonFinal, file = "/home/rstudio/EndoMineR/data/ColonFinal.rda")
  return(ColonFinal)
}


#' ColonpathRep
#'
#' Creates raw Pathology reports
#' @param x None needed
#' @import stringr
#' @keywords Pathology reports
#' @export
#' @examples
#' ColonpathRep(x)
ColonpathRep <- function(x) {
  # Date_of_Procedure <- generator::r_date_of_births(samplenumber,
  #                                                 start = as.Date("2001-01-01"), end = as.Date("2017-01-01"))
  # Date <- paste("Date received: ", Date_of_Procedure +
  #                 replicate(samplenumber, as.numeric(sample(1:10)),
  #                           1))

  Date <- Date_of_ProcedureAll + sample(1:12, 1)
  Date <- paste("Date received: ", Date)
  # Clinical Details
  ClinDet <- read.table("./data-raw/data/Histopath_ClinDetPhrasesColon.txt",
    header = F, stringsAsFactors = F
  )
  ClinDet <- replicate(samplenumber, paste(
    "Clinical Details: ",
    stringr::str_c(as.list(sample(ClinDet$V1, sample(1:10),
      replace = T
    )), collapse = ",")
  ))
  # Nature of the specimen
  NatureOfSpec <- read.table("./data-raw/data/Histopath_MacDescripPhrasesColon.txt",
    header = F, stringsAsFactors = F
  )
  NatureOfSpec <- replicate(samplenumber, paste(
    sample(1:10, 1, replace = T), "specimen. Nature of specimen: ",
    stringr::str_c(as.list(sample(NatureOfSpec$V1, sample(1:10), replace = T)), collapse = ",")
  ))
  MacDescrip <- unlist(replicate(samplenumber, paste(
    "Macroscopic description:",
    sample(1:10, 1), "specimens collected the largest measuring",
    sample(1:5, 1), "x", sample(1:5, 1), "x", sample(
      1:5,
      1
    ), "mm and the smallest", sample(
      1:5,
      1
    ), "x", sample(1:5, 1), "x", sample(
      1:5,
      1
    ), "mm"
  ), simplify = FALSE))
  # Merge the strings together randomly
  # MacDescrip<-replicate(1000,paste
  # (sample(list.of.samples,1,replace=F),paste('Diagnoses',stringr::stringr::str_c(sample(line,sample(3:10,1),replace=F),collapse='.'))))
  # Histol Details
  Histol <- read.table("./data-raw/data/HistolTextColon",
    header = F, stringsAsFactors = F
  )
  Histol <- replicate(samplenumber, paste(
    "Histology: ",
    stringr::str_c(as.list(sample(Histol$V1, sample(1:10),
      replace = T
    )), collapse = ",")
  ))
  # Diagnostic details
  Diagnostic <- read.table("./data-raw/data/Histopath_DxRawColon.txt",
    header = F, stringsAsFactors = F
  )
  Diagnostic <- replicate(samplenumber, paste(
    "Diagnosis: ",
    stringr::str_c(as.list(sample(Diagnostic$V1, sample(5:10),
      replace = T
    )), collapse = ",")
  ))
  PathDataFrameReport <- data.frame(
    NHS_Trust, HospitalNumberID,
    Patient_Name, Date_of_Birth, GeneralPractictioner,
    Date, ClinDet, NatureOfSpec, MacDescrip, Histol,
    Diagnostic
  )
  PathDataFrameFinalColon <- tidyr::unite(PathDataFrameReport,
    cat(paste(colnames(PathDataFrameReport), collapse = "\n")),
    colnames(PathDataFrameReport),
    sep = "\n"
  )
  names(PathDataFrameFinalColon) <- "PathReportWhole"
  # load(file = "./data_raw/data/PathDataFrameFinalColon.rda")
  save(PathDataFrameFinalColon, file = "/home/rstudio/EndoMineR/data/PathDataFrameFinalColon.rda")
  return(PathDataFrameFinalColon)
}
sebastiz/EndoMineR documentation built on Dec. 4, 2022, 7:04 p.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
sebastiz/EndoMineR
Functions to mine endoscopic and associated pathology datasets

data-raw/EndoFakeData.R
In sebastiz/EndoMineR: Functions to mine endoscopic and associated pathology datasets

R Package Documentation

Browse R Packages

We want your feedback!

sebastiz/EndoMineR Functions to mine endoscopic and associated pathology datasets

data-raw/EndoFakeData.R In sebastiz/EndoMineR: Functions to mine endoscopic and associated pathology datasets

R Package Documentation

Browse R Packages

We want your feedback!

sebastiz/EndoMineR
Functions to mine endoscopic and associated pathology datasets

data-raw/EndoFakeData.R
In sebastiz/EndoMineR: Functions to mine endoscopic and associated pathology datasets