R/Untitled.R

Defines functions getMorphDT

pattern <- "a.b"
strings <- c("abb", "a.b")
str_detect(strings, pattern)
str_detect(strings, fixed(pattern))
str_detect(strings, coll(pattern))

# coll() is useful for locale-aware case-insensitive matching
i <- c("I", "\u0130", "i")
i
str_detect(i, fixed("i", TRUE))
str_detect(i, coll("i", TRUE))
str_detect(i, coll("i", TRUE, locale = "tr"))

# Word boundaries
words <- c("These are   some words.")
str_count(words, boundary("word"))
str_split(words, " ")[[1]]
str_split(words, boundary("word"))[[1]]

# Regular expression variations
str_extract_all("The Cat in the Hat", "[a-z]+")
str_extract_all("The Cat in the Hat", regex("[a-z]+", TRUE))

str_extract_all("a\nb\nc", "^.")
str_extract_all("a\nb\nc", regex("^.", multiline = TRUE))

str_extract_all("a\nb\nc", "a.")
str_extract_all("a\nb\nc", regex("a.", dotall = TRUE))





# today 9/5/19 ------------------------------------------------------------

library(data.table)
library(stringr)
library(fst)

gb_ddir <- "inst/data/mldata/"

gb_cols <- c(
  'CaseNumber','StartTimeGMT',
  'ProcedureID', 'Hospital', "Surgeon",
  'TuID', 'PartNumber', 'SequenceNum','ToolName', 'ToolType',
  'ProcedureCategory', 'ProcedureSubject', 'ProcedureName',
  'DurationMin', 'ToolDurationMin'
)

"%_&_%" <- function(a, b){
  randomvar_set_in_funenviron <- "hello world"
  return(rlang::current_fn())

  rlang::curr
  rlang::call_name()
  # "%_&_%"
  # Reduce("&", lapply(a, function(i) stringr::str_detect(b, i)))
}
obj <- do.call("%_&_%", list(a =1, b = 1))

env <- rlang::env_clone(rlang::as_closure())
rlang::env_names(env)
ls(envir = env)

rlang::call_name()

"%_|_%" <- function(a, b)
  Reduce("|", lapply(a, function(i) stringr::str_detect(b, i)))


getMorphDT <- function(){

  DT <- rbindlist(
    lapply(
      list.files(gb_ddir, "*_full.fst", TRUE, TRUE),
      fst::read_fst,
      as.data.table = TRUE,
      columns = gb_cols
    ))

  oldnam <- c("ProcedureSubject", "ProcedureName", "ProcedureCategory", "ToolName", "ToolType")
  newnam <- c("psub", "pnam", "pcat", "tnam", "ttype", )
  setnames(DT, oldnam, newnam)

  for(nam in newnam){
    set(DT, NULL, nam, stringr::str_to_lower(DT[, get(nam)]))
  }

  DT[, "corp_pnam" := .(str_split(ProcedureCategory, boundary("word"))) ]
  DT[, "corp_psub" := .(str_split(ProcedureSubject, boundary("word"))) ]
  DT[, "corp_pcat" := .(str_split(ProcedureName, boundary("word"))) ]

  DT[, "corp_tool" := .(str_split(ToolName, boundary("word"))) ]





  ## pull out reloads into a dedicted column and filter out reduntant rows
  ##
  sterms <- c("stapler", "sureform")
  colrs <- c("blue", "green", "white", "gray", "black")


  ##
  ##
  ## IM HERE- START HERE AFTER ARRIVE AT WORK
  ##
  ##
  ## CHECK IF SUREFORM AND STAPLER SHARE A PARTNUMBER AND IF SO STANDARDIZE!!
  ##
  ##
  ##
  ##
  ##

  setkeyv(DT, c("ProcedureName", "ProcedureSubject", "ProcedureCategory"))

  DT[ProcedureCategory == "Urology", .N, .(ProcedureSubject, ProcedureName)][order(ProcedureName)]
  cDT <- DT[
    DT[, .N, c("ProcedureSubject", "ProcedureID")][, .(n_obs = sum(N), n_procs = .N), "ProcedureSubject"]
    ]

  DT <- cDT[, as.data.table(
    sort(table(str_split_fixed(ToolName, " ", 2)[, 1])), keep.rownames = TRUE),
    .(ProcedureSubject, n_procs, n_obs)
    ][ order(ProcedureSubject, -N),
       .(ProcedureSubject,
         tn = V1,
         pct_tot_obs = round(N/tot_obs, 3))][
           order(tn, -pct_tot_obs)
           ]




  # pobsDT[,1-sum(.SD[1:5][["pct_tot_obs"]], na.rm = TRUE), ProcedureSubject]


  plotDT <- pobsDT[,({

    rows_top5 <- .SD[seq(1, min(5, .N))]

    as.list(
      rbind(
        rows_top5,
        data.table(tn = "Other",
                   pct_tot_obs = 1-sum(rows_top5[["pct_tot_obs"]]))
      )
    )
  }), ProcedureSubject]

  plotDT[, "tn" := factor(tn, levels = c("Other", sort(unique(tn)[-which(unique(tn) == "Other")]))) ]

  plotDT[ProcedureSubject == "Dvp"]


  plotDT[, levels(tn)]

  ggplot(plotDT) +
    geom_bar(aes(ProcedureSubject, pct_tot_obs,fill=tn),stat="identity")


  setkeyv(DT, c("ProcedureSubject"))







  stapleDT <- DT[
    sterms %_|_% ToolName,
    list(
      ProcedureID, ProcedureSubject, ProcedureName,
      PartNumber, SequenceNum,
      ToolName,
      ToolNameKey = "stapler",
      is_sureform = str_detect(ToolName, "sureform"),
      is_reload = str_detect(ToolName, "reload"),
      colr_reload = str_extract(ToolName, paste0(colrs, collapse = "|")),
      size_reload = str_extract(ToolName, "[0-9]+")
    )][!str_detect(ToolName, "gui|\\$motor")]

  stapleDT[,.(
    tot_reloads = sum(is_reload),
    tot_less_reloads = .N - sum(is_reload),
    med_seqnum = list(unique(SequenceNum))
  ), ProcedureID][order(tot_less_reloads)]

  stapleDT[, .N, .(ProcedureSubject, ProcedureName)][order(-N)]

  DT[c("stapl", "sure") %_|_% ToolName, .N, ToolName
     ][, table(stringr::str_remove_all(ToolName, paste0(c(sterms, " *"), collapse = "|")))]

  DT[c("sure", "reload") %_&_% ToolName, .N, ToolName]

  ## pull out reloads into a dedicted column and filter out reduntant rows
  DT[, "tot_reloads" :=
       sum(stringr::str_detect(ToolName, "Reload")),
     ProcedureID]

  cDT <- DT[!stringr::str_detect(ToolName, "Reload|Stapler\\$Motor")]
  return(DT[])
}


getFreqTable <- function(search_terms=NULL,
                         std_toolname=NULL,
                         stat_groupby=NULL,
                         .data = NULL){

  ## if data not given then fetch it
  if(is.null(.data)){
    DT <- getMorphDT()
  }



  ## main computation done in this function \
  ## - SELECTION - all terms provided as arg "search_terms" must be detected
  ##
  MAP <- .data[, ({
    ind <- search_terms %_&_% ToolName

    fq_sel <- as.data.table(table(ToolName[ ind ]), TRUE)
    fq_else <- as.data.table(table(ToolName[ !ind ]), TRUE)

    names(fq_sel) <- c("tn_sel", "N")
    names(fq_else) <- c("tn_not", "N")

    data.table(
      "tn_select" = list(fq_sel),
      "tn_others" = list(fq_else),
      "totObs" = .N,
      "count_ok" = sum(fq_sel$N, fq_else$N) == .N
    )
  }),
  keyby = stat_groupby]

  return(MAP)
}




tnFreq <- getFreqTable()

setkey(DT, ProcedureSubject, ProcedureName)









getToolFreq(
  tn_filter_terms = c("large?", "clip", "applier?"),
  map_val = "large clip applier",
  freq_grp_by = "ProcedureSubject",
  .data = DT
)


getToolFreq(
  tn_terms = c("needle", "drive"),
  map_value = "",
  count_grp = "ProcedureSubject",
  .data = DT
)




##
## NEEDLE DRIVER TOOLS
##
tn <- c("needle", "drive")
expr <- substitute(paste0(X, paste0(tn, collapse = " "), "r"))

map_entry(c("larg", tn), raw_tn, eval(expr, list(X = "large ")))
map_entry(c("mega", tn), raw_tn, eval(expr, list(X = "mega ")))
map_entry(c("curve", tn), raw_tn, eval(expr, list(X = "curved ")))
map_entry(c("wrist", tn), raw_tn, eval(expr, list(X = "wristed ")))
map_entry(c("^su|^ne", tn), raw_tn, eval(expr, list(X = "")))

map_entry(c("suction", "irrigat"), raw_tn, "suction irrigator")
map_entry(c("ves", "seal", "ext"), raw_tn, "vessel sealer extend")



map_entry("harmonic", raw_tn, "harmonic ace")

map_entry("grasper", raw_tn, "")
map_entry("retract", raw_tn, "")
map_entry("scissor", raw_tn, "")

map_entry(c("atrial", "retract"), raw_tn, "atrial retractor")

map_entry("fenestrated grasper", raw_tn, "fenestrated grasper")

map_entry("endoscope", raw_tn, "endoscope")









DT[ stringr::str_detect(mToolName, "staple.+(?<=reload)"),
    mToolName :=
      paste0("stapler reload ", stringr::str_extract(mToolName, "blue|green|white|gray|black"))
    ]

DT[ stringr::str_detect(ToolName, "needle.*(?<=drive)"),

    # stringr::str_trim(paste0(stringr::str_extract(ToolName, "mega|large|"), " needle driver"), "both"))
    unique(ToolName)
    ]

DT[ stringr::str_detect(DT$mToolName, "sureform.+(?<=reload)"),
    mToolName :=
      paste0("sureform reload ", stringr::str_extract(mToolName, "blue|green|white|black"))
    ]

DT[ stringr::str_detect(mToolName, "stapler( [0-9]+|\\$)"), mToolName := "stapler 30/45"]




###
### FORM THE FINALIZED MAPPING
###
toolMapIndex <- DT[, .N, .("tn_morpheus" = ToolName, "tn_standardized" = mToolName)]

# as.data.table(unlist(stringr::str_split(res[, .N, mapped][, mapped], " ")))[, .N, V1][order(V1)]
return(res[])
}




# misc code below ---------------------------------------------------------

mail
-s "test subject"
-F bfatemi07@gmail.com
-c cc-addr
-b
< /dev/null


SYS_CMD <- "ping localhost"
WORK_DIR <- getwd()
ENV_VARS <- c(VAR1 = "hello", VAR2 = "world")
rstudioapi::terminalExecute(command = SYS_CMD,
                            workingDir = WORK_DIR,
                            env = ENV_VARS,
                            show = TRUE)
rstudioapi::terminalBuffer()
rstudioapi::terminalExecute()
rstudioapi::restartSession()
system(paste0("kill -9 ", Sys.getpid()))



# previous ----------------------------------------------------------------


GLOB_DDIR <- "inst/data/mldata/"

GLOB_COLS <- c(
  'CaseNumber','StartTimeGMT',
  'ProcedureID', 'Hospital', "Surgeon",
  'TuID', 'PartNumber', 'SequenceNum',
  'ToolName', 'ToolType',
  'ProcedureCategory', 'ProcedureSubject', 'ProcedureName',
  'DurationMin', 'ToolDurationMin',
  "UsesRemaining", "UsesUsedThisProc", "MaxToolUses"
  # "WorkData1", "WorkData2", "WorkData3"
)


# "%_&_%" <- function(a, b)
#   which(Reduce("&", lapply(a, function(i) stringr::str_detect(b, i))))

"%_&_%" <- function(a, b)
  Reduce("&", lapply(a, function(i) stringr::str_detect(b, i)))

"%_|_%" <- function(a, b)
  which(Reduce("|", lapply(a, function(i) stringr::str_detect(b, i))))


# getToolMap <- function(DT, as_copy = FALSE, tn_only = TRUE){







paths <- list.files(GLOB_DDIR, "*_full.fst", TRUE, TRUE)

DT <- data.table::rbindlist(
  lapply(
    paths, fst::read_fst,
    as.data.table = TRUE,
    columns = GLOB_COLS
  ))

DT[, ToolName := stringr::str_to_lower(ToolName)][]

getFreqTable <- function(search_terms=NULL,
                         std_toolname=NULL,
                         stat_groupby=NULL,
                         .data = NULL){
  .data <- copy(DT)
  search_terms <- c(".")
  std_toolname <- NA_character_
  stat_groupby <- c("ProcedureSubject")

  out <- .data[, ({
    # TN <- .SD[["ToolName"]]
    ind <- search_terms %_&_% ToolName

    fq_sel <- as.data.table(table(ToolName[ ind ]), TRUE)
    fq_else <- as.data.table(table(ToolName[ !ind ]), TRUE)

    names(fq_sel) <- c("tn_sel", "N")
    names(fq_else) <- c("tn_not", "N")

    data.table(
      "tn_select" = list(fq_sel),
      "tn_others" = list(fq_else),
      "totObs" = .N,
      "count_ok" = sum(fq_sel$N, fq_else$N) == .N
    )
  }),
  keyby = stat_groupby]

  return(out)
}


fwrite(getFreqTable()[ProcedureSubject == "Dvp", tn_select][[1]][order(tn_sel)], "data-raw/dvp_tool_freq.csv")

tnFreq <- getFreqTable()

setkey(DT, ProcedureSubject, ProcedureName)









getToolFreq(
  tn_filter_terms = c("large?", "clip", "applier?"),
  map_val = "large clip applier",
  freq_grp_by = "ProcedureSubject",
  .data = DT
)


getToolFreq(
  tn_terms = c("needle", "drive"),
  map_value = "",
  count_grp = "ProcedureSubject",
  .data = DT
)




##
## NEEDLE DRIVER TOOLS
##
tn <- c("needle", "drive")
expr <- substitute(paste0(X, paste0(tn, collapse = " "), "r"))

map_entry(c("larg", tn), raw_tn, eval(expr, list(X = "large ")))
map_entry(c("mega", tn), raw_tn, eval(expr, list(X = "mega ")))
map_entry(c("curve", tn), raw_tn, eval(expr, list(X = "curved ")))
map_entry(c("wrist", tn), raw_tn, eval(expr, list(X = "wristed ")))
map_entry(c("^su|^ne", tn), raw_tn, eval(expr, list(X = "")))

map_entry(c("suction", "irrigat"), raw_tn, "suction irrigator")
map_entry(c("ves", "seal", "ext"), raw_tn, "vessel sealer extend")



map_entry("harmonic", raw_tn, "harmonic ace")

map_entry("grasper", raw_tn, "")
map_entry("retract", raw_tn, "")
map_entry("scissor", raw_tn, "")

map_entry(c("atrial", "retract"), raw_tn, "atrial retractor")

map_entry("fenestrated grasper", raw_tn, "fenestrated grasper")

map_entry("endoscope", raw_tn, "endoscope")









DT[ stringr::str_detect(mToolName, "staple.+(?<=reload)"),
    mToolName :=
      paste0("stapler reload ", stringr::str_extract(mToolName, "blue|green|white|gray|black"))
    ]

DT[ stringr::str_detect(ToolName, "needle.*(?<=drive)"),

    # stringr::str_trim(paste0(stringr::str_extract(ToolName, "mega|large|"), " needle driver"), "both"))
    unique(ToolName)
    ]

DT[ stringr::str_detect(DT$mToolName, "sureform.+(?<=reload)"),
    mToolName :=
      paste0("sureform reload ", stringr::str_extract(mToolName, "blue|green|white|black"))
    ]

DT[ stringr::str_detect(mToolName, "stapler( [0-9]+|\\$)"), mToolName := "stapler 30/45"]




###
### FORM THE FINALIZED MAPPING
###
toolMapIndex <- DT[, .N, .("tn_morpheus" = ToolName, "tn_standardized" = mToolName)]

# as.data.table(unlist(stringr::str_split(res[, .N, mapped][, mapped], " ")))[, .N, V1][order(V1)]
return(res[])
}




# misc code below ---------------------------------------------------------

mail
-s "test subject"
-F bfatemi07@gmail.com
-c cc-addr
-b
< /dev/null


SYS_CMD <- "ping localhost"
WORK_DIR <- getwd()
ENV_VARS <- c(VAR1 = "hello", VAR2 = "world")
rstudioapi::terminalExecute(command = SYS_CMD,
                            workingDir = WORK_DIR,
                            env = ENV_VARS,
                            show = TRUE)
rstudioapi::terminalBuffer()
rstudioapi::terminalExecute()
rstudioapi::restartSession()
system(paste0("kill -9 ", Sys.getpid()))
bfatemi/ninjar documentation built on Sept. 8, 2019, 7:37 p.m.