healthdb.R
In healthdb: Working with Healthcare Databases

## ----include = FALSE----------------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)

## ----eval = FALSE-------------------------------------------------------------
#  install.packages("healthdb")

## ----setup, message=FALSE-----------------------------------------------------
library(dplyr)
library(dbplyr)
library(lubridate)
library(glue)
library(purrr)
library(healthdb)

## -----------------------------------------------------------------------------
# make_test_dat() makes either a toy data.frame or database table in memory with known number of rows that satisfy the query we will show later
claim_db <- make_test_dat(vals_kept = c("303", "304", "305", "291", "292", glue("30{30:59}"), glue("29{10:29}"), noise_val = c("999", "111")), type = "database")

# this is a database table
# note that in-memory SQLite database stores dates as numbers
claim_db %>% head()

## -----------------------------------------------------------------------------
hosp_df <- make_test_dat(vals_kept = c(glue("F{10:19}"), glue("F{100:199}"), noise_val = "999"), type = "data.frame")

# this is a local data.frame/tibble
hosp_df %>% head()

# convert Date to numeric to be consistent with claim_db
hosp_df <- hosp_df %>% 
  mutate(dates = julian(dates))

## ----eval=FALSE---------------------------------------------------------------
#  ## not run
#  claim_db %>%
#    # identify the target codes
#    filter(if_any(starts_with("diagx"), ~ str_like(., c("291%", "292%", "303%", "304%", "305%")))) %>%
#    # each clnt has at least 2 records on different dates
#    group_by(clnt_id) %>%
#    # the n_distinct step is mainly for reducing computation in the next step
#    filter(n_distinct(dates) >= 2) %>%
#    # any two dates within one year?
#    filter((max(dates) - min(dates)) <= 365)
#  ## end

## -----------------------------------------------------------------------------
result1 <- claim_db %>%
  identify_row(
vars = starts_with("diagx"),
match = "start",
vals = c(291:292, 303:305)
  )

## -----------------------------------------------------------------------------
result2 <- result1 %>%
  exclude(
excl = identify_row(claim_db, starts_with("diagx"), "in", "111"),
by = "clnt_id"
  )

## -----------------------------------------------------------------------------
result3 <- result2 %>% restrict_n(
  clnt_id = clnt_id,
  n_per_clnt = 2,
  count_by = dates,
  # here we use filter mode to remove records that failed the restriction
  mode = "filter"
)

## -----------------------------------------------------------------------------
result4 <- result3 %>% restrict_date(
  clnt_id = clnt_id,
  date_var = dates,
  n = 2,
  within = 365,
  uid = uid,
  # here we use flag mode to flag records that met the restriction instead of removing those
  mode = "flag"
)

## -----------------------------------------------------------------------------
# Class of result4
class(result4)

# execute query and download the result
result_df <- result4 %>% collect()

# Number of rows in source
nrow(claim_db %>% collect())

# Number of rows in the current result
nrow(result_df)

## -----------------------------------------------------------------------------
# make two look up tables
age_tab <- data.frame(
  clnt_id = 1:50,
  age = sample(1:90, 50),
  sex = sample(c("F", "M"), 50, replace = TRUE)
)
address_tab <- data.frame(
  clnt_id = rep(1:50, 5), year = rep(2016:2020, each = 50),
  area_code = sample(0:200, 50, replace = TRUE)
)

# get year from dates for matching

result_df <- result_df %>% mutate(year = lubridate::year(as.Date(dates, origin = "1970-01-01")))

# note that keys must be present in all tables
result_df %>%
  fetch_var(
keys = c(clnt_id, year),
linkage = list(
  # the formula means from_table ~ get_variable
  # |clnt_id means matching on clnt_id only
  age_tab ~ c(age, sex) | clnt_id,
  address_tab ~ area_code
)
  ) %>%
  select(uid, clnt_id, dates, age, sex, area_code) %>% 
  head()

## -----------------------------------------------------------------------------
# build the full definition of SUD
sud_def <- build_def(
  # name of definition
  def_lab = "SUD",
  # place holder names for sources
  src_labs = c("claim", "hosp"),
  def_fn = define_case, # you could alter it and supply your own function
  # below are argumets of define_case
  fn_args = list(
    # if length = 1, the single element will be use for every source
    vars = list(starts_with("diagx")),
    match = "start", # match ICD starts with vals
    vals = list(c(291:292, 303:305), glue("F{10:19}")),
    clnt_id = clnt_id,
    n_per_clnt = c(2, 1),
    date_var = dates,
    within = c(365, NULL),
    uid = uid,
    mode = "flag"
  )
)

sud_def

## -----------------------------------------------------------------------------
sud_def$fn_call

## -----------------------------------------------------------------------------
# execute the definition
result_list <- sud_def %>%
  execute_def(with_data = list(
    claim = claim_db,
    hosp = hosp_df
  ))

## -----------------------------------------------------------------------------
# view the results
purrr::walk(result_list, ~ head(.) %>% print())

## -----------------------------------------------------------------------------
bind_source(result_list,
  # output_name = c(names in the list elements)
  src = "src",
  uid = "uid",
  clnt_id = "clnt_id",
  flag = c("flag_restrict_date", NA),
  # force_proceed is needed to collect remote tables to local memory
  force_proceed = TRUE
)

## -----------------------------------------------------------------------------
pool_case(result_list,
          def = sud_def,
          # your could skip summary with output_lvl = "raw"
          output_lvl = "clnt",
          # include records only from sources having valid records, see function documentation for more detail and other options
          include_src = "has_valid",
          force_proceed = TRUE)