# This code QAs the stage mcaid CCW table
#
# It is designed to be run as part of the master Medicaid script:
# https://github.com/PHSKC-APDE/claims_data/blob/main/claims_db/db_loader/mcaid/master_mcaid_analytic.R
#
# 2019-08-12
# Alastair Matheson, adapted from Eli Kern's SQL script
### Function elements
# conn = database connection
# server = whether we are working in HHSAW or PHClaims
# config = the YAML config file. Can be either an object already loaded into
# R or a URL that should be used
# get_config = if a URL is supplied, set this to T so the YAML file is loaded
# skip_review = if you do not want to manually review comparison to APCD estimates
# (set to T because it holds up automated monthly runs)
qa_stage_mcaid_claim_ccw_f <- function(conn = NULL,
server = c("hhsaw", "phclaims"),
config = NULL,
get_config = F,
skip_review = T) {
# Set up variables specific to the server
server <- match.arg(server)
if (get_config == T){
if (stringr::str_detect(config, "^http")) {
config <- yaml::yaml.load(getURL(config))
} else{
stop("A URL must be specified in config if using get_config = T")
}
}
to_schema <- config[[server]][["to_schema"]]
to_table <- config[[server]][["to_table"]]
final_schema <- config[[server]][["final_schema"]]
final_table <- ifelse(is.null(config[[server]][["final_table"]]), '',
config[[server]][["final_table"]])
final_table_pre <- ifelse(is.null(config[[server]][["final_table_pre"]]), '',
config[[server]][["final_table_pre"]])
qa_schema <- config[[server]][["qa_schema"]]
qa_table_pre <- ifelse(is.null(config[[server]][["qa_table_pre"]]), '',
config[[server]][["qa_table_pre"]])
message("Running QA on ", to_schema, ".", to_table)
#### PULL OUT VALUES NEEDED MULTIPLE TIMES ####
last_run <- as.POSIXct(odbc::dbGetQuery(
conn, glue::glue_sql("SELECT MAX (last_run) FROM {`to_schema`}.{`to_table`}",
.con = conn))[[1]])
#### SET UP EMPTY DATA FRAME TO TRACK RESULTS ####
ccw_qa <- data.frame(etl_batch_id = integer(),
last_run = as.Date(character()),
table_name = character(),
qa_item = character(),
qa_result = character(),
qa_date = as.Date(character()),
note = character())
#### STEP 1: TABLE-WIDE CHECKS ####
#### COUNT # CONDITIONS RUN ####
distinct_cond <- as.integer(dbGetQuery(
conn,
glue::glue_sql("SELECT count(distinct ccw_code) as cond_count FROM {`to_schema`}.{`to_table`}",
.con = conn)))
# See how many are in the final table
distinct_cond_final <- as.integer(dbGetQuery(
conn,
glue::glue_sql("SELECT count(distinct ccw_code) as cond_count FROM {`final_schema`}.{`final_table`}",
.con = conn)))
if (distinct_cond >= distinct_cond_final) {
ccw_qa <- rbind(ccw_qa,
data.frame(etl_batch_id = NA_integer_,
last_run = last_run,
table_name = paste0(to_schema, ".", to_table),
qa_item = "# distinct conditions",
qa_result = "PASS",
qa_date = Sys.time(),
note = glue("There were {distinct_cond} conditions analyzed")))
} else {
ccw_qa <- rbind(ccw_qa,
data.frame(etl_batch_id = NA_integer_,
last_run = last_run,
table_name = paste0(to_schema, ".", to_table),
qa_item = "# distinct conditions",
qa_result = "FAIL",
qa_date = Sys.time(),
note = glue("There were {distinct_cond} conditions analyzed, but there are ",
"{distinct_cond_final} conditions in the final table")))
}
#### COUNT NUMBER + PERCENT OF DISTINCT PEOPLE BY CONDITION ####
distinct_id_ccw <- dbGetQuery(
conn,
glue::glue_sql("SELECT ccw_code, ccw_desc, count(distinct id_mcaid) as id_dcount
FROM {`to_schema`}.{`to_table`}
WHERE year(from_date) <= 2017 and year(to_date) >= 2017
GROUP BY ccw_code, ccw_desc
ORDER BY ccw_code",
.con = conn))
distinct_id_pop <- as.integer(dbGetQuery(
conn,
glue::glue_sql("SELECT count(distinct id_mcaid) as id_dcount
FROM {`final_schema`}.{DBI::SQL(final_table_pre)}mcaid_elig_timevar
WHERE year(from_date) <= 2017 and year(to_date) >= 2017",
.con = conn)))
distinct_id_chk <- distinct_id_ccw %>%
mutate(prop = id_dcount / distinct_id_pop * 100)
# Compare to APCD-derived data
apcd_prop <- data.frame(
ccw_desc = c("ccw_anemia", "ccw_asthma", "ccw_cancer_breast", "ccw_copd",
"ccw_depression", "ccw_diabetes", "ccw_hypertension",
"ccw_hypothyroid", "ccw_mi"),
apcd_2017_all = c(8.6, 5.3, 0.5, 2.6, 14.5, 6.5, 12.4, 2.7, 0.3),
apcd_2017_7mth = c(6.9, 5.3, 0.3, 1.5, 12.5, 3.8, 7.3, 1.6, 0.2))
distinct_id_chk <- left_join(distinct_id_chk, apcd_prop, by = "ccw_desc")
distinct_id_chk <- distinct_id_chk %>%
mutate(abs_diff = prop - apcd_2017_all,
per_diff = abs_diff / prop * 100)
# Show results for review
print(distinct_id_chk %>% filter(!is.na(abs_diff)))
if (skip_review == F) {
prop_chk <- askYesNo(msg = glue("Are the deviations from the APCD estimates ",
"within acceptable parameters? Ideally a small ",
"percentage difference (<10%) but for small estimates, ",
"a small absolute difference is ok (<0.5)."))
if (is.na(prop_chk)) {
stop("QA process aborted at proportion checking step")
} else if (prop_chk == T) {
ccw_qa <- rbind(ccw_qa,
data.frame(etl_batch_id = NA_integer_,
last_run = last_run,
table_name = paste0(to_schema, ".", to_table),
qa_item = "Overall proportion with each condition (compared to APCD)",
qa_result = "PASS",
qa_date = Sys.time(),
note = glue("Most conditions are close to APCD-derived estimates")))
} else if (prop_chk == F) {
ccw_qa <- rbind(ccw_qa,
data.frame(etl_batch_id = NA_integer_,
last_run = last_run,
table_name = paste0(to_schema, ".", to_table),
qa_item = "Overall proportion with each condition (compared to APCD)",
qa_result = "FAIL",
qa_date = Sys.time(),
note = glue("One or more conditions deviate from expected proportions")))
}
}
#### CHECK AGE DISTRIBUTION BY CONDITION FOR A GIVEN YEAR ####
age_dist_cond_f <- function(year = 2017) {
if (lubridate::leap_year(year)) {
pt <- 366
} else {
pt <- 365
}
sql_call <- glue_sql(
"SELECT c.ccw_code, c.ccw_desc, c.age_grp7, count(distinct id_mcaid) as id_dcount
FROM (
SELECT a.id_mcaid, a.ccw_code, a.ccw_desc,
case
when b.age >= 0 and b.age < 5 then '00-04'
when b.age >= 5 and b.age < 12 then '05-11'
when b.age >= 12 and b.age < 18 then '12-17'
when b.age >= 18 and b.age < 25 then '18-24'
when b.age >= 25 and b.age < 45 then '25-44'
when b.age >= 45 and b.age < 65 then '45-64'
when b.age >= 65 then '65 and over'
end as age_grp7
FROM (
SELECT distinct id_mcaid, ccw_code, ccw_desc
FROM {`to_schema`}.{`to_table`}
where year(from_date) <= {year} and year(to_date) >= {year}
) as a
left join (
SELECT id_mcaid,
case
when datediff(day, dob, '{year}-12-31') >= 0 then floor((datediff(day, dob, '{year}-12-31') + 1) / {pt})
when datediff(day, dob, '{year}-12-31') < 0 then NULL
end as age
FROM {`final_schema`}.{DBI::SQL(final_table_pre)}mcaid_elig_demo
) as b
on a.id_mcaid = b.id_mcaid
) as c
where c.age_grp7 is not null
group by c.ccw_code, c.ccw_desc, c.age_grp7
order by c.ccw_code, c.age_grp7",
.con = conn
)
output <- dbGetQuery(conn, sql_call)
return(output)
}
age_dist_pop_f <- function(year = 2017) {
if (lubridate::leap_year(year)) {
pt <- 366
} else {
pt <- 365
}
sql_call <- glue_sql(
"SELECT age_grp7, count(distinct id_mcaid) as pop
FROM (
SELECT a.id_mcaid,
case
when b.age >= 0 and b.age < 5 then '00-04'
when b.age >= 5 and b.age < 12 then '05-11'
when b.age >= 12 and b.age < 18 then '12-17'
when b.age >= 18 and b.age < 25 then '18-24'
when b.age >= 25 and b.age < 45 then '25-44'
when b.age >= 45 and b.age < 65 then '45-64'
when b.age >= 65 then '65 and over'
end as age_grp7
FROM (
SELECT id_mcaid
FROM {`final_schema`}.{DBI::SQL(final_table_pre)}mcaid_elig_timevar
where year(from_date) <= {year} and year(to_date) >= {year}
) as a
left join (
SELECT id_mcaid,
case
when datediff(day, dob, '{year}-12-31') >= 0 then floor((datediff(day, dob, '{year}-12-31') + 1) / {pt})
when datediff(day, dob, '{year}-12-31') < 0 then NULL
end as age
FROM {`final_schema`}.{DBI::SQL(final_table_pre)}mcaid_elig_demo
) as b
on a.id_mcaid = b.id_mcaid
) as c
where c.age_grp7 is not null
group by c.age_grp7
order by c.age_grp7",
.con = conn
)
output <- dbGetQuery(conn, sql_call)
return(output)
}
age_dist_cond_chk <- age_dist_cond_f(year = 2018)
age_dist_pop_chk <- age_dist_pop_f(year = 2018)
age_dist_cond_chk <- left_join(age_dist_cond_chk, age_dist_pop_chk,
by = "age_grp7") %>%
mutate(prev = id_dcount / pop * 100)
# Plot results for visual inspection
win.graph(width = 16, height = 10)
ggplot(data = age_dist_cond_chk,
aes(x = age_grp7, y = prev, group = ccw_desc)) +
geom_line() +
geom_point() +
facet_wrap( ~ ccw_desc, ncol = 4, scales = "free")
if (skip_review == F) {
# Seek user input on whether or not patterns match what is expected
# NB. It would be nice to quantify this but human inspection will do for now
age_dist_chk <- askYesNo(
msg = glue("Do the age distributions look to be what is expected ",
"(generally increasing with age but drop offs after 65 not unusual)?")
)
if (is.na(age_dist_chk)) {
stop("QA process aborted at age distribution step")
} else if (age_dist_chk == T) {
ccw_qa <- rbind(ccw_qa,
data.frame(etl_batch_id = NA_integer_,
last_run = last_run,
table_name = paste0(to_schema, ".", to_table),
qa_item = "Patterns by age group",
qa_result = "PASS",
qa_date = Sys.time(),
note = glue("Most conditions increased with age as expected")))
} else if (age_dist_chk == F) {
ccw_qa <- rbind(ccw_qa,
data.frame(etl_batch_id = NA_integer_,
last_run = last_run,
table_name = paste0(to_schema, ".", to_table),
qa_item = "Patterns by age group",
qa_result = "FAIL",
qa_date = Sys.time(),
note = glue("One or more conditions had unusual age patterns")))
}
}
#### CHECK DISTRIBUTION BY CONDITION BY YEAR ####
year_dist_cond <- map_df(seq(2014, 2021), function(x) {
yr_start <- paste0(x, "-01-01")
yr_end <- paste0(x, "-12-31")
sql_call <- glue_sql(
"SELECT a.ccw_code, a.ccw_desc, count(distinct a.id_mcaid) as id_dcount
FROM (
SELECT distinct id_mcaid, ccw_code, ccw_desc
FROM {`to_schema`}.{`to_table`}
WHERE from_date <= {yr_end} and to_date >= {yr_start}
) as a
group by a.ccw_code, a.ccw_desc
order by a.ccw_code",
.con = conn
)
output <- dbGetQuery(conn, sql_call) %>%
mutate(year = x)
output
})
ggplot(data = year_dist_cond,
aes(x = year, y = id_dcount, group = ccw_desc)) +
geom_line() +
geom_point() +
facet_wrap( ~ ccw_desc, ncol = 4, scales = "free")
#### STEP 2: VALIDATE STATUS OF ONE PERSON PER CONDITION WITH 2+ TIME PERIODS ####
### Only run this when checking manually because end dates in the csv file get
# out of date.
# # Bring in csv file with specific individuals
# ids_csv <- read.csv(file = "//dchs-shares01/dchsdata/DCHSPHClaimsData/Data/QA_specific/stage.mcaid_claim_ccw_qa_ind.csv",
# stringsAsFactors = F)
#
#
# # Restrict to relevant columns
# ids_csv <- ids_csv %>% select(id_mcaid, ccw_desc, from_date, to_date) %>%
# arrange(id_mcaid, from_date)
#
# # Pull relevant people from ccw table
# # Note, need to use glue instead of glue_sql to get quotes to work in collapse
# ids_ccw <- dbGetQuery(
# conn,
# glue_sql("SELECT id_mcaid, ccw_desc, from_date, to_date
# FROM {`to_schema`}.{`to_table`}
# WHERE {DBI::SQL(
# glue_collapse(
# glue_data_sql(
# ids_csv,
# '(id_mcaid = {id_mcaid} and ccw_desc = {ccw_desc})', .con = conn),
# sep = ' OR '))}
# ORDER BY id_mcaid, from_date",
# .con = conn))
#
#
# if (isTRUE(all_equal(ids_csv, ids_ccw))) {
# ccw_qa <- rbind(ccw_qa,
# data.frame(etl_batch_id = NA_integer_,
# last_run = last_run,
# table_name = paste0(to_schema, ".", to_table),
# qa_item = "Specific individuals",
# qa_result = "PASS",
# qa_date = Sys.time(),
# note = glue("From/to dates matched what was expected")))
# } else {
# ccw_qa <- rbind(ccw_qa,
# data.frame(etl_batch_id = NA_integer_,
# last_run = last_run,
# table_name = paste0(to_schema, ".", to_table),
# qa_item = "Specific individuals",
# qa_result = "FAIL",
# qa_date = Sys.time(),
# note = glue("From/to dates DID NOT match what was expected")))
# }
#### STEP 3: LOAD QA RESULTS TO SQL AND RETURN RESULT ####
DBI::dbExecute(
conn,
glue::glue_sql("INSERT INTO {`qa_schema`}.{DBI::SQL(qa_table_pre)}qa_mcaid
(etl_batch_id, last_run, table_name, qa_item, qa_result, qa_date, note)
VALUES
{DBI::SQL(glue_collapse(
glue_data_sql(ccw_qa,
'({etl_batch_id}, {format(last_run, usetz = FALSE)}, {table_name}, {qa_item},
{qa_result}, {format(qa_date, usetz = FALSE)}, {note})',
.con = conn),
sep = ', ')
)};",
.con = conn))
if (max(str_detect(ccw_qa$qa_result, "FAIL")) == 0) {
ccw_qa_fail <- 0L
} else {
ccw_qa_fail <- 1L
}
message(glue::glue("QA of stage.mcaid_claim_ccw complete. Result: {min(ccw_qa$qa_result)}"))
return(ccw_qa_fail)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.