#' @title Medicare Stage Table Creation
#'
#' @description Code for creating stage enrollment tables for Medicare FFS data
#'
#' @details Creates stg tables for medicare enrollment, names, and ssns from
#' other tables as part of the Medicare extraction process.
#'
# ---- Load packages ----
pacman::p_load(DBI, dplyr, ggplot2, glue, lubridate, odbc, stringr)
devtools::source_url("https://raw.githubusercontent.com/PHSKC-APDE/apde/main/R/create_db_connection.R")
# ---- STEP 1: ----
# Code to create state_mcare_bene_enrollment table from MBSF tables
# Columns selected based on review of Danny's original 2019 scripts used to
# create elig_timevar and elig_demo tables
# Reference: https://kc1.sharepoint.com/:x:/r/teams/DPH-KCCross-SectorData/Shared%20Documents/General/References/Medicare/ResDAC%20file%20layouts/apde_mbsf_columns_needed_2024.xlsx?d=w4d88d662b43a4097811423fc7813313b&csf=1&web=1&e=1CBks8
db_hhsaw <- create_db_connection("hhsaw", interactive = F, prod = T)
bene_enrollment_sql <- glue::glue_sql(
"drop table if exists claims.stage_mcare_bene_enrollment
select
[etl_batch_id]
,cast(trim(bene_id) as varchar(255)) collate SQL_Latin1_General_CP1_CS_AS as bene_id
,[bene_enrollmt_ref_yr]
,[zip_cd]
,[bene_birth_dt]
,[bene_death_dt]
,[sex_ident_cd]
,[bene_race_cd]
,[rti_race_cd]
,[mdcr_entlmt_buyin_ind_01]
,[mdcr_entlmt_buyin_ind_02]
,[mdcr_entlmt_buyin_ind_03]
,[mdcr_entlmt_buyin_ind_04]
,[mdcr_entlmt_buyin_ind_05]
,[mdcr_entlmt_buyin_ind_06]
,[mdcr_entlmt_buyin_ind_07]
,[mdcr_entlmt_buyin_ind_08]
,[mdcr_entlmt_buyin_ind_09]
,[mdcr_entlmt_buyin_ind_10]
,[mdcr_entlmt_buyin_ind_11]
,[mdcr_entlmt_buyin_ind_12]
,[hmo_ind_01]
,[hmo_ind_02]
,[hmo_ind_03]
,[hmo_ind_04]
,[hmo_ind_05]
,[hmo_ind_06]
,[hmo_ind_07]
,[hmo_ind_08]
,[hmo_ind_09]
,[hmo_ind_10]
,[hmo_ind_11]
,[hmo_ind_12]
,[ptd_cntrct_id_01]
,[ptd_cntrct_id_02]
,[ptd_cntrct_id_03]
,[ptd_cntrct_id_04]
,[ptd_cntrct_id_05]
,[ptd_cntrct_id_06]
,[ptd_cntrct_id_07]
,[ptd_cntrct_id_08]
,[ptd_cntrct_id_09]
,[ptd_cntrct_id_10]
,[ptd_cntrct_id_11]
,[ptd_cntrct_id_12]
,[dual_stus_cd_01]
,[dual_stus_cd_02]
,[dual_stus_cd_03]
,[dual_stus_cd_04]
,[dual_stus_cd_05]
,[dual_stus_cd_06]
,[dual_stus_cd_07]
,[dual_stus_cd_08]
,[dual_stus_cd_09]
,[dual_stus_cd_10]
,[dual_stus_cd_11]
,[dual_stus_cd_12]
into claims.stage_mcare_bene_enrollment
from [claims].[stage_mcare_mbsf_abcd_summary]
-- Union to 2014 data
-- Join Part ABC and Part D data for 2014
-- QA below confirms left join is appropriate because no one has Part D but not Part AB
-- Union command will drop any duplicate rows within years
UNION SELECT
a.[etl_batch_id]
,cast(trim(a.bene_id) as varchar(255)) collate SQL_Latin1_General_CP1_CS_AS as bene_id
,a.[bene_enrollmt_ref_yr]
,a.[bene_zip_cd]
,a.[bene_birth_dt]
,a.[bene_death_dt]
,a.[bene_sex_ident_cd]
,a.[bene_race_cd]
,a.[rti_race_cd]
,a.[bene_mdcr_entlmt_buyin_ind_01]
,a.[bene_mdcr_entlmt_buyin_ind_02]
,a.[bene_mdcr_entlmt_buyin_ind_03]
,a.[bene_mdcr_entlmt_buyin_ind_04]
,a.[bene_mdcr_entlmt_buyin_ind_05]
,a.[bene_mdcr_entlmt_buyin_ind_06]
,a.[bene_mdcr_entlmt_buyin_ind_07]
,a.[bene_mdcr_entlmt_buyin_ind_08]
,a.[bene_mdcr_entlmt_buyin_ind_09]
,a.[bene_mdcr_entlmt_buyin_ind_10]
,a.[bene_mdcr_entlmt_buyin_ind_11]
,a.[bene_mdcr_entlmt_buyin_ind_12]
,a.[bene_hmo_ind_01]
,a.[bene_hmo_ind_02]
,a.[bene_hmo_ind_03]
,a.[bene_hmo_ind_04]
,a.[bene_hmo_ind_05]
,a.[bene_hmo_ind_06]
,a.[bene_hmo_ind_07]
,a.[bene_hmo_ind_08]
,a.[bene_hmo_ind_09]
,a.[bene_hmo_ind_10]
,a.[bene_hmo_ind_11]
,a.[bene_hmo_ind_12]
,b.[ptd_cntrct_id_01]
,b.[ptd_cntrct_id_02]
,b.[ptd_cntrct_id_03]
,b.[ptd_cntrct_id_04]
,b.[ptd_cntrct_id_05]
,b.[ptd_cntrct_id_06]
,b.[ptd_cntrct_id_07]
,b.[ptd_cntrct_id_08]
,b.[ptd_cntrct_id_09]
,b.[ptd_cntrct_id_10]
,b.[ptd_cntrct_id_11]
,b.[ptd_cntrct_id_12]
,b.[dual_stus_cd_01]
,b.[dual_stus_cd_02]
,b.[dual_stus_cd_03]
,b.[dual_stus_cd_04]
,b.[dual_stus_cd_05]
,b.[dual_stus_cd_06]
,b.[dual_stus_cd_07]
,b.[dual_stus_cd_08]
,b.[dual_stus_cd_09]
,b.[dual_stus_cd_10]
,b.[dual_stus_cd_11]
,b.[dual_stus_cd_12]
from [claims].[stage_mcare_mbsf_ab_summary] as a
left join [claims].[stage_mcare_mbsf_d_cmpnts] as b
on a.bene_id = b.bene_id",
.con = db_hhsaw)
odbc::dbSendQuery(conn = db_hhsaw, bene_enrollment_sql)
# Confirm that no one has Part D coverage but not Part AB coverage in 2014
# Confirm no bene_id in mbsf_d table that doesn't exist in mbsf_ab table
# PASS condition: Expect 0
bene_check_sql <- glue::glue_sql("
select count(a.bene_id) as row_count
from [claims].[stage_mcare_mbsf_d_cmpnts] as a
left join [claims].[stage_mcare_mbsf_ab_summary] as b
on a.bene_id = b.bene_id
where b.bene_id is null",
.con = db_hhsaw)
odbc::dbGetQuery(conn = db_hhsaw, bene_check_sql)
# ---- STEP 2: ----
# Code to create NAMES and SSN tables for person linkage
# Note that HIC (alternate unique ID table is not used for linkage and thus
# not loaded here)
bene_names_sql <- glue::glue_sql("
drop table if exists claims.stage_mcare_bene_names;
select distinct
cast(trim(bene_id) as varchar(255)) collate SQL_Latin1_General_CP1_CS_AS as bene_id,
bene_srnm_name,
bene_gvn_name,
bene_mdl_name
into claims.stage_mcare_bene_names
from [claims].[stage_mcare_edb_user_view]",
.con = db_hhsaw)
odbc::dbSendQuery(conn = db_hhsaw, bene_names_sql)
bene_ssn_sql <- glue::glue_sql("
drop table if exists claims.stage_mcare_bene_ssn;
select distinct
cast(trim(bene_id) as varchar(255)) collate SQL_Latin1_General_CP1_CS_AS as bene_id,
ssn
into claims.stage_mcare_bene_ssn
from [claims].[stage_mcare_bene_ssn_xwalk]",
.con = db_hhsaw)
odbc::dbSendQuery(conn = db_hhsaw, bene_ssn_sql)
# ---- QA Step 2: ----
# Ensure case-sensitivity is set for bene_id variable (expect CS trimmed varchar count > CI trimmed varchar)
bene_names_casetest_sql <- glue::glue_sql("
with temp1 as (select distinct
cast(trim(bene_id) as varchar(255)) collate SQL_Latin1_General_CP1_CS_AS as bene_id_cs,
cast(trim(bene_id) as varchar(255)) collate SQL_Latin1_General_CP1_CI_AS as bene_id_ci,
lower(trim(bene_id)) as bene_id_lowercase
from claims.stage_mcare_bene_names
)
select 'case-sensitive trimmed varchar' as column_type, count(distinct bene_id_cs) as bene_id_dcount from temp1
union select 'case-insensitive trimmed varchar' as column_type, count(distinct bene_id_ci) as bene_id_dcount from temp1
union select 'lowercase trimmed varchar' as column_type,count(distinct bene_id_lowercase) as bene_id_dcount from temp1
order by bene_id_dcount desc;",
.con = db_hhsaw)
bene_names_casetest <- odbc::dbGetQuery(conn = db_hhsaw, bene_names_casetest_sql)
print(bene_names_casetest)
bene_ssn_casetest_sql <- glue::glue_sql("
with temp1 as (select distinct
cast(trim(bene_id) as varchar(255)) collate SQL_Latin1_General_CP1_CS_AS as bene_id_cs,
cast(trim(bene_id) as varchar(255)) collate SQL_Latin1_General_CP1_CI_AS as bene_id_ci,
lower(trim(bene_id)) as bene_id_lowercase
from claims.stage_mcare_bene_ssn
)
select 'case-sensitive trimmed varchar' as column_type, count(distinct bene_id_cs) as bene_id_dcount from temp1
union select 'case-insensitive trimmed varchar' as column_type, count(distinct bene_id_ci) as bene_id_dcount from temp1
union select 'lowercase trimmed varchar' as column_type,count(distinct bene_id_lowercase) as bene_id_dcount from temp1
order by bene_id_dcount desc;",
.con = db_hhsaw)
bene_ssn_casetest <- odbc::dbGetQuery(conn = db_hhsaw, bene_ssn_casetest_sql)
print(bene_ssn_casetest)
# Ensure there are very few bene_id values in the [claims].[stage_mcare_bene_enrollment]
# table that do not match to the [claims].[stage_mcare_bene_names] or [claims].[stage_mcare_bene_ssn] tables
bene_id_test_names_sql <- glue::glue_sql("
SELECT a.bene_id
FROM claims.stage_mcare_bene_enrollment a
LEFT JOIN claims.stage_mcare_bene_names b
ON a.bene_id = b.bene_id
WHERE b.bene_id IS NULL
",
.con = db_hhsaw)
bene_id_test_names <- odbc::dbGetQuery(conn = db_hhsaw, bene_id_test_names_sql)
nrow(bene_id_test_names)
bene_id_test_ssn_sql <- glue::glue_sql("
SELECT a.bene_id
FROM claims.stage_mcare_bene_enrollment a
LEFT JOIN claims.stage_mcare_bene_ssn b
ON a.bene_id = b.bene_id
WHERE b.bene_id IS NULL
",
.con = db_hhsaw)
bene_id_test_ssn <- odbc::dbGetQuery(conn = db_hhsaw, bene_id_test_ssn_sql)
nrow(bene_id_test_ssn)
bene_id_test_both_sql <- glue::glue_sql("
SELECT a.bene_id
FROM claims.stage_mcare_bene_enrollment a
LEFT JOIN (
SELECT DISTINCT bene_id FROM claims.stage_mcare_bene_names
UNION
SELECT DISTINCT bene_id FROM claims.stage_mcare_bene_ssn
) b ON a.bene_id = b.bene_id
WHERE b.bene_id IS NULL
",
.con = db_hhsaw)
bene_id_test_both <- odbc::dbGetQuery(conn = db_hhsaw, bene_id_test_both_sql)
nrow(bene_id_test_both)
# Ensure that the date of birth variable in the [claims].[stage_mcare_bene_enrollment]
# is no longer garbled for 2020 and 2021 data (this could just be a one-time check,
# although it would be prudent for us to have QA here that aims to find issues
# with any date variables. For example, you would never want to see that dates
# of birth are mostly clustered in a given year/month for a given enrollment_year,
# as this would indicate that something went wrong with how these dates were formatted/parsed along the way.
date_check_sql <- glue::glue_sql("
SELECT bene_enrollmt_ref_yr, bene_id, bene_birth_dt, bene_death_dt
FROM claims.stage_mcare_bene_enrollment
",
.con = db_hhsaw)
date_check <- odbc::dbGetQuery(conn = db_hhsaw, date_check_sql)
# Assert dates are length 10
invalid_date_length <- date_check[str_count(date_check$bene_birth_dt) != 10,]
nrow(invalid_date_length)
# convert birth dates that are length 10 to lubridates
date_distribution <- date_check[str_count(date_check$bene_birth_dt) == 10,]
date_distribution$bene_birth_dt <- as_date(date_distribution$bene_birth_dt)
# Check min and max years
min(date_distribution$year)
max(date_distribution$year)
# bar chart of birthdays by year
ggplot(date_distribution, aes(x=bene_birth_dt))+
geom_histogram(binwidth=30, colour="purple") +
scale_x_date(labels = date_format("%Y")) +
ylab("Frequency") + xlab("Year") +
theme_bw()
# Facet year-month histogram
date_distribution <- date_distribution %>%
# Get the year value to use it for the facetted plot
mutate(year = year(bene_birth_dt),
# Get the month-day dates and set all dates with a dummy year (2021 in this case)
# This will get all your dates in a common x axis scale
month_day = as_date(paste(3000,month(bene_birth_dt),day(bene_birth_dt), sep = "-")))
ggplot(date_distribution, aes(x = month_day)) +
geom_histogram() +
labs(x = "Month",
y = "Count",
title = "Birthdays per Month") +
scale_x_date(labels = scales::date_format("%b"),
breaks = "1 month") +
facet_wrap(~year) +
theme(axis.text.x = element_text(angle = 90,
vjust = 0.5,
hjust = 1))
# Zoom in to main years of interest
ggplot(date_distribution[(date_distribution$year > 1924) & (date_distribution$year < 1970),], aes(x = month_day)) +
geom_histogram() +
labs(x = "Month",
y = "Count",
title = "Birthdays per Month") +
scale_x_date(labels = scales::date_format("%b"),
breaks = "1 month") +
facet_wrap(~year) +
theme(axis.text.x = element_text(angle = 90,
vjust = 0.5,
hjust = 1))
# ---- STEP 3: Header-level medical claim concepts ----
# Code to create stage_mcare_claim_header_prep table
# Columns selected based on concepts/columns needed to create analytic-ready claim tables
# Reference: https://kc1.sharepoint.com/:x:/r/teams/DPH-KCCross-SectorData/Shared%20Documents/General/References/Medicare/ResDAC%20file%20layouts/apde_claims_columns_needed_2024.xlsx?d=w157ab8c163714967847462a7271df79a&csf=1&web=1&e=uweatj
# Exclude denied claims per ResDAC guidance
# Exclude claims among people with no enrollment data
# Trim white space for id_mcare, claim_header_id
db_hhsaw <- create_db_connection("hhsaw", interactive = F, prod = T)
claim_header_prep_sql <- glue::glue_sql(
"drop table if exists claims.stage_mcare_claim_header_prep;
--BLANK--
",
.con = db_hhsaw)
odbc::dbSendQuery(conn = db_hhsaw, claim_header_prep_sql)
# ---- Extra: ----
# Demonstrate need for case sensitivity
tables_sql <- glue::glue_sql("
--View collations settings of tables used in this demonstration
--Collation name must be set as SQL_Latin1_General_CP1_CS_AS for case sensitivity to be respected
SELECT t.name, c.name, c.collation_name
FROM SYS.COLUMNS c
JOIN SYS.TABLES t ON t.object_id = c.object_id
WHERE t.name in ('stage_mcare_edb_user_view', 'stage_mcare_bcarrier_claims')
and c.name in ('bene_id', 'clm_id')
order by t.name desc, c.name;",
.con = db_hhsaw)
test1 <- odbc::dbGetQuery(conn = db_hhsaw, tables_sql)
bene_ssn_sql <- glue::glue_sql("
--Demonstrate case-sensitive nature of bene_id variable
with temp1 as (select distinct
cast(trim(bene_id) as varchar(255)) collate SQL_Latin1_General_CP1_CS_AS as bene_id_cs,
cast(trim(bene_id) as varchar(255)) collate SQL_Latin1_General_CP1_CI_AS as bene_id_ci,
lower(trim(bene_id)) as bene_id_lowercase
from claims.stage_mcare_edb_user_view
)
select 'case-sensitive trimmed varchar' as column_type, count(distinct bene_id_cs) as bene_id_dcount from temp1
union select 'case-insensitive trimmed varchar' as column_type, count(distinct bene_id_ci) as bene_id_dcount from temp1
union select 'lowercase trimmed varchar' as column_type,count(distinct bene_id_lowercase) as bene_id_dcount from temp1
order by bene_id_dcount desc;",
.con = db_hhsaw)
test2 <- odbc::dbGetQuery(conn = db_hhsaw, bene_ssn_sql)
clm_id_sql <- glue::glue_sql("
--Demonstrate case-sensitive nature of clm_id variable
with temp2 as (select distinct
cast(trim(clm_id) as varchar(255)) collate SQL_Latin1_General_CP1_CS_AS as clm_id_cs,
cast(trim(clm_id) as varchar(255)) collate SQL_Latin1_General_CP1_CI_AS as clm_id_ci,
lower(trim(clm_id)) as clm_id_lowercase
from claims.stage_mcare_bcarrier_claims
)
select 'case-sensitive trimmed varchar' as column_type, count(distinct clm_id_cs) as clm_id_dcount from temp2
union select 'case-insensitive trimmed varchar' as column_type, count(distinct clm_id_ci) as clm_id_dcount from temp2
union select 'lowercase trimmed varchar' as column_type,count(distinct clm_id_lowercase) as clm_id_dcount from temp2
order by clm_id_dcount desc;",
.con = db_hhsaw)
test3 <- odbc::dbGetQuery(conn = db_hhsaw, clm_id_sql)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.