library(tidyr)
library(dplyr)
knitr::opts_chunk$set(echo = FALSE,
                message = FALSE,
                warning = FALSE,
                error= TRUE,
                time_it = TRUE)
startTime <- Sys.time()

all_times <- list()
knitr::knit_hooks$set(time_it = local({
  now <- NULL
  function(before, options) {
    if (before) {
      now <<- Sys.time()
    } else {
      res <- difftime(Sys.time(), now)

      all_times[[options$label]] <<- c(options$label, format(round(res, 3)))

      #Only prints times greater than 5 minutes.  
      if(res > as.difftime("00:05", format="%H:%M")){
        paste("Time for the chunk", options$label, "to run:", format(round(res, 3)))
      }
    }
  }
}))
connectionString <- getConnectionString(params)
run_query <- function(query_text, ...){
  result <- run_db_query(connectionString, query_text )
  return(result)
}

get_connection <- function(){
  connection <- get_new_connection(connectionString)
  return(connection)
}
tbresult <-runTableReplacements(connectionString)
QA_Alert_Messages <- data.frame('Alert Code'=character(),
                 'Alert Message'=character(), stringsAsFactors = FALSE)

CHORDS QA Report: VDW P3 Tables

The purpose of the data quality program is to conduct data quality checks that correspond to similar data checks carried out by PCORnet (Version 7). The checks cover 4 realms of data quality: data model conformance, data plausibility, data completeness, and data persistence. The program uses a series of SQL queries operationalized using RStudio to produce this report. The program is designed to support tables/figures that provide detailed information about each data quality check as well as an alert if a data error appears to be present (based on criteria developed by PCORnet or the CHORDS team).

This data quality report was generated from CHORDS r params$DBName.

Information about the QA program

Data Partner: Analyst: Query Run Date: r Sys.Date()

# Table to check for quality
tableCheckList <- c(lab_results, encounters, prescribing, procedures, social_history, provider_specialty, diagnoses, vital_signs, census_location, demographics, pro_surveys,  pro_questions, pro_responses, benefit, linkage, death, pharmacy)
# table list in DB
#sqlConnection <- get_connection()
#dbTables <- sqlTables(get_connection(), catalog = params$DBName)
dbTables <- run_query(query_text = 
    "SELECT 
        TABLE_NAME
     FROM     
        INFORMATION_SCHEMA.TABLES
     UNION ALL
     SELECT 
        name AS TABLE_NAME
     FROM   
        sys.synonyms;")
#dbTables <- merge(dbTables, dbSynonyms, by="TABLE_NAME")
#RODBC::odbcClose(sqlConnection)
tabledChecked <- vector(length=length(tableCheckList))
for (i in 1:length(tableCheckList)){
  tabledChecked[i] <- sum(grepl(tableCheckList[i], dbTables$TABLE_NAME, ignore.case = T))>0
}
missingTables <- setdiff(tableCheckList, tableCheckList[tabledChecked])
nonMissingTables <- setdiff(tableCheckList, missingTables)
nonMissingDBTables <- subset(dbTables, grepl(paste(tableCheckList, collapse = '|'), TABLE_NAME, ignore.case = T), select=TABLE_NAME)[,,drop=T]


missingTableMessage <- if(length(missingTables)==0) {
  "All tables accounted for."
} else{
    paste("The following tables are missing:", paste(missingTables, collapse = ','), sep = ' ')
}

  QA_Alert_Messages[nrow(QA_Alert_Messages) + 1,] = c("1.01",missingTableMessage)
## Data Check 1.02: Required tables are not populated

table_N_lst <- list() #vector(length=length(nonMissingDBTables))
for(i in 1:length(nonMissingDBTables)){
  table_N_lst[[i]] <- run_query(paste("select distinct '_aaa_' as _aaa_, count(*) as tabN from ",nonMissingDBTables[i], sep=' '))#[,'tabN',drop=T]
 # print(dbTables[i])
  if (class(table_N_lst[[i]]) !='data.frame') {table_N_lst[[i]] <- NULL}
}
#length(dbTables)
table_N_lst <- do.call('rbind', table_N_lst) [,'tabN',drop=T]
nonPopTables <- nonMissingDBTables[table_N_lst==0]
nonPopMessage <- if(length(nonPopTables)==0) {"All tables populated, among those present."
} else{
    paste("The following tables exist, but are not populated:",  paste(nonPopTables, collapse = ','), sep = ' ')
}
  QA_Alert_Messages[nrow(QA_Alert_Messages) + 1,] = c("1.02",nonPopMessage)
sqlFiles <- system.file('sql', package='chordsTables')
schema_validation_query <- readChar(dir(sqlFiles, "VDW_Schema_Validation.sql", f=TRUE), file.info(dir(sqlFiles, "VDW_Schema_Validation.sql", f=TRUE))$size)#read_file(dir(sqlFiles, "VDW_Schema_Validation.sql", f=TRUE))
connection103 <- get_connection()
schema_validation <- sqlQuery(connection103, schema_validation_query, as.is=TRUE)
odbcClose(connection103)
## Data Check 1.03: Required fields are not present

missing_columns <- subset(schema_validation, Result == "TABLE/VIEW OR COLUMN MISSING")
missing_columnsTables <- select(missing_columns, c("TableName", "ColumnName"))
if (params$QAAlert == TRUE & nrow(missing_columnsTables) > 0){
  missing_columns$message <- paste0(missing_columns$TableName, ".", missing_columns$ColumnName, sep="")
  QA_Alert_Message_103_items <- paste0(missing_columns$message, collapse = ', ')
  QA_Alert_Message_103 <- paste0("The following required fields are not present: ", QA_Alert_Message_103_items)
  QA_Alert_Messages[nrow(QA_Alert_Messages) + 1,] = c("1.03",QA_Alert_Message_103)
}
misconfig <- subset(schema_validation, (Result == "CONFIG-MISMATCH" & ExpectedIsNullable == FoundIsNullable)) 
invalid_specs <- select(misconfig, c("TableName", "ColumnName", "ExpectedNumberPrecision", "FoundNumberPrecision", "ExpectedNumberScale", "FoundNumberScale", "ExpectedCharLength", "FoundCharLength", "ExpectedDatePrecision", "FoundDatePrecision"))
if (params$QAAlert == TRUE & nrow(invalid_specs) > 0){
  misconfig$message <- paste0(misconfig$TableName, ".", misconfig$ColumnName, sep="")
  QA_Alert_Message_104_items <- paste0(misconfig$message, collapse = ', ')
  QA_Alert_Message_104 <- paste0("The following required fields do not conform to data model specifications for data type, length, or name: ",
                                  QA_Alert_Message_104_items)
  QA_Alert_Messages[nrow(QA_Alert_Messages) + 1,] = c("1.04",QA_Alert_Message_104)
}
primary_key_data <- run_query("
SET ANSI_NULLS ON;
SET QUOTED_IDENTIFIER ON;
SET NOCOUNT ON;
SET ANSI_WARNINGS OFF;
BEGIN

    IF OBJECT_ID('tempdb..#PKVALIDATION') IS NOT NULL
    BEGIN
       DROP TABLE #PKVALIDATION;
    END;
    CREATE TABLE #PKVALIDATION (
        Table_Name      VARCHAR(250) ,
        Column_Name     VARCHAR(250)
    );
    IF OBJECT_ID('tempdb..#KeyMatchResult') IS NOT NULL
    BEGIN
       DROP TABLE #KeyMatchResult;
    END;
    IF OBJECT_ID('tempdb..#PKVIOLATIONS') IS NOT NULL
    BEGIN
       DROP TABLE #PKVIOLATIONS;
    END;

    CREATE TABLE #PKVIOLATIONS (
        Table_Name      VARCHAR(250) ,
    Violation_Count BIGINT,
        ErrorMessage varchar(500) DEFAULT NULL
    );
END;
/*****************************************************************************
END TempTable Clearing and Creation
*****************************************************************************/
/*****************************************************************************
BEGIN Table Create Section
*****************************************************************************/
INSERT INTO #PKVALIDATION
VALUES      (
       'CENSUS_DEMOG', 'CENSUS_YEAR'), (
       'CENSUS_DEMOG', 'GEOCODE'), (
       'EVERNDC', 'NDC'), (
       'EVERNDC', 'GENERIC'), (
       'PROVIDER_SPECIALTY', 'PROVIDER'), (
       'DEATH', 'PERSON_ID'), (
       'CAUSE_OF_DEATH', 'PERSON_ID'), (
       'CAUSE_OF_DEATH', 'COD'), (
       'DEMOGRAPHICS', 'PERSON_ID'), (
       'LINKAGE', 'CID'), (
       'LINKAGE', 'LINE'), (
       'BENEFIT', 'BENEFIT_ID'), (
       'ENCOUNTERS', 'ENC_ID'), (
       'DIAGNOSES', 'DIAGNOSES_ID'), (
       'ENROLLMENT', 'PERSON_ID'), (
       'ENROLLMENT', 'ENR_START'), (
       'LAB_RESULTS', 'LAB_RESULTS_ID'), (
       'PRO_SURVEYS', 'PRO_ID'), (
       'PRO_QUESTIONS', 'PRO_ID'), (
       'PRO_QUESTIONS', 'QUESTION_ID'), (
       'PRO_QUESTIONS', 'QUESTION_VER'), (
       'PRO_RESPONSES', 'RESPONSE_ID'), (
       'PHARMACY', 'PHARMACY_ID'), (
       'PRESCRIBING', 'PRESCRIBING_ID'), (
       'PROCEDURES', 'PROCEDURES_ID'), (
       'SOCIAL_HISTORY', 'SOCIAL_HISTORY_ID'), (
       'VITAL_SIGNS', 'VITAL_SIGNS_ID'), (
       'TUMOR', 'TUMOR_ID'), (
       'LANGUAGES', 'PERSON_ID'), (
       'LANGUAGES', 'LANG_ISO'), (
       'CENSUS_LOCATION', 'PERSON_ID'), (
       'CENSUS_LOCATION', 'LOC_START');
/*****************************************************************************
BEGIN Table Name Replacement: If a TableName replacement table exists, it will swap out the 
names in the tables for the correct ones based on how it's mapped in their table.
*****************************************************************************/
BEGIN
IF OBJECT_ID('CHORDS_TABLENAMES') IS NOT NULL
    BEGIN
        UPDATE a
               SET a.Table_Name = b.NEW_NAME
        FROM #PKVALIDATION a JOIN CHORDS_TABLENAMES b ON b.ORG_NAME = a.Table_Name;
    END;
END;
/*****************************************************************************
END Table Name Replacement
*****************************************************************************/
/*****************************************************************************
BEGIN Analysis Section: Compares the partner's primary keys to the expected keys
******************************************************************************/
SELECT * INTO #KeyMatchResult
FROM (
SELECT            
       ExpectKeys.TABLE_NAME, 
       ExpectKeys.COLUMN_NAMES AS Expected_Primary_Key, 
       CurrKeys.COLUMN_NAMES AS Found_Primary_Key,
       CASE
           WHEN ob.type = 'V'
               THEN 'VIEW FOUND'
           WHEN CurrKeys.TABLE_NAME IS NULL
           THEN 'TABLE/VIEW OR KEYS NOT FOUND'
           WHEN ExpectKeys.COLUMN_NAMES != CurrKeys.COLUMN_NAMES
           THEN 'KEY MISMATCH'
           WHEN ExpectKeys.COLUMN_NAMES = CurrKeys.COLUMN_NAMES
           THEN 'OK'
           ELSE 'UNKNOWN ERROR'
       END AS Key_Match_Result
FROM                  
(
    SELECT        
           TABLE_NAME, 
           LEFT(COL, LEN(COL) - 1) AS COLUMN_NAMES
    FROM              
    (
        SELECT DISTINCT    
               TAB.TABLE_NAME TABLE_NAME, 
        (
            SELECT 
                   COL.Column_Name + ', ' AS [text()]
            FROM    
                 #PKVALIDATION COL
            WHERE  COL.Table_Name = TAB.Table_Name
            ORDER BY 
                     COL.Column_Name FOR
            XML PATH('')
        ) COL
        FROM 
             #PKVALIDATION TAB
    ) T
    WHERE T.COL IS NOT NULL
) ExpectKeys
LEFT JOIN
(
    SELECT        
           TABLE_NAME, 
           LEFT(COL, LEN(COL) - 1) AS COLUMN_NAMES
    FROM              
    (
        SELECT DISTINCT    
               TAB.TABLE_NAME TABLE_NAME, 
        (
            SELECT 
                   COL.COLUMN_NAME + ', ' AS [text()]
            FROM    
                 INFORMATION_SCHEMA.CONSTRAINT_COLUMN_USAGE COL
            WHERE  COL.CONSTRAINT_NAME = TAB.CONSTRAINT_NAME
                   AND COL.TABLE_NAME = TAB.TABLE_NAME
                   AND CONSTRAINT_TYPE = 'PRIMARY KEY'
            ORDER BY 
                     COL.COLUMN_NAME FOR
            XML PATH('')
        ) COL
        FROM 
             INFORMATION_SCHEMA.TABLE_CONSTRAINTS TAB
    ) T
    WHERE T.COL IS NOT NULL
) CurrKeys
     ON CurrKeys.TABLE_NAME = ExpectKeys.TABLE_NAME
     LEFT JOIN sys.objects ob ON ob.object_id = OBJECT_ID(ExpectKeys.TABLE_NAME)
     and ob.type in ('U', 'V')) KeyMatch;
BEGIN

    DECLARE @SQL NVARCHAR(3000);
    DECLARE @Table_Name VARCHAR(100);
      DECLARE @Key_Columns VARCHAR(100);

    DECLARE CUR CURSOR
    FOR SELECT
              Table_Name, Expected_Primary_Key
        FROM
             #KeyMatchResult;

    OPEN CUR;

    FETCH NEXT FROM CUR INTO @Table_Name, @Key_Columns;

    WHILE @@FETCH_STATUS = 0
        BEGIN
        SET @SQL = '
            IF OBJECT_ID(''' + @Table_Name + ''') IS NOT NULL
            WITH CTEKEY
                AS (SELECT 
                    COUNT(*) KEYTOT
                    FROM (
                        SELECT DISTINCT  ' + @Key_Columns + '
                        FROM ' + @Table_Name + ') z),
            CTETOT
                AS (SELECT COUNT(*) TABTOT
                    FROM ' + @Table_Name + ')
            INSERT INTO #PKVIOLATIONS
            SELECT  ''' + @Table_Name + ''', CTETOT.TABTOT - CTEKEY.KEYTOT, NULL
            FROM CTETOT, CTEKEY;'
            BEGIN TRY
                EXEC Sp_executesql
                     @SQL;
            END TRY
            BEGIN CATCH
                Declare @ErrorMessageRef as varchar(max);
                SELECT @ErrorMessageRef = ERROR_MESSAGE();
                INSERT INTO #PKVIOLATIONS
                    VALUES(@Table_Name, NULL, @ErrorMessageRef)
            END CATCH
        FETCH NEXT FROM CUR INTO @Table_Name, @Key_Columns;
        END;
    CLOSE CUR;
    DEALLOCATE CUR; 
END;
SELECT DISTINCT 
       z.Table_Name, 
       z.Expected_Primary_Key, 
       z.Found_Primary_Key,
       CASE
           WHEN z.ErrorMessage IS NOT NULL THEN 
                z.ErrorMessage
           WHEN z.Key_Match_Result = 'TABLE/VIEW OR KEYS NOT FOUND'
                AND y.name IS NOT NULL
                AND z.ErrorMessage IS NULL
           THEN 'TABLE SYNONYM FOUND; UNABLE TO AUTOMATICALLY VALIDATE'
           ELSE z.Key_Match_Result
       END AS Key_Match_Result, 
       z.Violation_Count
FROM       
(
    SELECT 
           m.*, 
           v.Violation_Count,
           v.ErrorMessage
    FROM   
         #KeyMatchResult m
         LEFT JOIN #PKVIOLATIONS v
              ON v.Table_Name = m.TABLE_NAME
) z
LEFT JOIN sys.synonyms y
     ON y.name = z.TABLE_NAME;")
violated_primary_keys <- subset(primary_key_data, Key_Match_Result != "OK")
if (params$QAAlert == TRUE & nrow(violated_primary_keys) > 0){
  violated_primary_keys_table <- subset(violated_primary_keys, (Violation_Count > 0 & Key_Match_Result == "VIEW FOUND") | (Violation_Count > 0 & Key_Match_Result == "TABLE SYNONYM FOUND; UNABLE TO AUTOMATICALLY VALIDATE") |  (Key_Match_Result != "VIEW FOUND" & Key_Match_Result != "TABLE SYNONYM FOUND; UNABLE TO AUTOMATICALLY VALIDATE"))
  violated_primary_keys_table$message <- paste0(violated_primary_keys_table$Table_Name, sep="")
  QA_Alert_Message_105_items <- paste0(violated_primary_keys_table$message, collapse = ', ')
  QA_Alert_Message_105 <- paste0("The following tables have primary key definition errors:  ",
                                  QA_Alert_Message_105_items)
  QA_Alert_Messages[nrow(QA_Alert_Messages) + 1,] = c("1.05",QA_Alert_Message_105)
}
sqlFiles <- system.file('sql', package='chordsTables')
data_validation_query <- readChar(dir(sqlFiles, "VDW_DataValidation.sql", f=TRUE), file.info(dir(sqlFiles, "VDW_DataValidation.sql", f=TRUE))$size) #read_file(dir(sqlFiles, "VDW_DataValidation.sql", f=TRUE))
connection106 <- get_connection()
results <- sqlQuery(connection106, data_validation_query, as.is=TRUE)
data_validation <- sqlQuery(connection106, "
  SELECT
       TargetTable,
       TargetColumn,
       CASE 
          WHEN UnexpectedValue = '' THEN '(blank)'
          ELSE UnexpectedValue
        END AS UnexpectedValue,
       Message
  FROM
     #CHORDSDataValueResults
  ORDER BY TargetTable;
  ", as.is = TRUE)
ref_integrity <- sqlQuery(connection106, "
  SELECT
       *
  FROM
     #CHORDSReferentialIntegrityResults
  ORDER BY TargetTable;
  ", as.is = TRUE)
odbcClose(connection106)
if (params$QAAlert == TRUE & nrow(data_validation) > 0){
  data_validation_table <- data.frame(data_validation, stringsAsFactors = FALSE)
  data_validation_table$message <- paste0(data_validation_table$TargetTable, ".", data_validation_table$TargetColumn, sep="")
  QA_Alert_Message_106_items <- paste0(data_validation_table$message, collapse = ', ')
  QA_Alert_Message_106 <- paste0("The following required fields contain values outside of data model specifications: ",
                                  QA_Alert_Message_106_items)
  QA_Alert_Messages[nrow(QA_Alert_Messages) + 1,] = c("1.06", QA_Alert_Message_106)
}
nullableValues = subset(schema_validation, (Result == "CONFIG-MISMATCH" & ExpectedIsNullable != FoundIsNullable))
nullableValues <- select(nullableValues, c("TableName", "ColumnName", "ExpectedIsNullable", "FoundIsNullable"))
if (params$QAAlert == TRUE & nrow(nullableValues) > 0){
  nullableValues_Table <- data.frame(nullableValues, stringsAsFactors = FALSE)
  nullableValues_Table$message <- paste0(nullableValues_Table$TableName, ".", nullableValues_Table$ColumnName, sep="")
  QA_Alert_Message_107_items <- paste0(nullableValues_Table$message, collapse = ', ')
  QA_Alert_Message_107 <- paste0("The following required fields have non-permissible missing values: ",
                                  QA_Alert_Message_107_items)
  QA_Alert_Messages[nrow(QA_Alert_Messages) + 1,] = c("1.07",QA_Alert_Message_107)
}
orphan_person_ids <- subset(ref_integrity, ((TargetColumn == "PERSON_ID" | ReferenceColumn == "PERSON_ID") & ValuesNotFound > 0))
orphan_person_ids_table <- select(orphan_person_ids, c("TargetTable", "TargetColumn", "ReferenceTable", "ReferenceColumn", "ValuesNotFound", "TargetTableDistinctCount", "PercentOfDistinctTargetColumnMissing"))
if (params$QAAlert == TRUE & nrow(orphan_person_ids_table) > 0){
  orphan_person_ids_sub_table <- data.frame(orphan_person_ids_table, stringsAsFactors = FALSE)
  orphan_person_ids_sub_table$message <- paste0(orphan_person_ids_sub_table$TargetTable, sep="")
  QA_Alert_Message_108_items <- paste0(orphan_person_ids_sub_table$message, collapse = ', ')
  QA_Alert_Message_108 <- paste0("The following tables contain orphan PERSON_IDs: ",
                                  QA_Alert_Message_108_items)
  QA_Alert_Messages[nrow(QA_Alert_Messages) + 1,] = c("1.08",QA_Alert_Message_108)
}
orphan_enc_ids <- subset(ref_integrity, (TargetColumn == "ENC_ID" | ReferenceColumn == "ENC_ID") & ValuesNotFound > 0)
orphan_enc_ids_table <- select(orphan_enc_ids, c("TargetTable", "TargetColumn", "ReferenceTable", "ReferenceColumn", "ValuesNotFound", "TargetTableDistinctCount", "PercentOfDistinctTargetColumnMissing"))
if (params$QAAlert == TRUE & nrow(orphan_enc_ids_table) > 0){
  orphan_person_ids_sub_table <- data.frame(orphan_enc_ids_table, stringsAsFactors = FALSE)
  orphan_person_ids_sub_table$message <- paste0(orphan_person_ids_sub_table$TargetTable, sep="")
  QA_Alert_Message_109_items <- paste0(orphan_person_ids_sub_table$message, collapse = ', ')
  QA_Alert_Message_109 <- paste0("The following tables contain orphan ENCOUNTER_IDs: ",
                                  QA_Alert_Message_109_items)
  QA_Alert_Messages[nrow(QA_Alert_Messages) + 1,] = c("1.09",QA_Alert_Message_109)
}
repErrors <- run_query(
                      paste0(
                        "
select 'Diagnosis' as tablename, 
       count(*) as nrows,
       sum(case when a.person_id <> b.person_id then 1 else 0 end) as Person_ID,
       sum(case when a.adate <> b.adate then 1 else 0 end) as Adate,
       sum(case when a.enctype <> b.enctype then 1 else 0 end) as EncType
from ", encounters, " as a inner join (select distinct enc_id, person_id, adate, enctype from ", diagnoses, ") as b on a.enc_id = b.enc_id
union
select 'Procedure' as tablename, 
       count(*) as nrows,
       sum(case when a.person_id <> b.person_id then 1 else 0 end) as  Person,
       sum(case when a.adate <> b.adate then 1 else 0 end) as  Adate,
       sum(case when a.enctype <> b.enctype then 1 else 0 end) as  EncType
from ", encounters, " as a inner join (select distinct enc_id, person_id, adate, enctype from ", procedures, ") as b on a.enc_id = b.enc_id
                        "
                      )) 
repErrors_tx <- tidyr::gather(repErrors, 'field','n_bad',Person_ID, Adate, EncType) %>% arrange(tablename) %>%
  within({
    NP_bad<- paste0(n_bad,' (',round(100*n_bad/nrows, 2),')')
  })


QA_Alert_Message_1.10 <- if(nrow(subset(repErrors_tx, n_bad>0))>0) {"There are Replication errors between the ENCOUNTER, PROCEDURES and DIAGNOSIS tables"} else {
  "There were no Replication errors between the ENCOUNTER, PROCEDURES and DIAGNOSIS tables found."
}
  QA_Alert_Messages[nrow(QA_Alert_Messages) + 1,] = c("1.10",QA_Alert_Message_1.10)
GR1Person <- run_query(
                      paste0(
                        "
select 'Encounter' as tablename, count(*) as n_encid, 
       sum(case when N_person>1 then 1 else 0 end) as N_gr1
from ( select enc_id,
       count(distinct person_id) as N_person
       from ", encounters," 
       group by enc_id
) as qry
"
))
GR1Person <- within(GR1Person, {
  pct <- round(100*N_gr1/n_encid, 2)
})
#if(GR1Person$pct>5){
#  print(paste0("WARNING: The encounter table has more than 5% of encounters assigned to more than 1 person (",GR1Person$pct,"%)"))
#} else{
#  print(paste0("The encounter table has ", GR1Person$pct,"% of encounters assigned to more than 1 person"))
#}
encItoManyMsg <- if(GR1Person$pct>5){
  paste0("WARNING: The encounter table has more than 5% of encounters assigned to more than 1 person (",GR1Person$pct,"%)")
} else{
  paste0("The encounter table has ", GR1Person$pct,"% of encounters assigned to more than 1 person")
}

  QA_Alert_Messages[nrow(QA_Alert_Messages) + 1,] = c("1.11",encItoManyMsg)
orphan_provider_ids <- subset(ref_integrity, ((TargetColumn %in% c("RXMD", "PROVIDER") | ReferenceColumn %in% c("RXMD", "PROVIDER"))) & ValuesNotFound > 0)
orphan_provider_ids_table <- select(orphan_provider_ids, c("TargetTable", "TargetColumn", "ReferenceTable", "ReferenceColumn", "ValuesNotFound", "TargetTableDistinctCount", "PercentOfDistinctTargetColumnMissing"))
if (params$QAAlert == TRUE & nrow(orphan_provider_ids_table) > 0){
  orphan_provider_ids_sub_table <- data.frame(orphan_provider_ids_table, stringsAsFactors = FALSE)
  orphan_provider_ids_sub_table$message <- paste0(orphan_provider_ids_sub_table$TargetTable, ".", orphan_provider_ids_sub_table$TargetColumn, sep="")
  QA_Alert_Message_112_items <- paste0(orphan_provider_ids_sub_table$message, collapse = ', ')
  QA_Alert_Message_112 <- paste0("The following tables contain orphan PROVIDER_IDs: ",
                                  QA_Alert_Message_112_items)
  QA_Alert_Messages[nrow(QA_Alert_Messages) + 1,] = c("1.12",QA_Alert_Message_112)
}
connection113 <- get_connection()
code_conform_table_counts <- sqlQuery(connection113, paste0("
 /*
 - DIAGNOSES:
        *ICD09:length is not between 3-5 OR has alpha characters other than 
         E or V OR has no numeric characters OR first 3 digits (min length) are 0;
        *ICD10: length is not between 3 and 7 OR 1st character is not alpha OR 
         first 3 digits (min length) are 0 or 9 OR has no numeric characters;
 - PROCEDURES:
        *CPT/HCPCS are length <5 OR first 5 are all 0 or 9 OR no numeric;
        *ICD9 is not length 3 or 4 OR any alpha OR all 0;
        *ICD10 is not length 7 OR no numeric OR all 7 are 0s or 9s;
 - PRESCRIBING:
        *flag codes with any alphabetical characters OR a length <2 or >7;
 - PHARMACY:
        *length not 11 OR any alpha OR a string of 0 or 9;
 - LAB_RESULT
        *flag for any alphabetical characters OR a length less than 3 or greater 
         than 7 OR the absence of a dash after the next to last position;
*/
SET NOCOUNT ON;
DROP TABLE IF EXISTS 
     #VdwInvalidCodes;
SELECT        
       *
INTO 
     #VdwInvalidCodes
FROM              
(
    SELECT    
           a.TableName, 
           a.CodeType, 
           a.Code, 
           a.ValidResult
    FROM          
    (
        SELECT 
               'DIAGNOSES' AS TableName, 
               DX AS Code, 
               DX_CODETYPE AS CodeType,
               CASE
                   WHEN LEN(replace(DX, '.', '')) NOT BETWEEN 3 AND 7
                   THEN 'Code Length must be between 3 and 7 characters (excluding \".\")'
                   WHEN DX NOT LIKE '[A-TV-Z]%'
                   THEN 'Starting character must be a letter and not \"U\"'
                   WHEN SUBSTRING(DX, 1, 3) IN('000', '999')
                   THEN 'Invalid Numeric Value Range'
                   WHEN DX NOT LIKE '%[0-9]%'
                   THEN 'Values must include a numerical component'
                   ELSE 'OK'
               END AS ValidResult
        FROM ",    
             diagnoses, "
        WHERE   DX_CODETYPE = '10'
        UNION ALL
        SELECT 
               'DIAGNOSES' AS TableName, 
               DX AS Code, 
               DX_CODETYPE AS CodeType,
               CASE
                   WHEN LEN(replace(DX, '.', '')) NOT BETWEEN 3 AND 5
                   THEN 'Length must be between 3 and 5 characters (excluding \".\")'
                   WHEN(DX NOT LIKE '[EV]%'
                        AND DX NOT LIKE '[0-9]%')
                   THEN 'Starting Character must be \"E\", \"V\", or a number'
                   WHEN SUBSTRING(DX, 1, 3) IN('000')
                   THEN 'Invalid Numeric Value Range'
                   WHEN DX NOT LIKE '%[0-9]%'
                   THEN 'Numeric Values Not Detected'
                   ELSE 'OK'
               END AS ValidResult
        FROM  ",   
             diagnoses, "
        WHERE   DX_CODETYPE = '09'
        UNION ALL
        SELECT 
               'PROCEDURES' AS TableName, 
               PX AS Code, 
               PX_CODETYPE AS CodeType,
               CASE
                   WHEN LEN(PX) < 5
                   THEN 'Code length must be less than 5 characters'
                   WHEN SUBSTRING(PX, 1, 5) IN('00000', '99999')
                   THEN 'Invalid Numeric Value'
                   WHEN TRY_PARSE(PX AS INT) IS NULL
                   THEN 'Non-numeric Characters Detected'
                   ELSE 'OK'
               END AS ValidResult
        FROM  ",   
             procedures, "
        WHERE   PX_CODETYPE = 'C4'
        UNION ALL
        SELECT 
               'PROCEDURES' AS TableName, 
               PX AS Code, 
               PX_CODETYPE AS CodeType,
               CASE
                   WHEN LEN(replace(PX, '.', '')) NOT BETWEEN 3 AND 4
                   THEN 'Code length must be between 3 and 4 characters'
                   WHEN PX NOT LIKE '[0-9]%'
                   THEN 'Invalid Start Character'
                   WHEN PX IN('00000')
                   THEN 'Invalid Numeric Value'
                   ELSE 'OK'
               END AS ValidResult
        FROM  ",
             procedures, "
        WHERE   PX_CODETYPE = '09'
        UNION ALL
        SELECT 
               'PROCEDURES' AS TableName, 
               PX AS Code, 
               PX_CODETYPE AS CodeType,
               CASE
                   WHEN LEN(PX) != 7
                   THEN 'Code Length Must Equal 7'
                   WHEN PX NOT LIKE '%[0-9]%'
                   THEN 'Code values must include a numerical component'
                   WHEN PX IN('0000000', '9999999')
                   THEN 'Invalid Numeric Value'
                   ELSE 'OK'
               END AS ValidResult
        FROM  ",   
             procedures,"
        WHERE   PX_CODETYPE = '10'
        UNION ALL
        SELECT 
               'PRESCRIBING' AS TableName, 
               RXNORM AS Code, 
               'RXNORM' AS CodeType,
               CASE
                   WHEN RXNORM LIKE '%[A-Z]%'
                   THEN 'Code cannot contain alphabetical characters'
                   WHEN LEN(RXNORM) NOT BETWEEN 2 AND 7
                   THEN 'Code length must be between 2 and 7 characters'
                   ELSE 'OK'
               END AS ValidResult
        FROM  ",   
             prescribing, "
        UNION ALL
        SELECT 
               'PHARMACY' AS TableName, 
               NDC AS Code, 
               'NDC' AS CodeType,
               CASE
                   WHEN NDC LIKE '%[A-Z]%'
                   THEN 'Code cannot contain alphabetical characters'
                   WHEN LEN(NDC) != 11
                   THEN 'Code length must be 11 characters'
                   WHEN NDC IN('00000000000', '99999999999')
                   THEN 'Invalid Numeric Value'
                   ELSE 'OK'
               END AS ValidResult
        FROM  ",   
             pharmacy, "
        UNION ALL
        SELECT 
               'LAB_RESULTS' AS TableName, 
               LOINC AS Code, 
               'LOINC' AS CodeType,
               CASE
                   WHEN LOINC LIKE '%[A-Z]%'
                   THEN 'Code cannot contain alphabetical characters'
                   WHEN LEN(LOINC) NOT BETWEEN 3 AND 7
                   THEN 'Code length must be between 2 and 7 characters'
                   WHEN SUBSTRING(LEFT(REVERSE(RTRIM(LTRIM(LOINC))), 2), 2, 2) != '-'
                   THEN 'No hyphen character in the second to last position'
                   ELSE 'OK'
               END AS ValidResult
        FROM ",  
             lab_results, "
    ) a
    WHERE a.validResult != 'OK'
) InvalidCodes;
WITH CTE_CodeCounts
     AS (SELECT 
                'DIAGNOSES' AS TableName, 
                DX_CODETYPE AS CodeType, 
                COUNT(1) CountAll
         FROM ",    
              diagnoses, "
         GROUP BY 
                  DX_CODETYPE
         UNION ALL
         SELECT 
                'PROCEDURES' AS TableName, 
                PX_CodeType AS CodeType, 
                COUNT(1) CountAll
         FROM ",    
              procedures, " 
         GROUP BY 
                  PX_CodeType
         UNION ALL
         SELECT 
                'PRESCRIBING' AS TableName, 
                'RXNORM' AS CodeType, 
                COUNT(1) CountAll
         FROM ",    
              prescribing, "
         UNION ALL
         SELECT 
                'PHARMACY' AS TableName, 
                'NDC' AS CodeType, 
                COUNT(1) CountAll
         FROM ",    
              pharmacy, "
         UNION ALL
         SELECT 
                'LAB_RESULTS' AS TableName, 
                'LOINC' AS CodeType, 
                COUNT(1) CountAll
         FROM ",   
              lab_results, "
    )
     SELECT 
            b.TableName, 
            b.CodeType, 
            cc.CountAll, 
            b.CountInvalid AS CountInvalid, 
            ROUND(CAST(b.CountInvalid AS FLOAT) / CAST(cc.CountAll AS FLOAT) * CAST(100.0 AS FLOAT), 3)
            AS PercentInvalid
     FROM       
     (
         SELECT 
                z.TableName, 
                z.CodeType, 
                COUNT(1) AS CountInvalid
         FROM   
              #VdwInvalidCodes z
         GROUP BY 
                  z.TableName, 
                  z.CodeType
     ) b
     JOIN CTE_CodeCounts cc
          ON cc.TableName = b.TableName
             AND cc.CodeType = b.CodeType
     ORDER BY 
              b.TableName, 
              b.CodeType;
"))
top_50_invalid <- sqlQuery(connection113, "
SELECT 
        TableName, 
    CodeType, 
    CASE 
      WHEN Code = '' THEN '(blank)'
      ELSE Code
    END AS Code, 
    ValidResult, 
        CountInvalid
FROM       
(
    SELECT 
           TableName, 
           CodeType, 
           Code, 
           ValidResult, 
           CountInvalid, 
           ROW_NUMBER() OVER(PARTITION BY TableName
           ORDER BY 
                    CountInvalid DESC) RowNum
    FROM       
    (
        SELECT 
               z.TableName, 
               z.CodeType, 
               z.Code, 
               z.ValidResult, 
               COUNT(1) AS CountInvalid
        FROM   
             #VdwInvalidCodes z
        GROUP BY 
                 z.TableName, 
                 z.CodeType, 
                 z.Code, 
                 z.ValidResult
    ) InvalidCount
) Top50
WHERE  RowNum <= 50
ORDER BY TableName, CodeType, CountInvalid Desc;
")
odbcClose(connection113)
if (params$QAAlert == TRUE){
  code_conform_table_counts_table <- subset(code_conform_table_counts, PercentInvalid > 5.0)
  if (!(length(code_conform_table_counts_table)==0)){
    code_conform_table_counts_table$message <- paste0(code_conform_table_counts_table$TableName, "-", code_conform_table_counts_table$CodeType, " (",code_conform_table_counts_table$PercentInvalid , "%)", sep="")
    QA_Alert_Message_113_items <- paste0(code_conform_table_counts_table$message, collapse = ', ')
    QA_Alert_Message_113 <- paste0("More than 5% of ICD, CPT, LOINC, RXCUI, or NDC codes do not conform to the expected length or content: ",
                                  QA_Alert_Message_113_items)
    QA_Alert_Messages[nrow(QA_Alert_Messages) + 1,] = c("1.13",QA_Alert_Message_113)
  }
}
# DC 2.01

tabDates <- run_query("
SELECT DISTINCT 
       case 
            when s.name is not null then s.name else a.name end AS tabname, 
       b.name AS colname
FROM   
     sys.objects a
     INNER JOIN sys.columns b
          ON a.object_id = b.object_id
     left join sys.synonyms s on 
        replace(replace( s.base_object_name, '[', ''), ']', '') = a.name
WHERE a.type IN('U', 'V')
and b.name like '%date%'
ORDER BY 
        tabname;")

modDates <- run_query("
SELECT  DISTINCT
       case 
            when s.name is not null then s.name else a.name end AS name, 
      a.modify_date
FROM   
     sys.objects a
     INNER JOIN sys.columns b
          ON a.object_id = b.object_id
     left join sys.synonyms s on 
        replace(replace( s.base_object_name, '[', ''), ']', '') = a.name
WHERE a.type IN('U', 'V')")

futureDateCode <- merge(tabDates, modDates, by.x='tabname', by.y='name') %>%
  within({
    sqlcode <- paste0("
                      select '",tabname, "' as tableName, '", colname,"' as dateName ,count(*) as nrows, sum(case when a.",colname,">b.modify_date then 1 else 0 end) as futureDate from ",
tabname, " as a , (select modify_date from (SELECT  
       case 
            when s.name is not null then s.name else a.name end AS name, 
       a.modify_date
FROM   
     sys.objects a
     INNER JOIN sys.columns b
          ON a.object_id = b.object_id
     left join sys.synonyms s on 
        replace(replace( s.base_object_name, '[', ''), ']', '') = a.name
WHERE a.type IN('U', 'V')) a  WHERE a.name ='",tabname,"') as b"
)
  })
fdatelst <- list()
for (i in 1:nrow(futureDateCode)){
  fdatelst[[i]] <- run_query(futureDateCode[i, "sqlcode"])
}
fdatesdone <- do.call('rbind', fdatelst) %>%
  within(.,{
    pctFuture <- round(100*futureDate/nrows, 4)
  })

### add message if no table is a problem 
futureDataMsg <- NULL
futureDataMsg <- if(nrow(subset(fdatesdone, pctFuture>=5))>0  ){
  futureDataMsgTable <- subset(fdatesdone, pctFuture>=5)
  paste("The following tables exceeded the 5% limit on future dates:",
        paste0(futureDataMsgTable$tableName, futureDataMsgTable$dateName, sep="."), sep = ', ')
} else if(nrow(subset(fdatesdone, pctFuture>=5))==0  ) {
                   "No table exceeds the 5% limit on future dates."
}  

  QA_Alert_Messages[nrow(QA_Alert_Messages) + 1,] = c("2.01",futureDataMsg)
# select count(*) as personCount, sum(case when (minage not between 0 and 89) or (maxage not between 0 and 89) then 1 else 0 end ) as N_outOfRange
patAgeRange <- run_query(paste0("
select count(*) as nrows, 
       sum(case when (minage <0) or (maxage <0) then 1 else 0 end ) as N_low,
       sum(case when (minage >=90) or (maxage >=90) then 1 else 0 end ) as N_high
from
(
select distinct  b.*, datediff(YY, a.birth_date, b.first_dt) as minage,  datediff(YY, a.birth_date, b.last_dt) as maxage 
from ",demographics, " as a inner join 
  (select person_id, min(adate) as first_dt, max(adate) as last_dt from ",encounters, "  group by person_id  ) as b on a.person_id = b.person_id    
where a.birth_date is not null
) as qry
                        "))
htOutOfRange <- run_query(paste0("

select count(*) as nrows,
       sum(case when ht <0.0 then 1 else 0 end ) as N_low,
       sum(case when ht>=95.0 then 1 else 0 end ) as N_high
from ", vital_signs, " 
 where ht is not null
                             "))
wtOutOfRange <- run_query(paste0("

select count(*) as nrows,
       sum(case when wt <0.0 then 1 else 0 end ) as N_low,
       sum(case when wt>350 then 1 else 0 end ) as N_high
from ", vital_signs, " 
 where wt is not null
                             "))
dbpOutOfRange <- run_query(paste0("

select count(*) as nrows,
       sum(case when diastolic <40 then 1 else 0 end ) as N_low,
       sum(case when diastolic>120 then 1 else 0 end ) as N_high
from ", vital_signs, " 
 where diastolic is not null
                             "))
sbpOutOfRange <- run_query(paste0("

select count(*) as nrows,
       sum(case when systolic <40 then 1 else 0 end ) as N_low,
       sum(case when systolic>210 then 1 else 0 end ) as N_high
from ", vital_signs, " 
 where systolic is not null
                             "))

daysSupplyOutOfRange <- run_query(paste0("

select count(*) as nrows,
       sum(case when rx_days_supply <1 then 1 else 0 end ) as N_low,
       sum(case when rx_days_supply>90 then 1 else 0 end ) as N_high
from ", prescribing, " 
 where rx_days_supply is not null
                             "))
allOutOfRange <- 
  rbind(
    cbind(table='Demographic/Encounter', item='Age (people)', low='< 0 yrs',high='> 89 yrs', patAgeRange),
    cbind(table='Vital_signs',item='Height (records)',low= '< 0 inches',high='> 0 inches', htOutOfRange),
    cbind(table='Vital_signs',item='Weight (records)',low= '< 0 pounds',high='> 350 pounds', wtOutOfRange),
    cbind(table='Vital_signs',item='Diastolic BP (records)',low= '< 40 mgHg',high='> 120 mgHg', dbpOutOfRange),
    cbind(table='Vital_signs',item='Systolic BP (records)',low= '< 40 mgHg',high='> 210 mgHg', sbpOutOfRange),
    cbind(table='Prescribing',item='Prescribed days supply (records)',low= '< 1 day',high='> 90 days', daysSupplyOutOfRange)
)
allOutOfRange <- within(allOutOfRange, {
  NP_low = paste0(N_low,' (',round(100*N_low/nrows, 2),')')
  NP_high = paste0(N_high,' (',round(100*N_high/nrows, 2),')')
})
#knitr::kable(
#subset(allOutOfRange, select=c(table,item, low, high, nrows, NP_low, NP_high)),
#col.names = c('Table','Field','Check Low','High','N','Low values N (%)','High values N (%)')
#)

outOfRangeMsg <- NULL
outOfRangeMsg <- if(nrow(subset(allOutOfRange, (N_low+N_high)/nrows>=0.1))>0 ){

  paste('More than 10% of records fall into the lowest or highest categories of age, height, weight, diastolic,  blood pressure, systolic blood pressure, or dispensed days supply:', subset(allOutOfRange, (N_low+N_high)/nrows>=0.1)$item, sep=' ')
} else if(nrow(subset(allOutOfRange, (N_low+N_high)/nrows>=0.1))==0 ) {
                   "No table exceeds the 5% limit."
}
  QA_Alert_Messages[nrow(QA_Alert_Messages) + 1,] = c("2.02",futureDataMsg)
encTabBirth2.03 <- run_query(paste0("
 select sum(dobPostAdate) as dobPostAdate,
         sum(dobPostDdate) as dobPostDdate
  from (
                       select a.person_id, 
                              max(case when datediff(day, a.birth_Date, b.adate)<0 and b.adate is not null then 1 else 0 end) as dobPostAdate,
                              max(case when datediff(day, a.birth_Date, b.ddate)<0 and b.ddate is not null then 1 else 0 end) as dobPostDdate
                       from (select distinct person_id, birth_date from ",demographics, " where birth_date is not null) as a inner join
                            ", encounters, " as b on a.person_id = b.person_id group by a.person_id
) qry


                       ") )
encTabDeath2.03 <- run_query(paste0("
 select sum(deathPreAdate) as deathPreAdate,
         sum(deathPreDdate) as deathPreDdate
  from (
                      select a.person_id, 
                              max(case when datediff(day, a.deathdt, b.adate)>0 and b.adate is not null then 1 else 0 end) as deathPreAdate,
                              max(case when datediff(day, a.deathdt, b.ddate)>0 and b.ddate is not null then 1 else 0 end) as deathPreDdate
                       from (select distinct person_id, deathdt from ",death, " where deathdt is not null) as a inner join
                            ", encounters, " as b on a.person_id = b.person_id group by a.person_id
) qry


                       ") )
procTabBirth2.03 <- run_query(paste0("
 select sum(dobPostProc) as dobPostProc 
  from (
                       select a.person_id, 
                              max(case when datediff(day, a.birth_Date, procdate)<0 and b.procdate is not null then 1 else 0 end) as dobPostProc
                       from (select distinct person_id, birth_date from ",demographics, " where birth_date is not null) as a inner join
                            ", procedures, " as b on a.person_id = b.person_id group by a.person_id
) qry


                       ") )
procTabDeath2.03 <- run_query(paste0("
 select sum(deathPreProc) as deathPreProc
  from (
                       select a.person_id, 
                              max(case when datediff(day, a.deathdt, b.procdate)>0 and b.procdate is not null then 1 else 0 end) as deathPreProc
                       from (select distinct person_id, deathdt from ",death, " where deathdt is not null) as a inner join
                            ", procedures, " as b on a.person_id = b.person_id group by a.person_id
) qry


                       ") )
vitalsTabBirth2.03 <- run_query(paste0("
 select sum(dobPostMeasure) as dobPostMeasure 
  from (
                       select a.person_id, 
                              max(case when datediff(day, a.birth_Date, b.measure_date)<0 and b.measure_date is not null then 1 else 0 end) as dobPostMeasure
                       from (select distinct person_id, birth_date from ",demographics, " where birth_date is not null) as a inner join
                            ", vital_signs, " as b on a.person_id = b.person_id group by a.person_id
) qry


                       ") )
vitalsTabDeath2.03 <- run_query(paste0("
 select sum(deathPreMeasure) as deathPreMeasure
  from (
                       select a.person_id, 
                              max(case when datediff(day, a.deathdt, b.measure_date)>0 and b.measure_date is not null then 1 else 0 end) as deathPreMeasure
                       from (select distinct person_id, deathdt from ",death, " where deathdt is not null) as a inner join
                            ", vital_signs, " as b on a.person_id = b.person_id group by a.person_id
) qry


                       ") )
pharmTabBirth2.03 <- run_query(paste0("
 select sum(dobPostRx) as dobPostRx 
  from (
                       select a.person_id, 
                              max(case when datediff(day, a.birth_Date, b.rxdate)<0 and b.rxdate is not null then 1 else 0 end) as dobPostRx
                       from (select distinct person_id, birth_date from ",demographics, " where birth_date is not null) as a inner join
                            ", pharmacy, " as b on a.person_id = b.person_id group by a.person_id
) qry


                       ") )
pharmTabDeath2.03 <- run_query(paste0("
 select sum(deathPreRx) as deathPreRx
  from (
                       select a.person_id, 
                              max(case when datediff(day, a.deathdt, b.rxdate)>0 and b.rxdate is not null then 1 else 0 end) as deathPreRx
                       from (select distinct person_id, deathdt from ",death, " where deathdt is not null) as a inner join
                            ", pharmacy, " as b on a.person_id = b.person_id group by a.person_id
) qry


                       ") )

prescribTabBirth2.03 <- run_query(paste0("
 select sum(dobPostRxStart) as dobPostRxStart 
  from (
                       select a.person_id, 
                              max(case when datediff(day, a.birth_Date, b.rx_start_date)<0 and b.rx_start_date is not null then 1 else 0 end) as dobPostRxStart
                       from (select distinct person_id, birth_date from ",demographics, " where birth_date is not null) as a inner join
                            ", prescribing, " as b on a.person_id = b.person_id group by a.person_id
) qry


                       ") )
prescribTabDeath2.03 <- run_query(paste0("
 select sum(deathPreRxStart) as deathPreRxStart
  from (
                       select a.person_id, 
                              max(case when datediff(day, a.deathdt, b.rx_start_date)>0 and b.rx_start_date is not null then 1 else 0 end) as deathPreRxStart
                       from (select distinct person_id, deathdt from ",death, " where deathdt is not null) as a inner join
                            ", prescribing , " as b on a.person_id = b.person_id group by a.person_id
) qry


                       ") )
labTabBirth2.03 <- run_query(paste0("
 select sum(dobPostResult) as dobPostResult 
  from (
                       select a.person_id, 
                              max(case when datediff(day, a.birth_Date, b.result_dt)<0 and b.result_dt is not null then 1 else 0 end) as dobPostResult
                       from (select distinct person_id, birth_date from ",demographics, " where birth_date is not null) as a inner join
                            ", lab_results, " as b on a.person_id = b.person_id group by a.person_id
) qry


                       ") )
labTabDeath2.03 <- run_query(paste0("
 select sum(deathPreResult) as deathPreResult
  from (
                       select a.person_id, 
                              max(case when datediff(day, a.deathdt, b.result_dt)>0 and b.result_dt is not null then 1 else 0 end) as deathPreResult
                       from (select distinct person_id, deathdt from ",death, " where deathdt is not null) as a inner join
                            ", lab_results, " as b on a.person_id = b.person_id group by a.person_id
) qry


                       ") )
DeathBeforeBirth <- run_query(paste0("
 select sum(deathPreBirth) as deathPreBirth
  from (
                       select a.person_id, 
                              max(case when datediff(day, a.deathdt, b.birth_date)>0  then 1 else 0 end) as deathPreBirth
                       from (select distinct person_id, deathdt from ",death, " where deathdt is not null) as a inner join
                             (select distinct person_id, birth_date from ",demographics, " where birth_date is not null) as b on a.person_id = b.person_id group by a.person_id
) qry


                       ") )
patsInEncTab <- run_query( paste0("select count(distinct person_id) as n from ",encounters))
table2.03 <- cbind(
encTabBirth2.03,
encTabDeath2.03,
procTabBirth2.03,
procTabDeath2.03,
vitalsTabBirth2.03,
vitalsTabDeath2.03,
pharmTabBirth2.03,
pharmTabDeath2.03,
prescribTabBirth2.03,
prescribTabDeath2.03,
labTabBirth2.03,
labTabDeath2.03,
DeathBeforeBirth
)
labels2.03 <- data.frame(name=c( "dobPostAdate","dobPostDdate","deathPreAdate","deathPreDdate","dobPostProc","deathPreProc","dobPostMeasure","deathPreMeasure","dobPostRx","deathPreRx","dobPostRxStart","deathPreRxStart","dobPostResult","deathPreResult","deathPreBirth" ),
                        newlabel = c('Adate<birth_date','Ddate<birth_date','Deathdt<adate','Deathdt<ddate','Procdate<birth_date','ProcDate>deathdt','Measure_date<birth_date','Measure_date>deathdt','rxdate<birth_date','rxdate>deathdt','rx_start_dt<birth_date','rx_start_dt>deathdt','Result_date<birth_date','Result_date>deathdt','Deathdt<birth_date')   ,
                        srctab = c('Encounters','Encounters','Encounters','Encounters','Procedures','Procedures','Vital_signs','Vital_signs','Pharmacy','Pharmacy','Prescribing','Prescribing','lab_results','lab_results','Demographics and death')
)

table2.03_tx <- tidyr::gather(table2.03, "comp", 'n') %>%
  within({
    pct <- round(100*n/patsInEncTab$n, 2)
  }) 
table2.03_tx$ord <- 1:nrow(table2.03_tx)
table2.03_tx <- table2.03_tx%>%
  merge(labels2.03, by.x='comp',by.y='name', all.x=T) %>%
  arrange(ord)
#knitr::kable(table2.03_tx[,c('ord','comp','n','pct', 'srctab')], col.names = c(' ','Date comparison','N Patients','% of unique patients in the encounter table','Source table'))
#params$QAAlert <- T

illogicalDatesMsg <- NULL
illogicalDatesMsg <- if(nrow(subset(table2.03_tx, pct>=5))>0){

  paste("More than 5% of patients have illogical date relationships:", subset(table2.03_tx, pct>=5)$comp, sep=' ')
} else if(nrow(subset(fdatesdone, pctFuture>=5))==0 ) {
                   "No table exceeds the 5% limit on illogical dates."
} 

  QA_Alert_Messages[nrow(QA_Alert_Messages) + 1,] = c("2.03",illogicalDatesMsg)
# to implement a new check add a new row to the following table:
# table = the standard name of the table in the database (it will automaticaly correct for you if the actual table names are not standard)
# var = the name of the variable to be chacked
# condition = a valid logical condition ( in MS sql server language) which identifies a bad value
# where = a valid where clause to subset the table, be sure to start with "WHERE", see example for ddate
check3.03 <-
rbind( 
  data.frame(table='demographics', disp='birth_date',   var='birth_date',            condition='is null', where=' '),
  data.frame(table='demographics', disp='gender',   var='gender',                condition="not in ('M','F','O')", where=' '),
  data.frame(table='encounters', disp='discharge_disposition',     var='discharge_disposition', condition="not in ('A','E')", where=' '),
  data.frame(table='encounters', disp='ddate',    var='ddate',                 condition="is null", where="where enctype = 'IP'"),
  data.frame(table='procedures', disp='procdate',    var='procdate',              condition='is null', where=' '),
  data.frame(table='prescribing',disp='rx_order_date',   var='rx_order_date',         condition='is null', where=' '),
  data.frame(table='pharmacy', disp='rxsup and/or rxamt', var='not(rxsup>1 or rxamt>1)',       condition=' ', where=' '), # compicated by it depending on two variables
  #data.frame(table='cause_of_death', var='source',                condition="not in ('S', 'N', 'T', 'B', 'L', 'U', 'O')"),
  data.frame(table='diagnoses',disp='dx_origin',      var='dx_origin',             condition="not in ('OD', 'BI', 'CL', 'PR', 'NI', 'OT')", where=' '),
  data.frame(table='diagnoses', disp='enc_id',      var='enc_id',                condition="is null", where=' '),
  data.frame(table='procedures', disp='enc_id',     var='enc_id',                condition="is null", where=' '),
  data.frame(table='vital_signs', disp='enc_id',    var='enc_id',                condition="is null", where=' ')

)
res3.03 <- list()
for(i in 1:nrow(check3.03)){


  res3.03[[i]] <- with(check3.03, {run_query( 
                                            paste("select '",table[i],"' as dataTable,'",var[i],"' as variable, '",disp[i],"' as display, count(*) as N_rows, sum(case when ",var[i],condition[i],"then 1 else 0 end) as N_bad",
                                                  "from",eval(as.name(tolower(table[i]))), where[i]
                                                  ,
                                                  sep=' '
                                                  )
  )}
  )
}
result3.03 <- do.call('rbind', res3.03) %>%
  within(.,{
    pctBad <- round(100*N_bad/N_rows, 2)
  })
check3.03Msg <- NULL
check3.03Msg <- if(nrow(subset(result3.03, pctBad>=10))>0){
  result3.03gt10 <- subset(result3.03, pctBad>=10)
  paste("More than 10% of records have missing or unknown values for the following table fields:  ",
        paste(paste(trimws(result3.03gt10$dataTable), ": ", trimws(result3.03gt10$display), sep=''), sep = ", "), sep="")
} else if(nrow(subset(result3.03, pctFuture>=10))==0 ) {
                   "No check exceeds the 10% limit on bad values."
}

  QA_Alert_Messages[nrow(QA_Alert_Messages) + 1,] = c("3.03",check3.03Msg)
patsWithEncs <- run_query(paste0("WITH cte_patWEncDiag AS (
    SELECT count(DISTINCT di.person_id) patsWEncDiag
    FROM ", demographics, " de
    JOIN ", encounters ," e on e.person_id = de.person_id
    JOIN ", diagnoses ," di ON de.person_id = di.person_id
    ),
cte_patWEncProc AS (
    SELECT count(DISTINCT p.person_id) AS patsWEncProc
    FROM ", demographics, " de
    JOIN ", encounters, " e on e.person_id = de.person_id
    JOIN ", procedures, " p ON de.person_id = p.person_id
    ),
cte_pats AS (
    SELECT count(DISTINCT de.person_id) nrows
    FROM ", demographics, " de
    JOIN ", encounters, " e on e.person_id = de.person_id
    )
SELECT nrows, patsWEncDiag, patsWEncProc
FROM cte_patWEncDiag
    ,cte_patWEncProc
    ,cte_pats") )
patsWithEncs <- within(patsWithEncs, {
  pctWEncDiag <- 100.0* patsWEncDiag/nrows
  pctWEncProc <- 100.0* patsWEncProc/nrows
})
patDiagMsg <- with (patsWithEncs,{
  if(pctWEncDiag<50.0 ){
  paste0('WARNING: Only ',round(pctWEncDiag,1),'% of patients with encounters have diagnoses') 
} else { 
  paste0('NOTE: ',round(pctWEncDiag,1),'% of patients with encounters have diagnoses') 
}
})
patProcMsg <- with (patsWithEncs,{
  if(pctWEncProc<50.0 ){
  paste0('WARNING: Only ',round(pctWEncProc,1),'% of patients with encounters have procedures') 
} else { 
  paste0('NOTE: ',round(pctWEncProc,1),'% of patients with encounters have procedures') 
}
})

  QA_Alert_Messages[nrow(QA_Alert_Messages) + 1,] = c("3.04",patProcMsg)
  QA_Alert_Messages[nrow(QA_Alert_Messages) + 1,] = c("3.05",patDiagMsg)
ipedei_no_principal_diag <- run_query(paste0("SELECT        
       InpatientEnc, 
       InpatientEnc - InpatientEncWPrincipalDiag AS InpatientEncWOPrincipalDiag, 
       CAST(ROUND((1.0 - CAST(InpatientEncWPrincipalDiag AS FLOAT) / CAST(InpatientEnc AS FLOAT)) *
       100.0, 2) as DECIMAL(5,2)) AS InpatientEncWOPrincipalDiagPcnt
FROM              
(
    SELECT    
           COUNT(e.ENC_ID) AS InpatientEnc, 
           SUM(IIF(d.ENC_ID IS NOT NULL, 1, 0)) AS InpatientEncWPrincipalDiag
    FROM ",     
         encounters ," e
         LEFT JOIN
    (
        SELECT DISTINCT 
               ENC_ID
        FROM  ", 
             diagnoses, " d
        WHERE  d.PRINCIPAL_DX = 'P'
               AND d.DX_ORIGIN != 'PR'
    ) d
              ON d.ENC_ID = e.ENC_ID
    WHERE e.ENCTYPE IN('IP', 'EI')
) PxDiags;"))
if (params$QAAlert == TRUE){
  ipedei_no_principal_diag_table <- subset(ipedei_no_principal_diag, InpatientEncWOPrincipalDiagPcnt > 10.0)
  if (!(length(ipedei_no_principal_diag_table)==0) & nrow(ipedei_no_principal_diag_table) > 0){
    QA_Alert_Message_306 <- paste0("More than 10% of IP (inpatient) encounters with any diagnosis don't have a principal diagnosis")
    QA_Alert_Messages[nrow(QA_Alert_Messages) + 1,] = c("3.06",QA_Alert_Message_306)
  }
}
benchmark_start <- run_query(paste0("
  SET NOCOUNT ON;
  DECLARE @BenchmarkStartDate DATE;
  SET @BenchmarkStartDate =
  (
      SELECT 
             IIF(MAX(x.ADATE) <= GETDATE(), MAX(x.ADATE), GETDATE())
      FROM ",   
           encounters, " x
  );
  SELECT @BenchmarkStartDate;
"))
data_result_75_complete <- run_query(gsub( "@BenchmarkStartDate", paste0('\'', benchmark_start[1,1], '\'', sep=''), paste0("
  SET NOCOUNT ON;
  DECLARE @BenchmarkPriorYearAvg INT;
  SET @BenchmarkPriorYearAvg =
  (
      SELECT 
             COUNT(1) / 12
      FROM  ", 
           encounters, " e
      WHERE  e.ENCTYPE IN('IP', 'AV', 'ED', 'EI')
      AND e.ADATE >= DATEADD(MONTH, -24, @BenchmarkStartDate)
      AND e.ADATE <= DATEADD(MONTH, -12, @BenchmarkStartDate)
  );

  SELECT 
         FORMAT(ADATE, 'MM-MMM') Month, 
         @BenchmarkPriorYearAvg AS BenchmarkCount, 
         COUNT(1) PriorMonthCount, 
         IIF(@BenchmarkPriorYearAvg <> 0, CAST(ROUND((CAST(COUNT(1) AS FLOAT) / @BenchmarkPriorYearAvg * 100.0), 2) as DECIMAL(5,2)), 0)  AS PercentofBenchMark
  FROM ",  
       encounters, " e
  WHERE  e.ENCTYPE IN('IP', 'AV', 'ED', 'EI')
  AND e.ADATE >= DATEADD(MONTH, -12, @BenchmarkStartDate)
  GROUP BY 
           FORMAT(ADATE, 'MM-MMM')
  ORDER BY 
           FORMAT(ADATE, 'MM-MMM');                          

")))
data_result_75_complete$PercentofBenchMark <- as.numeric(as.character(data_result_75_complete$PercentofBenchMark)) 

QA Alerts

knitr::kable(QA_Alert_Messages, row.names = FALSE)

\pagebreak

Data Check 1.03: Required fields are not present

knitr::kable(missing_columnsTables, row.names = FALSE, col.names = c("Table Name", "Column Name"))

Data Check 1.04: Required fields do not conform to data model specifications for data type, length, or name

invalid_specs[is.na(invalid_specs)] = ""
knitr::kable(invalid_specs, row.names = FALSE, col.names = c("Table Name", "Column Name", "Expected Number Precision", "Found Number Precision", "Expected Number Scale", "Found Number Scale", "Expected Char Length", "Found Char Length", "Expected Date Precision", "Found Date Precision"))

Data Check 1.05: Tables have primary key definition errors

violated_primary_keys[is.na(violated_primary_keys)] = ""
knitr::kable(violated_primary_keys, row.names = FALSE, digits = 2, format.args = list(big.mark = ",", scientific = FALSE))

Data Check 1.06: Required fields contain values outside of data model specifications

data_validation[is.na(data_validation)] = ""
knitr::kable(data_validation, digits = 2, format.args = list(big.mark = ",", scientific = FALSE))

Data Check 1.07: Required fields have non-permissible missing values

knitr::kable(nullableValues, row.names = FALSE, col.names = c("Table Name", "Column Name", "Expected Field Allows Null Value", "Found Field Allows Null Value"), digits = 2, format.args = list(big.mark = ",", scientific = FALSE))

Data Check 1.08: Tables contain orphan PERSON_IDs

knitr::kable(orphan_person_ids_table, row.names = FALSE, col.names = c("Target Table", "Target Column", "Reference Table", "Reference Column", "Count of Values Not Found", "Distinct Total in Target Table", "Percentage of Missing to Distinct Total"), digits = 2, format.args = list(big.mark = ",", scientific = FALSE))

Data Check 1.09: Tables contain orphan ENCOUNTER_IDs

knitr::kable(orphan_enc_ids_table, row.names = FALSE, col.names = c("Target Table", "Target Column", "Reference Table", "Reference Column", "Count of Values Not Found", "Distinct Total in Target Table", "Percentage of Missing to Distinct Total"), digits = 2, format.args = list(big.mark = ",", scientific = FALSE))

Data Check 1.10: Replication errors between the ENCOUNTER, PROCEDURES and DIAGNOSIS tables

knitr::kable(subset(repErrors_tx, select=c(tablename, nrows, field, NP_bad)),
             col.names = c('Table','N','Field','Errors N(%)'), digits = 2, format.args = list(big.mark = ",", scientific = FALSE))

Data Check 1.11: Encounters assigned to more than one person_id in ENCOUNTERS table

knitr::kable(GR1Person, col.names = c('Table','Number of Encounters','Encounters Assinged to More Than One Person', "Percentage"), row.names = FALSE, digits = 2, format.args = list(big.mark = ",", scientific = FALSE))

Data Check 1.12: Tables contain orphan PROVIDER_IDs

knitr::kable(orphan_provider_ids_table, row.names = FALSE, col.names = c("Target Table", "Target Column", "Reference Table", "Reference Column", "Count of Values Not Found", "Distinct Total in Target Table", "Percentage of Missing to Distinct Total"), digits = 2, format.args = list(big.mark = ",", scientific = FALSE))

Data Check 1.13: More than 5% of ICD, CPT, LOINC, RXCUI, or NDC codes do not conform to the expected length or content

knitr::kable(code_conform_table_counts, row.names = FALSE, col.names = c("Table Name", "Code Type", "Total Codes in Table", "Total Invalid Codes", "Percent Invalid"), digits = 3, format.args = list(big.mark = ",", scientific = FALSE))
cat(" \n")  
cat("Top 50 Invalid Codes")
top_50_invalid$Code[top_50_invalid$Code==""] <- "\'blank\'"
knitr::kable(top_50_invalid, row.names = FALSE, col.names = c("Table Name", "Code Type", "Code", "Valid Result", "Count Invalid"), digits = 2, format.args = list(big.mark = ",", scientific = FALSE))

Data Check 2.01: More than 5% of records have future dates

  futureDataMsg <- if(nrow(subset(fdatesdone, pctFuture>=5))>0 & params$QAAlert==F){
    knitr::kable(subset(fdatesdone, pctFuture>=5), col.names = c('Table','Date name','Total rows','N future dates','%'), digits = 2, format.args = list(big.mark = ",", scientific = FALSE))
} else if(nrow(subset(fdatesdone, pctFuture>=5))==0 & params$QAAlert==F) {
  "No table exceeds the 5% limit on future dates."
} else if ( params$QAAlert==T) {
  knitr::kable(fdatesdone, col.names = c('Table','Date name','Total rows','N future dates','%'), digits = 2, format.args = list(big.mark = ",", scientific = FALSE))
}

futureDataMsg

Data Check 2.02: More than 10% of records fall into the lowest or highest categories of age, height, weight, diastolic bp, systolic bp, or dispensed days supply

outOfRangeMsg <- NULL
outOfRangeMsg <- if(nrow(subset(allOutOfRange, (N_low+N_high)/nrows>=0.1))>0 & params$QAAlert==F){
  allOutOfRange <-  subset(subset(allOutOfRange, (N_low+N_high)/nrows>=0.1), select=c(table,item, low, high, nrows, NP_low, NP_high))
  knitr::kable(allOutOfRange, col.names = c('Table','Field','Check Low','High','N','Low values N (%)','High values N (%)')
, digits = 2, format.args = list(big.mark = ",", scientific = FALSE)) 
} else if(nrow(subset(allOutOfRange, (N_low+N_high)/nrows>=0.1))==0 & params$QAAlert==F) {
                   "No table exceeds the 10% limit."
} else if ( params$QAAlert==T) {
  allOutOfRange <- subset(allOutOfRange, select=c(table,item, low, high, nrows, NP_low, NP_high))
  knitr::kable(allOutOfRange, col.names = c('Table','Field','Check Low','High','N','Low values N(%)','High values N (%)'), digits = 2, format.args = list(big.mark = ",", scientific = FALSE))
}
outOfRangeMsg

Data Check 2.03: More than 5% of patients have illogical date relationships

illogicalDatesMsg <- NULL
illogicalDatesMsg <- if(nrow(subset(table2.03_tx, pct>=5))>0 & params$QAAlert==F){
  knitr::kable(subset(table2.03_tx, pct>=5)[,c('ord','comp','n','pct', 'srctab')], 
               col.names = c(' ','Date comparison','N Patients','% of unique patients in the encounter table','Source table'))
} else if(nrow(subset(table2.03_tx, pct>=5))==0 & params$QAAlert==F) {
  "No table exceeds the 5% limit on illogical dates."
} else if ( params$QAAlert==T) {
  knitr::kable(table2.03_tx[,c('ord','comp','n','pct', 'srctab')], 
               col.names = c(' ','Date comparison','N Patients','% of unique patients in the encounter table','Source table'))
}
illogicalDatesMsg

Data Check 3.03: More than 10% of records have missing or unknown values for the following fields:

  check3.03Msg <- NULL
check3.03Msg <- if(nrow(subset(result3.03, pctBad>=10.0))>0 & params$QAAlert==F){
  knitr::kable(subset(result3.03, pctBad>=10.0)[,c('dataTable','display','N_rows','N_bad', 'pctBad')],
               col.names = c('Table','Variable','N Rows','N with bad values','%'))
} else if(nrow(subset(result3.03, pctBad>=10.0))==0 & params$QAAlert==F) {
  "No check exceeds the 10% limit on bad values."
} else if ( params$QAAlert==T) {
  knitr::kable(result3.03[,c('dataTable','display','N_rows','N_bad', 'pctBad')], 
               col.names = c('Table','Variable','N Rows','N with bad values','%'), digits = 2, format.args = list(big.mark = ",", scientific = FALSE))
}
check3.03Msg

Data Check 3.04 Patients with Encounters and Diagnoses

check3.04 <- data.frame(patsWithEncs[,c('nrows','patsWEncDiag','pctWEncDiag')]) %>% format(digits = 2, big.mark = "," )
check3.04$patsWEncDiag <- paste(check3.04$patsWEncDiag," (",check3.04$pctWEncDiag, "%)", sep="")

knitr::kable(check3.04[,c('nrows','patsWEncDiag')], row.names = FALSE, col.names = c("Patients w/ Encounters", "Patients with Encounters and Diagnosis Records"))

Data Check 3.05 Patients with Encounters and Procedures

check3.05 <- data.frame(patsWithEncs[,c('nrows','patsWEncProc','pctWEncProc')]) %>% format(digits = 2, big.mark = "," )
check3.05$patsWEncProc <- paste(check3.05$patsWEncDiag," (",check3.05$pctWEncProc, "%)")

knitr::kable(check3.05[,c('nrows','patsWEncProc')], row.names = FALSE, col.names = c("Patients w/ Encounters", "Patients with Encounters and Procedure Records"))

Data Check 3.06: More than 10% of IP (inpatient) or ED to inpatient (EI) encounters with any diagnosis don't have a principal diagnosis

knitr::kable(ipedei_no_principal_diag, row.names = FALSE, col.names = c("In-Patient Type Encounters", "In-Patient Encounters W/O Principal Diagnoses", "% of In-Patient Type Encounters W/O Principal Diagnoses"), digits = 2, format.args = list(big.mark = ",", scientific = FALSE))

Data Check 3.07: Encounters, diagnoses, or procedures in an ambulatory (AV), emergency department (ED), ED to inpatient (EI), or inpatient (IP) setting are less than 75% complete three months prior to the current month

cat(paste0('Benchmark Start Date: ', benchmark_start[1,1]))
if (params$QAAlert == TRUE){
  data_result_75_complete_table <- subset(data_result_75_complete, PercentofBenchMark < 75 )
  if (!(length(data_result_75_complete_table)==0) & nrow(data_result_75_complete_table) > 0){
    QA_Alert_Message_307 <- paste0("Encounters, diagnoses, or procedures in an ambulatory (AV), emergency department (ED), or inpatient (IP) setting are less than 75% complete three months prior to the current month")
    QA_Alert_Messages[nrow(QA_Alert_Messages) + 1,] = c("3.07",QA_Alert_Message_307)
  }
}
knitr::kable(data_result_75_complete,row.names = FALSE, col.names = c("Month", "Benchnmark Count", "Prior Month Count", "% of Benchmark"), digits = 2, format.args = list(big.mark = ",", scientific = FALSE))

Total program run time:

# close odbc
odbcCloseAll()
endTime <- Sys.time()
runtime <- endTime - startTime

Query run time = r runtime minutes



UCCC/CHORDS-QA documentation built on July 18, 2021, 6:39 a.m.