library(tidyr) library(dplyr) knitr::opts_chunk$set(echo = FALSE, message = FALSE, warning = FALSE, error= TRUE, time_it = TRUE) startTime <- Sys.time() all_times <- list() knitr::knit_hooks$set(time_it = local({ now <- NULL function(before, options) { if (before) { now <<- Sys.time() } else { res <- difftime(Sys.time(), now) all_times[[options$label]] <<- c(options$label, format(round(res, 3))) #Only prints times greater than 5 minutes. if(res > as.difftime("00:05", format="%H:%M")){ paste("Time for the chunk", options$label, "to run:", format(round(res, 3))) } } } }))
connectionString <- getConnectionString(params)
run_query <- function(query_text, ...){ result <- run_db_query(connectionString, query_text ) return(result) } get_connection <- function(){ connection <- get_new_connection(connectionString) return(connection) }
tbresult <-runTableReplacements(connectionString)
QA_Alert_Messages <- data.frame('Alert Code'=character(), 'Alert Message'=character(), stringsAsFactors = FALSE)
The purpose of the data quality program is to conduct data quality checks that correspond to similar data checks carried out by PCORnet (Version 7). The checks cover 4 realms of data quality: data model conformance, data plausibility, data completeness, and data persistence. The program uses a series of SQL queries operationalized using RStudio to produce this report. The program is designed to support tables/figures that provide detailed information about each data quality check as well as an alert if a data error appears to be present (based on criteria developed by PCORnet or the CHORDS team).
This data quality report was generated from CHORDS r params$DBName
.
Data Partner:
Analyst:
Query Run Date: r Sys.Date()
# Table to check for quality tableCheckList <- c(lab_results, encounters, prescribing, procedures, social_history, provider_specialty, diagnoses, vital_signs, census_location, demographics, pro_surveys, pro_questions, pro_responses, benefit, linkage, death, pharmacy) # table list in DB #sqlConnection <- get_connection() #dbTables <- sqlTables(get_connection(), catalog = params$DBName) dbTables <- run_query(query_text = "SELECT TABLE_NAME FROM INFORMATION_SCHEMA.TABLES UNION ALL SELECT name AS TABLE_NAME FROM sys.synonyms;") #dbTables <- merge(dbTables, dbSynonyms, by="TABLE_NAME") #RODBC::odbcClose(sqlConnection) tabledChecked <- vector(length=length(tableCheckList)) for (i in 1:length(tableCheckList)){ tabledChecked[i] <- sum(grepl(tableCheckList[i], dbTables$TABLE_NAME, ignore.case = T))>0 } missingTables <- setdiff(tableCheckList, tableCheckList[tabledChecked]) nonMissingTables <- setdiff(tableCheckList, missingTables) nonMissingDBTables <- subset(dbTables, grepl(paste(tableCheckList, collapse = '|'), TABLE_NAME, ignore.case = T), select=TABLE_NAME)[,,drop=T] missingTableMessage <- if(length(missingTables)==0) { "All tables accounted for." } else{ paste("The following tables are missing:", paste(missingTables, collapse = ','), sep = ' ') } QA_Alert_Messages[nrow(QA_Alert_Messages) + 1,] = c("1.01",missingTableMessage)
## Data Check 1.02: Required tables are not populated table_N_lst <- list() #vector(length=length(nonMissingDBTables)) for(i in 1:length(nonMissingDBTables)){ table_N_lst[[i]] <- run_query(paste("select distinct '_aaa_' as _aaa_, count(*) as tabN from ",nonMissingDBTables[i], sep=' '))#[,'tabN',drop=T] # print(dbTables[i]) if (class(table_N_lst[[i]]) !='data.frame') {table_N_lst[[i]] <- NULL} } #length(dbTables) table_N_lst <- do.call('rbind', table_N_lst) [,'tabN',drop=T] nonPopTables <- nonMissingDBTables[table_N_lst==0] nonPopMessage <- if(length(nonPopTables)==0) {"All tables populated, among those present." } else{ paste("The following tables exist, but are not populated:", paste(nonPopTables, collapse = ','), sep = ' ') } QA_Alert_Messages[nrow(QA_Alert_Messages) + 1,] = c("1.02",nonPopMessage)
sqlFiles <- system.file('sql', package='chordsTables') schema_validation_query <- readChar(dir(sqlFiles, "VDW_Schema_Validation.sql", f=TRUE), file.info(dir(sqlFiles, "VDW_Schema_Validation.sql", f=TRUE))$size)#read_file(dir(sqlFiles, "VDW_Schema_Validation.sql", f=TRUE)) connection103 <- get_connection() schema_validation <- sqlQuery(connection103, schema_validation_query, as.is=TRUE) odbcClose(connection103)
## Data Check 1.03: Required fields are not present missing_columns <- subset(schema_validation, Result == "TABLE/VIEW OR COLUMN MISSING") missing_columnsTables <- select(missing_columns, c("TableName", "ColumnName")) if (params$QAAlert == TRUE & nrow(missing_columnsTables) > 0){ missing_columns$message <- paste0(missing_columns$TableName, ".", missing_columns$ColumnName, sep="") QA_Alert_Message_103_items <- paste0(missing_columns$message, collapse = ', ') QA_Alert_Message_103 <- paste0("The following required fields are not present: ", QA_Alert_Message_103_items) QA_Alert_Messages[nrow(QA_Alert_Messages) + 1,] = c("1.03",QA_Alert_Message_103) }
misconfig <- subset(schema_validation, (Result == "CONFIG-MISMATCH" & ExpectedIsNullable == FoundIsNullable)) invalid_specs <- select(misconfig, c("TableName", "ColumnName", "ExpectedNumberPrecision", "FoundNumberPrecision", "ExpectedNumberScale", "FoundNumberScale", "ExpectedCharLength", "FoundCharLength", "ExpectedDatePrecision", "FoundDatePrecision")) if (params$QAAlert == TRUE & nrow(invalid_specs) > 0){ misconfig$message <- paste0(misconfig$TableName, ".", misconfig$ColumnName, sep="") QA_Alert_Message_104_items <- paste0(misconfig$message, collapse = ', ') QA_Alert_Message_104 <- paste0("The following required fields do not conform to data model specifications for data type, length, or name: ", QA_Alert_Message_104_items) QA_Alert_Messages[nrow(QA_Alert_Messages) + 1,] = c("1.04",QA_Alert_Message_104) }
primary_key_data <- run_query(" SET ANSI_NULLS ON; SET QUOTED_IDENTIFIER ON; SET NOCOUNT ON; SET ANSI_WARNINGS OFF; BEGIN IF OBJECT_ID('tempdb..#PKVALIDATION') IS NOT NULL BEGIN DROP TABLE #PKVALIDATION; END; CREATE TABLE #PKVALIDATION ( Table_Name VARCHAR(250) , Column_Name VARCHAR(250) ); IF OBJECT_ID('tempdb..#KeyMatchResult') IS NOT NULL BEGIN DROP TABLE #KeyMatchResult; END; IF OBJECT_ID('tempdb..#PKVIOLATIONS') IS NOT NULL BEGIN DROP TABLE #PKVIOLATIONS; END; CREATE TABLE #PKVIOLATIONS ( Table_Name VARCHAR(250) , Violation_Count BIGINT, ErrorMessage varchar(500) DEFAULT NULL ); END; /***************************************************************************** END TempTable Clearing and Creation *****************************************************************************/ /***************************************************************************** BEGIN Table Create Section *****************************************************************************/ INSERT INTO #PKVALIDATION VALUES ( 'CENSUS_DEMOG', 'CENSUS_YEAR'), ( 'CENSUS_DEMOG', 'GEOCODE'), ( 'EVERNDC', 'NDC'), ( 'EVERNDC', 'GENERIC'), ( 'PROVIDER_SPECIALTY', 'PROVIDER'), ( 'DEATH', 'PERSON_ID'), ( 'CAUSE_OF_DEATH', 'PERSON_ID'), ( 'CAUSE_OF_DEATH', 'COD'), ( 'DEMOGRAPHICS', 'PERSON_ID'), ( 'LINKAGE', 'CID'), ( 'LINKAGE', 'LINE'), ( 'BENEFIT', 'BENEFIT_ID'), ( 'ENCOUNTERS', 'ENC_ID'), ( 'DIAGNOSES', 'DIAGNOSES_ID'), ( 'ENROLLMENT', 'PERSON_ID'), ( 'ENROLLMENT', 'ENR_START'), ( 'LAB_RESULTS', 'LAB_RESULTS_ID'), ( 'PRO_SURVEYS', 'PRO_ID'), ( 'PRO_QUESTIONS', 'PRO_ID'), ( 'PRO_QUESTIONS', 'QUESTION_ID'), ( 'PRO_QUESTIONS', 'QUESTION_VER'), ( 'PRO_RESPONSES', 'RESPONSE_ID'), ( 'PHARMACY', 'PHARMACY_ID'), ( 'PRESCRIBING', 'PRESCRIBING_ID'), ( 'PROCEDURES', 'PROCEDURES_ID'), ( 'SOCIAL_HISTORY', 'SOCIAL_HISTORY_ID'), ( 'VITAL_SIGNS', 'VITAL_SIGNS_ID'), ( 'TUMOR', 'TUMOR_ID'), ( 'LANGUAGES', 'PERSON_ID'), ( 'LANGUAGES', 'LANG_ISO'), ( 'CENSUS_LOCATION', 'PERSON_ID'), ( 'CENSUS_LOCATION', 'LOC_START'); /***************************************************************************** BEGIN Table Name Replacement: If a TableName replacement table exists, it will swap out the names in the tables for the correct ones based on how it's mapped in their table. *****************************************************************************/ BEGIN IF OBJECT_ID('CHORDS_TABLENAMES') IS NOT NULL BEGIN UPDATE a SET a.Table_Name = b.NEW_NAME FROM #PKVALIDATION a JOIN CHORDS_TABLENAMES b ON b.ORG_NAME = a.Table_Name; END; END; /***************************************************************************** END Table Name Replacement *****************************************************************************/ /***************************************************************************** BEGIN Analysis Section: Compares the partner's primary keys to the expected keys ******************************************************************************/ SELECT * INTO #KeyMatchResult FROM ( SELECT ExpectKeys.TABLE_NAME, ExpectKeys.COLUMN_NAMES AS Expected_Primary_Key, CurrKeys.COLUMN_NAMES AS Found_Primary_Key, CASE WHEN ob.type = 'V' THEN 'VIEW FOUND' WHEN CurrKeys.TABLE_NAME IS NULL THEN 'TABLE/VIEW OR KEYS NOT FOUND' WHEN ExpectKeys.COLUMN_NAMES != CurrKeys.COLUMN_NAMES THEN 'KEY MISMATCH' WHEN ExpectKeys.COLUMN_NAMES = CurrKeys.COLUMN_NAMES THEN 'OK' ELSE 'UNKNOWN ERROR' END AS Key_Match_Result FROM ( SELECT TABLE_NAME, LEFT(COL, LEN(COL) - 1) AS COLUMN_NAMES FROM ( SELECT DISTINCT TAB.TABLE_NAME TABLE_NAME, ( SELECT COL.Column_Name + ', ' AS [text()] FROM #PKVALIDATION COL WHERE COL.Table_Name = TAB.Table_Name ORDER BY COL.Column_Name FOR XML PATH('') ) COL FROM #PKVALIDATION TAB ) T WHERE T.COL IS NOT NULL ) ExpectKeys LEFT JOIN ( SELECT TABLE_NAME, LEFT(COL, LEN(COL) - 1) AS COLUMN_NAMES FROM ( SELECT DISTINCT TAB.TABLE_NAME TABLE_NAME, ( SELECT COL.COLUMN_NAME + ', ' AS [text()] FROM INFORMATION_SCHEMA.CONSTRAINT_COLUMN_USAGE COL WHERE COL.CONSTRAINT_NAME = TAB.CONSTRAINT_NAME AND COL.TABLE_NAME = TAB.TABLE_NAME AND CONSTRAINT_TYPE = 'PRIMARY KEY' ORDER BY COL.COLUMN_NAME FOR XML PATH('') ) COL FROM INFORMATION_SCHEMA.TABLE_CONSTRAINTS TAB ) T WHERE T.COL IS NOT NULL ) CurrKeys ON CurrKeys.TABLE_NAME = ExpectKeys.TABLE_NAME LEFT JOIN sys.objects ob ON ob.object_id = OBJECT_ID(ExpectKeys.TABLE_NAME) and ob.type in ('U', 'V')) KeyMatch; BEGIN DECLARE @SQL NVARCHAR(3000); DECLARE @Table_Name VARCHAR(100); DECLARE @Key_Columns VARCHAR(100); DECLARE CUR CURSOR FOR SELECT Table_Name, Expected_Primary_Key FROM #KeyMatchResult; OPEN CUR; FETCH NEXT FROM CUR INTO @Table_Name, @Key_Columns; WHILE @@FETCH_STATUS = 0 BEGIN SET @SQL = ' IF OBJECT_ID(''' + @Table_Name + ''') IS NOT NULL WITH CTEKEY AS (SELECT COUNT(*) KEYTOT FROM ( SELECT DISTINCT ' + @Key_Columns + ' FROM ' + @Table_Name + ') z), CTETOT AS (SELECT COUNT(*) TABTOT FROM ' + @Table_Name + ') INSERT INTO #PKVIOLATIONS SELECT ''' + @Table_Name + ''', CTETOT.TABTOT - CTEKEY.KEYTOT, NULL FROM CTETOT, CTEKEY;' BEGIN TRY EXEC Sp_executesql @SQL; END TRY BEGIN CATCH Declare @ErrorMessageRef as varchar(max); SELECT @ErrorMessageRef = ERROR_MESSAGE(); INSERT INTO #PKVIOLATIONS VALUES(@Table_Name, NULL, @ErrorMessageRef) END CATCH FETCH NEXT FROM CUR INTO @Table_Name, @Key_Columns; END; CLOSE CUR; DEALLOCATE CUR; END; SELECT DISTINCT z.Table_Name, z.Expected_Primary_Key, z.Found_Primary_Key, CASE WHEN z.ErrorMessage IS NOT NULL THEN z.ErrorMessage WHEN z.Key_Match_Result = 'TABLE/VIEW OR KEYS NOT FOUND' AND y.name IS NOT NULL AND z.ErrorMessage IS NULL THEN 'TABLE SYNONYM FOUND; UNABLE TO AUTOMATICALLY VALIDATE' ELSE z.Key_Match_Result END AS Key_Match_Result, z.Violation_Count FROM ( SELECT m.*, v.Violation_Count, v.ErrorMessage FROM #KeyMatchResult m LEFT JOIN #PKVIOLATIONS v ON v.Table_Name = m.TABLE_NAME ) z LEFT JOIN sys.synonyms y ON y.name = z.TABLE_NAME;")
violated_primary_keys <- subset(primary_key_data, Key_Match_Result != "OK") if (params$QAAlert == TRUE & nrow(violated_primary_keys) > 0){ violated_primary_keys_table <- subset(violated_primary_keys, (Violation_Count > 0 & Key_Match_Result == "VIEW FOUND") | (Violation_Count > 0 & Key_Match_Result == "TABLE SYNONYM FOUND; UNABLE TO AUTOMATICALLY VALIDATE") | (Key_Match_Result != "VIEW FOUND" & Key_Match_Result != "TABLE SYNONYM FOUND; UNABLE TO AUTOMATICALLY VALIDATE")) violated_primary_keys_table$message <- paste0(violated_primary_keys_table$Table_Name, sep="") QA_Alert_Message_105_items <- paste0(violated_primary_keys_table$message, collapse = ', ') QA_Alert_Message_105 <- paste0("The following tables have primary key definition errors: ", QA_Alert_Message_105_items) QA_Alert_Messages[nrow(QA_Alert_Messages) + 1,] = c("1.05",QA_Alert_Message_105) }
sqlFiles <- system.file('sql', package='chordsTables') data_validation_query <- readChar(dir(sqlFiles, "VDW_DataValidation.sql", f=TRUE), file.info(dir(sqlFiles, "VDW_DataValidation.sql", f=TRUE))$size) #read_file(dir(sqlFiles, "VDW_DataValidation.sql", f=TRUE)) connection106 <- get_connection() results <- sqlQuery(connection106, data_validation_query, as.is=TRUE) data_validation <- sqlQuery(connection106, " SELECT TargetTable, TargetColumn, CASE WHEN UnexpectedValue = '' THEN '(blank)' ELSE UnexpectedValue END AS UnexpectedValue, Message FROM #CHORDSDataValueResults ORDER BY TargetTable; ", as.is = TRUE) ref_integrity <- sqlQuery(connection106, " SELECT * FROM #CHORDSReferentialIntegrityResults ORDER BY TargetTable; ", as.is = TRUE) odbcClose(connection106) if (params$QAAlert == TRUE & nrow(data_validation) > 0){ data_validation_table <- data.frame(data_validation, stringsAsFactors = FALSE) data_validation_table$message <- paste0(data_validation_table$TargetTable, ".", data_validation_table$TargetColumn, sep="") QA_Alert_Message_106_items <- paste0(data_validation_table$message, collapse = ', ') QA_Alert_Message_106 <- paste0("The following required fields contain values outside of data model specifications: ", QA_Alert_Message_106_items) QA_Alert_Messages[nrow(QA_Alert_Messages) + 1,] = c("1.06", QA_Alert_Message_106) }
nullableValues = subset(schema_validation, (Result == "CONFIG-MISMATCH" & ExpectedIsNullable != FoundIsNullable)) nullableValues <- select(nullableValues, c("TableName", "ColumnName", "ExpectedIsNullable", "FoundIsNullable")) if (params$QAAlert == TRUE & nrow(nullableValues) > 0){ nullableValues_Table <- data.frame(nullableValues, stringsAsFactors = FALSE) nullableValues_Table$message <- paste0(nullableValues_Table$TableName, ".", nullableValues_Table$ColumnName, sep="") QA_Alert_Message_107_items <- paste0(nullableValues_Table$message, collapse = ', ') QA_Alert_Message_107 <- paste0("The following required fields have non-permissible missing values: ", QA_Alert_Message_107_items) QA_Alert_Messages[nrow(QA_Alert_Messages) + 1,] = c("1.07",QA_Alert_Message_107) }
orphan_person_ids <- subset(ref_integrity, ((TargetColumn == "PERSON_ID" | ReferenceColumn == "PERSON_ID") & ValuesNotFound > 0)) orphan_person_ids_table <- select(orphan_person_ids, c("TargetTable", "TargetColumn", "ReferenceTable", "ReferenceColumn", "ValuesNotFound", "TargetTableDistinctCount", "PercentOfDistinctTargetColumnMissing")) if (params$QAAlert == TRUE & nrow(orphan_person_ids_table) > 0){ orphan_person_ids_sub_table <- data.frame(orphan_person_ids_table, stringsAsFactors = FALSE) orphan_person_ids_sub_table$message <- paste0(orphan_person_ids_sub_table$TargetTable, sep="") QA_Alert_Message_108_items <- paste0(orphan_person_ids_sub_table$message, collapse = ', ') QA_Alert_Message_108 <- paste0("The following tables contain orphan PERSON_IDs: ", QA_Alert_Message_108_items) QA_Alert_Messages[nrow(QA_Alert_Messages) + 1,] = c("1.08",QA_Alert_Message_108) }
orphan_enc_ids <- subset(ref_integrity, (TargetColumn == "ENC_ID" | ReferenceColumn == "ENC_ID") & ValuesNotFound > 0) orphan_enc_ids_table <- select(orphan_enc_ids, c("TargetTable", "TargetColumn", "ReferenceTable", "ReferenceColumn", "ValuesNotFound", "TargetTableDistinctCount", "PercentOfDistinctTargetColumnMissing")) if (params$QAAlert == TRUE & nrow(orphan_enc_ids_table) > 0){ orphan_person_ids_sub_table <- data.frame(orphan_enc_ids_table, stringsAsFactors = FALSE) orphan_person_ids_sub_table$message <- paste0(orphan_person_ids_sub_table$TargetTable, sep="") QA_Alert_Message_109_items <- paste0(orphan_person_ids_sub_table$message, collapse = ', ') QA_Alert_Message_109 <- paste0("The following tables contain orphan ENCOUNTER_IDs: ", QA_Alert_Message_109_items) QA_Alert_Messages[nrow(QA_Alert_Messages) + 1,] = c("1.09",QA_Alert_Message_109) }
repErrors <- run_query( paste0( " select 'Diagnosis' as tablename, count(*) as nrows, sum(case when a.person_id <> b.person_id then 1 else 0 end) as Person_ID, sum(case when a.adate <> b.adate then 1 else 0 end) as Adate, sum(case when a.enctype <> b.enctype then 1 else 0 end) as EncType from ", encounters, " as a inner join (select distinct enc_id, person_id, adate, enctype from ", diagnoses, ") as b on a.enc_id = b.enc_id union select 'Procedure' as tablename, count(*) as nrows, sum(case when a.person_id <> b.person_id then 1 else 0 end) as Person, sum(case when a.adate <> b.adate then 1 else 0 end) as Adate, sum(case when a.enctype <> b.enctype then 1 else 0 end) as EncType from ", encounters, " as a inner join (select distinct enc_id, person_id, adate, enctype from ", procedures, ") as b on a.enc_id = b.enc_id " )) repErrors_tx <- tidyr::gather(repErrors, 'field','n_bad',Person_ID, Adate, EncType) %>% arrange(tablename) %>% within({ NP_bad<- paste0(n_bad,' (',round(100*n_bad/nrows, 2),')') }) QA_Alert_Message_1.10 <- if(nrow(subset(repErrors_tx, n_bad>0))>0) {"There are Replication errors between the ENCOUNTER, PROCEDURES and DIAGNOSIS tables"} else { "There were no Replication errors between the ENCOUNTER, PROCEDURES and DIAGNOSIS tables found." } QA_Alert_Messages[nrow(QA_Alert_Messages) + 1,] = c("1.10",QA_Alert_Message_1.10)
GR1Person <- run_query( paste0( " select 'Encounter' as tablename, count(*) as n_encid, sum(case when N_person>1 then 1 else 0 end) as N_gr1 from ( select enc_id, count(distinct person_id) as N_person from ", encounters," group by enc_id ) as qry " )) GR1Person <- within(GR1Person, { pct <- round(100*N_gr1/n_encid, 2) }) #if(GR1Person$pct>5){ # print(paste0("WARNING: The encounter table has more than 5% of encounters assigned to more than 1 person (",GR1Person$pct,"%)")) #} else{ # print(paste0("The encounter table has ", GR1Person$pct,"% of encounters assigned to more than 1 person")) #} encItoManyMsg <- if(GR1Person$pct>5){ paste0("WARNING: The encounter table has more than 5% of encounters assigned to more than 1 person (",GR1Person$pct,"%)") } else{ paste0("The encounter table has ", GR1Person$pct,"% of encounters assigned to more than 1 person") } QA_Alert_Messages[nrow(QA_Alert_Messages) + 1,] = c("1.11",encItoManyMsg)
orphan_provider_ids <- subset(ref_integrity, ((TargetColumn %in% c("RXMD", "PROVIDER") | ReferenceColumn %in% c("RXMD", "PROVIDER"))) & ValuesNotFound > 0) orphan_provider_ids_table <- select(orphan_provider_ids, c("TargetTable", "TargetColumn", "ReferenceTable", "ReferenceColumn", "ValuesNotFound", "TargetTableDistinctCount", "PercentOfDistinctTargetColumnMissing")) if (params$QAAlert == TRUE & nrow(orphan_provider_ids_table) > 0){ orphan_provider_ids_sub_table <- data.frame(orphan_provider_ids_table, stringsAsFactors = FALSE) orphan_provider_ids_sub_table$message <- paste0(orphan_provider_ids_sub_table$TargetTable, ".", orphan_provider_ids_sub_table$TargetColumn, sep="") QA_Alert_Message_112_items <- paste0(orphan_provider_ids_sub_table$message, collapse = ', ') QA_Alert_Message_112 <- paste0("The following tables contain orphan PROVIDER_IDs: ", QA_Alert_Message_112_items) QA_Alert_Messages[nrow(QA_Alert_Messages) + 1,] = c("1.12",QA_Alert_Message_112) }
connection113 <- get_connection() code_conform_table_counts <- sqlQuery(connection113, paste0(" /* - DIAGNOSES: *ICD09:length is not between 3-5 OR has alpha characters other than E or V OR has no numeric characters OR first 3 digits (min length) are 0; *ICD10: length is not between 3 and 7 OR 1st character is not alpha OR first 3 digits (min length) are 0 or 9 OR has no numeric characters; - PROCEDURES: *CPT/HCPCS are length <5 OR first 5 are all 0 or 9 OR no numeric; *ICD9 is not length 3 or 4 OR any alpha OR all 0; *ICD10 is not length 7 OR no numeric OR all 7 are 0s or 9s; - PRESCRIBING: *flag codes with any alphabetical characters OR a length <2 or >7; - PHARMACY: *length not 11 OR any alpha OR a string of 0 or 9; - LAB_RESULT *flag for any alphabetical characters OR a length less than 3 or greater than 7 OR the absence of a dash after the next to last position; */ SET NOCOUNT ON; DROP TABLE IF EXISTS #VdwInvalidCodes; SELECT * INTO #VdwInvalidCodes FROM ( SELECT a.TableName, a.CodeType, a.Code, a.ValidResult FROM ( SELECT 'DIAGNOSES' AS TableName, DX AS Code, DX_CODETYPE AS CodeType, CASE WHEN LEN(replace(DX, '.', '')) NOT BETWEEN 3 AND 7 THEN 'Code Length must be between 3 and 7 characters (excluding \".\")' WHEN DX NOT LIKE '[A-TV-Z]%' THEN 'Starting character must be a letter and not \"U\"' WHEN SUBSTRING(DX, 1, 3) IN('000', '999') THEN 'Invalid Numeric Value Range' WHEN DX NOT LIKE '%[0-9]%' THEN 'Values must include a numerical component' ELSE 'OK' END AS ValidResult FROM ", diagnoses, " WHERE DX_CODETYPE = '10' UNION ALL SELECT 'DIAGNOSES' AS TableName, DX AS Code, DX_CODETYPE AS CodeType, CASE WHEN LEN(replace(DX, '.', '')) NOT BETWEEN 3 AND 5 THEN 'Length must be between 3 and 5 characters (excluding \".\")' WHEN(DX NOT LIKE '[EV]%' AND DX NOT LIKE '[0-9]%') THEN 'Starting Character must be \"E\", \"V\", or a number' WHEN SUBSTRING(DX, 1, 3) IN('000') THEN 'Invalid Numeric Value Range' WHEN DX NOT LIKE '%[0-9]%' THEN 'Numeric Values Not Detected' ELSE 'OK' END AS ValidResult FROM ", diagnoses, " WHERE DX_CODETYPE = '09' UNION ALL SELECT 'PROCEDURES' AS TableName, PX AS Code, PX_CODETYPE AS CodeType, CASE WHEN LEN(PX) < 5 THEN 'Code length must be less than 5 characters' WHEN SUBSTRING(PX, 1, 5) IN('00000', '99999') THEN 'Invalid Numeric Value' WHEN TRY_PARSE(PX AS INT) IS NULL THEN 'Non-numeric Characters Detected' ELSE 'OK' END AS ValidResult FROM ", procedures, " WHERE PX_CODETYPE = 'C4' UNION ALL SELECT 'PROCEDURES' AS TableName, PX AS Code, PX_CODETYPE AS CodeType, CASE WHEN LEN(replace(PX, '.', '')) NOT BETWEEN 3 AND 4 THEN 'Code length must be between 3 and 4 characters' WHEN PX NOT LIKE '[0-9]%' THEN 'Invalid Start Character' WHEN PX IN('00000') THEN 'Invalid Numeric Value' ELSE 'OK' END AS ValidResult FROM ", procedures, " WHERE PX_CODETYPE = '09' UNION ALL SELECT 'PROCEDURES' AS TableName, PX AS Code, PX_CODETYPE AS CodeType, CASE WHEN LEN(PX) != 7 THEN 'Code Length Must Equal 7' WHEN PX NOT LIKE '%[0-9]%' THEN 'Code values must include a numerical component' WHEN PX IN('0000000', '9999999') THEN 'Invalid Numeric Value' ELSE 'OK' END AS ValidResult FROM ", procedures," WHERE PX_CODETYPE = '10' UNION ALL SELECT 'PRESCRIBING' AS TableName, RXNORM AS Code, 'RXNORM' AS CodeType, CASE WHEN RXNORM LIKE '%[A-Z]%' THEN 'Code cannot contain alphabetical characters' WHEN LEN(RXNORM) NOT BETWEEN 2 AND 7 THEN 'Code length must be between 2 and 7 characters' ELSE 'OK' END AS ValidResult FROM ", prescribing, " UNION ALL SELECT 'PHARMACY' AS TableName, NDC AS Code, 'NDC' AS CodeType, CASE WHEN NDC LIKE '%[A-Z]%' THEN 'Code cannot contain alphabetical characters' WHEN LEN(NDC) != 11 THEN 'Code length must be 11 characters' WHEN NDC IN('00000000000', '99999999999') THEN 'Invalid Numeric Value' ELSE 'OK' END AS ValidResult FROM ", pharmacy, " UNION ALL SELECT 'LAB_RESULTS' AS TableName, LOINC AS Code, 'LOINC' AS CodeType, CASE WHEN LOINC LIKE '%[A-Z]%' THEN 'Code cannot contain alphabetical characters' WHEN LEN(LOINC) NOT BETWEEN 3 AND 7 THEN 'Code length must be between 2 and 7 characters' WHEN SUBSTRING(LEFT(REVERSE(RTRIM(LTRIM(LOINC))), 2), 2, 2) != '-' THEN 'No hyphen character in the second to last position' ELSE 'OK' END AS ValidResult FROM ", lab_results, " ) a WHERE a.validResult != 'OK' ) InvalidCodes; WITH CTE_CodeCounts AS (SELECT 'DIAGNOSES' AS TableName, DX_CODETYPE AS CodeType, COUNT(1) CountAll FROM ", diagnoses, " GROUP BY DX_CODETYPE UNION ALL SELECT 'PROCEDURES' AS TableName, PX_CodeType AS CodeType, COUNT(1) CountAll FROM ", procedures, " GROUP BY PX_CodeType UNION ALL SELECT 'PRESCRIBING' AS TableName, 'RXNORM' AS CodeType, COUNT(1) CountAll FROM ", prescribing, " UNION ALL SELECT 'PHARMACY' AS TableName, 'NDC' AS CodeType, COUNT(1) CountAll FROM ", pharmacy, " UNION ALL SELECT 'LAB_RESULTS' AS TableName, 'LOINC' AS CodeType, COUNT(1) CountAll FROM ", lab_results, " ) SELECT b.TableName, b.CodeType, cc.CountAll, b.CountInvalid AS CountInvalid, ROUND(CAST(b.CountInvalid AS FLOAT) / CAST(cc.CountAll AS FLOAT) * CAST(100.0 AS FLOAT), 3) AS PercentInvalid FROM ( SELECT z.TableName, z.CodeType, COUNT(1) AS CountInvalid FROM #VdwInvalidCodes z GROUP BY z.TableName, z.CodeType ) b JOIN CTE_CodeCounts cc ON cc.TableName = b.TableName AND cc.CodeType = b.CodeType ORDER BY b.TableName, b.CodeType; ")) top_50_invalid <- sqlQuery(connection113, " SELECT TableName, CodeType, CASE WHEN Code = '' THEN '(blank)' ELSE Code END AS Code, ValidResult, CountInvalid FROM ( SELECT TableName, CodeType, Code, ValidResult, CountInvalid, ROW_NUMBER() OVER(PARTITION BY TableName ORDER BY CountInvalid DESC) RowNum FROM ( SELECT z.TableName, z.CodeType, z.Code, z.ValidResult, COUNT(1) AS CountInvalid FROM #VdwInvalidCodes z GROUP BY z.TableName, z.CodeType, z.Code, z.ValidResult ) InvalidCount ) Top50 WHERE RowNum <= 50 ORDER BY TableName, CodeType, CountInvalid Desc; ") odbcClose(connection113) if (params$QAAlert == TRUE){ code_conform_table_counts_table <- subset(code_conform_table_counts, PercentInvalid > 5.0) if (!(length(code_conform_table_counts_table)==0)){ code_conform_table_counts_table$message <- paste0(code_conform_table_counts_table$TableName, "-", code_conform_table_counts_table$CodeType, " (",code_conform_table_counts_table$PercentInvalid , "%)", sep="") QA_Alert_Message_113_items <- paste0(code_conform_table_counts_table$message, collapse = ', ') QA_Alert_Message_113 <- paste0("More than 5% of ICD, CPT, LOINC, RXCUI, or NDC codes do not conform to the expected length or content: ", QA_Alert_Message_113_items) QA_Alert_Messages[nrow(QA_Alert_Messages) + 1,] = c("1.13",QA_Alert_Message_113) } }
# DC 2.01 tabDates <- run_query(" SELECT DISTINCT case when s.name is not null then s.name else a.name end AS tabname, b.name AS colname FROM sys.objects a INNER JOIN sys.columns b ON a.object_id = b.object_id left join sys.synonyms s on replace(replace( s.base_object_name, '[', ''), ']', '') = a.name WHERE a.type IN('U', 'V') and b.name like '%date%' ORDER BY tabname;") modDates <- run_query(" SELECT DISTINCT case when s.name is not null then s.name else a.name end AS name, a.modify_date FROM sys.objects a INNER JOIN sys.columns b ON a.object_id = b.object_id left join sys.synonyms s on replace(replace( s.base_object_name, '[', ''), ']', '') = a.name WHERE a.type IN('U', 'V')") futureDateCode <- merge(tabDates, modDates, by.x='tabname', by.y='name') %>% within({ sqlcode <- paste0(" select '",tabname, "' as tableName, '", colname,"' as dateName ,count(*) as nrows, sum(case when a.",colname,">b.modify_date then 1 else 0 end) as futureDate from ", tabname, " as a , (select modify_date from (SELECT case when s.name is not null then s.name else a.name end AS name, a.modify_date FROM sys.objects a INNER JOIN sys.columns b ON a.object_id = b.object_id left join sys.synonyms s on replace(replace( s.base_object_name, '[', ''), ']', '') = a.name WHERE a.type IN('U', 'V')) a WHERE a.name ='",tabname,"') as b" ) }) fdatelst <- list() for (i in 1:nrow(futureDateCode)){ fdatelst[[i]] <- run_query(futureDateCode[i, "sqlcode"]) } fdatesdone <- do.call('rbind', fdatelst) %>% within(.,{ pctFuture <- round(100*futureDate/nrows, 4) }) ### add message if no table is a problem futureDataMsg <- NULL futureDataMsg <- if(nrow(subset(fdatesdone, pctFuture>=5))>0 ){ futureDataMsgTable <- subset(fdatesdone, pctFuture>=5) paste("The following tables exceeded the 5% limit on future dates:", paste0(futureDataMsgTable$tableName, futureDataMsgTable$dateName, sep="."), sep = ', ') } else if(nrow(subset(fdatesdone, pctFuture>=5))==0 ) { "No table exceeds the 5% limit on future dates." } QA_Alert_Messages[nrow(QA_Alert_Messages) + 1,] = c("2.01",futureDataMsg)
# select count(*) as personCount, sum(case when (minage not between 0 and 89) or (maxage not between 0 and 89) then 1 else 0 end ) as N_outOfRange patAgeRange <- run_query(paste0(" select count(*) as nrows, sum(case when (minage <0) or (maxage <0) then 1 else 0 end ) as N_low, sum(case when (minage >=90) or (maxage >=90) then 1 else 0 end ) as N_high from ( select distinct b.*, datediff(YY, a.birth_date, b.first_dt) as minage, datediff(YY, a.birth_date, b.last_dt) as maxage from ",demographics, " as a inner join (select person_id, min(adate) as first_dt, max(adate) as last_dt from ",encounters, " group by person_id ) as b on a.person_id = b.person_id where a.birth_date is not null ) as qry ")) htOutOfRange <- run_query(paste0(" select count(*) as nrows, sum(case when ht <0.0 then 1 else 0 end ) as N_low, sum(case when ht>=95.0 then 1 else 0 end ) as N_high from ", vital_signs, " where ht is not null ")) wtOutOfRange <- run_query(paste0(" select count(*) as nrows, sum(case when wt <0.0 then 1 else 0 end ) as N_low, sum(case when wt>350 then 1 else 0 end ) as N_high from ", vital_signs, " where wt is not null ")) dbpOutOfRange <- run_query(paste0(" select count(*) as nrows, sum(case when diastolic <40 then 1 else 0 end ) as N_low, sum(case when diastolic>120 then 1 else 0 end ) as N_high from ", vital_signs, " where diastolic is not null ")) sbpOutOfRange <- run_query(paste0(" select count(*) as nrows, sum(case when systolic <40 then 1 else 0 end ) as N_low, sum(case when systolic>210 then 1 else 0 end ) as N_high from ", vital_signs, " where systolic is not null ")) daysSupplyOutOfRange <- run_query(paste0(" select count(*) as nrows, sum(case when rx_days_supply <1 then 1 else 0 end ) as N_low, sum(case when rx_days_supply>90 then 1 else 0 end ) as N_high from ", prescribing, " where rx_days_supply is not null ")) allOutOfRange <- rbind( cbind(table='Demographic/Encounter', item='Age (people)', low='< 0 yrs',high='> 89 yrs', patAgeRange), cbind(table='Vital_signs',item='Height (records)',low= '< 0 inches',high='> 0 inches', htOutOfRange), cbind(table='Vital_signs',item='Weight (records)',low= '< 0 pounds',high='> 350 pounds', wtOutOfRange), cbind(table='Vital_signs',item='Diastolic BP (records)',low= '< 40 mgHg',high='> 120 mgHg', dbpOutOfRange), cbind(table='Vital_signs',item='Systolic BP (records)',low= '< 40 mgHg',high='> 210 mgHg', sbpOutOfRange), cbind(table='Prescribing',item='Prescribed days supply (records)',low= '< 1 day',high='> 90 days', daysSupplyOutOfRange) ) allOutOfRange <- within(allOutOfRange, { NP_low = paste0(N_low,' (',round(100*N_low/nrows, 2),')') NP_high = paste0(N_high,' (',round(100*N_high/nrows, 2),')') }) #knitr::kable( #subset(allOutOfRange, select=c(table,item, low, high, nrows, NP_low, NP_high)), #col.names = c('Table','Field','Check Low','High','N','Low values N (%)','High values N (%)') #) outOfRangeMsg <- NULL outOfRangeMsg <- if(nrow(subset(allOutOfRange, (N_low+N_high)/nrows>=0.1))>0 ){ paste('More than 10% of records fall into the lowest or highest categories of age, height, weight, diastolic, blood pressure, systolic blood pressure, or dispensed days supply:', subset(allOutOfRange, (N_low+N_high)/nrows>=0.1)$item, sep=' ') } else if(nrow(subset(allOutOfRange, (N_low+N_high)/nrows>=0.1))==0 ) { "No table exceeds the 5% limit." } QA_Alert_Messages[nrow(QA_Alert_Messages) + 1,] = c("2.02",futureDataMsg)
encTabBirth2.03 <- run_query(paste0(" select sum(dobPostAdate) as dobPostAdate, sum(dobPostDdate) as dobPostDdate from ( select a.person_id, max(case when datediff(day, a.birth_Date, b.adate)<0 and b.adate is not null then 1 else 0 end) as dobPostAdate, max(case when datediff(day, a.birth_Date, b.ddate)<0 and b.ddate is not null then 1 else 0 end) as dobPostDdate from (select distinct person_id, birth_date from ",demographics, " where birth_date is not null) as a inner join ", encounters, " as b on a.person_id = b.person_id group by a.person_id ) qry ") ) encTabDeath2.03 <- run_query(paste0(" select sum(deathPreAdate) as deathPreAdate, sum(deathPreDdate) as deathPreDdate from ( select a.person_id, max(case when datediff(day, a.deathdt, b.adate)>0 and b.adate is not null then 1 else 0 end) as deathPreAdate, max(case when datediff(day, a.deathdt, b.ddate)>0 and b.ddate is not null then 1 else 0 end) as deathPreDdate from (select distinct person_id, deathdt from ",death, " where deathdt is not null) as a inner join ", encounters, " as b on a.person_id = b.person_id group by a.person_id ) qry ") ) procTabBirth2.03 <- run_query(paste0(" select sum(dobPostProc) as dobPostProc from ( select a.person_id, max(case when datediff(day, a.birth_Date, procdate)<0 and b.procdate is not null then 1 else 0 end) as dobPostProc from (select distinct person_id, birth_date from ",demographics, " where birth_date is not null) as a inner join ", procedures, " as b on a.person_id = b.person_id group by a.person_id ) qry ") ) procTabDeath2.03 <- run_query(paste0(" select sum(deathPreProc) as deathPreProc from ( select a.person_id, max(case when datediff(day, a.deathdt, b.procdate)>0 and b.procdate is not null then 1 else 0 end) as deathPreProc from (select distinct person_id, deathdt from ",death, " where deathdt is not null) as a inner join ", procedures, " as b on a.person_id = b.person_id group by a.person_id ) qry ") ) vitalsTabBirth2.03 <- run_query(paste0(" select sum(dobPostMeasure) as dobPostMeasure from ( select a.person_id, max(case when datediff(day, a.birth_Date, b.measure_date)<0 and b.measure_date is not null then 1 else 0 end) as dobPostMeasure from (select distinct person_id, birth_date from ",demographics, " where birth_date is not null) as a inner join ", vital_signs, " as b on a.person_id = b.person_id group by a.person_id ) qry ") ) vitalsTabDeath2.03 <- run_query(paste0(" select sum(deathPreMeasure) as deathPreMeasure from ( select a.person_id, max(case when datediff(day, a.deathdt, b.measure_date)>0 and b.measure_date is not null then 1 else 0 end) as deathPreMeasure from (select distinct person_id, deathdt from ",death, " where deathdt is not null) as a inner join ", vital_signs, " as b on a.person_id = b.person_id group by a.person_id ) qry ") ) pharmTabBirth2.03 <- run_query(paste0(" select sum(dobPostRx) as dobPostRx from ( select a.person_id, max(case when datediff(day, a.birth_Date, b.rxdate)<0 and b.rxdate is not null then 1 else 0 end) as dobPostRx from (select distinct person_id, birth_date from ",demographics, " where birth_date is not null) as a inner join ", pharmacy, " as b on a.person_id = b.person_id group by a.person_id ) qry ") ) pharmTabDeath2.03 <- run_query(paste0(" select sum(deathPreRx) as deathPreRx from ( select a.person_id, max(case when datediff(day, a.deathdt, b.rxdate)>0 and b.rxdate is not null then 1 else 0 end) as deathPreRx from (select distinct person_id, deathdt from ",death, " where deathdt is not null) as a inner join ", pharmacy, " as b on a.person_id = b.person_id group by a.person_id ) qry ") ) prescribTabBirth2.03 <- run_query(paste0(" select sum(dobPostRxStart) as dobPostRxStart from ( select a.person_id, max(case when datediff(day, a.birth_Date, b.rx_start_date)<0 and b.rx_start_date is not null then 1 else 0 end) as dobPostRxStart from (select distinct person_id, birth_date from ",demographics, " where birth_date is not null) as a inner join ", prescribing, " as b on a.person_id = b.person_id group by a.person_id ) qry ") ) prescribTabDeath2.03 <- run_query(paste0(" select sum(deathPreRxStart) as deathPreRxStart from ( select a.person_id, max(case when datediff(day, a.deathdt, b.rx_start_date)>0 and b.rx_start_date is not null then 1 else 0 end) as deathPreRxStart from (select distinct person_id, deathdt from ",death, " where deathdt is not null) as a inner join ", prescribing , " as b on a.person_id = b.person_id group by a.person_id ) qry ") ) labTabBirth2.03 <- run_query(paste0(" select sum(dobPostResult) as dobPostResult from ( select a.person_id, max(case when datediff(day, a.birth_Date, b.result_dt)<0 and b.result_dt is not null then 1 else 0 end) as dobPostResult from (select distinct person_id, birth_date from ",demographics, " where birth_date is not null) as a inner join ", lab_results, " as b on a.person_id = b.person_id group by a.person_id ) qry ") ) labTabDeath2.03 <- run_query(paste0(" select sum(deathPreResult) as deathPreResult from ( select a.person_id, max(case when datediff(day, a.deathdt, b.result_dt)>0 and b.result_dt is not null then 1 else 0 end) as deathPreResult from (select distinct person_id, deathdt from ",death, " where deathdt is not null) as a inner join ", lab_results, " as b on a.person_id = b.person_id group by a.person_id ) qry ") ) DeathBeforeBirth <- run_query(paste0(" select sum(deathPreBirth) as deathPreBirth from ( select a.person_id, max(case when datediff(day, a.deathdt, b.birth_date)>0 then 1 else 0 end) as deathPreBirth from (select distinct person_id, deathdt from ",death, " where deathdt is not null) as a inner join (select distinct person_id, birth_date from ",demographics, " where birth_date is not null) as b on a.person_id = b.person_id group by a.person_id ) qry ") ) patsInEncTab <- run_query( paste0("select count(distinct person_id) as n from ",encounters)) table2.03 <- cbind( encTabBirth2.03, encTabDeath2.03, procTabBirth2.03, procTabDeath2.03, vitalsTabBirth2.03, vitalsTabDeath2.03, pharmTabBirth2.03, pharmTabDeath2.03, prescribTabBirth2.03, prescribTabDeath2.03, labTabBirth2.03, labTabDeath2.03, DeathBeforeBirth ) labels2.03 <- data.frame(name=c( "dobPostAdate","dobPostDdate","deathPreAdate","deathPreDdate","dobPostProc","deathPreProc","dobPostMeasure","deathPreMeasure","dobPostRx","deathPreRx","dobPostRxStart","deathPreRxStart","dobPostResult","deathPreResult","deathPreBirth" ), newlabel = c('Adate<birth_date','Ddate<birth_date','Deathdt<adate','Deathdt<ddate','Procdate<birth_date','ProcDate>deathdt','Measure_date<birth_date','Measure_date>deathdt','rxdate<birth_date','rxdate>deathdt','rx_start_dt<birth_date','rx_start_dt>deathdt','Result_date<birth_date','Result_date>deathdt','Deathdt<birth_date') , srctab = c('Encounters','Encounters','Encounters','Encounters','Procedures','Procedures','Vital_signs','Vital_signs','Pharmacy','Pharmacy','Prescribing','Prescribing','lab_results','lab_results','Demographics and death') ) table2.03_tx <- tidyr::gather(table2.03, "comp", 'n') %>% within({ pct <- round(100*n/patsInEncTab$n, 2) }) table2.03_tx$ord <- 1:nrow(table2.03_tx) table2.03_tx <- table2.03_tx%>% merge(labels2.03, by.x='comp',by.y='name', all.x=T) %>% arrange(ord) #knitr::kable(table2.03_tx[,c('ord','comp','n','pct', 'srctab')], col.names = c(' ','Date comparison','N Patients','% of unique patients in the encounter table','Source table')) #params$QAAlert <- T illogicalDatesMsg <- NULL illogicalDatesMsg <- if(nrow(subset(table2.03_tx, pct>=5))>0){ paste("More than 5% of patients have illogical date relationships:", subset(table2.03_tx, pct>=5)$comp, sep=' ') } else if(nrow(subset(fdatesdone, pctFuture>=5))==0 ) { "No table exceeds the 5% limit on illogical dates." } QA_Alert_Messages[nrow(QA_Alert_Messages) + 1,] = c("2.03",illogicalDatesMsg)
# to implement a new check add a new row to the following table: # table = the standard name of the table in the database (it will automaticaly correct for you if the actual table names are not standard) # var = the name of the variable to be chacked # condition = a valid logical condition ( in MS sql server language) which identifies a bad value # where = a valid where clause to subset the table, be sure to start with "WHERE", see example for ddate check3.03 <- rbind( data.frame(table='demographics', disp='birth_date', var='birth_date', condition='is null', where=' '), data.frame(table='demographics', disp='gender', var='gender', condition="not in ('M','F','O')", where=' '), data.frame(table='encounters', disp='discharge_disposition', var='discharge_disposition', condition="not in ('A','E')", where=' '), data.frame(table='encounters', disp='ddate', var='ddate', condition="is null", where="where enctype = 'IP'"), data.frame(table='procedures', disp='procdate', var='procdate', condition='is null', where=' '), data.frame(table='prescribing',disp='rx_order_date', var='rx_order_date', condition='is null', where=' '), data.frame(table='pharmacy', disp='rxsup and/or rxamt', var='not(rxsup>1 or rxamt>1)', condition=' ', where=' '), # compicated by it depending on two variables #data.frame(table='cause_of_death', var='source', condition="not in ('S', 'N', 'T', 'B', 'L', 'U', 'O')"), data.frame(table='diagnoses',disp='dx_origin', var='dx_origin', condition="not in ('OD', 'BI', 'CL', 'PR', 'NI', 'OT')", where=' '), data.frame(table='diagnoses', disp='enc_id', var='enc_id', condition="is null", where=' '), data.frame(table='procedures', disp='enc_id', var='enc_id', condition="is null", where=' '), data.frame(table='vital_signs', disp='enc_id', var='enc_id', condition="is null", where=' ') ) res3.03 <- list() for(i in 1:nrow(check3.03)){ res3.03[[i]] <- with(check3.03, {run_query( paste("select '",table[i],"' as dataTable,'",var[i],"' as variable, '",disp[i],"' as display, count(*) as N_rows, sum(case when ",var[i],condition[i],"then 1 else 0 end) as N_bad", "from",eval(as.name(tolower(table[i]))), where[i] , sep=' ' ) )} ) } result3.03 <- do.call('rbind', res3.03) %>% within(.,{ pctBad <- round(100*N_bad/N_rows, 2) }) check3.03Msg <- NULL check3.03Msg <- if(nrow(subset(result3.03, pctBad>=10))>0){ result3.03gt10 <- subset(result3.03, pctBad>=10) paste("More than 10% of records have missing or unknown values for the following table fields: ", paste(paste(trimws(result3.03gt10$dataTable), ": ", trimws(result3.03gt10$display), sep=''), sep = ", "), sep="") } else if(nrow(subset(result3.03, pctFuture>=10))==0 ) { "No check exceeds the 10% limit on bad values." } QA_Alert_Messages[nrow(QA_Alert_Messages) + 1,] = c("3.03",check3.03Msg)
patsWithEncs <- run_query(paste0("WITH cte_patWEncDiag AS ( SELECT count(DISTINCT di.person_id) patsWEncDiag FROM ", demographics, " de JOIN ", encounters ," e on e.person_id = de.person_id JOIN ", diagnoses ," di ON de.person_id = di.person_id ), cte_patWEncProc AS ( SELECT count(DISTINCT p.person_id) AS patsWEncProc FROM ", demographics, " de JOIN ", encounters, " e on e.person_id = de.person_id JOIN ", procedures, " p ON de.person_id = p.person_id ), cte_pats AS ( SELECT count(DISTINCT de.person_id) nrows FROM ", demographics, " de JOIN ", encounters, " e on e.person_id = de.person_id ) SELECT nrows, patsWEncDiag, patsWEncProc FROM cte_patWEncDiag ,cte_patWEncProc ,cte_pats") ) patsWithEncs <- within(patsWithEncs, { pctWEncDiag <- 100.0* patsWEncDiag/nrows pctWEncProc <- 100.0* patsWEncProc/nrows }) patDiagMsg <- with (patsWithEncs,{ if(pctWEncDiag<50.0 ){ paste0('WARNING: Only ',round(pctWEncDiag,1),'% of patients with encounters have diagnoses') } else { paste0('NOTE: ',round(pctWEncDiag,1),'% of patients with encounters have diagnoses') } }) patProcMsg <- with (patsWithEncs,{ if(pctWEncProc<50.0 ){ paste0('WARNING: Only ',round(pctWEncProc,1),'% of patients with encounters have procedures') } else { paste0('NOTE: ',round(pctWEncProc,1),'% of patients with encounters have procedures') } }) QA_Alert_Messages[nrow(QA_Alert_Messages) + 1,] = c("3.04",patProcMsg) QA_Alert_Messages[nrow(QA_Alert_Messages) + 1,] = c("3.05",patDiagMsg)
ipedei_no_principal_diag <- run_query(paste0("SELECT InpatientEnc, InpatientEnc - InpatientEncWPrincipalDiag AS InpatientEncWOPrincipalDiag, CAST(ROUND((1.0 - CAST(InpatientEncWPrincipalDiag AS FLOAT) / CAST(InpatientEnc AS FLOAT)) * 100.0, 2) as DECIMAL(5,2)) AS InpatientEncWOPrincipalDiagPcnt FROM ( SELECT COUNT(e.ENC_ID) AS InpatientEnc, SUM(IIF(d.ENC_ID IS NOT NULL, 1, 0)) AS InpatientEncWPrincipalDiag FROM ", encounters ," e LEFT JOIN ( SELECT DISTINCT ENC_ID FROM ", diagnoses, " d WHERE d.PRINCIPAL_DX = 'P' AND d.DX_ORIGIN != 'PR' ) d ON d.ENC_ID = e.ENC_ID WHERE e.ENCTYPE IN('IP', 'EI') ) PxDiags;")) if (params$QAAlert == TRUE){ ipedei_no_principal_diag_table <- subset(ipedei_no_principal_diag, InpatientEncWOPrincipalDiagPcnt > 10.0) if (!(length(ipedei_no_principal_diag_table)==0) & nrow(ipedei_no_principal_diag_table) > 0){ QA_Alert_Message_306 <- paste0("More than 10% of IP (inpatient) encounters with any diagnosis don't have a principal diagnosis") QA_Alert_Messages[nrow(QA_Alert_Messages) + 1,] = c("3.06",QA_Alert_Message_306) } }
benchmark_start <- run_query(paste0(" SET NOCOUNT ON; DECLARE @BenchmarkStartDate DATE; SET @BenchmarkStartDate = ( SELECT IIF(MAX(x.ADATE) <= GETDATE(), MAX(x.ADATE), GETDATE()) FROM ", encounters, " x ); SELECT @BenchmarkStartDate; ")) data_result_75_complete <- run_query(gsub( "@BenchmarkStartDate", paste0('\'', benchmark_start[1,1], '\'', sep=''), paste0(" SET NOCOUNT ON; DECLARE @BenchmarkPriorYearAvg INT; SET @BenchmarkPriorYearAvg = ( SELECT COUNT(1) / 12 FROM ", encounters, " e WHERE e.ENCTYPE IN('IP', 'AV', 'ED', 'EI') AND e.ADATE >= DATEADD(MONTH, -24, @BenchmarkStartDate) AND e.ADATE <= DATEADD(MONTH, -12, @BenchmarkStartDate) ); SELECT FORMAT(ADATE, 'MM-MMM') Month, @BenchmarkPriorYearAvg AS BenchmarkCount, COUNT(1) PriorMonthCount, IIF(@BenchmarkPriorYearAvg <> 0, CAST(ROUND((CAST(COUNT(1) AS FLOAT) / @BenchmarkPriorYearAvg * 100.0), 2) as DECIMAL(5,2)), 0) AS PercentofBenchMark FROM ", encounters, " e WHERE e.ENCTYPE IN('IP', 'AV', 'ED', 'EI') AND e.ADATE >= DATEADD(MONTH, -12, @BenchmarkStartDate) GROUP BY FORMAT(ADATE, 'MM-MMM') ORDER BY FORMAT(ADATE, 'MM-MMM'); "))) data_result_75_complete$PercentofBenchMark <- as.numeric(as.character(data_result_75_complete$PercentofBenchMark))
knitr::kable(QA_Alert_Messages, row.names = FALSE)
\pagebreak
knitr::kable(missing_columnsTables, row.names = FALSE, col.names = c("Table Name", "Column Name"))
invalid_specs[is.na(invalid_specs)] = "" knitr::kable(invalid_specs, row.names = FALSE, col.names = c("Table Name", "Column Name", "Expected Number Precision", "Found Number Precision", "Expected Number Scale", "Found Number Scale", "Expected Char Length", "Found Char Length", "Expected Date Precision", "Found Date Precision"))
violated_primary_keys[is.na(violated_primary_keys)] = "" knitr::kable(violated_primary_keys, row.names = FALSE, digits = 2, format.args = list(big.mark = ",", scientific = FALSE))
data_validation[is.na(data_validation)] = "" knitr::kable(data_validation, digits = 2, format.args = list(big.mark = ",", scientific = FALSE))
knitr::kable(nullableValues, row.names = FALSE, col.names = c("Table Name", "Column Name", "Expected Field Allows Null Value", "Found Field Allows Null Value"), digits = 2, format.args = list(big.mark = ",", scientific = FALSE))
knitr::kable(orphan_person_ids_table, row.names = FALSE, col.names = c("Target Table", "Target Column", "Reference Table", "Reference Column", "Count of Values Not Found", "Distinct Total in Target Table", "Percentage of Missing to Distinct Total"), digits = 2, format.args = list(big.mark = ",", scientific = FALSE))
knitr::kable(orphan_enc_ids_table, row.names = FALSE, col.names = c("Target Table", "Target Column", "Reference Table", "Reference Column", "Count of Values Not Found", "Distinct Total in Target Table", "Percentage of Missing to Distinct Total"), digits = 2, format.args = list(big.mark = ",", scientific = FALSE))
knitr::kable(subset(repErrors_tx, select=c(tablename, nrows, field, NP_bad)), col.names = c('Table','N','Field','Errors N(%)'), digits = 2, format.args = list(big.mark = ",", scientific = FALSE))
knitr::kable(GR1Person, col.names = c('Table','Number of Encounters','Encounters Assinged to More Than One Person', "Percentage"), row.names = FALSE, digits = 2, format.args = list(big.mark = ",", scientific = FALSE))
knitr::kable(orphan_provider_ids_table, row.names = FALSE, col.names = c("Target Table", "Target Column", "Reference Table", "Reference Column", "Count of Values Not Found", "Distinct Total in Target Table", "Percentage of Missing to Distinct Total"), digits = 2, format.args = list(big.mark = ",", scientific = FALSE))
knitr::kable(code_conform_table_counts, row.names = FALSE, col.names = c("Table Name", "Code Type", "Total Codes in Table", "Total Invalid Codes", "Percent Invalid"), digits = 3, format.args = list(big.mark = ",", scientific = FALSE)) cat(" \n") cat("Top 50 Invalid Codes") top_50_invalid$Code[top_50_invalid$Code==""] <- "\'blank\'" knitr::kable(top_50_invalid, row.names = FALSE, col.names = c("Table Name", "Code Type", "Code", "Valid Result", "Count Invalid"), digits = 2, format.args = list(big.mark = ",", scientific = FALSE))
futureDataMsg <- if(nrow(subset(fdatesdone, pctFuture>=5))>0 & params$QAAlert==F){ knitr::kable(subset(fdatesdone, pctFuture>=5), col.names = c('Table','Date name','Total rows','N future dates','%'), digits = 2, format.args = list(big.mark = ",", scientific = FALSE)) } else if(nrow(subset(fdatesdone, pctFuture>=5))==0 & params$QAAlert==F) { "No table exceeds the 5% limit on future dates." } else if ( params$QAAlert==T) { knitr::kable(fdatesdone, col.names = c('Table','Date name','Total rows','N future dates','%'), digits = 2, format.args = list(big.mark = ",", scientific = FALSE)) } futureDataMsg
outOfRangeMsg <- NULL outOfRangeMsg <- if(nrow(subset(allOutOfRange, (N_low+N_high)/nrows>=0.1))>0 & params$QAAlert==F){ allOutOfRange <- subset(subset(allOutOfRange, (N_low+N_high)/nrows>=0.1), select=c(table,item, low, high, nrows, NP_low, NP_high)) knitr::kable(allOutOfRange, col.names = c('Table','Field','Check Low','High','N','Low values N (%)','High values N (%)') , digits = 2, format.args = list(big.mark = ",", scientific = FALSE)) } else if(nrow(subset(allOutOfRange, (N_low+N_high)/nrows>=0.1))==0 & params$QAAlert==F) { "No table exceeds the 10% limit." } else if ( params$QAAlert==T) { allOutOfRange <- subset(allOutOfRange, select=c(table,item, low, high, nrows, NP_low, NP_high)) knitr::kable(allOutOfRange, col.names = c('Table','Field','Check Low','High','N','Low values N(%)','High values N (%)'), digits = 2, format.args = list(big.mark = ",", scientific = FALSE)) } outOfRangeMsg
illogicalDatesMsg <- NULL illogicalDatesMsg <- if(nrow(subset(table2.03_tx, pct>=5))>0 & params$QAAlert==F){ knitr::kable(subset(table2.03_tx, pct>=5)[,c('ord','comp','n','pct', 'srctab')], col.names = c(' ','Date comparison','N Patients','% of unique patients in the encounter table','Source table')) } else if(nrow(subset(table2.03_tx, pct>=5))==0 & params$QAAlert==F) { "No table exceeds the 5% limit on illogical dates." } else if ( params$QAAlert==T) { knitr::kable(table2.03_tx[,c('ord','comp','n','pct', 'srctab')], col.names = c(' ','Date comparison','N Patients','% of unique patients in the encounter table','Source table')) } illogicalDatesMsg
check3.03Msg <- NULL check3.03Msg <- if(nrow(subset(result3.03, pctBad>=10.0))>0 & params$QAAlert==F){ knitr::kable(subset(result3.03, pctBad>=10.0)[,c('dataTable','display','N_rows','N_bad', 'pctBad')], col.names = c('Table','Variable','N Rows','N with bad values','%')) } else if(nrow(subset(result3.03, pctBad>=10.0))==0 & params$QAAlert==F) { "No check exceeds the 10% limit on bad values." } else if ( params$QAAlert==T) { knitr::kable(result3.03[,c('dataTable','display','N_rows','N_bad', 'pctBad')], col.names = c('Table','Variable','N Rows','N with bad values','%'), digits = 2, format.args = list(big.mark = ",", scientific = FALSE)) } check3.03Msg
check3.04 <- data.frame(patsWithEncs[,c('nrows','patsWEncDiag','pctWEncDiag')]) %>% format(digits = 2, big.mark = "," ) check3.04$patsWEncDiag <- paste(check3.04$patsWEncDiag," (",check3.04$pctWEncDiag, "%)", sep="") knitr::kable(check3.04[,c('nrows','patsWEncDiag')], row.names = FALSE, col.names = c("Patients w/ Encounters", "Patients with Encounters and Diagnosis Records"))
check3.05 <- data.frame(patsWithEncs[,c('nrows','patsWEncProc','pctWEncProc')]) %>% format(digits = 2, big.mark = "," ) check3.05$patsWEncProc <- paste(check3.05$patsWEncDiag," (",check3.05$pctWEncProc, "%)") knitr::kable(check3.05[,c('nrows','patsWEncProc')], row.names = FALSE, col.names = c("Patients w/ Encounters", "Patients with Encounters and Procedure Records"))
knitr::kable(ipedei_no_principal_diag, row.names = FALSE, col.names = c("In-Patient Type Encounters", "In-Patient Encounters W/O Principal Diagnoses", "% of In-Patient Type Encounters W/O Principal Diagnoses"), digits = 2, format.args = list(big.mark = ",", scientific = FALSE))
cat(paste0('Benchmark Start Date: ', benchmark_start[1,1])) if (params$QAAlert == TRUE){ data_result_75_complete_table <- subset(data_result_75_complete, PercentofBenchMark < 75 ) if (!(length(data_result_75_complete_table)==0) & nrow(data_result_75_complete_table) > 0){ QA_Alert_Message_307 <- paste0("Encounters, diagnoses, or procedures in an ambulatory (AV), emergency department (ED), or inpatient (IP) setting are less than 75% complete three months prior to the current month") QA_Alert_Messages[nrow(QA_Alert_Messages) + 1,] = c("3.07",QA_Alert_Message_307) } } knitr::kable(data_result_75_complete,row.names = FALSE, col.names = c("Month", "Benchnmark Count", "Prior Month Count", "% of Benchmark"), digits = 2, format.args = list(big.mark = ",", scientific = FALSE))
# close odbc odbcCloseAll() endTime <- Sys.time() runtime <- endTime - startTime
Query run time = r runtime
minutes
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.