R/DatasetDQ.R

Defines functions createMIAD

Documented in createMIAD

#' Level one data about dataset MIAD stands for minimum information about a dataset
#' percentage measures are used that hide exact patient counts
#' @param connectionDetails connection
#' @param cdmDatabaseSchema schema
#' @param workDatabaseSchema work schema
#' @param outputFolder where to put output
#' @param level 1 2 or 3, indicates level of detail
#' @export
createMIAD <- function(connectionDetails,
                       cdmDatabaseSchema,
                       workDatabaseSchema = cdmDatabaseSchema,
                       outputFolder,
                       level=3) {
  
  #assumption  outputFolder is where study output is (and exists)
  #now not needed - assumption outputFolder has subfolder export 
  
  #assuming that MIAD data  would be added to packageresults
  
  #workDatabaseSchema = "results" and the same wordDatabaseSchema happens to also have
  #achilles results data
  #the function above requires knowing workDatabaseSchema (currently package data does not have that)
  #so either would have to be added or executed during study execution or feasibility
  
  
  
  exportFolder<-file.path(outputFolder,"export")
  
  
  if (!file.exists(exportFolder))
    dir.create(exportFolder)
  
  
  
  
  
  
  
  #check if Achilles_results tables are in workDatabaseSchema (hoping they would)
  conn <- DatabaseConnector::connect(connectionDetails)    
  
  
  #not truly used at the moment
  tables<-getTableNames(conn,workDatabaseSchema) #requires certain version of DatabaseConnector
  tables
  tables<-toupper(tables)
  if (!("ACHILLES_RESULTS_DERIVED" %in% tables)) hasAchillesDerivedTable=FALSE else hasAchillesDerivedTable=TRUE
  
  
  #fix for Redshift bug
  hasAchillesDerivedTable = TRUE
  
  #testing just one table. if achilles_results_derived table is present, it is assumed that all achilles tables are present
  if (hasAchillesDerivedTable) {
    
    
    
    
    
    #assuming colum names will be upper case for all outputs
    
    #1 derived measures 
    
    
    sql <- "select measure_id, stratum_1,stratum_2, statistic_value as value from @results_database_schema.achilles_results_derived where measure_id not like '%PersonCnt%' order by measure_id,stratum_1"
    
    #old query (smaller was)
    # select * from @results_database_schema.achilles_results_derived r where measure_id in ('ach_2000:Percentage',
    #                                      'ach_2001:Percentage','ach_2002:Percentage','ach_2003:Percentage')
    
    sql <- SqlRender::renderSql(sql,results_database_schema = workDatabaseSchema)$sql
    sql <- SqlRender::translateSql(sql, targetDialect = connectionDetails$dbms)$sql
    dataDerived <- DatabaseConnector::querySql(conn, sql)
    
    
    
    
    #2 ------dist results table section
    
    
    sql <- "select d.analysis_id, stratum_1, stratum_2,count_value,avg_value, median_value, a.analysis_name,
    a.stratum_1_name,stratum_2_name, stdev_value,p10_value,p25_value,p75_value,p90_value from @results_database_schema.achilles_results_dist d 
    join @results_database_schema.achilles_analysis a on d.analysis_id = a.analysis_id
    where d.analysis_id 
    in (103,104,105,106,107,203,206,211,403,506,511,512,513,514,515,603,703,803,903,1003,1803) order by analysis_id"
    
    
    sql <- SqlRender::renderSql(sql,results_database_schema = workDatabaseSchema)$sql
    sql <- SqlRender::translateSql(sql, targetDialect = connectionDetails$dbms)$sql
    dataDist <- DatabaseConnector::querySql(conn, sql)
    
    
    
    # write.csv(dataDist,file = file.path(exportFolder,'SelectedAchillesResultsDistMeasures.csv'),row.names = F)
    
    
    
    
    #4 ------Achilles  results  table section (selected measures) (recomputed as percentages of all patients)
    
    #treshold on patient count was added (in addition to achilles default filtering)
    sql <- "select analysis_id as measure_id,stratum_1,stratum_2,stratum_3,count_value from @results_database_schema.achilles_results a 
    where analysis_id in (0,1,2,4,5,10,11,12,109,113,212,200,505)
    and count_value >10
    "
    
    # where analysis_id in (0,1,2,4,5,10,11,12,109,113,212,200,505)
    
    sql <- SqlRender::renderSql(sql,results_database_schema = workDatabaseSchema)$sql
    sql <- SqlRender::translateSql(sql, targetDialect = connectionDetails$dbms)$sql
    data <- DatabaseConnector::querySql(conn, sql)
    
    
    
    
    #get person count
    persons<-data$COUNT_VALUE[data$MEASURE_ID == 1]
    
    data$VALUE <- data$COUNT_VALUE/persons
    data$COUNT_VALUE <- NULL
    
    #create fuzzy count
    if (persons > 100000000) personsFuzzy<-'>100M'  
    if (persons < 100000000) personsFuzzy<-'40-100M'  
    if (persons < 40000000) personsFuzzy<-'20-40M'  
    if (persons < 20000000) personsFuzzy<-'10-20M'  
    if (persons < 10000000) personsFuzzy<-'5-10M'  
    if (persons < 50000000) personsFuzzy<-'1-5M'  
    if (persons < 1000000) personsFuzzy<-'100k-1M'
    if (persons < 100000) personsFuzzy<-'10-100k'
    if (persons < 10000)  personsFuzzy<-'<10k'
    
    #binned size of dataset
    newrow<-data.frame(MEASURE_ID = 99,STRATUM_1='',STRATUM_2='',STRATUM_3='',VALUE=as.character(personsFuzzy))
    
    data<-rbind(data,newrow)
    
    
    
    
    
    
    
    # write.csv(data,file = file.path(exportFolder,'SelectedAchillesResultsMeasuresPerc.csv'),row.names = F)
    
    
    #5 ------Achilles  results  table section (not person dependent)
    
    
    sql <- "select analysis_id,stratum_1,count_value from @results_database_schema.achilles_results a where analysis_id in (201)"
    
    
    sql <- SqlRender::renderSql(sql,results_database_schema = workDatabaseSchema)$sql
    sql <- SqlRender::translateSql(sql, targetDialect = connectionDetails$dbms)$sql
    dataAchillesResults <- DatabaseConnector::querySql(conn, sql)
    
    
    
    
    #combine data from variouse streams
    dataDerived2<-data.frame(MEASURE_ID=dataDerived$MEASURE_ID,STRATUM_1=dataDerived$STRATUM_1,STRATUM_2=dataDerived$STRATUM_2,STRATUM_3=NA,VALUE=dataDerived$VALUE)
    export<-rbind(data,dataDerived2)
    
    
    #further prune the measures
    
    
    export <- dplyr::arrange(export,MEASURE_ID,STRATUM_1,STRATUM_2,STRATUM_3)
    
    LEVEL_1 <- c('0','2','4','99')
    LEVEL_2 <- c(LEVEL_1,'Achilles:byAnalysis:RowCnt')
    
    #restrict data to a given level
    #level 1 is most restrictive, level 3 has most data
    if (level==1) {
      writeLines(paste('Using level',level))
      export <-  dplyr::filter(export,MEASURE_ID %in% LEVEL_1)
    }
    
    if (level==2) {
      writeLines(paste('Using level',level))
      export <-  dplyr::filter(export,MEASURE_ID %in% LEVEL_2)
    }
    
    #add description column
    #this makes it dependand on Achilles (not ideal, but for the time beeing OK)
    lkup_analyses<-Achilles::getAnalysisDetails()
    
    
    lkup_analyses <- dplyr::select(lkup_analyses,MEASURE_ID=ANALYSIS_ID,MEASURE_NAME=ANALYSIS_NAME)
    lkup_analyses$MEASURE_ID <- as.character(lkup_analyses$MEASURE_ID)
    export<-dplyr::left_join(export,lkup_analyses,by='MEASURE_ID')
    
    
    
    #export data
    write.csv(export,file = file.path(exportFolder,'DatasetMetadata.csv'),row.names = F)
    
    # Clean up
    DatabaseConnector::disconnect(conn)
    
    
    
  }
  writeLines('Done generating MIAD')
  #return value
  return(export)
}
vojtechhuser/DataQuality documentation built on May 10, 2020, 8:31 a.m.