# load packages 
library(data.table)
library(DT)
library(easimple)
library(gridExtra)
library(ggplot2)

# create copy of dems data
in_demo <- copy(params$file_name)

###################
# save parameters #
###################

  # interval parameter
  outlier_parm <- params$outlier_parm

  # duplicate parameter
  dup_parm <- params$duplicate_parm

  # threshold for % of missing...
  missing_parm <- params$missing_parm

  # set colors
  c_good    <- "white"
  c_warning <- "#FFEB9C"
  c_problem <- "#FFC7CE"

######################
# general formatting #
######################

  # create list of all dummy vars
  dummy_vars <- grep("^(d_)", colnames(in_demo), value = TRUE)

  # convert dummy vars to numeric
  in_demo[, dummy_vars] <- lapply(in_demo[, dummy_vars, with=FALSE], as.numeric)

  # convert NA grades to 99 and NA school years to missing
  in_demo[is.na(student_grade)   , student_grade := 99]
  in_demo[is.na(demo_school_year), demo_school_year := "missing"]

  # make student_grade two characters
  in_demo[nchar(as.character(student_grade)) == 1 & !grepl("k", student_grade), student_grade := paste0(0, student_grade)]

################
# create lists #
################

  # list of years
  year_list <- unique(in_demo$demo_school_year)

  # list of grades
  grade_list <- unique(in_demo$student_grade)

  # sort grade list low to high
  grade_list <- sort(grade_list, decreasing=FALSE)

  # put k and pk on front end of list if they exists
  if ( ("k" %chin% grade_list) & ("pk" %chin% grade_list)) { 
    other_grades <- setdiff(grade_list, c("k", "pk"))
    other_grades <- sort(other_grades, decreasing=FALSE)
    grade_list   <- c("pk", "k", other_grades)}

  # put k on front end of list if they exists
  if ( ("k" %chin% grade_list) & !("pk" %chin% grade_list)) { 
    other_grades <- setdiff(grade_list, c("k"))
    other_grades <- sort(other_grades, decreasing=FALSE)
    grade_list   <- c("k", other_grades)}

  # put pk on front end of list if they exists
  if ( !("k" %chin% grade_list) & ("pk" %chin% grade_list)) { 
    other_grades <- setdiff(grade_list, c("pk"))
    other_grades <- sort(other_grades, decreasing=FALSE)
    grade_list   <- c("pk", other_grades)}


##########################
# TABLE 1: GENERAL TABLE #
##########################

  # .i. create general table 
  general_table <- in_demo[, list(total_n_obs       = .N,
                                  n_unique_students = length(unique(ea_student_id))),
                           by="demo_school_year"]

  # .ii. output n_exact duplicate rows
  exact_dups <- ea_out_dups(in_demo, opt_key_all = 1)

  # .ii. calcualte number of exact rows per year
  exact_dups_table <- exact_dups[, list(exact_dup_row=.N), by="demo_school_year"]

  # .ii. merge duplicate row info onto general table
  general_table <- ea_merge(general_table, exact_dups_table, "demo_school_year", "x")

  # .ii. create a % of duplicate rows
  general_table[, perc_exact_dup_row := round((exact_dup_row/total_n_obs)*100, 2)]

  # .iii. make all NA values = 0
  general_table[is.na(general_table)] <- 0

  # .iv. sort years from high to low
  general_table <- general_table[order(-(demo_school_year))]

  # .v. convert to datatable
  general_table_dt <- datatable(general_table, rownames = FALSE, extensions = "KeyTable")

  #######################################
  # flag large disparities across years #
  #######################################

     # set general_table_flag to zero
    general_table_flag <- 0

    # set list of vars to check in loop
    gen_vars <- c("total_n_obs", "n_unique_students")

  ##########################################################################
  # check 1.1 & 1.2 -> outliers of total_n_obs & n_unique_students by year #
  ##########################################################################

     # flag large disparities
    for (m_var in gen_vars){

      # set intervals
      lower_int <- general_table[demo_school_year != "missing", mean(get(m_var)) - mean(get(m_var))*(outlier_parm)]
      upper_int <- general_table[demo_school_year != "missing", mean(get(m_var)) + mean(get(m_var))*(outlier_parm)]

      # apply color scheme
      if (is.na(lower_int)==0) {general_table_dt <- formatStyle(general_table_dt, columns = paste0(m_var), background = styleInterval(c(lower_int, upper_int), c(c_problem, c_good, c_problem)))}

      # trigger internal flag if outliers exists
      if ( (is.na(lower_int)==0) & ( (general_table[, min(get(m_var))] < lower_int) | (general_table[, max(get(m_var))] > upper_int))) {general_table_flag <- 1}

    }

  #########################################################################
  # check 1.3 -> % of duplicate rows is higher than the missing threshold #
  #########################################################################

    # apply color scheme to % exact duplicate row
    general_table_dt <- formatStyle(general_table_dt, columns = "perc_exact_dup_row", background = styleInterval(c(0, dup_parm), c(c_good, c_warning, c_problem)))

    # internal flag for general table
    if (sum(general_table$perc_exact_dup_row, na.rm = TRUE)!=0) {

      if ( (min(general_table$perc_exact_dup_row, na.rm = TRUE) > dup_parm)) {general_table_flag <- 1}

    }

  ####################################
  # check 1.4 -> missing school year #
  ####################################

    # highlight missing school year
    general_table_dt <- formatStyle(general_table_dt, columns = "demo_school_year", backgroundColor = styleEqual(c("missing"), c(c_problem)))

    # internal flag for missing school year
    if ("missing" %chin% unique(general_table$demo_school_year)) {general_table_flag <- 1}


#########################################
# TABLE 1.b: GRADE DISTRIBUTION BY YEAR #
#########################################

  # remove exact duplicates
  sub_demo <- ea_no_dups(in_demo, opt_key_all = 1)

  # calc # students per grade
  grade_distribution <- ea_table(sub_demo, c("demo_school_year", "student_grade"))

  # cast wide
  grade_distribution <- dcast.data.table(grade_distribution, demo_school_year ~ student_grade, value.var = "count")

  # convert NA values to zero
  grade_distribution[is.na(grade_distribution)] <- 0  

   # make column name grade "missing" if it exists
  if ("99" %chin% grade_list) {setnames(grade_distribution, "99", "missing")}

  # create new grade_list
  grade_list_t1 <- copy(grade_list)

  # replace "99" with "missing" for new grade list
  if ("99" %chin% grade_list_t1) {grade_list_t1 <- gsub("99", "missing", grade_list_t1)}

  # set column order
  ea_colorder(grade_distribution, c("demo_school_year", grade_list_t1))

  # convert to datatable
  grade_dist_dt <- datatable(grade_distribution, rownames = FALSE, extensions = "KeyTable")

  # set flag_grade_dist to 0
  flag_grade_dist <- 0

  ###############################################################
  # check 1.b loop through grade_list_t1 and highlight outliers #
  ###############################################################

    # flag large disparities
    for (m_grade in grade_list_t1){

      # set intervals
      lower_int <- grade_distribution[demo_school_year != "missing", mean(get(m_grade)) - mean(get(m_grade))*(outlier_parm)]
      upper_int <- grade_distribution[demo_school_year != "missing", mean(get(m_grade)) + mean(get(m_grade))*(outlier_parm)]

      # apply color scheme
      if (is.na(lower_int)==0) {grade_dist_dt <- formatStyle(grade_dist_dt, columns = paste0(m_grade), background = styleInterval(c(lower_int, upper_int), c(c_problem, c_good, c_problem)))}

      # trigger internal flag if outliers exists
      if ((is.na(lower_int)==0) & ( (grade_distribution[, min(get(m_grade))] < lower_int) | (grade_distribution[, max(get(m_grade))] > upper_int))) {flag_grade_dist <- 1}

    }


####################################
# TABLE 2: MISSING VARIABLES TABLE #
####################################

  # create a copy of in_demo as a data.frame
  missing_vars <- as.data.frame(copy(in_demo))

  # subset test dataset to missing vars
  missing_vars <-  missing_vars[sapply(missing_vars, function(x) all(is.na(x)))]

  # create list of all missing vars
  miss_var_list <- colnames(missing_vars)

  # create data.table of remaining columns
  miss_var_table <- as.data.table(colnames(in_demo))

  # setnames
  setnames(miss_var_table, "V1", "variable")

  # create a missing flag
  miss_var_table[, missing := ifelse(variable %chin% miss_var_list, 1,0)]

  # datatable 
  miss_var_dt <- datatable(miss_var_table, rownames = FALSE, extensions = "Scroller", options=list(deferRender=TRUE, scrollY=400, scrollCollapse=TRUE))

  # apply color scheme: highlight missing vars
  miss_var_dt <- formatStyle(miss_var_dt, columns = "missing", background = styleEqual(c(0, 1), c(c_good, c_problem)))

  # check 2.1 -> internal flag for list of missing vars
  if (length(miss_var_list)>0) {
    missing_vars_flag <- 1
  } else { missing_vars_flag <- 0}


#################################
# TABLE 3: MISSING DATA % TABLE #
#################################

  # create table (from sub_demo, which is the raw dataset with exact dups removed)
  missing_table <- sub_demo[ , list(total_n_obs         = .N,
                                    missing_student_id  = sum(is.na(ea_student_id)),
                                    missing_grade       = sum(is.na(student_grade)),
                                    missing_gender      = sum(d_gender_missing == 1, na.rm = TRUE),
                                    missing_race        = sum(d_race_missing   == 1, na.rm = TRUE),
                                    missing_frl         = sum(d_frl_missing    == 1, na.rm = TRUE),
                                    missing_ell         = sum(d_ell_missing    == 1, na.rm = TRUE),
                                    missing_sped        = sum(d_sped_missing   == 1, na.rm = TRUE)),
                                by="demo_school_year"]

  # add %'s to table
  missing_table[, perc_miss_stud_id     := round( (missing_student_id/total_n_obs)*100,2)]
  missing_table[, perc_miss_grade       := round( (missing_grade/total_n_obs)*100,2)]
  missing_table[, perc_miss_gender      := round( (missing_gender/total_n_obs)*100,2)]
  missing_table[, perc_miss_race        := round( (missing_race/total_n_obs)*100,2)]
  missing_table[, perc_miss_frl         := round( (missing_frl/total_n_obs)*100,2)]
  missing_table[, perc_miss_ell         := round( (missing_ell/total_n_obs)*100,2)]
  missing_table[, perc_miss_sped        := round( (missing_sped/total_n_obs)*100,2)]

  # sort column order
  ea_colorder(missing_table, c("demo_school_year", "total_n_obs", "missing_student_id", "perc_miss_stud_id", "missing_grade", "perc_miss_grade",
                               "missing_gender", "perc_miss_gender", "missing_race", "perc_miss_race", "missing_frl", "perc_miss_frl", "missing_ell", "perc_miss_ell",
                               "missing_sped", "perc_miss_sped"))

  # sort years from high to low
  missing_table <- missing_table[order(-(demo_school_year))]

  # the following code pre-hides the "extra" columns and allows you to open more if you want more info
  missing_dt <- datatable(missing_table, rownames=FALSE, extensions = 'ColVis', options = list(columnDefs = list(list(targets = c(1,2,4,6,8,10,12,14), visible = FALSE)),
                                                                                               dom = 'C<"clear">lfrtip'))

  ##################################
  # flag large %'s of missing data #
  ##################################

    # .i. check 3.1 -> 3.7 apply color scheme to all percent missings from "missing_table"
    missing_dt <- formatStyle(missing_dt, columns = c("perc_miss_stud_id", "perc_miss_grade","perc_miss_gender", "perc_miss_race", "perc_miss_frl", "perc_miss_ell","perc_miss_sped"),
                              background = styleInterval(c(0, missing_parm), c(c_good, c_warning, c_problem)))
    # internal flag for missing table
    if ( (max(missing_table$perc_miss_stud_id) > missing_parm) | (max(missing_table$perc_miss_grade) > missing_parm) |  
         (max(missing_table$perc_miss_gender) > missing_parm) | (max(missing_table$perc_miss_race) > missing_parm) | 
         (max(missing_table$perc_miss_frl) > missing_parm) | (max(missing_table$perc_miss_ell) > missing_parm) | 
         (max(missing_table$perc_miss_sped) > missing_parm)) {

      missing_table_flag <- 1
    } else {
      missing_table_flag <- 0}  


###################################################
# Table 4: PRIMARY DUMMY FREQUENCIES ACROSS YEARS #
###################################################

  # note: made from dummy vars

  # calculate %'s by year (from sub_demo, which is the raw dataset with exact dups removed)
  prim_dum <- sub_demo[, list(
                             perc_male     = (sum(d_gender_m==1)/ (sum(d_gender_f==1) + sum(d_gender_m==1))),
                             perc_frl_y    = (sum(d_frl_y==1)/ (sum(d_frl_y==1) + sum(d_frl_n==1))),
                             perc_ell_y    = (sum(d_ell_y==1)/ (sum(d_ell_y==1) + sum(d_ell_n==1))),
                             perc_sped_y   = (sum(d_sped_y==1)/ (sum(d_sped_n==1) + sum(d_sped_y==1))),
                             perc_asian    = (sum(d_race_asian==1) / sum(is.na(ea_student_race)==0)),
                             perc_black    = (sum(d_race_black==1) / sum(is.na(ea_student_race)==0)),
                             perc_hispanic = (sum(d_race_hispanic==1) / sum(is.na(ea_student_race)==0)),
                             perc_islander = (sum(d_race_islander==1) / sum(is.na(ea_student_race)==0)),
                             perc_multi    = (sum(d_race_multi==1) / sum(is.na(ea_student_race)==0)),
                             perc_native   = (sum(d_race_native==1) / sum(is.na(ea_student_race)==0)),
                             perc_white    = (sum(d_race_white==1) / sum(is.na(ea_student_race)==0))),
                      by="demo_school_year"]

  # convert NA's to 0
  prim_dum[is.na(prim_dum)] <- 0

  # create list of colnames (not demo_school_year)
  prim_vars <- setdiff(colnames(prim_dum), "demo_school_year")

  # multiply all %'s by 100
  prim_dum[, prim_vars] <- lapply(prim_dum[, prim_vars, with=FALSE], function(x){ x <- round(x*100,2)})

  # sort years from high to low
  prim_dum <- prim_dum[order(-(demo_school_year))]

  # .v. convert to datatable
  prim_dum_dt <- datatable(prim_dum, rownames = FALSE, extensions = "KeyTable")

    ###########################################################################
    # check 4.1 -> 4.12 loop through primary variables and highlight outliers #
    ###########################################################################

    # flag large disparities
    for (m_var in prim_vars){

      # set intervals
      lower_int <- prim_dum[demo_school_year != "missing", mean(get(m_var)) - mean(get(m_var))*(outlier_parm)]
      upper_int <- prim_dum[demo_school_year != "missing", mean(get(m_var)) + mean(get(m_var))*(outlier_parm)]

      # if both intervals are 0, change so doesn't get highlighted
      if (lower_int == 0 & upper_int == 0) 
            {lower_int <- -1
             upper_int <-  1}

      # apply color scheme
      if (is.na(lower_int)==0) {prim_dum_dt <- formatStyle(prim_dum_dt, columns = paste0(m_var), background = styleInterval(c(lower_int, upper_int), c(c_problem, c_good, c_problem)))}

      # set internal flag to zero
      flag_prim_dum <- 0

      # trigger internal flag if outliers exists
      if ( (is.na(lower_int)==0) & ( (prim_dum[, min(get(m_var))] < lower_int) | (prim_dum[, max(get(m_var))] > upper_int))) {flag_prim_dum <- 1}

    }

#####################################################
# TABLE 5: PRIMARY VARIABLE CATEGORIES ACROSS YEARS #
#####################################################

  # list to loop over
  loop_list <- c("ea_student_race", "ea_student_frl_status", "ea_student_sped_status", "ea_student_ell_status")

  # start loop 
  for (m_loop in loop_list){

    # initially set internal category flag to zero
    m_flag <- 0

    # only proceed if variable is not missing
    if (! (m_loop %chin% miss_var_list)){

      # calculate crosstab table (from sub_demo, which is the raw dataset with exact dups removed)
      m_table <- ea_table(sub_demo, c("demo_school_year", m_loop))

      # cast table wide
      m_table <- dcast.data.table(m_table, demo_school_year ~ get(m_loop))

      # make all NA values = 0
      m_table[is.na(m_table)] <- 0

      # .iv. sort years from high to low
      m_table <- m_table[order(-(demo_school_year))]

      # .v. convert to datatable
      m_dt <- datatable(m_table, rownames = FALSE, extensions = "KeyTable")

      ###################################################################
      # check 5.1 -> 5.4 loop through categories and highlight outliers #
      ###################################################################

        # create list of races to loop over
        category_list <- in_demo[, unique(get(m_loop))]

        # flag large disparities
        for (m_cat in category_list){

          # set intervals
          lower_int <- m_table[demo_school_year != "missing", mean(get(m_cat)) - mean(get(m_cat))*(outlier_parm)]
          upper_int <- m_table[demo_school_year != "missing", mean(get(m_cat)) + mean(get(m_cat))*(outlier_parm)]

          # apply color scheme
          if (is.na(lower_int)==0) {m_dt <- formatStyle(m_dt, columns = paste0(m_cat), background = styleInterval(c(lower_int, upper_int), c(c_problem, c_good, c_problem)))}

          # trigger internal flag if outliers exists
          if ((is.na(lower_int)==0) & ( (m_table[, min(get(m_cat))] < lower_int) | (m_table[, max(get(m_cat))] > upper_int))) {m_flag <- 1}

        }

        # save flag and DT
        assign(paste0("flag_cat_", ea_scan(m_loop, 3, "_")), m_flag)
        assign(paste0(ea_scan(m_loop, 3, "_"), "_cat_dt"), m_dt)

    }
  }

##################
# to be added....#
##################

# across grades/years/terms Cohort info for students, flag potential issues  

# Missing data 
  # need to differentiate between mandatory variables and optional vars

# ID issues
  # notify when more than one student ID exists in raw data, which to use
  # if both state/local ids exist, analyze their uniqueness & overlap
  # Check for bad id's
    # non alpha-numeric numbers
    # leading zeroes
    # scientific notation

# By school info
  # grade spans per school & year
    # check for missing grades, weird spans, large changes

  # number of students per school & year
    # by school
    # by school and grade
    # flag any large changes in n-sizes at a school across years


  # an overall flag for whether or nor the headings "flagged/unflagged should appear aka...is anything flagged?" -->


Flagged Checks

Below are all the the quality control checks that have been flagged for possible problems


r if (general_table_flag==1) '### general table'

##########################
# table 1: general table #
##########################

if (general_table_flag==1){  general_table_dt}


r if (flag_grade_dist==1) '### grade distribution'

#################################
# table 1.b: grade distribution #
#################################

if (flag_grade_dist==1){grade_dist_dt}


r if (missing_vars_flag==1) '### missing variables'

##############################
# table 2: missing variables #
##############################

if (missing_vars_flag==1) {miss_var_dt}


r if (missing_table_flag==1) '### missing data percentages'

###############################
# table 3: missing data table #
###############################

if (missing_table_flag==1) {missing_dt}


r if (flag_prim_dum==1) '### primary dummy frequencies (by year)' r if (flag_prim_dum==1) '* percentages calculated from dummy variables'

###################################################
# table 4: primary dummy frequencies across years #
###################################################

if (flag_prim_dum==1) {prim_dum_dt}


r if (exists("flag_cat_race")) if(flag_cat_race==1) '### student race categories (by year) ' r if (exists("flag_cat_race")) if(flag_cat_race==1) '* numbers calculated from "ea_student_race" variable '

################################################
# table 5.a: student race categories (by year) #
################################################

if (exists("flag_cat_race")){

  if(flag_cat_race==1) {race_cat_dt}
}


r if (exists("flag_cat_frl")) if(flag_cat_frl==1) '### student frl categories (by year) ' r if (exists("flag_cat_frl")) if(flag_cat_frl==1) '* numbers calculated from "ea_student_frl_status" variable '

###############################################
# table 5.b: student frl categories (by year) #
###############################################

if (exists("flag_cat_frl")){

  if(flag_cat_frl==1) {frl_cat_dt}

}


r if (exists("flag_cat_sped")) if(flag_cat_sped==1) '### student sped categories (by year) ' r if (exists("flag_cat_sped")) if(flag_cat_sped==1) '* numbers calculated from "ea_student_sped_status" variable '

################################################
# table 5.c: student sped categories (by year) #
################################################

if (exists("flag_cat_sped")){

  if(flag_cat_sped==1) {sped_cat_dt}

}  


r if (exists("flag_cat_ell")) if(flag_cat_ell==1) '### student ell categories (by year) ' r if (exists("flag_cat_ell")) if(flag_cat_ell==1) '* numbers calculated from "ea_student_ell_status" variable '

###################################################
# table 5.d: ### student ell categories (by year) #
###################################################

if (exists("flag_cat_ell")){

  if(flag_cat_ell==1) {ell_cat_dt}
}



Unflagged Checks

Below are all the the qc_checks that were not flagged for possible problems


r if (general_table_flag==0) '### general table'

##########################
# table 1: general table #
##########################

if (general_table_flag==0){  general_table_dt}


r if (flag_grade_dist==0) '### grade distribution'

#################################
# table 1.b: grade distribution #
#################################

if (flag_grade_dist==0){grade_dist_dt}


r if (missing_vars_flag==0) '### missing variables'

##############################
# table 2: missing variables #
##############################

if (missing_vars_flag==0) {miss_var_dt}


r if (missing_table_flag==0) '### missing data percentages'

###############################
# table 3: missing data table #
###############################

if (missing_table_flag==0) {missing_dt}


r if (flag_prim_dum==0) '### primary dummy frequencies (by year)' r if (flag_prim_dum==0) '* percentages calculated from dummy variables'

###################################################
# table 4: primary dummy frequencies across years #
###################################################

if (flag_prim_dum==0) {prim_dum_dt}


r if (exists("flag_cat_race")) if(flag_cat_race==0) '### student race categories (by year) ' r if (exists("flag_cat_race")) if(flag_cat_race==0) '* numbers calculated from "ea_student_race" variable '

################################################
# table 5.a: student race categories (by year) #
################################################

if (exists("flag_cat_race")){

  if(flag_cat_race==0) {race_cat_dt}
}


r if (exists("flag_cat_frl")) if(flag_cat_frl==0) '### student frl categories (by year) ' r if (exists("flag_cat_frl")) if(flag_cat_frl==0) '* numbers calculated from "ea_student_frl_status" variable '

###############################################
# table 5.b: student frl categories (by year) #
###############################################

if (exists("flag_cat_frl")){
  if(flag_cat_frl==0) {frl_cat_dt}
}


r if (exists("flag_cat_sped")) if(flag_cat_sped==0) '### student sped categories (by year) ' r if (exists("flag_cat_sped")) if(flag_cat_sped==0) '* numbers calculated from "ea_student_sped_status" variable '

################################################
# table 5.c: student sped categories (by year) #
################################################

if (exists("flag_cat_sped")){

  if (flag_cat_sped==0) {sped_cat_dt}

}


r if (exists("flag_cat_ell")) if(flag_cat_ell==0) '### student ell categories (by year) ' r if (exists("flag_cat_ell")) if(flag_cat_ell==0) '* numbers calculated from "ea_student_ell_status" variable '

###################################################
# table 5.d: ### student ell categories (by year) #
###################################################

if (exists("flag_cat_ell")){

  if (flag_cat_ell==0) {ell_cat_dt}
}


zrussek/eatestpackage documentation built on May 5, 2019, 1:37 a.m.