# load packages library(data.table) library(DT) library(easimple) library(gridExtra) library(ggplot2) # create copy of dems data in_demo <- copy(params$file_name) ################### # save parameters # ################### # interval parameter outlier_parm <- params$outlier_parm # duplicate parameter dup_parm <- params$duplicate_parm # threshold for % of missing... missing_parm <- params$missing_parm # set colors c_good <- "white" c_warning <- "#FFEB9C" c_problem <- "#FFC7CE" ###################### # general formatting # ###################### # create list of all dummy vars dummy_vars <- grep("^(d_)", colnames(in_demo), value = TRUE) # convert dummy vars to numeric in_demo[, dummy_vars] <- lapply(in_demo[, dummy_vars, with=FALSE], as.numeric) # convert NA grades to 99 and NA school years to missing in_demo[is.na(student_grade) , student_grade := 99] in_demo[is.na(demo_school_year), demo_school_year := "missing"] # make student_grade two characters in_demo[nchar(as.character(student_grade)) == 1 & !grepl("k", student_grade), student_grade := paste0(0, student_grade)] ################ # create lists # ################ # list of years year_list <- unique(in_demo$demo_school_year) # list of grades grade_list <- unique(in_demo$student_grade) # sort grade list low to high grade_list <- sort(grade_list, decreasing=FALSE) # put k and pk on front end of list if they exists if ( ("k" %chin% grade_list) & ("pk" %chin% grade_list)) { other_grades <- setdiff(grade_list, c("k", "pk")) other_grades <- sort(other_grades, decreasing=FALSE) grade_list <- c("pk", "k", other_grades)} # put k on front end of list if they exists if ( ("k" %chin% grade_list) & !("pk" %chin% grade_list)) { other_grades <- setdiff(grade_list, c("k")) other_grades <- sort(other_grades, decreasing=FALSE) grade_list <- c("k", other_grades)} # put pk on front end of list if they exists if ( !("k" %chin% grade_list) & ("pk" %chin% grade_list)) { other_grades <- setdiff(grade_list, c("pk")) other_grades <- sort(other_grades, decreasing=FALSE) grade_list <- c("pk", other_grades)} ########################## # TABLE 1: GENERAL TABLE # ########################## # .i. create general table general_table <- in_demo[, list(total_n_obs = .N, n_unique_students = length(unique(ea_student_id))), by="demo_school_year"] # .ii. output n_exact duplicate rows exact_dups <- ea_out_dups(in_demo, opt_key_all = 1) # .ii. calcualte number of exact rows per year exact_dups_table <- exact_dups[, list(exact_dup_row=.N), by="demo_school_year"] # .ii. merge duplicate row info onto general table general_table <- ea_merge(general_table, exact_dups_table, "demo_school_year", "x") # .ii. create a % of duplicate rows general_table[, perc_exact_dup_row := round((exact_dup_row/total_n_obs)*100, 2)] # .iii. make all NA values = 0 general_table[is.na(general_table)] <- 0 # .iv. sort years from high to low general_table <- general_table[order(-(demo_school_year))] # .v. convert to datatable general_table_dt <- datatable(general_table, rownames = FALSE, extensions = "KeyTable") ####################################### # flag large disparities across years # ####################################### # set general_table_flag to zero general_table_flag <- 0 # set list of vars to check in loop gen_vars <- c("total_n_obs", "n_unique_students") ########################################################################## # check 1.1 & 1.2 -> outliers of total_n_obs & n_unique_students by year # ########################################################################## # flag large disparities for (m_var in gen_vars){ # set intervals lower_int <- general_table[demo_school_year != "missing", mean(get(m_var)) - mean(get(m_var))*(outlier_parm)] upper_int <- general_table[demo_school_year != "missing", mean(get(m_var)) + mean(get(m_var))*(outlier_parm)] # apply color scheme if (is.na(lower_int)==0) {general_table_dt <- formatStyle(general_table_dt, columns = paste0(m_var), background = styleInterval(c(lower_int, upper_int), c(c_problem, c_good, c_problem)))} # trigger internal flag if outliers exists if ( (is.na(lower_int)==0) & ( (general_table[, min(get(m_var))] < lower_int) | (general_table[, max(get(m_var))] > upper_int))) {general_table_flag <- 1} } ######################################################################### # check 1.3 -> % of duplicate rows is higher than the missing threshold # ######################################################################### # apply color scheme to % exact duplicate row general_table_dt <- formatStyle(general_table_dt, columns = "perc_exact_dup_row", background = styleInterval(c(0, dup_parm), c(c_good, c_warning, c_problem))) # internal flag for general table if (sum(general_table$perc_exact_dup_row, na.rm = TRUE)!=0) { if ( (min(general_table$perc_exact_dup_row, na.rm = TRUE) > dup_parm)) {general_table_flag <- 1} } #################################### # check 1.4 -> missing school year # #################################### # highlight missing school year general_table_dt <- formatStyle(general_table_dt, columns = "demo_school_year", backgroundColor = styleEqual(c("missing"), c(c_problem))) # internal flag for missing school year if ("missing" %chin% unique(general_table$demo_school_year)) {general_table_flag <- 1} ######################################### # TABLE 1.b: GRADE DISTRIBUTION BY YEAR # ######################################### # remove exact duplicates sub_demo <- ea_no_dups(in_demo, opt_key_all = 1) # calc # students per grade grade_distribution <- ea_table(sub_demo, c("demo_school_year", "student_grade")) # cast wide grade_distribution <- dcast.data.table(grade_distribution, demo_school_year ~ student_grade, value.var = "count") # convert NA values to zero grade_distribution[is.na(grade_distribution)] <- 0 # make column name grade "missing" if it exists if ("99" %chin% grade_list) {setnames(grade_distribution, "99", "missing")} # create new grade_list grade_list_t1 <- copy(grade_list) # replace "99" with "missing" for new grade list if ("99" %chin% grade_list_t1) {grade_list_t1 <- gsub("99", "missing", grade_list_t1)} # set column order ea_colorder(grade_distribution, c("demo_school_year", grade_list_t1)) # convert to datatable grade_dist_dt <- datatable(grade_distribution, rownames = FALSE, extensions = "KeyTable") # set flag_grade_dist to 0 flag_grade_dist <- 0 ############################################################### # check 1.b loop through grade_list_t1 and highlight outliers # ############################################################### # flag large disparities for (m_grade in grade_list_t1){ # set intervals lower_int <- grade_distribution[demo_school_year != "missing", mean(get(m_grade)) - mean(get(m_grade))*(outlier_parm)] upper_int <- grade_distribution[demo_school_year != "missing", mean(get(m_grade)) + mean(get(m_grade))*(outlier_parm)] # apply color scheme if (is.na(lower_int)==0) {grade_dist_dt <- formatStyle(grade_dist_dt, columns = paste0(m_grade), background = styleInterval(c(lower_int, upper_int), c(c_problem, c_good, c_problem)))} # trigger internal flag if outliers exists if ((is.na(lower_int)==0) & ( (grade_distribution[, min(get(m_grade))] < lower_int) | (grade_distribution[, max(get(m_grade))] > upper_int))) {flag_grade_dist <- 1} } #################################### # TABLE 2: MISSING VARIABLES TABLE # #################################### # create a copy of in_demo as a data.frame missing_vars <- as.data.frame(copy(in_demo)) # subset test dataset to missing vars missing_vars <- missing_vars[sapply(missing_vars, function(x) all(is.na(x)))] # create list of all missing vars miss_var_list <- colnames(missing_vars) # create data.table of remaining columns miss_var_table <- as.data.table(colnames(in_demo)) # setnames setnames(miss_var_table, "V1", "variable") # create a missing flag miss_var_table[, missing := ifelse(variable %chin% miss_var_list, 1,0)] # datatable miss_var_dt <- datatable(miss_var_table, rownames = FALSE, extensions = "Scroller", options=list(deferRender=TRUE, scrollY=400, scrollCollapse=TRUE)) # apply color scheme: highlight missing vars miss_var_dt <- formatStyle(miss_var_dt, columns = "missing", background = styleEqual(c(0, 1), c(c_good, c_problem))) # check 2.1 -> internal flag for list of missing vars if (length(miss_var_list)>0) { missing_vars_flag <- 1 } else { missing_vars_flag <- 0} ################################# # TABLE 3: MISSING DATA % TABLE # ################################# # create table (from sub_demo, which is the raw dataset with exact dups removed) missing_table <- sub_demo[ , list(total_n_obs = .N, missing_student_id = sum(is.na(ea_student_id)), missing_grade = sum(is.na(student_grade)), missing_gender = sum(d_gender_missing == 1, na.rm = TRUE), missing_race = sum(d_race_missing == 1, na.rm = TRUE), missing_frl = sum(d_frl_missing == 1, na.rm = TRUE), missing_ell = sum(d_ell_missing == 1, na.rm = TRUE), missing_sped = sum(d_sped_missing == 1, na.rm = TRUE)), by="demo_school_year"] # add %'s to table missing_table[, perc_miss_stud_id := round( (missing_student_id/total_n_obs)*100,2)] missing_table[, perc_miss_grade := round( (missing_grade/total_n_obs)*100,2)] missing_table[, perc_miss_gender := round( (missing_gender/total_n_obs)*100,2)] missing_table[, perc_miss_race := round( (missing_race/total_n_obs)*100,2)] missing_table[, perc_miss_frl := round( (missing_frl/total_n_obs)*100,2)] missing_table[, perc_miss_ell := round( (missing_ell/total_n_obs)*100,2)] missing_table[, perc_miss_sped := round( (missing_sped/total_n_obs)*100,2)] # sort column order ea_colorder(missing_table, c("demo_school_year", "total_n_obs", "missing_student_id", "perc_miss_stud_id", "missing_grade", "perc_miss_grade", "missing_gender", "perc_miss_gender", "missing_race", "perc_miss_race", "missing_frl", "perc_miss_frl", "missing_ell", "perc_miss_ell", "missing_sped", "perc_miss_sped")) # sort years from high to low missing_table <- missing_table[order(-(demo_school_year))] # the following code pre-hides the "extra" columns and allows you to open more if you want more info missing_dt <- datatable(missing_table, rownames=FALSE, extensions = 'ColVis', options = list(columnDefs = list(list(targets = c(1,2,4,6,8,10,12,14), visible = FALSE)), dom = 'C<"clear">lfrtip')) ################################## # flag large %'s of missing data # ################################## # .i. check 3.1 -> 3.7 apply color scheme to all percent missings from "missing_table" missing_dt <- formatStyle(missing_dt, columns = c("perc_miss_stud_id", "perc_miss_grade","perc_miss_gender", "perc_miss_race", "perc_miss_frl", "perc_miss_ell","perc_miss_sped"), background = styleInterval(c(0, missing_parm), c(c_good, c_warning, c_problem))) # internal flag for missing table if ( (max(missing_table$perc_miss_stud_id) > missing_parm) | (max(missing_table$perc_miss_grade) > missing_parm) | (max(missing_table$perc_miss_gender) > missing_parm) | (max(missing_table$perc_miss_race) > missing_parm) | (max(missing_table$perc_miss_frl) > missing_parm) | (max(missing_table$perc_miss_ell) > missing_parm) | (max(missing_table$perc_miss_sped) > missing_parm)) { missing_table_flag <- 1 } else { missing_table_flag <- 0} ################################################### # Table 4: PRIMARY DUMMY FREQUENCIES ACROSS YEARS # ################################################### # note: made from dummy vars # calculate %'s by year (from sub_demo, which is the raw dataset with exact dups removed) prim_dum <- sub_demo[, list( perc_male = (sum(d_gender_m==1)/ (sum(d_gender_f==1) + sum(d_gender_m==1))), perc_frl_y = (sum(d_frl_y==1)/ (sum(d_frl_y==1) + sum(d_frl_n==1))), perc_ell_y = (sum(d_ell_y==1)/ (sum(d_ell_y==1) + sum(d_ell_n==1))), perc_sped_y = (sum(d_sped_y==1)/ (sum(d_sped_n==1) + sum(d_sped_y==1))), perc_asian = (sum(d_race_asian==1) / sum(is.na(ea_student_race)==0)), perc_black = (sum(d_race_black==1) / sum(is.na(ea_student_race)==0)), perc_hispanic = (sum(d_race_hispanic==1) / sum(is.na(ea_student_race)==0)), perc_islander = (sum(d_race_islander==1) / sum(is.na(ea_student_race)==0)), perc_multi = (sum(d_race_multi==1) / sum(is.na(ea_student_race)==0)), perc_native = (sum(d_race_native==1) / sum(is.na(ea_student_race)==0)), perc_white = (sum(d_race_white==1) / sum(is.na(ea_student_race)==0))), by="demo_school_year"] # convert NA's to 0 prim_dum[is.na(prim_dum)] <- 0 # create list of colnames (not demo_school_year) prim_vars <- setdiff(colnames(prim_dum), "demo_school_year") # multiply all %'s by 100 prim_dum[, prim_vars] <- lapply(prim_dum[, prim_vars, with=FALSE], function(x){ x <- round(x*100,2)}) # sort years from high to low prim_dum <- prim_dum[order(-(demo_school_year))] # .v. convert to datatable prim_dum_dt <- datatable(prim_dum, rownames = FALSE, extensions = "KeyTable") ########################################################################### # check 4.1 -> 4.12 loop through primary variables and highlight outliers # ########################################################################### # flag large disparities for (m_var in prim_vars){ # set intervals lower_int <- prim_dum[demo_school_year != "missing", mean(get(m_var)) - mean(get(m_var))*(outlier_parm)] upper_int <- prim_dum[demo_school_year != "missing", mean(get(m_var)) + mean(get(m_var))*(outlier_parm)] # if both intervals are 0, change so doesn't get highlighted if (lower_int == 0 & upper_int == 0) {lower_int <- -1 upper_int <- 1} # apply color scheme if (is.na(lower_int)==0) {prim_dum_dt <- formatStyle(prim_dum_dt, columns = paste0(m_var), background = styleInterval(c(lower_int, upper_int), c(c_problem, c_good, c_problem)))} # set internal flag to zero flag_prim_dum <- 0 # trigger internal flag if outliers exists if ( (is.na(lower_int)==0) & ( (prim_dum[, min(get(m_var))] < lower_int) | (prim_dum[, max(get(m_var))] > upper_int))) {flag_prim_dum <- 1} } ##################################################### # TABLE 5: PRIMARY VARIABLE CATEGORIES ACROSS YEARS # ##################################################### # list to loop over loop_list <- c("ea_student_race", "ea_student_frl_status", "ea_student_sped_status", "ea_student_ell_status") # start loop for (m_loop in loop_list){ # initially set internal category flag to zero m_flag <- 0 # only proceed if variable is not missing if (! (m_loop %chin% miss_var_list)){ # calculate crosstab table (from sub_demo, which is the raw dataset with exact dups removed) m_table <- ea_table(sub_demo, c("demo_school_year", m_loop)) # cast table wide m_table <- dcast.data.table(m_table, demo_school_year ~ get(m_loop)) # make all NA values = 0 m_table[is.na(m_table)] <- 0 # .iv. sort years from high to low m_table <- m_table[order(-(demo_school_year))] # .v. convert to datatable m_dt <- datatable(m_table, rownames = FALSE, extensions = "KeyTable") ################################################################### # check 5.1 -> 5.4 loop through categories and highlight outliers # ################################################################### # create list of races to loop over category_list <- in_demo[, unique(get(m_loop))] # flag large disparities for (m_cat in category_list){ # set intervals lower_int <- m_table[demo_school_year != "missing", mean(get(m_cat)) - mean(get(m_cat))*(outlier_parm)] upper_int <- m_table[demo_school_year != "missing", mean(get(m_cat)) + mean(get(m_cat))*(outlier_parm)] # apply color scheme if (is.na(lower_int)==0) {m_dt <- formatStyle(m_dt, columns = paste0(m_cat), background = styleInterval(c(lower_int, upper_int), c(c_problem, c_good, c_problem)))} # trigger internal flag if outliers exists if ((is.na(lower_int)==0) & ( (m_table[, min(get(m_cat))] < lower_int) | (m_table[, max(get(m_cat))] > upper_int))) {m_flag <- 1} } # save flag and DT assign(paste0("flag_cat_", ea_scan(m_loop, 3, "_")), m_flag) assign(paste0(ea_scan(m_loop, 3, "_"), "_cat_dt"), m_dt) } } ################## # to be added....# ################## # across grades/years/terms Cohort info for students, flag potential issues # Missing data # need to differentiate between mandatory variables and optional vars # ID issues # notify when more than one student ID exists in raw data, which to use # if both state/local ids exist, analyze their uniqueness & overlap # Check for bad id's # non alpha-numeric numbers # leading zeroes # scientific notation # By school info # grade spans per school & year # check for missing grades, weird spans, large changes # number of students per school & year # by school # by school and grade # flag any large changes in n-sizes at a school across years # an overall flag for whether or nor the headings "flagged/unflagged should appear aka...is anything flagged?" -->
Below are all the the quality control checks that have been flagged for possible problems
r if (general_table_flag==1) '### general table'
########################## # table 1: general table # ########################## if (general_table_flag==1){ general_table_dt}
r if (flag_grade_dist==1) '### grade distribution'
################################# # table 1.b: grade distribution # ################################# if (flag_grade_dist==1){grade_dist_dt}
r if (missing_vars_flag==1) '### missing variables'
############################## # table 2: missing variables # ############################## if (missing_vars_flag==1) {miss_var_dt}
r if (missing_table_flag==1) '### missing data percentages'
############################### # table 3: missing data table # ############################### if (missing_table_flag==1) {missing_dt}
r if (flag_prim_dum==1) '### primary dummy frequencies (by year)'
r if (flag_prim_dum==1) '* percentages calculated from dummy variables'
################################################### # table 4: primary dummy frequencies across years # ################################################### if (flag_prim_dum==1) {prim_dum_dt}
r if (exists("flag_cat_race")) if(flag_cat_race==1) '### student race categories (by year) '
r if (exists("flag_cat_race")) if(flag_cat_race==1) '* numbers calculated from "ea_student_race" variable '
################################################ # table 5.a: student race categories (by year) # ################################################ if (exists("flag_cat_race")){ if(flag_cat_race==1) {race_cat_dt} }
r if (exists("flag_cat_frl")) if(flag_cat_frl==1) '### student frl categories (by year) '
r if (exists("flag_cat_frl")) if(flag_cat_frl==1) '* numbers calculated from "ea_student_frl_status" variable '
############################################### # table 5.b: student frl categories (by year) # ############################################### if (exists("flag_cat_frl")){ if(flag_cat_frl==1) {frl_cat_dt} }
r if (exists("flag_cat_sped")) if(flag_cat_sped==1) '### student sped categories (by year) '
r if (exists("flag_cat_sped")) if(flag_cat_sped==1) '* numbers calculated from "ea_student_sped_status" variable '
################################################ # table 5.c: student sped categories (by year) # ################################################ if (exists("flag_cat_sped")){ if(flag_cat_sped==1) {sped_cat_dt} }
r if (exists("flag_cat_ell")) if(flag_cat_ell==1) '### student ell categories (by year) '
r if (exists("flag_cat_ell")) if(flag_cat_ell==1) '* numbers calculated from "ea_student_ell_status" variable '
################################################### # table 5.d: ### student ell categories (by year) # ################################################### if (exists("flag_cat_ell")){ if(flag_cat_ell==1) {ell_cat_dt} }
Below are all the the qc_checks that were not flagged for possible problems
r if (general_table_flag==0) '### general table'
########################## # table 1: general table # ########################## if (general_table_flag==0){ general_table_dt}
r if (flag_grade_dist==0) '### grade distribution'
################################# # table 1.b: grade distribution # ################################# if (flag_grade_dist==0){grade_dist_dt}
r if (missing_vars_flag==0) '### missing variables'
############################## # table 2: missing variables # ############################## if (missing_vars_flag==0) {miss_var_dt}
r if (missing_table_flag==0) '### missing data percentages'
############################### # table 3: missing data table # ############################### if (missing_table_flag==0) {missing_dt}
r if (flag_prim_dum==0) '### primary dummy frequencies (by year)'
r if (flag_prim_dum==0) '* percentages calculated from dummy variables'
################################################### # table 4: primary dummy frequencies across years # ################################################### if (flag_prim_dum==0) {prim_dum_dt}
r if (exists("flag_cat_race")) if(flag_cat_race==0) '### student race categories (by year) '
r if (exists("flag_cat_race")) if(flag_cat_race==0) '* numbers calculated from "ea_student_race" variable '
################################################ # table 5.a: student race categories (by year) # ################################################ if (exists("flag_cat_race")){ if(flag_cat_race==0) {race_cat_dt} }
r if (exists("flag_cat_frl")) if(flag_cat_frl==0) '### student frl categories (by year) '
r if (exists("flag_cat_frl")) if(flag_cat_frl==0) '* numbers calculated from "ea_student_frl_status" variable '
############################################### # table 5.b: student frl categories (by year) # ############################################### if (exists("flag_cat_frl")){ if(flag_cat_frl==0) {frl_cat_dt} }
r if (exists("flag_cat_sped")) if(flag_cat_sped==0) '### student sped categories (by year) '
r if (exists("flag_cat_sped")) if(flag_cat_sped==0) '* numbers calculated from "ea_student_sped_status" variable '
################################################ # table 5.c: student sped categories (by year) # ################################################ if (exists("flag_cat_sped")){ if (flag_cat_sped==0) {sped_cat_dt} }
r if (exists("flag_cat_ell")) if(flag_cat_ell==0) '### student ell categories (by year) '
r if (exists("flag_cat_ell")) if(flag_cat_ell==0) '* numbers calculated from "ea_student_ell_status" variable '
################################################### # table 5.d: ### student ell categories (by year) # ################################################### if (exists("flag_cat_ell")){ if (flag_cat_ell==0) {ell_cat_dt} }
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.