ModelEWS: Wisconsin Dropout Early Warning System

Documented in buildTrainingPool cleanDV.DEWSList cleanTrainingPool findBinary findCohortYears findKeys gatherData gatherData.pred makeDEWSList preProcess.DEWSList preProcess.pred scaleCenter smote

#-------------------------------------------------------------------------------
# Functions to reshape, drop, center & scale, recode and aggregate test and 
# training data
# ------------------------------------------------------------------------------

#' Find the years available for training based on grade level
#' @param grade a numeric representing the grade level we are interested in 
#' finding training cohorts for
#' @return a vector of school years formatted YYYY-YYYY which represent the 
#' grade N cohorts for which graduation data is currently available. This is 
#' done by simple arithmetic and needs to be updated each time the data advances, 
#' e.g. annuallly
#' @export
findCohortYears <- function(grade){
  years <- c("2005-2006", "2006-2007", "2007-2008", "2008-2009", "2009-2010")
  if(grade >= 8){
    return(years)
  } else if(grade ==7){
    return(years[1:(length(years)-1)])
  } else if(grade == 6){
    return(years[1:(length(years)-2)])
  } else if(grade == 5){
    return(years[1:(length(years)-3)])
  }
}


#' Build a pool of training observations for a grade level
#' @param grade a numeric representing the grade level we are interested in 
#' finding training cohorts for
#' @param conn a connection string which contains links to databases necessary 
#' to extract the data
#' @details This function identifies the number of grade N cohorts which have 
#' valid graduation outcomes and then pulls all complete observations from those 
#' cohorts using the assembleCohort function. It then stacks all of these cohorts 
#' into a single data.frame suitable for transformation and analysis later. At 
#' the end it calls cleanTrainingPool to clean up the data and enforce business 
#' rules about variable encoding.
#' @return a data.frame with all colums generated by assembleCohort and the 
#' number of rows equal to the number of students per cohort X the number of 
#' cohorts available for a given grade
#' @export
buildTrainingPool <- function(grade, conn){
  pool <- data.frame(HOLD = NA)
  yrs <- findCohortYears(grade)
  grade2 <- as.character(paste0(0, grade))
  for(i in yrs){
    pooltmp <- assembleCohort(i, grade2, type = "train", conn = conn)
    pool <- plyr::rbind.fill(pooltmp, pool)
    rm(pooltmp)
  }
  pool <- cleanTrainingPool(pool)
  return(pool)
}

#' Cleans the pooled training cases
#' @param pool a data.frame 
#' @return A data.frame same number of rows as pool, with data recoded. This 
#' includes all of the recoding business rules and should be modified to match 
#' the specific problem at hand.
cleanTrainingPool <- function(pool){
  pool$HOLD <- NULL
  pool$ESL_LEV <- "Native"
  pool$ESL_LEV <- ifelse(pool$STUDENT_ESL_CLASSIFICATION < 3, "Low", pool$ESL_LEV)
  pool$ESL_LEV <- ifelse(pool$STUDENT_ESL_CLASSIFICATION >= 3 & 
                           pool$STUDENT_ESL_CLASSIFICATION < 5, "Medium", pool$ESL_LEV)
  pool$ESL_LEV <- ifelse(pool$STUDENT_ESL_CLASSIFICATION >= 5 & 
                           pool$STUDENT_ESL_CLASSIFICATION < 7, "High", pool$ESL_LEV)
  pool$STUDENT_ESL_CLASSIFICATION <- NULL
  pool$FRL <- "N"
  pool$FRL <- ifelse(pool$STUDENT_FOODSERVICE_ELIG_CODE == "N", "N", "Y")
  pool$STUDENT_FOODSERVICE_ELIG_CODE <- NULL
  pool$EVER_RETAINED <- "N"
  pool$EVER_RETAINED <- ifelse(pool$STUDENT_REPEATER_INDICATOR == "Yes", "Y", "N")
  pool$EVER_RETAINED <- ifelse(pool$STUDENT_REPEATER_INDICATOR == "@NR", "Unk", 
                               pool$EVER_RETAINED)
  pool$STUDENT_REPEATER_INDICATOR <- NULL
  
  pool$GENDER <- ifelse(pool$STUDENT_GENDER_CODE == "F", "F", "M")
  pool$GENDER <- ifelse(pool$STUDENT_GENDER_CODE == "F", "F", "M")
  pool$STUDENT_GENDER_CODE <- NULL
  pool$ATT_RATE <- (1 - pool$STUDENT_ANNUAL_ABSENCES / pool$STUDENT_ANNUAL_ENROLLMENT_DAYS)
  pool$DAYS_POSS <- pool$STUDENT_ANNUAL_ENROLLMENT_DAYS
  pool$STUDENT_ANNUAL_ENROLLMENT_DAYS <- NULL
  pool$STUDENT_ANNUAL_ABSENCES <- NULL
  pool$SCHOOL_KEY_WSAS <- NULL
  pool$DISTRICT_CODE_WSAS <- NULL
  pool$STUDENT_ANNUAL_ATTRIBS_KEY <- NULL
  pool$DISTRICT_KEY <- pool$DISTRICT_CODE
  pool$DISTRICT_CODE <- NULL
  pool$SWD <- ifelse(pool$STUDENT_SPECIAL_ED_INDICATOR == "Yes", "Yes", "No")
  pool$STUDENT_SPECIAL_ED_INDICATOR <- NULL
  pool$LEP <- ifelse(pool$STUDENT_LEP_INDICATOR == "Yes", "Yes", "No")
  pool$STUDENT_LEP_INDICATOR <- NULL
  pool$daysRemoved <- as.numeric(pool$daysRemoved)
  pool$incCount <- as.numeric(pool$incCount)
  # Drop duplicated indicator -- it's not available to score new cases
  pool$duplicated <- NULL
  pool$GRADE_KEY <- pool$STUDENT_ANNUAL_GRADE_CODE
  pool$STUDENT_ANNUAL_GRADE_CODE <- NULL
  pool$incMiss <- ifelse(is.na(pool$incType), 1, 0)
  pool$incType <- ifelse(is.na(pool$incType), "N", pool$incType)
  pool$Assault <- ifelse(grepl("Assault", pool$incType), 1, 0)
  pool$Drug <- ifelse(grepl("Drug", pool$incType), 1, 0)
  pool$Weapon <- ifelse(grepl("Weapon", pool$incType), 1, 0)
  pool$SchoolRule <- ifelse(grepl("SchoolRule", pool$incType), 1, 0)
  # Preserve NAs
  pool$Assault <- ifelse(pool$incMiss > 0, NA, pool$Assault)
  pool$Drug <- ifelse(pool$incMiss > 0, NA, pool$Drug)
  pool$Weapon <- ifelse(pool$incMiss > 0, NA, pool$Weapon)
  pool$SchoolRule <- ifelse(pool$incMiss > 0, NA, pool$SchoolRule)
  pool$incType <- NULL
  pool$incMiss <- NULL
  pool$daysRemoved <- log(pool$daysRemoved + .01)
  pool$incCount <- log(pool$incCount + .1)
  return(pool)
}


#' Find database key variables
#'
#' @param names a vector of variable names
#'
#' @return all variable names that have KEY in the name as a character vector
findKeys <- function(names){
  keyVars <- names[grep("KEY", names)]
  return(keyVars)
}

#' Convert dataset to a model matrix suitable for training in caret
#' @param grade a numeric representing the grade level to extract data for
#' @param conn a connection object suitable for extracting data
#' @param DV a character representing the variable name of the dependent variable
#' @param na.omit logical, should rows with an NA be omitted, default is TRUE
#' @details First the function extracts data from buildTrainingPool, then it builds 
#' a model matrix out of the formula DV ~ ., first excluding all "KEY" variables 
#' which represent year, school, district, and student identifiers. Then it returns 
#' the model matrix.
#' @note Using na.omit=FALSE may result in a non-conformable array due to a mismatch 
#' between the DV vector length and data vector length. This function has not been 
#' modified to allow this yet.
#' @export
gatherData <- function(grade, conn, DV = "grad_ind", na.omit = TRUE){
  pool <- buildTrainingPool(grade = grade, conn = conn)
  if(na.omit == TRUE){
    totalRows <- nrow(pool)
    complRows <- nrow(na.omit(pool))
    pool <- na.omit(pool)
    msg <- paste0("Of ", totalRows, " in pooled data, ", complRows, " are complete")
    message(msg)
  } else{
    stop("Setting na.omit to FALSE may lead to mismatch between DV and predictors")
  }
  CLASS <- ifelse(DV == "grad_ind", TRUE, FALSE)
  if(DV == "grad_ind"){
    expr <- lazyeval::interp(quote(x != y), x = as.name(DV), y = "exitWIpublicK12")
    recode <- function() ifelse(pool[, "grad_ind"]  == "on-time", 1, 0)
    DV2 <- "ontime_grad"
  } else if(DV != "grad_ind"){
    stop("Not written yet.")
  }  
  pool %<>% filter_(expr)
  pool[, DV2] <- recode()
  keyVars <- findKeys(names(pool))
  keyVars <- c(keyVars, "SCHOOL_YEAR", DV)
  myF <- as.formula(paste(DV2, " ~ ."))
  mat1 <- model.matrix(myF, 
                       data = pool[, names(pool)[!names(pool) %in% keyVars]])
  mat1 <- cbind(pool[, DV2], mat1[, -1])
  colnames(mat1)[1] <- DV2
  return(mat1)
}

#' Convert a dataset without a dependent variable
#' @param grade a numeric representing the grade level to extract data for
#' @param conn a connection object suitable for extracting data
#' @param year a character representing the school year to predict for in YYYY-YYYY format
#' @param na.omit logical, should rows with an NA be omitted, default is TRUE
#' @details First the function extracts data from buildTrainingPool, then it builds 
#' a model matrix suitable for passing to a transformation and then prediction step. 
#' The key difference between this and the original gatherData is that there are no 
#' valide dependent variable observations for data we want to make predictions on. 
#' This function is modified to account for that. 
#' @note na.omit must be true for now as no imputation routines have been integrated
#' @export
gatherData.pred <- function(grade, conn, year, na.omit = TRUE, ...){
  if(na.omit == FALSE){
    stop("Imputation routines not yet written.")
  }
  if(class(grade) == "numeric"){
    grade <- as.character(paste0(0, grade))
  }
  pool <- assembleCohort(cohortYear = year, grade = grade, 
                         conn = conn, ...)
  pool <- cleanTrainingPool(pool)
  if(na.omit == TRUE){
    totalRows <- nrow(pool)
    complRows <- nrow(na.omit(pool))
    pool <- na.omit(pool)
    msg <- paste0("Of ", totalRows, " in pooled data, ", complRows, " are complete")
    message(msg)
  }
  keyVars <- findKeys(names(pool))
  keyVars <- c(keyVars, "SCHOOL_YEAR")
  myF <- as.formula(paste(" ~ ."))
  mat1 <- model.matrix(myF, 
                       data = pool[, names(pool)[!names(pool) %in% c(keyVars)]])
  row.names(mat1) <- pool[, "STUDENT_KEY"]
  return(mat1[, -1])
}



#' Construct a DEWSlist from a model matrix
#' @param matrix a matrix produced by gatherData
#' @param DV a character representing the column name associated with the dependent 
#' variable
#' @param preProcess logical, should a preProcess routine be applied to the data
#' @param mode character, is this running in production or development mode, see details 
#' @param prelim logical, is this the preliminary or final model, see details
#' @param P numeric, 0 to 1, proportion of data to be used for training
#' @param PVALID, 0 to 1, proportion of the data to be used for the validation data
#' @details This function makes the data suitable for use in EWStools functions 
#' for testing and selecting models. 
#' @return a list of lists, the lists are named "traindata", "testdata" and 
#' "validdata", and each list has two elements, a predictor matrix called "preds" 
#' and an outcome vector called "class". A fourth element called "scale" is 
#' included if preProcess = TRUE which contains the preProcess object used
#' @importFrom EWStools assembleData
#' @export
makeDEWSList <- function(matrix, DV=NULL, preProcess = TRUE, 
                         MODE = "PROD", PRELIM = FALSE, P, PVALID) {
  #
  if(missing(P) | missing(PVALID)){
    if(nrow(matrix) <= 60000){
      P <- ifelse(MODE == "PROD", 0.45, 0.25)
      PVALID <- ifelse(MODE == "PROD", 0.25, 0.25)
    }else {
      P <- ifelse(MODE == "PROD", 0.25, 0.05)
      PVALID <- ifelse(MODE == "PROD", 0.5, 0.75)
    }
  }
  if(missing(DV)){
    warning("No dependent variable specified, using first column.")
    DV <- colnames(matrix)[1]
  }
  tmppred <- colnames(matrix)[colnames(matrix) != DV]
  CLASS <- ifelse(DV == "ontime_grad", TRUE, FALSE)
  full <- assembleData(matrix, class = DV, 
                       p = P, predvars = tmppred, pvalid = PVALID, 
                       classification = CLASS)
  rm(matrix)
  full <- cleanDV.DEWSList(full, DV = DV)
  if(preProcess == TRUE){
   full <- preProcess.DEWSList(full)
  } 
  return(full)
}

#' PreProcess DEWS list
#' @param DEWSList a list produced by the makeDEWSList function
#' @return a DEWSList with the additional list element "scale" which includes 
#' the preProcess object
#' @details The preProcess function is applied to all continuous variables in 
#' the training data. A separate scaling is done to binary variables as well. 
#' This training data pre-processing is then applied to test and validdata so that 
#' all data elements have been scaled relative to the training data. 
#' @importFrom caret preProcess
preProcess.DEWSList <- function(DEWSList){
  if("scale" %in% names(DEWSList)){
    stop("Data appears to already be preprocessed as scale is present in DEWSList")
  }
  for(i in names(DEWSList)){
    DEWSList[[i]]$preds <- as.data.frame(DEWSList[[i]]$preds)
  } 
  binVars <- findBinary(DEWSList$traindata$preds)
  allVars <- names(DEWSList$traindata$preds)
  conVars <- allVars[!allVars %in% binVars]
  modPreProc <- preProcess(DEWSList$testdata$preds[, conVars], 
                           method = c("center", "scale"))
  for(i in names(DEWSList)){
    DEWSList[[i]]$preds %<>% scaleCenter(preProc = modPreProc, binVars = binVars, 
                                          conVars = conVars)
  } 
  DEWSList$scale <- modPreProc
  return(DEWSList)
  
}

#' Preprocess prediction cohort
#' @param predCohort a data.frame produced by 
#' @param preProcess a preProcess object with variable names matching those in predCohort
#' @param varNames a vector of variable names to preProcess
#' @return a data.frame of of the same dimensions of predCohort that is rescaled 
#' according to the attributes in the preProcess object
#' @export
#' @importFrom caret preProcess
preProcess.pred <- function(predCohort, preProcess, varNames){
  stopifnot(class(predCohort) == "matrix")
  stopifnot(class(preProcess) == "preProcess")
  binVars <- findBinary(predCohort)
  allVars <- names(predCohort)
  conVars <- names(preProcess$mean)
  out <- scaleCenter(data = predCohort, preProc = preProcess, binVars = binVars, 
                     conVars = conVars)
  sampNames <- colnames(out)
  out <- out[, sampNames %in% varNames]
  
  missVar <- varNames[!varNames %in% sampNames]
  extraData <- matrix(rep(0, length(missVar) * nrow(out)), 
                      ncol = length(missVar), nrow = nrow(out), 
                      dimnames = list("rows" = NULL, "columns" = missVar))
  
  out <- cbind(out, extraData)
  out <- out[, varNames]
  return(out)
}

#' Clean up DV in a DEWSList for proper modeling
#' @param DEWSList a DEWSlist created by makeDEWSList
#' @param DV a character representing the name of the dependent variable
#' @return a DEWSList
#' @details Reassigns the DEWSList[[i]]$class object to be a factor with the 
#' larger group, graduates, as the first level of a factor, and non-graduates 
#' as the second level
#' @note Currently only works if DV == "ontime_grad"
cleanDV.DEWSList <- function(DEWSList, DV){
  if(DV == "ontime_grad"){
    tabT <- table(DEWSList$traindata$class)
    small <- names(tabT[tabT == min(tabT)])
    big <- names(tabT[tabT == max(tabT)])
    
    for(i in names(DEWSList)){
      DEWSList[[i]]$class %<>% as.character()
      DEWSList[[i]]$class %<>% factor(levels = c(big, small), 
                                       labels = c("Grad", "Non.Grad"))
    } 
  } else {
    stop("Dependent variable not supported.")
  }
  return(DEWSList)
}

#' Scale and center a dataframe given a preProcessing object
#' @param data a data.frame
#' @param preProc a preProcessing object
#' @param binVars a vector of column indices representing binary variables
#' @param conVars a vector of column indices representing continuous variables in 
#' the preProcessing object
#' @details Continuous variables are processed by preProcess in the caret package, 
#' binary variables are processed using the rescale function in arm which centers 
#' binary inputs
#' @importFrom caret preProcess
#' @importFrom caret predict.preProcess
#' @importFrom arm rescale
#' @export 
scaleCenter <- function(data, preProc, binVars, conVars){
  data[, conVars] <- predict(preProc, data[, conVars])
  data[, binVars] <- apply(data[, binVars], 2, arm::rescale, 
                           binary.inputs = 'center')
  return(data)
}

#' Find binary variables
#' @description Find binary variables in a data.frame and return their names.
#' @param data data.frame
#' @return a character vector of variable names corresponding to column names in 
#' data which have fewer than three unique values
#' @note This function will return constant variables with only one value as 
#' binary as well
#' @export
findBinary <- function(data){
  out <- apply(data, 2, function(x) length(table(x)) < 3)
  return(names(out[out==TRUE]))
}  

#' SMOTE the data for class balance?
#' @description Applies the synthetic minority over-sampling technique to a 
#' list in a DEWSList to make the data less unbalanced between classes.
#' @param dewsList a single element of a DEWSList, e.g. traindata, testdata
#' @return an element of a dewslist with objects preds and class
#' @importFrom DMwR SMOTE
#' @details See the documentation for SMOTE in the DMwR package for details on 
#' how the SMOTE function works.
#' @export
smote <- function(dewsList) {
  library(DMwR)
  x <- dewsList$preds
  y <- dewsList$class
  dat <- if(is.data.frame(x)) x else as.data.frame(x)
  dat$.y <- y
  dat <- SMOTE(.y ~ ., data = dat)
  list(preds = dat[, !grepl(".y", colnames(dat), fixed = TRUE)],
       class = dat$.y)
}

jknowles/ModelEWS documentation built on May 19, 2019, 11:42 a.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

jknowles/ModelEWS
Wisconsin Dropout Early Warning System

R/dataTransform.R
In jknowles/ModelEWS: Wisconsin Dropout Early Warning System

Defines functions findCohortYears buildTrainingPool cleanTrainingPool findKeys gatherData gatherData.pred makeDEWSList preProcess.DEWSList preProcess.pred cleanDV.DEWSList scaleCenter findBinary smote

Documented in buildTrainingPool cleanDV.DEWSList cleanTrainingPool findBinary findCohortYears findKeys gatherData gatherData.pred makeDEWSList preProcess.DEWSList preProcess.pred scaleCenter smote

R Package Documentation

Browse R Packages

We want your feedback!

jknowles/ModelEWS Wisconsin Dropout Early Warning System

R/dataTransform.R In jknowles/ModelEWS: Wisconsin Dropout Early Warning System

Defines functions findCohortYears buildTrainingPool cleanTrainingPool findKeys gatherData gatherData.pred makeDEWSList preProcess.DEWSList preProcess.pred cleanDV.DEWSList scaleCenter findBinary smote

Documented in buildTrainingPool cleanDV.DEWSList cleanTrainingPool findBinary findCohortYears findKeys gatherData gatherData.pred makeDEWSList preProcess.DEWSList preProcess.pred scaleCenter smote

R Package Documentation

Browse R Packages

We want your feedback!

jknowles/ModelEWS
Wisconsin Dropout Early Warning System

R/dataTransform.R
In jknowles/ModelEWS: Wisconsin Dropout Early Warning System