#-------------------------------------------------------------------------------
# Functions to reshape, drop, center & scale, recode and aggregate test and
# training data
# ------------------------------------------------------------------------------
#' Find the years available for training based on grade level
#' @param grade a numeric representing the grade level we are interested in
#' finding training cohorts for
#' @return a vector of school years formatted YYYY-YYYY which represent the
#' grade N cohorts for which graduation data is currently available. This is
#' done by simple arithmetic and needs to be updated each time the data advances,
#' e.g. annuallly
#' @export
findCohortYears <- function(grade){
years <- c("2005-2006", "2006-2007", "2007-2008", "2008-2009", "2009-2010")
if(grade >= 8){
return(years)
} else if(grade ==7){
return(years[1:(length(years)-1)])
} else if(grade == 6){
return(years[1:(length(years)-2)])
} else if(grade == 5){
return(years[1:(length(years)-3)])
}
}
#' Build a pool of training observations for a grade level
#' @param grade a numeric representing the grade level we are interested in
#' finding training cohorts for
#' @param conn a connection string which contains links to databases necessary
#' to extract the data
#' @details This function identifies the number of grade N cohorts which have
#' valid graduation outcomes and then pulls all complete observations from those
#' cohorts using the assembleCohort function. It then stacks all of these cohorts
#' into a single data.frame suitable for transformation and analysis later. At
#' the end it calls cleanTrainingPool to clean up the data and enforce business
#' rules about variable encoding.
#' @return a data.frame with all colums generated by assembleCohort and the
#' number of rows equal to the number of students per cohort X the number of
#' cohorts available for a given grade
#' @export
buildTrainingPool <- function(grade, conn){
pool <- data.frame(HOLD = NA)
yrs <- findCohortYears(grade)
grade2 <- as.character(paste0(0, grade))
for(i in yrs){
pooltmp <- assembleCohort(i, grade2, type = "train", conn = conn)
pool <- plyr::rbind.fill(pooltmp, pool)
rm(pooltmp)
}
pool <- cleanTrainingPool(pool)
return(pool)
}
#' Cleans the pooled training cases
#' @param pool a data.frame
#' @return A data.frame same number of rows as pool, with data recoded. This
#' includes all of the recoding business rules and should be modified to match
#' the specific problem at hand.
cleanTrainingPool <- function(pool){
pool$HOLD <- NULL
pool$ESL_LEV <- "Native"
pool$ESL_LEV <- ifelse(pool$STUDENT_ESL_CLASSIFICATION < 3, "Low", pool$ESL_LEV)
pool$ESL_LEV <- ifelse(pool$STUDENT_ESL_CLASSIFICATION >= 3 &
pool$STUDENT_ESL_CLASSIFICATION < 5, "Medium", pool$ESL_LEV)
pool$ESL_LEV <- ifelse(pool$STUDENT_ESL_CLASSIFICATION >= 5 &
pool$STUDENT_ESL_CLASSIFICATION < 7, "High", pool$ESL_LEV)
pool$STUDENT_ESL_CLASSIFICATION <- NULL
pool$FRL <- "N"
pool$FRL <- ifelse(pool$STUDENT_FOODSERVICE_ELIG_CODE == "N", "N", "Y")
pool$STUDENT_FOODSERVICE_ELIG_CODE <- NULL
pool$EVER_RETAINED <- "N"
pool$EVER_RETAINED <- ifelse(pool$STUDENT_REPEATER_INDICATOR == "Yes", "Y", "N")
pool$EVER_RETAINED <- ifelse(pool$STUDENT_REPEATER_INDICATOR == "@NR", "Unk",
pool$EVER_RETAINED)
pool$STUDENT_REPEATER_INDICATOR <- NULL
pool$GENDER <- ifelse(pool$STUDENT_GENDER_CODE == "F", "F", "M")
pool$GENDER <- ifelse(pool$STUDENT_GENDER_CODE == "F", "F", "M")
pool$STUDENT_GENDER_CODE <- NULL
pool$ATT_RATE <- (1 - pool$STUDENT_ANNUAL_ABSENCES / pool$STUDENT_ANNUAL_ENROLLMENT_DAYS)
pool$DAYS_POSS <- pool$STUDENT_ANNUAL_ENROLLMENT_DAYS
pool$STUDENT_ANNUAL_ENROLLMENT_DAYS <- NULL
pool$STUDENT_ANNUAL_ABSENCES <- NULL
pool$SCHOOL_KEY_WSAS <- NULL
pool$DISTRICT_CODE_WSAS <- NULL
pool$STUDENT_ANNUAL_ATTRIBS_KEY <- NULL
pool$DISTRICT_KEY <- pool$DISTRICT_CODE
pool$DISTRICT_CODE <- NULL
pool$SWD <- ifelse(pool$STUDENT_SPECIAL_ED_INDICATOR == "Yes", "Yes", "No")
pool$STUDENT_SPECIAL_ED_INDICATOR <- NULL
pool$LEP <- ifelse(pool$STUDENT_LEP_INDICATOR == "Yes", "Yes", "No")
pool$STUDENT_LEP_INDICATOR <- NULL
pool$daysRemoved <- as.numeric(pool$daysRemoved)
pool$incCount <- as.numeric(pool$incCount)
# Drop duplicated indicator -- it's not available to score new cases
pool$duplicated <- NULL
pool$GRADE_KEY <- pool$STUDENT_ANNUAL_GRADE_CODE
pool$STUDENT_ANNUAL_GRADE_CODE <- NULL
pool$incMiss <- ifelse(is.na(pool$incType), 1, 0)
pool$incType <- ifelse(is.na(pool$incType), "N", pool$incType)
pool$Assault <- ifelse(grepl("Assault", pool$incType), 1, 0)
pool$Drug <- ifelse(grepl("Drug", pool$incType), 1, 0)
pool$Weapon <- ifelse(grepl("Weapon", pool$incType), 1, 0)
pool$SchoolRule <- ifelse(grepl("SchoolRule", pool$incType), 1, 0)
# Preserve NAs
pool$Assault <- ifelse(pool$incMiss > 0, NA, pool$Assault)
pool$Drug <- ifelse(pool$incMiss > 0, NA, pool$Drug)
pool$Weapon <- ifelse(pool$incMiss > 0, NA, pool$Weapon)
pool$SchoolRule <- ifelse(pool$incMiss > 0, NA, pool$SchoolRule)
pool$incType <- NULL
pool$incMiss <- NULL
pool$daysRemoved <- log(pool$daysRemoved + .01)
pool$incCount <- log(pool$incCount + .1)
return(pool)
}
#' Find database key variables
#'
#' @param names a vector of variable names
#'
#' @return all variable names that have KEY in the name as a character vector
findKeys <- function(names){
keyVars <- names[grep("KEY", names)]
return(keyVars)
}
#' Convert dataset to a model matrix suitable for training in caret
#' @param grade a numeric representing the grade level to extract data for
#' @param conn a connection object suitable for extracting data
#' @param DV a character representing the variable name of the dependent variable
#' @param na.omit logical, should rows with an NA be omitted, default is TRUE
#' @details First the function extracts data from buildTrainingPool, then it builds
#' a model matrix out of the formula DV ~ ., first excluding all "KEY" variables
#' which represent year, school, district, and student identifiers. Then it returns
#' the model matrix.
#' @note Using na.omit=FALSE may result in a non-conformable array due to a mismatch
#' between the DV vector length and data vector length. This function has not been
#' modified to allow this yet.
#' @export
gatherData <- function(grade, conn, DV = "grad_ind", na.omit = TRUE){
pool <- buildTrainingPool(grade = grade, conn = conn)
if(na.omit == TRUE){
totalRows <- nrow(pool)
complRows <- nrow(na.omit(pool))
pool <- na.omit(pool)
msg <- paste0("Of ", totalRows, " in pooled data, ", complRows, " are complete")
message(msg)
} else{
stop("Setting na.omit to FALSE may lead to mismatch between DV and predictors")
}
CLASS <- ifelse(DV == "grad_ind", TRUE, FALSE)
if(DV == "grad_ind"){
expr <- lazyeval::interp(quote(x != y), x = as.name(DV), y = "exitWIpublicK12")
recode <- function() ifelse(pool[, "grad_ind"] == "on-time", 1, 0)
DV2 <- "ontime_grad"
} else if(DV != "grad_ind"){
stop("Not written yet.")
}
pool %<>% filter_(expr)
pool[, DV2] <- recode()
keyVars <- findKeys(names(pool))
keyVars <- c(keyVars, "SCHOOL_YEAR", DV)
myF <- as.formula(paste(DV2, " ~ ."))
mat1 <- model.matrix(myF,
data = pool[, names(pool)[!names(pool) %in% keyVars]])
mat1 <- cbind(pool[, DV2], mat1[, -1])
colnames(mat1)[1] <- DV2
return(mat1)
}
#' Convert a dataset without a dependent variable
#' @param grade a numeric representing the grade level to extract data for
#' @param conn a connection object suitable for extracting data
#' @param year a character representing the school year to predict for in YYYY-YYYY format
#' @param na.omit logical, should rows with an NA be omitted, default is TRUE
#' @details First the function extracts data from buildTrainingPool, then it builds
#' a model matrix suitable for passing to a transformation and then prediction step.
#' The key difference between this and the original gatherData is that there are no
#' valide dependent variable observations for data we want to make predictions on.
#' This function is modified to account for that.
#' @note na.omit must be true for now as no imputation routines have been integrated
#' @export
gatherData.pred <- function(grade, conn, year, na.omit = TRUE, ...){
if(na.omit == FALSE){
stop("Imputation routines not yet written.")
}
if(class(grade) == "numeric"){
grade <- as.character(paste0(0, grade))
}
pool <- assembleCohort(cohortYear = year, grade = grade,
conn = conn, ...)
pool <- cleanTrainingPool(pool)
if(na.omit == TRUE){
totalRows <- nrow(pool)
complRows <- nrow(na.omit(pool))
pool <- na.omit(pool)
msg <- paste0("Of ", totalRows, " in pooled data, ", complRows, " are complete")
message(msg)
}
keyVars <- findKeys(names(pool))
keyVars <- c(keyVars, "SCHOOL_YEAR")
myF <- as.formula(paste(" ~ ."))
mat1 <- model.matrix(myF,
data = pool[, names(pool)[!names(pool) %in% c(keyVars)]])
row.names(mat1) <- pool[, "STUDENT_KEY"]
return(mat1[, -1])
}
#' Construct a DEWSlist from a model matrix
#' @param matrix a matrix produced by gatherData
#' @param DV a character representing the column name associated with the dependent
#' variable
#' @param preProcess logical, should a preProcess routine be applied to the data
#' @param mode character, is this running in production or development mode, see details
#' @param prelim logical, is this the preliminary or final model, see details
#' @param P numeric, 0 to 1, proportion of data to be used for training
#' @param PVALID, 0 to 1, proportion of the data to be used for the validation data
#' @details This function makes the data suitable for use in EWStools functions
#' for testing and selecting models.
#' @return a list of lists, the lists are named "traindata", "testdata" and
#' "validdata", and each list has two elements, a predictor matrix called "preds"
#' and an outcome vector called "class". A fourth element called "scale" is
#' included if preProcess = TRUE which contains the preProcess object used
#' @importFrom EWStools assembleData
#' @export
makeDEWSList <- function(matrix, DV=NULL, preProcess = TRUE,
MODE = "PROD", PRELIM = FALSE, P, PVALID) {
#
if(missing(P) | missing(PVALID)){
if(nrow(matrix) <= 60000){
P <- ifelse(MODE == "PROD", 0.45, 0.25)
PVALID <- ifelse(MODE == "PROD", 0.25, 0.25)
}else {
P <- ifelse(MODE == "PROD", 0.25, 0.05)
PVALID <- ifelse(MODE == "PROD", 0.5, 0.75)
}
}
if(missing(DV)){
warning("No dependent variable specified, using first column.")
DV <- colnames(matrix)[1]
}
tmppred <- colnames(matrix)[colnames(matrix) != DV]
CLASS <- ifelse(DV == "ontime_grad", TRUE, FALSE)
full <- assembleData(matrix, class = DV,
p = P, predvars = tmppred, pvalid = PVALID,
classification = CLASS)
rm(matrix)
full <- cleanDV.DEWSList(full, DV = DV)
if(preProcess == TRUE){
full <- preProcess.DEWSList(full)
}
return(full)
}
#' PreProcess DEWS list
#' @param DEWSList a list produced by the makeDEWSList function
#' @return a DEWSList with the additional list element "scale" which includes
#' the preProcess object
#' @details The preProcess function is applied to all continuous variables in
#' the training data. A separate scaling is done to binary variables as well.
#' This training data pre-processing is then applied to test and validdata so that
#' all data elements have been scaled relative to the training data.
#' @importFrom caret preProcess
preProcess.DEWSList <- function(DEWSList){
if("scale" %in% names(DEWSList)){
stop("Data appears to already be preprocessed as scale is present in DEWSList")
}
for(i in names(DEWSList)){
DEWSList[[i]]$preds <- as.data.frame(DEWSList[[i]]$preds)
}
binVars <- findBinary(DEWSList$traindata$preds)
allVars <- names(DEWSList$traindata$preds)
conVars <- allVars[!allVars %in% binVars]
modPreProc <- preProcess(DEWSList$testdata$preds[, conVars],
method = c("center", "scale"))
for(i in names(DEWSList)){
DEWSList[[i]]$preds %<>% scaleCenter(preProc = modPreProc, binVars = binVars,
conVars = conVars)
}
DEWSList$scale <- modPreProc
return(DEWSList)
}
#' Preprocess prediction cohort
#' @param predCohort a data.frame produced by
#' @param preProcess a preProcess object with variable names matching those in predCohort
#' @param varNames a vector of variable names to preProcess
#' @return a data.frame of of the same dimensions of predCohort that is rescaled
#' according to the attributes in the preProcess object
#' @export
#' @importFrom caret preProcess
preProcess.pred <- function(predCohort, preProcess, varNames){
stopifnot(class(predCohort) == "matrix")
stopifnot(class(preProcess) == "preProcess")
binVars <- findBinary(predCohort)
allVars <- names(predCohort)
conVars <- names(preProcess$mean)
out <- scaleCenter(data = predCohort, preProc = preProcess, binVars = binVars,
conVars = conVars)
sampNames <- colnames(out)
out <- out[, sampNames %in% varNames]
missVar <- varNames[!varNames %in% sampNames]
extraData <- matrix(rep(0, length(missVar) * nrow(out)),
ncol = length(missVar), nrow = nrow(out),
dimnames = list("rows" = NULL, "columns" = missVar))
out <- cbind(out, extraData)
out <- out[, varNames]
return(out)
}
#' Clean up DV in a DEWSList for proper modeling
#' @param DEWSList a DEWSlist created by makeDEWSList
#' @param DV a character representing the name of the dependent variable
#' @return a DEWSList
#' @details Reassigns the DEWSList[[i]]$class object to be a factor with the
#' larger group, graduates, as the first level of a factor, and non-graduates
#' as the second level
#' @note Currently only works if DV == "ontime_grad"
cleanDV.DEWSList <- function(DEWSList, DV){
if(DV == "ontime_grad"){
tabT <- table(DEWSList$traindata$class)
small <- names(tabT[tabT == min(tabT)])
big <- names(tabT[tabT == max(tabT)])
for(i in names(DEWSList)){
DEWSList[[i]]$class %<>% as.character()
DEWSList[[i]]$class %<>% factor(levels = c(big, small),
labels = c("Grad", "Non.Grad"))
}
} else {
stop("Dependent variable not supported.")
}
return(DEWSList)
}
#' Scale and center a dataframe given a preProcessing object
#' @param data a data.frame
#' @param preProc a preProcessing object
#' @param binVars a vector of column indices representing binary variables
#' @param conVars a vector of column indices representing continuous variables in
#' the preProcessing object
#' @details Continuous variables are processed by preProcess in the caret package,
#' binary variables are processed using the rescale function in arm which centers
#' binary inputs
#' @importFrom caret preProcess
#' @importFrom caret predict.preProcess
#' @importFrom arm rescale
#' @export
scaleCenter <- function(data, preProc, binVars, conVars){
data[, conVars] <- predict(preProc, data[, conVars])
data[, binVars] <- apply(data[, binVars], 2, arm::rescale,
binary.inputs = 'center')
return(data)
}
#' Find binary variables
#' @description Find binary variables in a data.frame and return their names.
#' @param data data.frame
#' @return a character vector of variable names corresponding to column names in
#' data which have fewer than three unique values
#' @note This function will return constant variables with only one value as
#' binary as well
#' @export
findBinary <- function(data){
out <- apply(data, 2, function(x) length(table(x)) < 3)
return(names(out[out==TRUE]))
}
#' SMOTE the data for class balance?
#' @description Applies the synthetic minority over-sampling technique to a
#' list in a DEWSList to make the data less unbalanced between classes.
#' @param dewsList a single element of a DEWSList, e.g. traindata, testdata
#' @return an element of a dewslist with objects preds and class
#' @importFrom DMwR SMOTE
#' @details See the documentation for SMOTE in the DMwR package for details on
#' how the SMOTE function works.
#' @export
smote <- function(dewsList) {
library(DMwR)
x <- dewsList$preds
y <- dewsList$class
dat <- if(is.data.frame(x)) x else as.data.frame(x)
dat$.y <- y
dat <- SMOTE(.y ~ ., data = dat)
list(preds = dat[, !grepl(".y", colnames(dat), fixed = TRUE)],
class = dat$.y)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.