R/dataTransform.R

Defines functions findCohortYears buildTrainingPool cleanTrainingPool findKeys gatherData gatherData.pred makeDEWSList preProcess.DEWSList preProcess.pred cleanDV.DEWSList scaleCenter findBinary smote

Documented in buildTrainingPool cleanDV.DEWSList cleanTrainingPool findBinary findCohortYears findKeys gatherData gatherData.pred makeDEWSList preProcess.DEWSList preProcess.pred scaleCenter smote

#-------------------------------------------------------------------------------
# Functions to reshape, drop, center & scale, recode and aggregate test and 
# training data
# ------------------------------------------------------------------------------

#' Find the years available for training based on grade level
#' @param grade a numeric representing the grade level we are interested in 
#' finding training cohorts for
#' @return a vector of school years formatted YYYY-YYYY which represent the 
#' grade N cohorts for which graduation data is currently available. This is 
#' done by simple arithmetic and needs to be updated each time the data advances, 
#' e.g. annuallly
#' @export
findCohortYears <- function(grade){
  years <- c("2005-2006", "2006-2007", "2007-2008", "2008-2009", "2009-2010")
  if(grade >= 8){
    return(years)
  } else if(grade ==7){
    return(years[1:(length(years)-1)])
  } else if(grade == 6){
    return(years[1:(length(years)-2)])
  } else if(grade == 5){
    return(years[1:(length(years)-3)])
  }
}


#' Build a pool of training observations for a grade level
#' @param grade a numeric representing the grade level we are interested in 
#' finding training cohorts for
#' @param conn a connection string which contains links to databases necessary 
#' to extract the data
#' @details This function identifies the number of grade N cohorts which have 
#' valid graduation outcomes and then pulls all complete observations from those 
#' cohorts using the assembleCohort function. It then stacks all of these cohorts 
#' into a single data.frame suitable for transformation and analysis later. At 
#' the end it calls cleanTrainingPool to clean up the data and enforce business 
#' rules about variable encoding.
#' @return a data.frame with all colums generated by assembleCohort and the 
#' number of rows equal to the number of students per cohort X the number of 
#' cohorts available for a given grade
#' @export
buildTrainingPool <- function(grade, conn){
  pool <- data.frame(HOLD = NA)
  yrs <- findCohortYears(grade)
  grade2 <- as.character(paste0(0, grade))
  for(i in yrs){
    pooltmp <- assembleCohort(i, grade2, type = "train", conn = conn)
    pool <- plyr::rbind.fill(pooltmp, pool)
    rm(pooltmp)
  }
  pool <- cleanTrainingPool(pool)
  return(pool)
}

#' Cleans the pooled training cases
#' @param pool a data.frame 
#' @return A data.frame same number of rows as pool, with data recoded. This 
#' includes all of the recoding business rules and should be modified to match 
#' the specific problem at hand.
cleanTrainingPool <- function(pool){
  pool$HOLD <- NULL
  pool$ESL_LEV <- "Native"
  pool$ESL_LEV <- ifelse(pool$STUDENT_ESL_CLASSIFICATION < 3, "Low", pool$ESL_LEV)
  pool$ESL_LEV <- ifelse(pool$STUDENT_ESL_CLASSIFICATION >= 3 & 
                           pool$STUDENT_ESL_CLASSIFICATION < 5, "Medium", pool$ESL_LEV)
  pool$ESL_LEV <- ifelse(pool$STUDENT_ESL_CLASSIFICATION >= 5 & 
                           pool$STUDENT_ESL_CLASSIFICATION < 7, "High", pool$ESL_LEV)
  pool$STUDENT_ESL_CLASSIFICATION <- NULL
  pool$FRL <- "N"
  pool$FRL <- ifelse(pool$STUDENT_FOODSERVICE_ELIG_CODE == "N", "N", "Y")
  pool$STUDENT_FOODSERVICE_ELIG_CODE <- NULL
  pool$EVER_RETAINED <- "N"
  pool$EVER_RETAINED <- ifelse(pool$STUDENT_REPEATER_INDICATOR == "Yes", "Y", "N")
  pool$EVER_RETAINED <- ifelse(pool$STUDENT_REPEATER_INDICATOR == "@NR", "Unk", 
                               pool$EVER_RETAINED)
  pool$STUDENT_REPEATER_INDICATOR <- NULL
  
  pool$GENDER <- ifelse(pool$STUDENT_GENDER_CODE == "F", "F", "M")
  pool$GENDER <- ifelse(pool$STUDENT_GENDER_CODE == "F", "F", "M")
  pool$STUDENT_GENDER_CODE <- NULL
  pool$ATT_RATE <- (1 - pool$STUDENT_ANNUAL_ABSENCES / pool$STUDENT_ANNUAL_ENROLLMENT_DAYS)
  pool$DAYS_POSS <- pool$STUDENT_ANNUAL_ENROLLMENT_DAYS
  pool$STUDENT_ANNUAL_ENROLLMENT_DAYS <- NULL
  pool$STUDENT_ANNUAL_ABSENCES <- NULL
  pool$SCHOOL_KEY_WSAS <- NULL
  pool$DISTRICT_CODE_WSAS <- NULL
  pool$STUDENT_ANNUAL_ATTRIBS_KEY <- NULL
  pool$DISTRICT_KEY <- pool$DISTRICT_CODE
  pool$DISTRICT_CODE <- NULL
  pool$SWD <- ifelse(pool$STUDENT_SPECIAL_ED_INDICATOR == "Yes", "Yes", "No")
  pool$STUDENT_SPECIAL_ED_INDICATOR <- NULL
  pool$LEP <- ifelse(pool$STUDENT_LEP_INDICATOR == "Yes", "Yes", "No")
  pool$STUDENT_LEP_INDICATOR <- NULL
  pool$daysRemoved <- as.numeric(pool$daysRemoved)
  pool$incCount <- as.numeric(pool$incCount)
  # Drop duplicated indicator -- it's not available to score new cases
  pool$duplicated <- NULL
  pool$GRADE_KEY <- pool$STUDENT_ANNUAL_GRADE_CODE
  pool$STUDENT_ANNUAL_GRADE_CODE <- NULL
  pool$incMiss <- ifelse(is.na(pool$incType), 1, 0)
  pool$incType <- ifelse(is.na(pool$incType), "N", pool$incType)
  pool$Assault <- ifelse(grepl("Assault", pool$incType), 1, 0)
  pool$Drug <- ifelse(grepl("Drug", pool$incType), 1, 0)
  pool$Weapon <- ifelse(grepl("Weapon", pool$incType), 1, 0)
  pool$SchoolRule <- ifelse(grepl("SchoolRule", pool$incType), 1, 0)
  # Preserve NAs
  pool$Assault <- ifelse(pool$incMiss > 0, NA, pool$Assault)
  pool$Drug <- ifelse(pool$incMiss > 0, NA, pool$Drug)
  pool$Weapon <- ifelse(pool$incMiss > 0, NA, pool$Weapon)
  pool$SchoolRule <- ifelse(pool$incMiss > 0, NA, pool$SchoolRule)
  pool$incType <- NULL
  pool$incMiss <- NULL
  pool$daysRemoved <- log(pool$daysRemoved + .01)
  pool$incCount <- log(pool$incCount + .1)
  return(pool)
}


#' Find database key variables
#'
#' @param names a vector of variable names
#'
#' @return all variable names that have KEY in the name as a character vector
findKeys <- function(names){
  keyVars <- names[grep("KEY", names)]
  return(keyVars)
}

#' Convert dataset to a model matrix suitable for training in caret
#' @param grade a numeric representing the grade level to extract data for
#' @param conn a connection object suitable for extracting data
#' @param DV a character representing the variable name of the dependent variable
#' @param na.omit logical, should rows with an NA be omitted, default is TRUE
#' @details First the function extracts data from buildTrainingPool, then it builds 
#' a model matrix out of the formula DV ~ ., first excluding all "KEY" variables 
#' which represent year, school, district, and student identifiers. Then it returns 
#' the model matrix.
#' @note Using na.omit=FALSE may result in a non-conformable array due to a mismatch 
#' between the DV vector length and data vector length. This function has not been 
#' modified to allow this yet.
#' @export
gatherData <- function(grade, conn, DV = "grad_ind", na.omit = TRUE){
  pool <- buildTrainingPool(grade = grade, conn = conn)
  if(na.omit == TRUE){
    totalRows <- nrow(pool)
    complRows <- nrow(na.omit(pool))
    pool <- na.omit(pool)
    msg <- paste0("Of ", totalRows, " in pooled data, ", complRows, " are complete")
    message(msg)
  } else{
    stop("Setting na.omit to FALSE may lead to mismatch between DV and predictors")
  }
  CLASS <- ifelse(DV == "grad_ind", TRUE, FALSE)
  if(DV == "grad_ind"){
    expr <- lazyeval::interp(quote(x != y), x = as.name(DV), y = "exitWIpublicK12")
    recode <- function() ifelse(pool[, "grad_ind"]  == "on-time", 1, 0)
    DV2 <- "ontime_grad"
  } else if(DV != "grad_ind"){
    stop("Not written yet.")
  }  
  pool %<>% filter_(expr)
  pool[, DV2] <- recode()
  keyVars <- findKeys(names(pool))
  keyVars <- c(keyVars, "SCHOOL_YEAR", DV)
  myF <- as.formula(paste(DV2, " ~ ."))
  mat1 <- model.matrix(myF, 
                       data = pool[, names(pool)[!names(pool) %in% keyVars]])
  mat1 <- cbind(pool[, DV2], mat1[, -1])
  colnames(mat1)[1] <- DV2
  return(mat1)
}

#' Convert a dataset without a dependent variable
#' @param grade a numeric representing the grade level to extract data for
#' @param conn a connection object suitable for extracting data
#' @param year a character representing the school year to predict for in YYYY-YYYY format
#' @param na.omit logical, should rows with an NA be omitted, default is TRUE
#' @details First the function extracts data from buildTrainingPool, then it builds 
#' a model matrix suitable for passing to a transformation and then prediction step. 
#' The key difference between this and the original gatherData is that there are no 
#' valide dependent variable observations for data we want to make predictions on. 
#' This function is modified to account for that. 
#' @note na.omit must be true for now as no imputation routines have been integrated
#' @export
gatherData.pred <- function(grade, conn, year, na.omit = TRUE, ...){
  if(na.omit == FALSE){
    stop("Imputation routines not yet written.")
  }
  if(class(grade) == "numeric"){
    grade <- as.character(paste0(0, grade))
  }
  pool <- assembleCohort(cohortYear = year, grade = grade, 
                         conn = conn, ...)
  pool <- cleanTrainingPool(pool)
  if(na.omit == TRUE){
    totalRows <- nrow(pool)
    complRows <- nrow(na.omit(pool))
    pool <- na.omit(pool)
    msg <- paste0("Of ", totalRows, " in pooled data, ", complRows, " are complete")
    message(msg)
  }
  keyVars <- findKeys(names(pool))
  keyVars <- c(keyVars, "SCHOOL_YEAR")
  myF <- as.formula(paste(" ~ ."))
  mat1 <- model.matrix(myF, 
                       data = pool[, names(pool)[!names(pool) %in% c(keyVars)]])
  row.names(mat1) <- pool[, "STUDENT_KEY"]
  return(mat1[, -1])
}



#' Construct a DEWSlist from a model matrix
#' @param matrix a matrix produced by gatherData
#' @param DV a character representing the column name associated with the dependent 
#' variable
#' @param preProcess logical, should a preProcess routine be applied to the data
#' @param mode character, is this running in production or development mode, see details 
#' @param prelim logical, is this the preliminary or final model, see details
#' @param P numeric, 0 to 1, proportion of data to be used for training
#' @param PVALID, 0 to 1, proportion of the data to be used for the validation data
#' @details This function makes the data suitable for use in EWStools functions 
#' for testing and selecting models. 
#' @return a list of lists, the lists are named "traindata", "testdata" and 
#' "validdata", and each list has two elements, a predictor matrix called "preds" 
#' and an outcome vector called "class". A fourth element called "scale" is 
#' included if preProcess = TRUE which contains the preProcess object used
#' @importFrom EWStools assembleData
#' @export
makeDEWSList <- function(matrix, DV=NULL, preProcess = TRUE, 
                         MODE = "PROD", PRELIM = FALSE, P, PVALID) {
  #
  if(missing(P) | missing(PVALID)){
    if(nrow(matrix) <= 60000){
      P <- ifelse(MODE == "PROD", 0.45, 0.25)
      PVALID <- ifelse(MODE == "PROD", 0.25, 0.25)
    }else {
      P <- ifelse(MODE == "PROD", 0.25, 0.05)
      PVALID <- ifelse(MODE == "PROD", 0.5, 0.75)
    }
  }
  if(missing(DV)){
    warning("No dependent variable specified, using first column.")
    DV <- colnames(matrix)[1]
  }
  tmppred <- colnames(matrix)[colnames(matrix) != DV]
  CLASS <- ifelse(DV == "ontime_grad", TRUE, FALSE)
  full <- assembleData(matrix, class = DV, 
                       p = P, predvars = tmppred, pvalid = PVALID, 
                       classification = CLASS)
  rm(matrix)
  full <- cleanDV.DEWSList(full, DV = DV)
  if(preProcess == TRUE){
   full <- preProcess.DEWSList(full)
  } 
  return(full)
}

#' PreProcess DEWS list
#' @param DEWSList a list produced by the makeDEWSList function
#' @return a DEWSList with the additional list element "scale" which includes 
#' the preProcess object
#' @details The preProcess function is applied to all continuous variables in 
#' the training data. A separate scaling is done to binary variables as well. 
#' This training data pre-processing is then applied to test and validdata so that 
#' all data elements have been scaled relative to the training data. 
#' @importFrom caret preProcess
preProcess.DEWSList <- function(DEWSList){
  if("scale" %in% names(DEWSList)){
    stop("Data appears to already be preprocessed as scale is present in DEWSList")
  }
  for(i in names(DEWSList)){
    DEWSList[[i]]$preds <- as.data.frame(DEWSList[[i]]$preds)
  } 
  binVars <- findBinary(DEWSList$traindata$preds)
  allVars <- names(DEWSList$traindata$preds)
  conVars <- allVars[!allVars %in% binVars]
  modPreProc <- preProcess(DEWSList$testdata$preds[, conVars], 
                           method = c("center", "scale"))
  for(i in names(DEWSList)){
    DEWSList[[i]]$preds %<>% scaleCenter(preProc = modPreProc, binVars = binVars, 
                                          conVars = conVars)
  } 
  DEWSList$scale <- modPreProc
  return(DEWSList)
  
}

#' Preprocess prediction cohort
#' @param predCohort a data.frame produced by 
#' @param preProcess a preProcess object with variable names matching those in predCohort
#' @param varNames a vector of variable names to preProcess
#' @return a data.frame of of the same dimensions of predCohort that is rescaled 
#' according to the attributes in the preProcess object
#' @export
#' @importFrom caret preProcess
preProcess.pred <- function(predCohort, preProcess, varNames){
  stopifnot(class(predCohort) == "matrix")
  stopifnot(class(preProcess) == "preProcess")
  binVars <- findBinary(predCohort)
  allVars <- names(predCohort)
  conVars <- names(preProcess$mean)
  out <- scaleCenter(data = predCohort, preProc = preProcess, binVars = binVars, 
                     conVars = conVars)
  sampNames <- colnames(out)
  out <- out[, sampNames %in% varNames]
  
  missVar <- varNames[!varNames %in% sampNames]
  extraData <- matrix(rep(0, length(missVar) * nrow(out)), 
                      ncol = length(missVar), nrow = nrow(out), 
                      dimnames = list("rows" = NULL, "columns" = missVar))
  
  out <- cbind(out, extraData)
  out <- out[, varNames]
  return(out)
}

#' Clean up DV in a DEWSList for proper modeling
#' @param DEWSList a DEWSlist created by makeDEWSList
#' @param DV a character representing the name of the dependent variable
#' @return a DEWSList
#' @details Reassigns the DEWSList[[i]]$class object to be a factor with the 
#' larger group, graduates, as the first level of a factor, and non-graduates 
#' as the second level
#' @note Currently only works if DV == "ontime_grad"
cleanDV.DEWSList <- function(DEWSList, DV){
  if(DV == "ontime_grad"){
    tabT <- table(DEWSList$traindata$class)
    small <- names(tabT[tabT == min(tabT)])
    big <- names(tabT[tabT == max(tabT)])
    
    for(i in names(DEWSList)){
      DEWSList[[i]]$class %<>% as.character()
      DEWSList[[i]]$class %<>% factor(levels = c(big, small), 
                                       labels = c("Grad", "Non.Grad"))
    } 
  } else {
    stop("Dependent variable not supported.")
  }
  return(DEWSList)
}

#' Scale and center a dataframe given a preProcessing object
#' @param data a data.frame
#' @param preProc a preProcessing object
#' @param binVars a vector of column indices representing binary variables
#' @param conVars a vector of column indices representing continuous variables in 
#' the preProcessing object
#' @details Continuous variables are processed by preProcess in the caret package, 
#' binary variables are processed using the rescale function in arm which centers 
#' binary inputs
#' @importFrom caret preProcess
#' @importFrom caret predict.preProcess
#' @importFrom arm rescale
#' @export 
scaleCenter <- function(data, preProc, binVars, conVars){
  data[, conVars] <- predict(preProc, data[, conVars])
  data[, binVars] <- apply(data[, binVars], 2, arm::rescale, 
                           binary.inputs = 'center')
  return(data)
}

#' Find binary variables
#' @description Find binary variables in a data.frame and return their names.
#' @param data data.frame
#' @return a character vector of variable names corresponding to column names in 
#' data which have fewer than three unique values
#' @note This function will return constant variables with only one value as 
#' binary as well
#' @export
findBinary <- function(data){
  out <- apply(data, 2, function(x) length(table(x)) < 3)
  return(names(out[out==TRUE]))
}  

#' SMOTE the data for class balance?
#' @description Applies the synthetic minority over-sampling technique to a 
#' list in a DEWSList to make the data less unbalanced between classes.
#' @param dewsList a single element of a DEWSList, e.g. traindata, testdata
#' @return an element of a dewslist with objects preds and class
#' @importFrom DMwR SMOTE
#' @details See the documentation for SMOTE in the DMwR package for details on 
#' how the SMOTE function works.
#' @export
smote <- function(dewsList) {
  library(DMwR)
  x <- dewsList$preds
  y <- dewsList$class
  dat <- if(is.data.frame(x)) x else as.data.frame(x)
  dat$.y <- y
  dat <- SMOTE(.y ~ ., data = dat)
  list(preds = dat[, !grepl(".y", colnames(dat), fixed = TRUE)],
       class = dat$.y)
}
jknowles/ModelEWS documentation built on May 19, 2019, 11:42 a.m.