R/growth_df_prep.R

Defines functions determine_growth_status calc_rit_growth_metrics growth_norm_lookup growth_testid_lookup build_growth_scaffolds scores_by_testid student_scaffold generate_growth_dfs

Documented in build_growth_scaffolds calc_rit_growth_metrics determine_growth_status generate_growth_dfs growth_norm_lookup growth_testid_lookup scores_by_testid student_scaffold

#' @title generate_growth_df
#'
#' @description
#' \code{generate_growth_df} takes a CDF and given two seasons (start and end) saturates
#' all possible growth calculations for a student and returns a long data frame with the
#' results.
#'
#' @details
#' A workflow wrapper that calls a variety of growth_df prep functions.
#' Given a mapvizieR processed cdf, this function will return a a growth
#' data frame, with one row per student per test per valid 'growth_window',
#' eg 'Fall to Spring'.
#'
#' @param processed_cdf a conforming processed_cdf data frame
#' @param norm_df_long defaults to student_growth_norms_2015  if you have a conforming norms object,
#' you can use generate_growth_df to produce a growth data frame for those norms.
#' example usage: calculate college ready growth norms, and use generate_growth_df to see
#' if students met them.
#' @param include_unsanctioned_windows if TRUE, generate_growth_df will
#' return some additional growth windows like 'Spring to Winter', which aren't in the
#' official norms (but might be useful for progress monitoring).
#'
#' @return a data frame with all rows where the student had at least ONE matching
#' test event (start or end)
#'
#' @export

generate_growth_dfs <- function(
  processed_cdf,
  norm_df_long = norms_students_wide_to_long(student_growth_norms_2015),
  include_unsanctioned_windows = FALSE
){
  #input validation
  assertthat::assert_that(
    is.data.frame(processed_cdf),
    is.data.frame(norm_df_long),
    is.logical(include_unsanctioned_windows),
    check_processed_cdf(processed_cdf)$boolean
  )

  #generate a scaffold of students/terms/windows
  scaffold <- build_growth_scaffolds(processed_cdf)

  #match start/end testids to cdf data
  with_scores <- growth_testid_lookup(scaffold, processed_cdf)

  #look up norms and add growth metrics
  with_norms <- growth_norm_lookup(
    with_scores, processed_cdf, norm_df_long, include_unsanctioned_windows
  ) %>%
  calc_rit_growth_metrics()

  #add cohort year
  with_norms <- with_norms %>% 
    dplyr::mutate(cohort_year = end_map_year_academic + 1 + 12 - end_grade) 
  
  #group the growth_df (for use by the summary method)
  with_norms <- with_norms %>%
  dplyr::group_by(
    end_map_year_academic, cohort_year, growth_window, end_schoolname,
    start_grade, end_grade,
    start_fallwinterspring, end_fallwinterspring,
    measurementscale
  )
  
  class(with_norms) <- c("mapvizieR_growth", class(with_norms))
  
  return(with_norms)
}



#' @title student_scaffold
#'
#' @description which student/test/season rows have valid data?
#'
#' @param processed_cdf a conforming processed_cdf data frame
#' @param start_season the start of the growth window ("Fall", "Winter", or "Spring")
#' @param end_season the end of the growth window ("Fall", "Winter", or "Spring")
#' @param year_offset start_year + ? = end_year.  if same academic_year (eg fall to spring)
#' this is 0  if spring to spring, this is 1
#'
#' @return a data frame to pass back generate_growth_df that has kids, and the relevant
#' student/test/seasons to calculate growth records on

student_scaffold <- function(
  processed_cdf,
  start_season,
  end_season,
  year_offset
) {
  #input validation
  assertthat::assert_that(
    is.data.frame(processed_cdf),
    start_season %in% c("Fall", "Spring", "Winter"),
    end_season %in% c("Fall", "Spring", "Winter"),
    check_processed_cdf(processed_cdf)$boolean
  )

  #make a simplified df
  cols <- c("studentid", "measurementscale", "testid",
    "map_year_academic", "fallwinterspring", "grade", "grade_level_season", "schoolname"
  )
  simple <- processed_cdf[ ,cols]
  simple$hash <- with(simple,
    paste(studentid, measurementscale, fallwinterspring, map_year_academic, sep='_')
  )

  #we want kids with EITHER start OR end
  start <- simple[simple$fallwinterspring==start_season, ]
  end <- simple[simple$fallwinterspring==end_season, ]

  #define target columns now, in case we need to step out
  
  target_cols_list = list(
    studentid = character(), 
    measurementscale = character(), 
    end_schoolname = character(), 
    end_grade_level_season = double(),
    end_grade = integer(), 
    growth_window = character(), 
    complete_obsv = logical(), 
    match_status = character(),
    start_testid = integer(), 
    start_map_year_academic=integer(), 
    start_fallwinterspring = character(),
    start_grade = integer(), 
    start_grade_level_season = double(), 
    start_schoolname = character(),
    end_testid = integer(), 
    end_map_year_academic = integer(), 
    end_fallwinterspring = character()
  )
  
  # Character vector of target column names
  target_cols <- names(target_cols_list)
  
  # empty data.frame
  empty <- dplyr::as_data_frame(target_cols_list) 

  #if there's no data, don't worry about matching; just return a zero row df
  if (nrow(start) == 0) {
    return(empty)
  }

  #namespace stuff
    #normally I avoid reference by index number since, uh, inputs change, but
    #we are hard coding the columns above, so it is OK.
  start_prefixes <- c(rep("", 2), rep("start_", 7))
  end_prefixes <- c(rep("", 2), rep("end_", 7))

  names(start) <- paste0(start_prefixes, names(start))
  names(end) <- paste0(end_prefixes, names(end))

  #a valid observation has BOTH start AND end AND the years match
    #first build a match hash to find out what to match on
  start$matching_end_hash <- with(start,
      paste(studentid, measurementscale, end_season,
        #this is the magic here - build the term you are looking for IN end ON start.
        #then use in the inner_join below
        start_map_year_academic + year_offset, sep='_'
      )
  )

  #a dplyr inner join will return the *matching* rows
  matched_rows <- dplyr::inner_join(
    x=start, y=end[, c(3:9)], by=c("matching_end_hash" = "end_hash")
  )

  #dplyr returns a barfy order. reorder.
  col_order <- c("studentid", "measurementscale",       #constants
    "start_testid", "start_map_year_academic",          #start cols
    "start_fallwinterspring", "start_grade",
    "start_grade_level_season", "start_schoolname",
    "end_testid", "end_map_year_academic",              #end cols
    "end_fallwinterspring", "end_grade",
    "end_grade_level_season", "end_schoolname",
    "start_hash", "matching_end_hash"                   #hashes
  )
  matched_df <- matched_rows[ ,col_order]
  matched_df$match_status <- rep('start and end', nrow(matched_df))
  matched_df$complete_obsv <- rep(TRUE, nrow(matched_df))

  #what rows are ONLY found in the start df?
  only_start <- dplyr::anti_join(
    x=start, y=matched_df, by=c("start_hash" = "start_hash")
  )
  if (nrow(only_start) > 0) {
    only_start$match_status <- rep('only start', nrow(only_start))
    only_start$complete_obsv <- rep(FALSE, nrow(only_start))
    #ensure that we always return FWS and year, even if unmatched on end.
    only_start$end_fallwinterspring <- end_season
    only_start$end_map_year_academic <- only_start$start_map_year_academic + year_offset
  }

  #what rows are ONLY found in the end df?
  only_end <- dplyr::anti_join(
    x=end, y=matched_df, by=c("end_hash"="matching_end_hash")
  )
  if (nrow(only_end) > 0) {
    only_end$match_status <- rep('only end', nrow(only_end))
    only_end$complete_obsv <- rep(FALSE, nrow(only_end))
    #ensure that we always return FWS and year, even if unmatched on start
    only_end$start_fallwinterspring <- start_season
    only_end$start_map_year_academic <- only_end$end_map_year_academic - year_offset
  }

  #build the df to return
  final <- dplyr::bind_rows(list(matched_df, only_start, only_end))

  #discard some helpers
  final <- final[ ,!names(final) %in% c('start_hash', 'end_hash', 'matching_end_hash')]

  final$growth_window <- paste0(start_season, ' to ', end_season)

  #reorder
  final <- final[ , target_cols]

  return(final)
}



#' @title scores_by_testid
#'
#' @description helper function for \code{generate_growth_df}. given a test id,
#' returns df with all the scores.
#'
#' @param testid a vector of testids
#' @param processed_cdf a conforming processed_cdf data frame
#' @param start_or_end either c('start', 'end')
#'
#' @return one row data frame.

scores_by_testid <- function(testid, processed_cdf, start_or_end) {
  #input validation
  assertthat::assert_that(
    #testids can't be null
    is.data.frame(processed_cdf),
    check_processed_cdf(processed_cdf)$boolean,
    start_or_end %in% c('start', 'end')
  )

  matched <- dplyr::left_join(
    as.data.frame(testid), processed_cdf, by="testid"
  )

  target_cols <- c("growthmeasureyn", "testtype", "testname", "teststartdate",
    "testdurationminutes", "testritscore", "teststandarderror", "testpercentile",
    "consistent_percentile", "testquartile", "rittoreadingscore", "rittoreadingmin",
    "rittoreadingmax", "teststarttime", "percentcorrect", "projectedproficiencylevel1")

  matching_slim <- matched[ , target_cols]
  #prefix it
  names(matching_slim) <- paste0(start_or_end, '_', names(matching_slim))

  return(matching_slim)
}



#' @title build_growth_scaffolds
#'
#' @description a helper function for \code{generate_growth_df}.
#' finds all the student/season growth windows.
#'
#' @param processed_cdf conforming mapvizieR processed cdf

build_growth_scaffolds <- function(processed_cdf){
  f2w <- student_scaffold(processed_cdf, 'Fall', 'Winter', 0)
  f2s <- student_scaffold(processed_cdf, 'Fall', 'Spring', 0)
  f2f <- student_scaffold(processed_cdf, 'Fall', 'Fall', 1)
  s2s <- student_scaffold(processed_cdf, 'Spring', 'Spring', 1)
  w2s <- student_scaffold(processed_cdf, 'Winter', 'Spring', 0)
  w2w <- student_scaffold(processed_cdf, 'Winter', 'Winter', 1)
  
  scaffolds <- rbind(f2w, f2s,f2f, s2s, w2s, w2w)

  return(scaffolds)
}



#' @title growth_testid_lookup
#'
#' @description a helper function for \code{generate_growth_df}
#' given a scaffold of student/season growth windows, finds the
#' matching test data in the cdf
#'
#' @param scaffold output of \code{build_growth_scaffolds}
#' @param processed_cdf conforming mapvizieR processed cdf

growth_testid_lookup <- function(scaffold, processed_cdf) {
  start_data <- scores_by_testid(scaffold$start_testid, processed_cdf, 'start')
  end_data <- scores_by_testid(scaffold$end_testid, processed_cdf, 'end')
  with_scores <- cbind(scaffold, start_data, end_data)

  return(with_scores)
}



#' @title growth_norm_lookup
#'
#' @description called by \code{generate_growth_df} to return
#' growth norms for growth data frames in process
#'
#' @param incomplete_growth_df a growth df in process.  needs to have growth
#' windows, start_grade, and start_testritscore.
#' @param processed_cdf conforming mapvizieR processed cdf.  needed for
#' unsanctioned windows.
#' @param norm_df_long a data frame of normative expectations
#' @param include_unsanctioned_windows check generate_growth_df for a
#' description.
#' @param ... currently not used.

growth_norm_lookup <- function(
  incomplete_growth_df,
  processed_cdf,
  norm_df_long,
  include_unsanctioned_windows,
  ...
) {

  if (include_unsanctioned_windows) {
    #spring to winter
    scaffold <- student_scaffold(processed_cdf, 'Spring', 'Winter', 1)
    s2w <- growth_testid_lookup(scaffold, processed_cdf)

    #half of spring to spring for the norms
    temp_norms <- norm_df_long[norm_df_long$growth_window == 'Spring to Spring', ]
    temp_norms$growth_window <- 'Spring to Winter'
    temp_norms$typical_growth <- temp_norms$typical_growth / 2
    temp_norms$reported_growth <- temp_norms$reported_growth / 2
    #we don't know what the standard deviation is.
    temp_norms$std_dev_of_expectation <- NA

    #put back on dfs
    incomplete_growth_df <- rbind(incomplete_growth_df, s2w)
    norm_df_long <- rbind(norm_df_long, temp_norms)

    #todo: spring to fall (?)
  }

  with_matched_norms <- dplyr::left_join(
    x=incomplete_growth_df, y=norm_df_long,
    by=c("measurementscale" = "measurementscale",
      "growth_window" = "growth_window",
      "start_grade" = "startgrade",
      "start_testritscore" = "startrit")
  )

  return(with_matched_norms)
}


#' @title Calculate growth metrics (RIT growth, meeting indicators, conditional
#' change in test percentile, growth index, student growth percentile).
#'
#' @description a helper function  for \code{generate_growth_df}
#' which adds columns to a growth CDF for variuos growth statistics.
#'
#' @param normed_df a data frame that has matched growth windows for
#' each student/subject/season triplet.
#' @return a data frame the same as growth_df, with additional calcs

calc_rit_growth_metrics <- function(normed_df){

  out <- normed_df %>%
    dplyr::mutate(
      rit_growth = end_testritscore - start_testritscore,
      met_typical_growth = rit_growth >= reported_growth,
      change_testpercentile = end_testpercentile - start_testpercentile,
      cgi = (rit_growth-reported_growth) / std_dev_of_expectation,
      sgp = pnorm(cgi)
    ) 

  return(out)
}



#' @title determine growth status
#'
#' @description last thing that runs after accelerated growth is calculated
#'
#' @param df output of add accelerated growth

determine_growth_status <- function(df) {

  #growth status
  df$growth_status <- ifelse(df$met_accel_growth, 'College Ready', NA)

  df$growth_status <- ifelse(
    is.na(df$growth_status) & !df$met_accel_growth & df$met_typical_growth,
    'Typical', df$growth_status
  )
  df$growth_status <- ifelse(
    is.na(df$growth_status) & df$rit_growth <= 0,
    'Negative', df$growth_status
  )
  df$growth_status <- ifelse(
    is.na(df$growth_status) & df$rit_growth > 0 & !df$met_typical_growth,
    'Positive', df$growth_status
  )

  return(df)
}
almartin82/mapvizieR documentation built on May 10, 2018, 11:59 p.m.