inst/data-raw/students.R

#################################### Setup #####################################

library(tidyverse)
library(stringr)
library(irc)

# data_dir <- system.file("data-raw", package = "irc")
data_dir <- file.path("inst", "data-raw")

#################################### 2013 ######################################

# Read survey questions

survey_file_2013 <- file.path(
  data_dir, "2013",
  "student_survey_2013.csv.gz"
)

questions_file_2013 <- file.path(
  data_dir, "2013",
  "student_questions_2013.csv.gz"
)

student_questions_2013 <- read_csv(questions_file_2013)

raw_students_2013 <- read_csv(
  survey_file_2013, skip = 1,
  col_names = student_questions_2013$idx,
  col_types = parse_column_types(
    question_idx = student_questions_2013$idx,
    question_text = student_questions_2013$text
  )
)

student_survey_2013 <- raw_students_2013 %>%
  mutate(year = 2013) %>%
  impute_student_data(question_key = student_questions_2013) %>%
  filter(!is.na(gender)) %>%
  filter(!is.na(school)) %>%
  filter(!is.na(grade_level))

#################################### 2014 ######################################

# Read survey questions

survey_file_2014 <- file.path(
  data_dir, "2014",
  "student_survey_2014.csv.gz"
  )

questions_file_2014 <- file.path(
  data_dir, "2014",
  "student_questions_2014.csv"
)

student_questions_2014 <- read_csv(questions_file_2014)

raw_students_2014 <- read_csv(
  survey_file_2014, skip = 1,
  col_names = student_questions_2014$idx,
  col_types = parse_column_types(
    question_idx = student_questions_2014$idx,
    question_text = student_questions_2014$text
  )
)

student_survey_2014 <- raw_students_2014 %>%
  mutate(year = 2014) %>%
  impute_student_data(question_key = student_questions_2014) %>%
  filter(!is.na(gender)) %>%
  filter(!is.na(school)) %>%
  filter(!is.na(grade_level))

#################################### 2015 ######################################

# Read survey questions

survey_file_2015 <- file.path(data_dir, "2015",
                              "student_survey_2015.csv.gz")

questions_file_2015 <- file.path(data_dir, "2015",
                                 "student_questions_2015.csv")

student_questions_2015 <- read_csv(questions_file_2015)

raw_students_2015 <- read_csv(
  survey_file_2015, skip = 1,
  col_names = student_questions_2015$idx,
  col_types = parse_column_types(
    question_idx = student_questions_2015$idx,
    question_text = student_questions_2015$text
  )
)

student_survey_2015 <- raw_students_2015 %>%
  mutate(year = 2015) %>%
  impute_student_data(question_key = student_questions_2015) %>%
  filter(!is.na(gender)) %>% # did not provide gender
  filter(!is.na(school)) %>% # did not provide school
  filter(!is.na(grade_level)) # did not provide grade level

#################################### 2016 ######################################

# Load survey questions

questions_file_2016 <- file.path(
  data_dir, "2016", "student_questions_2016.csv"
  )

survey_file_2016 <- file.path(data_dir, "2016", "student_survey_2016.csv.gz")

student_questions_2016 <- read_csv(questions_file_2016)

raw_students_2016 <- read_csv(
  survey_file_2016,
  skip = 1,
  col_names = student_questions_2016$idx,
  col_types = parse_column_types(
    question_idx = student_questions_2016$idx,
    question_text = student_questions_2016$text
  )
)

#### TODO: Bring in Avenues Data ####

# Clean 2016 data

excluded_schools_2016 <- c(
  "Graded School at Sao Paulo",
  "Singapore American School",
  "United Nations School of NY"
)

student_survey_2016 <- raw_students_2016 %>%
  mutate(year = 2016) %>%
  impute_student_data(question_key = student_questions_2016) %>%
  filter(!is.na(gender)) %>% # did not provide gender
  filter(!is.na(school)) %>% # did not provide school
  filter(!is.na(grade_level)) %>% # did not provide grade level
  filter(year <= 2016 | total_time > 60) %>% # completed the survey too quickly
# Note that HMS was not kept until 2016
  filter(!(school_level == "Elementary" & school == "Cary Academy")) %>%
  filter(!(grade_level == 12 & school_short =="AIS Budapest")) %>%
  filter(!(school %in% excluded_schools_2016))

#################################### 2017 ######################################

# Most schools 2017

questions_file_2017 <- file.path(data_dir, "2017", "student_questions_2017.csv")
survey_file_2017 <- file.path(data_dir, "2017", "student_survey_2017.csv.gz")

student_questions_2017 <- read_csv(questions_file_2017, col_types = "cc")

raw_students_2017 <- read_csv(
  survey_file_2017,
  skip = 1,
  col_names = student_questions_2017$idx,
  col_types = parse_column_types(
    question_idx = student_questions_2017$idx,
    question_text = student_questions_2017$text
  )
)

## HELP 2017 ------------

questions_file_help_2017 <- file.path(
  data_dir, "2017", "student_questions_help_2017.csv"
)

survey_file_help_2017 <- file.path(
  data_dir, "2017", "student_survey_help_2017.csv.gz"
)

student_questions_help_2017 <- read_csv(questions_file_help_2017)

raw_students_help_2017 <- read_csv(
  survey_file_help_2017,
  skip = 1,
  col_names = student_questions_help_2017$idx,
  col_types = parse_column_types(
    question_idx = student_questions_help_2017$idx,
    question_text = student_questions_help_2017$text
  )
) %>%
  select(-Q30) %>%
  rename(Q30 = Q64)

# Bind most and HELP

raw_students_2017 <- bind_rows(raw_students_2017, raw_students_help_2017)

excluded_schools_2017 <- c("American International School of Budapest",
                           "Singapore American School",
                           "United Nations School of NY")

student_survey_2017 <- raw_students_2017 %>%
  mutate(year = 2017) %>%
  impute_student_data(question_key = student_questions_2017) %>%
  filter(!is.na(gender)) %>%
  filter(!is.na(school)) %>%
  filter(!is.na(grade_level)) %>%
  filter(total_time > 60) %>%
  filter(!school %in% excluded_schools_2017) %>%
  filter(!(school == "Frankfurt International School" &
             grade_level %in% c(4, 13))) %>%
  filter(!(school_level == "Elementary" & school == "Cary Academy")) %>%
  filter(!(grade_level == 12 & school_short =="AIS Budapest")) %>%
  filter(
    !grepl(
      x = Q35_8_TEXT,
      pattern = "AUSTRA|bogan|fat dong|Bob Kater|Pig Latin|gupty|ahh"
    )
  ) %>%
  filter(
    !grepl(
      x = Q37_7_TEXT, pattern = "Potato|chicken"
    )
  ) %>%
  filter(!(school == "Knox Grammar School" & grade_level %in% c(4, 12))) %>% # low N for these grades
  filter(!(school == "Knox Grammar School" & gender == "Girl"))

#################################### 2018 ######################################

survey_file_2018 <- file.path(
  data_dir, "2018", "student_survey_2018.csv"
  )

student_questions_2018 <- read_csv(
  file.path(data_dir, "2018", "student_questions_2018.csv")
)

raw_students_2018 <- read_csv(
  survey_file_2018,
  skip = 2,
  col_names = student_questions_2018$idx,
  col_types = parse_column_types(
    question_idx = student_questions_2018$idx,
    question_text = student_questions_2018$text
  )
)

student_survey_2018 <- raw_students_2018 %>%
  mutate(year = 2018) %>%
  impute_student_data(question_key = student_questions_2018) %>%
  filter(!is.na(gender)) %>% # did not provide gender
  filter(!is.na(school)) %>% # did not provide school
  filter(!is.na(grade_level)) %>% # did not provide grade level
  filter(total_time > 60) # completed the survey too quickly

############################### Bind and Save ##################################

student_survey_master <- bind_rows(
  student_survey_2013,
  student_survey_2014,
  student_survey_2015,
  student_survey_2016,
  student_survey_2017
  )

student_questions_master <- read_csv(
  file.path(data_dir, "student_questions_master.csv.gz")
)

students <- student_survey_master
student_questions <- student_questions_master

devtools::use_data(student_questions, overwrite = T)
devtools::use_data(students, overwrite = T)
ircollaborative/report documentation built on July 28, 2018, 7:33 p.m.