knitr::opts_chunk$set(echo = TRUE, warning = FALSE, message = FALSE)

Setup of Stack Overflow 2016 Developer Survey Answers

This document sets up the data in the stacksurveyr package, based on the following two files:

You can run this code yourself with the knitr package: knitr::knit("README.Rmd").


The schema is read in fairly straightforwardly from schema.csv:


stack_schema <- read_csv("schema.csv")
devtools::use_data(stack_schema, overwrite = TRUE)


We read in the survey from the raw CSV file:


survey_raw <- read_csv("2016 Stack Overflow Survey Responses.csv")

There are two column types that need to be converted to ordered factors: scales ("On a scale of 1-5, how much do you agree...") and ranges ("How much experience do you have...").

convert_range_factor <- function(x) {
  d <- data_frame(name = unique(x)) %>%
    filter(! %>%
    mutate(num = extract_numeric(str_extract(name, "[\\d\\,]+"))) %>%
    arrange(num, !str_detect("Less", name), desc(name))

  factor(x, levels = d$name, ordered = TRUE)

agree_levels <- c("Disagree completely", "Disagree somewhat", "Neutral",
                  "Agree somewhat",  "Agree completely")
important_levels <- c("I don't care about this", "This is somewhat important",
                      "This is very important" )

stack_survey <- survey_raw %>%
  mutate_each(funs(convert_range_factor), contains("_range")) %>%
  mutate_each(funs(factor(., levels = important_levels, ordered = TRUE)),
              starts_with("important_")) %>%
  mutate_each(funs(factor(., levels = agree_levels, ordered = TRUE)),

levels(stack_survey$job_satisfaction) <- c(
  "I hate my job",
  "I'm somewhat dissatisfied with my job",
  "I'm neither satisfied nor dissatisfied",
  "I'm somewhat satisfied with my job",
  "I love my job",
  "I don't have a job",
  "Other (please specify)"

# also one transcription issue
stack_survey <- stack_survey %>%
  mutate(why_stack_overflow = str_replace_all(why_stack_overflow,
                                              "To maintain an online presence"))

# finally, first column needs a name
colnames(stack_survey)[1] <- "respondent_id"

devtools::use_data(stack_survey, overwrite = TRUE)

dgrtwo/stacksurveyr documentation built on May 15, 2019, 8:20 a.m.