# load packages ---------------------------------------------------------------- library(learnr) library(gradethis) library(tidyverse) library(tidymodels) library(dsbox) # set options for exercises and checking --------------------------------------- gradethis_setup() # hide non-exercise code chunks ------------------------------------------------ knitr::opts_chunk$set(echo = FALSE)
gss16 <- dsbox::gss16 |> mutate(email = emailmin + (emailhr * 60))
gss16 <- dsbox::gss16 |> mutate(snap_insta = if_else(snapchat == "Yes" | instagrm == "Yes", "Yes", "No" ))
gss16 <- dsbox::gss16 |> mutate( email = emailmin + (emailhr * 60), snap_insta = if_else(snapchat == "Yes" | instagrm == "Yes", "Yes", "No" ) )
gss16 <- dsbox::gss16 |> mutate( email = emailmin + (emailhr * 60), snap_insta = if_else(snapchat == "Yes" | instagrm == "Yes", "Yes", "No" ) ) email_fit <- linear_reg() |> set_engine("lm") |> fit(email ~ educ + wrkstat + snap_insta, data = gss16)
gss16 <- dsbox::gss16 |> mutate( email = emailmin + (emailhr * 60), snap_insta = if_else(snapchat == "Yes" | instagrm == "Yes", "Yes", "No" ) ) email_fit <- linear_reg() |> set_engine("lm") |> fit(email ~ educ + wrkstat + snap_insta, data = gss16) email_aug <- augment(email_fit$fit)
gss16 <- dsbox::gss16 |> mutate( advfront2 = case_when( advfront %in% c("Strongly agree", "Agree") ~ "Yes", advfront %in% c("Disagree", "Strongly disagree") ~ "No", TRUE ~ advfront ), polviews2 = case_when( polviews %in% c("Extremely liberal", "Liberal", "Slightly liberal") ~ "Liberal", polviews %in% c("Extrmly conservative", "Conservative", "Slghtly conservative") ~ "Conservative", TRUE ~ polviews ), polviews2 = fct_relevel(polviews2, "Conservative", "Moderate", "Liberal") )
knitr::include_graphics("images/mauro-mora-31-pOduwZGE-unsplash.jpg")
The General Social Survey (GSS) gathers data on contemporary American society in order to monitor and explain trends and constants in attitudes, behaviors, and attributes. Hundreds of trends have been tracked since 1972. In addition, since the GSS adopted questions from earlier surveys, trends can be followed for up to 70 years.
The GSS contains a standard core of demographic, behavioral, and attitudinal questions, plus topics of special interest. Among the topics covered are civil liberties, crime and violence, intergroup tolerance, morality, national spending priorities, psychological well-being, social mobility, and stress and traumatic events.
In this assignment we analyze data from the 2016 GSS, using it to estimate values of population parameters of interest about US adults.
In this assignment we will work with the following packages. You can load them with the following code block:
library(tidyverse) library(tidymodels) library(dsbox)
library(tidyverse) library(tidymodels) library(dsbox)
grade_this_code("The packages we'll be working with are now loaded.")
In this tutorial, we will work with the 2016 GSS data.
The data are available as part of the dsbox package we just loaded; the data frame is called gss16
.
| Variable name | Description/Question in GSS
|:--------|:-------------------------------------------------------------
| harass5
| "Over the past five years, have you been harassed by your superiors or co-workers at your job, for example, have you experienced any bullying, physical or psychological abuse?"
| emailmin
| Number of minutes spent on email weekly, extra to the hours in emailhrs (e.g. emailmin = 30 for 2.5 hours on email).
| emailhr
| Number of hours spent on email weekly.
| educ
| Number of years in education.
| polviews
| Political views. Possible answers are Extremely liberal, Liberal, Slightly liberal, Moderate, Slghtly conservative, Conservative, Extrmly conservative.
| advfront
| "Even if it brings no immediate benefits, scientific research that advances the frontiers of knowledge is necessary and should be supported by the federal government."
| snapchat
| Whether respondant uses Snapchat or not.
| instagram
| Whether respondant uses Instagram or not.
| wrkstat
| Work status.
In 2016, the GSS added a new question on harrassment at work. The question is phrased as the following.
Over the past five years, have you been harassed by your superiors or co-workers at your job, for example, have you experienced any bullying, physical or psychological abuse?
Answers to this question are stored in the harass5
variable in our dataset.
Use the following code block to count the number of possible values for harass5
that can be found in the dataset:
question("What are the possible responses to the question?", answer("Yes, No"), answer("Yes, No, Does not apply"), answer("Yes, No, NA, Unsure"), answer("Yes, No, NA, Does not apply", correct = TRUE ), random_answer_order = TRUE )
Next, use this code block to determine how many respondents answered "Yes" to the question:
gss16 |> filter(___) |> ___
gss16 |> filter(harass5 == "Yes") |> ___
gss16 |> filter(!(harass5 == "Yes")) |> nrow()
grade_this({ if(identical(.result, 237) | identical(.result, 237L)) { pass("237 people answered 'Yes' to the question.") } if(identical(.result, 1136) | identical(.result, 1136L)) { fail("Did you accidentally count the number of 'No' responses?") } if(identical(.result, 1232) | identical(.result, 1232L)) { fail("Did you accidentally count the number of reponses that aren't 'Yes?'") } fail("Not quite. Try looking at the hints now if you need help.") })
The 2016 GSS also asked respondents how many hours and minutes they spend on email weekly.
The responses to these questions are recorded in the emailhr
and emailmin
variables.
For example, if the response is 2.5 hrs, this would be recorded as emailhr = 2
and emailmin = 30
.
email
Using the following code block, create a new variable called email
that combines these two variables to report the number of minutes the respondents spend on email weekly.
gss16 <- gss16 |> ___
gss16 <- gss16 |> mutate(___)
gss16 <- gss16 |> mutate(email = ___)
gss16 <- gss16 |> mutate(email = emailmin + (emailhr * 60))
grade_this({ if(identical(as.character(.result[1, 10]), "720")) { pass("Well done!") } fail("Not quite. Try looking at the hints for the previous exercise.") })
First, create a visualisation of the distribution of this new variable.
Ensure you make a 'tidy' plot by adding axis labels and choosing a suitable geom
.
Now, find the mean and median number of minutes respondents spend on email weekly.
Remember - you'll need to remove the `NA` values.
gss16 |> summarise( mean = mean(___) )
gss16 |> summarise( mean = mean(email, na.rm = ___), ___ )
What is the mean number of minutes respondents spend on email weekly? Enter the answer into the following block to the nearest minute:
grade_this({ if(identical(as.numeric(.result), 417) | identical(as.numeric(.result), 417L)) { pass("The mean number of minutes spent on email per week is 417 mins.") } if(identical(as.numeric(.result), 416.8423)) { fail("Good, but please round this to the nearest whole minute.") } if((as.numeric(.result) > 416) & (as.numeric(.result) < 417)) { fail("Did you forget to round your result to the nearest whole minute?") } if(identical(as.numeric(.result), 416) | identical(as.numeric(.result), 416L)) { fail("Good, but check your rounding.") } fail("Not quite. Try looking at the hints for the previous exercise.") })
What is the median number of minutes spend on email weekly? Enter the answer into the following block to the nearest minute:
grade_this({ if(identical(.result, 120) | identical(.result, 120L)) { pass("The median number of minutes spent on email per week is 120 mins.") } fail("Not quite. Try looking at the hints for the previous exercise.") })
Based on the shape of the distribution of the email
variable, which of the mean and median is a better measure of the typical amount of time Americans spend on email weekly?
question( "Is the mean or the median more suitable?", answer("Mean", message = "Since the distribution is skewed, we need a measure that is resistant to outliers - i.e. the median." ), answer("Median", correct = TRUE, message = "Fab! The distribution is skewed and so we use the median as it is resistant to outliers." ) )
Create another new variable, snap_insta
that is coded as “Yes” if the respondent reported using either of Snapchat (snapchat
) or Instagram (instagrm
), and “No” if not.
If the recorded value was NA
for both of these questions, the value in your new variable should also be NA
.
gss16 <- gss16 |> ___
gss16 <- gss16 |> mutate(___)
gss16 <- gss16 |> mutate(snap_insta = if_else(___))
gss16 <- gss16 |> mutate(snap_insta = if_else(snapchat == "Yes" ___ instagrm == "Yes", ___, ___ ))
gss16 <- gss16 |> mutate(snap_insta = if_else(snapchat == "Yes" | instagrm == "Yes", "Yes", "No" ))
grade_result_strict( pass_if(~ .result$snap_insta[2] == "No", "Good going creating that new variable."), pass_if(~ .result$snap_insta[19] == "Yes", "Good going creating that new variable."), pass_if(~ .result$snap_insta[5] == "Yes", "Good going creating that new variable."), pass_if(~ .result$snap_insta[6] == "Yes", "Good going creating that new variable."), pass_if(~ is.na(.result$snap_insta[1])) )
Now, count how many respondents fall into each category of possible values for the snap_insta
variable (which were "Yes", "No", and NA
).
gss16 |> count(___)
grade_this({ if(identical(as.numeric(.result[1, 2]), 858)) { pass("You have counted the responses correctly.") } fail("Not quite. Try taking a look at the hint.") })
What are the possible responses to the question
Last week were you working full time, part time, going to school, keeping house, or what?
and how many respondents chose each of these answers?
Note that this information is stored in the wrkstat
variable.
gss16 |> count(___)
grade_this({ if(identical(as.numeric(.result[1, 2]), 284)) { pass("You have counted the responses correctly.") } if(identical(as.numeric(.result[1, 2]), 1321)) { fail("Good, but there is no need to sort the data.") } if(identical(as.numeric(.result[1, 2]), 3)) { fail("Good, but there is no need to sort the data.") } fail("Not quite. Try taking a look at the hint.") })
Fit a model predicting email (number of minutes per week spent on email) from educ
(number of years of education), wrkstat
, and snap_insta
.
Having created the model, we'll use the tidy()
function from the tidymodels
package to view the attributes of the model in a more, well, tidy way.
email_fit <- linear_reg() |> set_engine("___") |> fit(___ ~ ___, data = ___) tidy(email_fit)
email_fit <- linear_reg() |> set_engine("lm") |> fit(___ ~ ___, data = ___) tidy(email_fit)
email_fit <- linear_reg() |> set_engine("lm") |> fit(email ~ ___, data = ___) tidy(email_fit)
email_fit <- linear_reg() |> set_engine("lm") |> fit(email ~ educ + wrkstat + snap_insta, data = ___) tidy(email_fit)
email_fit <- linear_reg() |> set_engine("lm") |> fit(email ~ educ + wrkstat + snap_insta, data = gss16) tidy(email_fit)
grade_this({ if(identical(as.numeric(floor(.result[1, 2])), -230) & identical(as.numeric(round(.result[1, 5], digits = 3)), 0.126)) { pass("You have created the desired model correctly.") } fail("Not quite. Try taking a look at the hints for constructing the model.") })
question("Which is the correct interpretation for the `estimate` of the `educ` parameter?", answer("For each additional year spent in education, the weekly time spent on email is expected to be longer, on average, by 29.6 minutes.", correct = TRUE), answer("For each additional year spent in education, the weekly time spent on email is longer for all respondents by 29.6 minutes."), answer("An additional year spent in education causes you to spend 29.6 minutes more per week on email."), answer("For each additional 29.6 minutes spent on email, the time spent in education is expected to be greater, on average, by one year."), allow_retry = TRUE, random_answer_order = TRUE )
To test the suitability of a linear model, we create a residual plot showing the residuals against fitted values for the model and data.
To help with this, we use the augment()
function, also from the tidymodels
package.
First, create the augmented dataset:
email_aug <- ___
email_aug <- augment(email_fit$fit)
grade_this_code("You've successfully created the augmented data set - now on to the residual plot!")
Now, use the email_aug
tibble as the dataset for creating the desired residual plot.
ggplot(___) + ___
ggplot(data = email_aug, ___) + ___
ggplot(data = email_aug, aes(x = ___, y = ___)) + ___
ggplot(data = email_aug, aes(x = .fitted, y = .resid)) + ___
ggplot(data = email_aug, aes(x = .fitted, y = .resid)) + geom_point()
grade_this_code("Good job creating the residual plot.")
The 2016 GSS also asked respondents whether they think of themselves as liberal or conservative (polviews
) and whether they think science research is necessary and should be supported by the federal government (advfront
).
The question on science research is worded as follows:
Even if it brings no immediate benefits, scientific research that advances the frontiers of knowledge is necessary and should be supported by the federal government.
And possible responses to this question are Strongly agree, Agree, Disagree, Strongly disagree, Dont know, No answer, Not applicable.
The question on political views is worded as follows:
We hear a lot of talk these days about liberals and conservatives. I'm going to show you a seven-point scale on which the political views that people might hold are arranged from extremely liberal--point 1--to extremely conservative--point 7. Where would you place yourself on this scale?
And possible responses to this question are Extremely liberal, Liberal, Slightly liberal, Moderate, Slghtly conservative, Conservative, Extrmly conservative.
Responses that were originally Don't know, No answer and Not applicable are already mapped to NA
s upon data import.
Note that the levels of this variables are spelled inconsistently: "Extremely liberal"
vs. "Extrmly conservative"
.
Since this is the spelling that shows up in the data, you need to make sure this is how you spell the levels in your code.
In a new variable - call it advfront2
- recode advfront
such that Strongly agree and Agree are mapped to "Yes"
, and Disagree and Strongly disagree are mapped to "No"
.
The remaining levels can be left as is.
gss16 <- gss16 |> mutate(___)
gss16 <- gss16 |> mutate( advfront2 = case_when(___) )
gss16 <- gss16 |> mutate( advfront2 = case_when( advfront %in% c(___) ~ "Yes", ___ ) )
gss16 <- gss16 |> mutate( advfront2 = case_when( advfront %in% c("Strongly agree", "Agree") ~ "Yes", ___ ) )
gss16 <- gss16 |> mutate( advfront2 = case_when( advfront %in% c("Strongly agree", "Agree") ~ "Yes", advfront %in% c("Disagree", "Strongly disagree") ~ "No", TRUE ~ advfront ) )
grade_result_strict( pass_if(~ .result$advfront2[1] == "Yes", "Good going creating that new variable."), pass_if(~ .result$advfront2[9] == "Yes"), pass_if(~ .result$advfront2[2] == "No"), pass_if(~ .result$advfront2[98] == "No"), pass_if(~ .result$advfront2[12] == "Dont know"), pass_if(~ is.na(.result$advfront2[3])) )
In a new variable, recode polviews
such that Extremely liberal, Liberal, and Slightly liberal, are mapped to "Liberal"
, and Slghtly conservative, Conservative, and Extrmly conservative disagree are mapped to "Conservative"
.
The remaining levels can be left as is.
Arrange the levels so they are in the order "Conservative", "Moderate", "Liberal".
Don’t overwrite the existing polviews, instead, let's call the new variable polviews2
.
gss16 <- gss16 |> mutate(___)
gss16 <- gss16 |> mutate( polviews2 = case_when(___), polviews2 = ___ )
gss16 <- gss16 |> mutate( polviews2 = case_when( polviews %in% c(___) ~ "Liberal", ___ ), polviews2 = ___ )
gss16 <- gss16 |> mutate( polviews2 = case_when( polviews %in% c("Extremely liberal", "Liberal", "Slightly liberal") ~ "Liberal", ___ ), polviews2 = ___ )
gss16 <- gss16 |> mutate( polviews2 = case_when( polviews %in% c("Extremely liberal", "Liberal", "Slightly liberal") ~ "Liberal", polviews %in% c("Extrmly conservative", "Conservative", "Slghtly conservative") ~ "Conservative", TRUE ~ polviews ), polviews2 = fct_relevel(polviews2, "Conservative", "Moderate", "Liberal") )
grade_result_strict( pass_if(~ .result$polviews2[1] == "Moderate", "Good going creating that new variable."), pass_if(~ .result$polviews2[2] == "Liberal", "Good going creating that new variable."), pass_if(~ .result$polviews2[3] == "Conservative", "Good going creating that new variable."), pass_if(~ .result$polviews2[5] == "Liberal", "Good going creating that new variable."), pass_if(~ .result$polviews2[46] == "Liberal", "Good going creating that new variable."), pass_if(~ .result$polviews2[8] == "Conservative", "Good going creating that new variable."), pass_if(~ .result$polviews2[24] == "Conservative", "Good going creating that new variable."), pass_if(~ is.na(.result$polviews2[9])), fail_if(~ class(.result$polviews2) != "factor", "Be sure to make polviews2 a factor with the specified levels."), pass_if(~ levels(.result$polviews2)[1] == "Conservative"), pass_if(~ levels(.result$polviews2)[2] == "Moderate"), pass_if(~ levels(.result$polviews2)[3] == "Liberal") )
For the final exercise in this tutorial, create a visualization that displays the relationship between these two new variables.
You'll need to remove all NA
values from both variables, and the particular visualisation we're going to create is a filled bar plot.
gss16 |> filter(___) |> ggplot(mapping = aes(___)) + geom___(___)
gss16 |> filter(!is.na(___), !is.na(___)) |> ggplot(mapping = aes(___)) + geom___(___)
gss16 |> filter(!is.na(polviews2), !is.na(advfront2)) |> ggplot(mapping = aes(x = ___, fill = ___)) + geom___(___)
gss16 |> filter(!is.na(polviews2), !is.na(advfront2)) |> ggplot(aes(x = polviews2, fill = advfront2)) + geom___(___)
gss16 |> filter(!is.na(polviews2), !is.na(advfront2)) |> ggplot(aes(x = polviews2, fill = advfront2)) + geom_bar(position = "fill")
grade_this_code("You've successfuly created the required visualisation.")
In the following code block, copy your code from above for the basic plot, and see what you can do to make it look a little nicer. Some suggestions could be adding labels and titles, changing colours, etc.
Great work! We hope you've enjoyed this chance to practice your data handling and modelling skills some more.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.