Learning objectives:
Learning about key steps in data preparation, processing, wrangling using tidyverse (a set of packages).
r knitr::include_url("https://docs.google.com/presentation/d/1sJgXd8weNnuCDTYFdb3iWErls0cVIefXbCegVBI3zTw/edit")
Walkthrough 1 Script
# useful shortcut# # Run current line/selection: cmd + return (MAC), ctrl + Enter (Windows) # Assignment sign (<-) : Option + - (M), Alt + - (W) # Pipe sign (%) : cmd + shift + M (M), ctrl + shift + M (W) install.packages(c("tidyverse", "apaTables", "sjPlot", "dataedu", "summarytools", "ggpubr")) # dataedu wasn't available for R ver 3.6.3 so I installed dev version of dataedu remotes::install_github("data-edu/dataedu") # this was not in the book but useful to get descriptives install.packages("summarytools") # Load packages library(tidyverse) library(apaTables) library(sjPlot) library(readxl) library(dataedu) library(summarytools) library(ggpubr) ############################ # import data from dataedu # ############################ # Pre-survey for the F15 and S16 semesters pre_survey <- dataedu::pre_survey # Gradebook and log-trace data for F15 and S16 semesters course_data <- dataedu::course_data # Log-trace data for F15 and S16 semesters - this is for time spent course_minutes <- dataedu::course_minutes ############# # view data # ############# pre_survey head(pre_survey) # first six rows View(pre_survey) # a full view in a separate tab glimpse(pre_survey) # list of variables, values for the first couple of cases # get a quick look at each variable in df view(dfSummary(pre_survey)) ## take a look at course_data, course_minutes; what do you notice? ######################## # 1.process pre_survey # ######################## pre_survey <- pre_survey %>% # Rename the qustions something easier to work with because R is case sensitive # and working with variable names in mix case is prone to error rename( q1 = Q1MaincellgroupRow1, q2 = Q1MaincellgroupRow2, q3 = Q1MaincellgroupRow3, q4 = Q1MaincellgroupRow4, q5 = Q1MaincellgroupRow5, q6 = Q1MaincellgroupRow6, q7 = Q1MaincellgroupRow7, q8 = Q1MaincellgroupRow8, q9 = Q1MaincellgroupRow9, q10 = Q1MaincellgroupRow10 ) %>% # Convert all question responses to numeric mutate_at(vars(q1:q10), list( ~ as.numeric(.))) # q1-10 are already numeric, so this doesn't seem necessary # you could insert suffix to the var names indicate the three dimensions # e.g. q1.i, q2.uv, 3.pc where i = interest, u = utility value, p = perceived competence ############################################# #1a.practice mutate = making a new variable # ############################################# # create a df (dataframe) in tibble format with two columns/vars: male & female df <- tibble( male = 5, female = 5 ) # Use mutate to create a new column called "total_students" # populate that column with the sum of the "male" and "female" variables df %>% mutate(total_students = male + female) # let's keep this new column in df df <- df %>% mutate(total_students = male + female) ###################################################### # 1b. reverse_score function with mutate & case_when # ###################################################### # This part of the code is where we write the function: # Function for reversing scales reverse_scale <- function(question) { # Reverses the response scales for consistency # Arguments: # question - survey question # Returns: # a numeric converted response # Note: even though 3 is not transformed, case_when expects a match for all # possible conditions, so it's best practice to label each possible input # and use TRUE ~ as the final statement returning NA for unexpected inputs x <- case_when( question == 1 ~ 5, question == 2 ~ 4, question == 3 ~ 3, question == 4 ~ 2, question == 5 ~ 1, TRUE ~ NA_real_ ) x } # let's see how it works reverse_scale(pre_survey$q4) pre_survey$q4 # compare with original # And here's where we use that function to reverse the scales # We use the pipe operator %>% here # Reverse scale for questions 4 and 7 pre_survey <- pre_survey %>% mutate(q4 = reverse_scale(q4), # mutate with the original var name to overwrite q7 = reverse_scale(q7)) # Note: psych package has reverse.code() function so you don't have to write your own ##################################################### #1c. pivot_longer to make pre_survey into long form # ##################################################### # Pivot the dataset from wide to long format # And name the long format df as measure_mean measure_mean <- pre_survey %>% # Gather questions and responses pivot_longer(cols = q1:q10, names_to = "question", # give a new var/col name "question" where question # will go values_to = "response") # give a new var/col name "response" where response values will go # create a new var called measure to denote 3 dimensions of motivation measure_mean <- measure_mean %>% # Here's where we make the column of question categories called "measure" mutate( measure = case_when( question %in% c("q1", "q4", "q5", "q8", "q10") ~ "int", question %in% c("q2", "q6", "q9") ~ "uv", question %in% c("q3", "q7") ~ "pc", TRUE ~ NA_character_) ) ################################################### # 1d. Get mean scores for each motivation measure # # Across ~912 students who responded the pre-survey # using group_by() and summarize() ################################################### measure_mean <- measure_mean %>% # First, we group by the new variable "measure" group_by(measure) %>% # Here's where we compute the mean of the responses summarize( # Creating a new variable to indicate the mean response for each measure mean_response = mean(response, na.rm = TRUE), # Creating a new variable to indicate the percent of each measure that # had NAs in the response field percent_NA = mean(is.na(response)) ) measure_mean ############################## # 2. Process the course data # ############################## View(course_data) # split course section into components course_data <- course_data %>% # Give course subject, semester, and section their own columns separate( col = CourseSectionOrigID, into = c("subject", "semester", "section"), sep = "-", remove = FALSE # this is to keep the original var ) ############################################# # 3. Join/merge course_data with pre_survey # ############################################# #rename pre_survey id vars pre_survey <- pre_survey %>% rename(student_id = opdata_username, #new_var_name = old_var_name course_id = opdata_CourseID) pre_survey ################################################ #3a. extract 5 digits inbetween _ _ in student_id # ################################################ #trying str_sub just with one string value str_sub("_99888_1", start = 2) str_sub("_99888_1", start = -3) str_sub("_99888_1", start = 2, end = -3) # Re-create the variable "student_id" so that it excludes the extraneous characters pre_survey <- pre_survey %>% mutate(student_id = str_sub(student_id, start = 2, end = -3)) # Save the new variable as numeric so that R no longer thinks it is text pre_survey <- pre_survey %>% mutate(student_id = as.numeric(student_id)) ########################################################### #3b rename id vars in course_data and join with pre_survey ########################################################## course_data <- course_data %>% rename(student_id = Bb_UserPK, course_id = CourseSectionOrigID) # new df merges course_data with pre_survey dat <- left_join(course_data, pre_survey, by = c("student_id", "course_id")) dat ############################################ #4. Process course_minutes & join with dat ############################################ course_minutes <- course_minutes %>% rename(student_id = Bb_UserPK, course_id = CourseSectionOrigID) course_minutes <- course_minutes %>% # Change the data type for student_id in course_minutes so we can match to # student_id in dat mutate(student_id = as.integer(student_id)) dat <- dat %>% left_join(course_minutes, by = c("student_id", "course_id")) # dat has many gradebook_items per student per course # we want just one row per student & course combo # using distinct() dat <- distinct(dat, course_id, student_id, .keep_all = TRUE) # rename final grade var dat <- rename(dat, final_grade = FinalGradeCEMS) ########################################### # 5. Analysis ########################################### ####################################################################### # 5a.Scatter plot to examine relationship between final grade & time spent ####################################################################### view(dfSummary(dat)) #scatter plot to see relationship between timespent & final grade p1 <- dat %>% # aes() tells ggplot2 what variables to map to what feature of a plot # Here we map variables to the x- and y-axis ggplot(aes(x = TimeSpent, y = final_grade)) + # Creates a point with x- and y-axis coordinates specified above geom_point(color = dataedu_colors("green")) + theme_dataedu() + labs(x = "Time Spent", y = "Final Grade") # add a line of best fit p1 + geom_smooth(method = "lm") # with ggpubr, you can add correlation to the graph require(ggpubr) p2 <- ggscatter(dat, x = "TimeSpent", y = "final_grade", color = "springgreen4", add = "reg.line", # Add regressin line add.params = list(color = "blue", fill = "lightgray"), # Customize reg. line conf.int = TRUE # Add confidence interval ) # Add correlation coefficient p2 + stat_cor(method = "pearson", label.x = 3900, label.y = 130, p.accuracy = 0.001, r.accuracy = 0.01) ################################################### # 5b.Linear regression with time spent as predictor ################################################### m_linear <- lm(final_grade ~ TimeSpent, data = dat) summary(m_linear) # get publication ready table with tab_model function require(sjPlot) tab_model(m_linear, title = "Table 7.1") # you can copy and paste it into Word! # or save it with apa.re.table function apa.reg.table(m_linear, filename = "regression-table-output.doc") ################################################### # 5c.Correlations among the 3 motivation variables ################################################### # pivot survey_responses to long form survey_responses <- pre_survey %>% # Gather questions and responses pivot_longer(cols = q1:q10, names_to = "question", values_to = "response") %>% mutate( # Here's where we make the column of question categories measure = case_when( question %in% c("q1", "q4", "q5", "q8", "q10") ~ "int", question %in% c("q2", "q6", "q9") ~ "uv", question %in% c("q3", "q7") ~ "pc", TRUE ~ NA_character_ )) # create mean_response for each student for each measure survey_responses <- survey_responses %>% group_by(student_id, measure) %>% # Here's where we compute the mean of the responses for each stdt & measure combo summarize( # Mean response for each measure mean_response = mean(response, na.rm = TRUE) ) # Filter NA (missing) responses and pivot to wide form survey_responses <- survey_responses %>% filter(!is.na(mean_response)) %>% pivot_wider(names_from = measure, values_from = mean_response) survey_responses # get correlation table survey_responses %>% apa.cor.table(filename = "corr-table-output.doc") # note the correlation table includes student_id. # probably want to delete it in Word ############################################################################# # 5d. Linear regression with hours sptent (rather than minutes) as predictor ############################################################################ # creating a new variable for the amount of time spent in hours dat <- dat %>% mutate(TimeSpent_hours = TimeSpent / 60) # the same linear model as above, but with the TimeSpent variable in hours m_linear_1 <- lm(final_grade ~ TimeSpent_hours, data = dat) # viewing the output of the linear model tab_model(m_linear_1, title = "Table 7.2") ################################################################## # 5e. Linear regression with standardized time spent as predictor ################################################################## # this is to standardize the TimeSpent variable to have a mean of 0 and a standard deviation of 1 # this makes intercept more interpretable dat <- dat %>% mutate(TimeSpent_std = scale(TimeSpent)) # the same linear model as above, but with the TimeSpent variable standardized m_linear_2 <- lm(final_grade ~ TimeSpent_std, data = dat) # viewing the output of the linear model tab_model(m_linear_2, title = "Table 7.3") ##################################################################### #6. Multiple regression model with time spent & subject as predictors ##################################################################### # a linear model with the subject added # independent variables, such as TimeSpent_std and subject, can simply be separated with a plus symbol: m_linear_3 <- lm(final_grade ~ TimeSpent_std + subject, data = dat) # note: subject is a categorical variable and it seems AnPhA (animal physiology) # is set as the reference category (numbers assinged by alphabetical order) tab_model(m_linear_3, title = "Table 7.4") # Combine all four models in one table & show standard errors rather than CIs tab_model(m_linear, m_linear_1, m_linear_2, m_linear_3, show.ci = FALSE, show.se = TRUE) ##################################################################### #7. What other analyses can you think of? ##################################################################### # Add total scores of pre-course motivation as a predictor? # --> use mutate to create sum_motiv variable # Does the effect of time spent vary by subjects/courses? # --> add time x subject interaction term # Maybe color dots in scatter plot by subject to see if there is a pattern ggplot(data = dat, aes(x = TimeSpent, y = final_grade, color = subject)) + geom_point() + theme_dataedu() + labs(x = "Time Spent", y = "Final Grade")
r knitr::include_url("https://www.youtube.com/embed/lXWAfm0fh7Q")
Meeting chat log
00:04:35 Ryan Woodbury: https://rfordatascience.slack.com/files/UQ4DR12BY/F01QUFD8V5H/dsieur_ch7_slides 00:04:50 Ryan Woodbury: https://rfordatascience.slack.com/files/UQ4DR12BY/F01RJ4ENF4Y/desieur_ch7_scripts.r 00:04:55 Ryan Woodbury: Slides, then script 00:10:33 Ryan Woodbury: Is it like skimr? 00:14:57 Isabella Velásquez: super clear! love the color coding 00:16:26 Edgar Zamora: https://www.garrickadenbuie.com/project/tidyexplain/ I use these GIFs to help me visualize the different kind of joins. Get confused 00:17:31 Rob Lucas: Thanks for sharing that Edgar! The semi and anti joins were new to me. I think this will help me visualize them. 00:26:26 Mark LaVenia: you are doing great! Thanks! 00:29:29 Ryan Woodbury: The `mutate_*()` functions are being superseded by using the `across()` function within `mutate()`. 00:30:03 Isabella Velásquez: here's some documentation on the mutate_* functions. https://dplyr.tidyverse.org/reference/mutate_all.html but like Ryan said, they've been superseded. I am still learning across()! 00:30:05 Ryan Woodbury: Line 69 would be: mutate(across(q1:q10, as.numeric)) in the "new" format 00:30:37 Alyssa Ibarra: Thanks, Ryan! I didn't know it was changing 00:30:49 Ryan Woodbury: I was just getting used to the mutate_*() functions too! 00:37:57 Alyssa Ibarra: when would you use mutate versus transmute? 00:38:35 Ryan Woodbury: The issue with the psych::reverse.code() function that Yukie is talking about is that it is not tidyverse friendly, *but* is a function that is already made and does a great job. (I love the psych package, BTW.) 00:45:41 Mark LaVenia: I love that 01:00:27 Ryan Woodbury: Great advice on imagining the joined datasets. 01:01:19 Isabella Velásquez: I've got to hop off at 5. Thank you SO much Yukie! That was fantastic ! 01:01:39 Ryan Woodbury: Thank you! Great work. 01:02:58 Mark LaVenia: Great job Yukie! 01:03:00 Alyssa Ibarra: Thank you so much! It was so great! 01:03:09 Edgar Zamora: Great job! Thank you!
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.