In francojc/tadr: Text as Data Resources

library(tidyverse)

Curate data

Rate My Professor sample dataset

rmp <- 
  read_csv(file = "data/original/rate_my_professor/RateMyProfessor_Sample data.csv") %>% 
  filter(comments != "No Comments") # remove ratings with no comments

rmp_z_scores <- 
  rmp %>% 
  group_by(num_student) %>% 
  mutate(z_score = scale(student_star)) %>% 
  ungroup() %>% 
  mutate(labels = cut(z_score, 2, labels = c("low", "high")),
         doc_id = row_number()) %>% 
  select(-z_score)

glimpse(rmp_z_scores)

rmp_z_scores %>% 
  select(doc_id, num_student, student_star, labels, comments)

rmp_curated <- 
  rmp %>% 
  mutate(doc_id = row_number()) %>% # add document id
  mutate(course_rating = case_when(
    student_star <= 3.5 ~ "low", # low if 3.5 or less
    student_star > 3.5 ~ "high" # high if 4 or greater
  )) %>% 
  mutate(online = factor(IsCourseOnline, 
                         levels = c(0, 1), 
                         labels = c(FALSE, TRUE))) %>% 
  select(doc_id, student_id = num_student, student_star, course_rating, online, comments)

glimpse(rmp_curated)

fs::dir_create(path = "data/derived/rate_my_professor_sample/")
write_csv(rmp_curated, file = "data/derived/rate_my_professor_sample/rmp_curated.csv")

data_dic_starter <- function(data, file_path) {
  # Function:
  # Creates a .csv file with the basic information
  # to document a curated dataset

  tibble(variable_name = names(data), # column with existing variable names
         name = "", # column for human-readable names
         description = "") %>% # column for prose description
    write_csv(file = file_path) # write to disk
}

data_dic_starter(rmp_curated, file_path = "data/derived/rate_my_professor_sample/rmp_curated_data_dictionary.csv")