crsra: Tidying and Analyzing 'Coursera' Research Export Data

Documented in crsra_import_course crsra_table_names

#' Imports all the .csv files into one list consisting of all
#' the tables within the course.
#'
#' @param workdir A character string vector indicating the
#' directory where the unzipped course is stored.
#' @param add_course_name Should a column of the course name
#' be added to all the \code{data.frame}s
#' @param change_pid_column Should the \code{partner_user_id}
#' column be changed to simply say \code{"partner_user_id"}?
#' @param check_problems Should problems with reading in the
#' data be checked?
#' @param include vector of tables to import, they are the lowercase
#' names of the files without any `.csv`.  See \code{\link{crsra_table_names}}.
#'
#' @examples
#' zip_file = system.file("extdata", "fake_course_7051862327916.zip",
#' package = "crsra")
#' bn = basename(zip_file)
#' bn = sub("[.]zip$", "", bn)
#' res = unzip(zip_file, exdir = tempdir(), overwrite = TRUE)
#' workdir = file.path(tempdir(), bn)
#' course_tables = crsra_import_course(workdir,
#' check_problems = FALSE)
#' @export
#' @importFrom purrr map_lgl
#' @importFrom dplyr select_
#' @importFrom readr problems read_csv cols
crsra_import_course <- function(
    workdir = ".",
    add_course_name = FALSE,
    change_pid_column = FALSE,
    check_problems = TRUE,
    include = NULL) {

    dircheck <- file.path(workdir, "course_branch_grades.csv")
    if (!file.exists(dircheck)) {
        dircheck <- file.path(workdir, "course_branch_grades.csv.gz")
    }
    if (!file.exists(dircheck)) {
        msg = paste0("Did not find course_branch_grades.csv. ",
                     "Please make sure you have set your ",
                     "working directory to where the Coursera ",
                     "data dump is located.")
        stop(msg)
    }

    ########################################################################
    files = list.files(
        pattern = ".(csv|CSV)(|.gz|.GZ)$", path = workdir,
        full.names = TRUE)
    stubs <- sub(pattern = "[.]csv(|[.]gz|[.]GZ)$", replacement = "",
                 basename(files))
    if (!is.null(include)) {
        include = tolower(include)
        req_include = c("users", "courses")
        sdiff = setdiff(req_include, include)
        if (length(sdiff) > 0) {
            message(
                paste0("Adding in ",
                paste(sdiff, collapse = ", "),
                " as they are required")
            )
            include = c(include, sdiff)
        }
        keep = tolower(stubs) %in% c(include,
                                     sub("[.]csv(|[.]gz)$", "", include))
        if (!any(keep)) {
            stop("include vector does not map to any files in workdir")
        }
        files = files[keep]
        stubs = stubs[keep]
    }
    suppressWarnings({
        suppressMessages({
            dfs = purrr::map(
                files,
                readr::read_csv,
                progress = FALSE,
                col_types = readr::cols())
        })
    })
    names(dfs) = stubs
    if (check_problems) {
        any_probs = purrr::map(dfs, readr::problems)
        any_probs = purrr::map_lgl(any_probs, function(x) {
            nrow(x) > 0
        })

        if (any(any_probs)) {
            suppressWarnings({
                suppressMessages({
                    dfs[ any_probs] = lapply(
                        files[any_probs],
                        readr::read_csv, guess_max = Inf,
                        progress = FALSE, col_types = readr::cols())
                })
            })
        }
        any_probs = purrr::map(dfs, problems)
        any_probs = purrr::map_lgl(any_probs, function(x) {
            nrow(x) > 0
        })
        if (any(any_probs)) {
            ind = stubs[which(any_probs)]
            warning(paste0("Data set ", paste(ind, collapse = ", "),
                           " may still have some data errors"))
        }
    }

    if ("peer_review_part_free_responses" %in% names(dfs)) {
        dfs$peer_review_part_free_responses =
            dfs$peer_review_part_free_responses %>%
            dplyr::select_("peer_assignment_id",
                           "peer_assignment_review_schema_part_id",
                           "peer_review_id",
                           "peer_review_part_free_response_text")
    }
    course_name = dfs$courses$course_name
    if (add_course_name) {
        dfs = lapply(dfs, function(x) {
            if (nrow(x) == 0) {
                return(x)
            }
            x$course_name = course_name
            x
        })
    }
    cn = colnames(dfs$users)
    ind = grep("user_id", cn)
    if (length(ind) > 1) {
        warning("More than 1 column in users has a user_id name!")
        ind = ind[1]
    }
    partner_user_id = cn[ind]
    if (change_pid_column) {
        dfs = lapply(dfs, function(x) {
            cn = colnames(x)
            if (partner_user_id %in% cn) {
                cn[cn %in% partner_user_id] = "partner_user_id"
            }
            colnames(x) = cn
            x
        })
    }

    attr(dfs, "course_name") = course_name
    attr(dfs, "partner_user_id") = partner_user_id
    class(dfs) = "coursera_course_import"

    return(dfs)
}

#' @rdname crsra_import_course
#' @export
crsra_table_names = function(workdir = ".") {
    files = list.files(
        pattern = ".(csv|CSV)(|.gz|.GZ)$", path = workdir,
        full.names = TRUE)
    stubs <- sub(pattern = "[.]csv(|[.]gz|[.]GZ)$", replacement = "",
                 basename(files))
    stubs = tolower(stubs)
    stubs
}