R/fetch_data.R

# Included here first since fetch_data is usually the first function that is
# called.
#' @importFrom magrittr %>%
#' @export
magrittr::`%>%`

#' Fetch data from the original source
#'
#' This function fetchs the main dataset, keeps variables relevant to
#' the analysis, restrict the sample size as needed, and lastly save
#' the new dataset as an `.RData` file.
#'
#' @return Saves the wrangled data into the data/ folder.
#' @export
#'
#' @examples
#' fetch_data()
#'
fetch_data <- function() {
    # Load the master dataset,
    ds.prep <- PROMISE::PROMISE_data %>%
        dplyr::filter(VN %in% c(1, 3, 6)) %>%
        ## Kick out Canoers
        dplyr::filter(is.na(Canoe)) %>%
        dplyr::tbl_df()

    print(paste0('Original dataset rows are ', dim(ds.prep)[1], ' and columns are ', dim(ds.prep)[2]))

    ##' Munge and wrangle the data into the final version.
    ds <- ds.prep %>%
        dplyr::select(
            SID, VN, BMI, Waist, HOMA, ISI, IGIIR, ISSI2, TAG, LDL, HDL, Chol,
            ALT, CRP, Age, Sex, Ethnicity, dplyr::matches('^Total'), MET,
            dplyr::matches('^(tg|ne|pl|ce)\\d+'), Glucose0, Glucose120
        ) %>%
        dplyr::mutate(
            BaseTotalNE = TotalNE,
            BaseTotalTG = TotalTG,
            BaseTAG = ifelse(VN == 1, TAG, NA),
            lBaseTAG = log(BaseTAG),
            BaseAge = ifelse(VN == 1, Age, NA),
            invHOMA = (1 / HOMA),
            linvHOMA = log(invHOMA),
            lISI = log(ISI),
            lIGIIR = log(IGIIR),
            lISSI2 = log(ISSI2),
            lALT = log(ALT),
            lTAG = log(TAG)
        ) %>%
        dplyr::arrange(SID, VN) %>%
        dplyr::group_by(SID) %>%
        tidyr::fill(TotalTG, dplyr::matches('^tg\\d+'), BaseTAG, lBaseTAG, BaseAge, TotalNE) %>%
        dplyr::ungroup()

    ds <- ds %>%
        dplyr::full_join(ds %>%
                      dplyr::filter(VN == 1) %>%
                      dplyr::mutate_each(dplyr::funs((. / TotalTG) * 100), dplyr::matches('^tg\\d+')) %>%
                      dplyr::select(SID, dplyr::matches('^tg\\d+')) %>%
                      stats::setNames(paste0('pct_', names(.))) %>%
                      dplyr::rename(SID = pct_SID),
                  by = 'SID')

    ds <- ds %>%
        dplyr::mutate(
            VN = plyr::mapvalues(VN, c(1, 3, 6), c(0, 1, 2)),
            f.VN = factor(VN, c(0, 1, 2), c('yr0', 'yr3', 'yr6')),
            Ethnicity =
                plyr::mapvalues(
                    Ethnicity,
                    c(
                        'African',
                        'European',
                        'First Nations',
                        'Latino/a',
                        'Other',
                        'South Asian'
                    ),
                    c('Other', 'European', 'Other', 'Latino/a',
                      'Other', 'South Asian')
                ),
            BiEthnicity = plyr::mapvalues(
                Ethnicity,
                c('Other', 'European', 'Latino/a', 'South Asian'),
                c('Non-European', 'European', 'Non-European', 'Non-European')
            )
        ) %>%
        dplyr::arrange(SID, VN) %>%
        dplyr::filter(!is.na(TotalTG))

    print(paste0('Working dataset rows are ', dim(ds)[1], ' and columns are ', dim(ds)[2]))

    # Final dataset object
    project_data <- ds

    # Save the dataset to the data/ folder.
    devtools::use_data(project_data, overwrite = TRUE)
    # Save the variable names as an internal dataset
    vars <- names(project_data)
    devtools::use_data(vars, internal = TRUE, overwrite = TRUE)
}
lwjohnst86/seminar2016 documentation built on May 21, 2019, 9:15 a.m.