R/data.R

Defines functions combine_nsu_in_dir find_market_file save_nsu_data label_variables extract_labels fix_misc_issues rename_variables correct_col_type check_cols ingest_dta_file get_matching_files

Documented in check_cols combine_nsu_in_dir correct_col_type extract_labels find_market_file fix_misc_issues get_matching_files ingest_dta_file label_variables rename_variables save_nsu_data

#' Get vector of file names matching a pattern
#' 
#' Returns a character vector of file names that match a regex pattern in the target directory
#' 
#' @param dir Character vector. File path to directory
#' @param pattern Character vector. Regular expression describing files to return
#' 
#' @return Character vector. File names with `.dta` extension
#' 
#' @importFrom fs dir_ls path_file
#' @importFrom stringr str_subset
get_matching_files <- function(
    dir, 
    pattern
) {

    pattern_mod <- dplyr::if_else(
        pattern == "releve",
        "^releve_",
        pattern
    )

    file_names <- fs::dir_ls(
            path = dir,
            recurse = FALSE,
            regexp = "\\.dta"
        ) %>%
        fs::path_file() %>%
        stringr::str_subset(pattern = pattern_mod)
  
    return(file_names)

}

#' Read Stata file into memory
#' 
#' First, ingest data file. 
#' Then, write it to an object in the global environment, where the object name is the file name minus the `.dta` extension.
#' These names follow the same pattern as the input files in order to facilitate merging.
#' 
#' @param dir Character vector. File path to directory
#' @param file_name Character vector. File name with `.dta` extension
#' 
#' @return Data frame
#' 
#' @importFrom stringr str_replace
#' @importFrom haven read_dta
ingest_dta_file <- function(
    dir,
    file_name
) {

    df_name <- stringr::str_replace(
            string = file_name,
            pattern = "\\.dta",
            replacement = ""
        )

    haven::read_dta(file = paste0(dir, file_name))

}

#' Check whether expected columns are present
#' 
#' @param df Data frame. Data frame to check.
#' @param df_name Character. Name of data frame as a character.
#' @param df_type Character. For consumption data, one of : "unitesFixes", "releve", "unitesautre1", "autre1releve", "unitesautre2", "autre2releve"
#' For production data, one of: "unitesFixes", "unitesAutre1", "unitesAutre2"
#' 
#' @importFrom dplyr case_when
#' @importFrom stringr str_detect
#' @importFrom assertthat assert_that
#' @importFrom glue glue glue_collapse
check_cols <- function(
    df,
    df_name,
    df_type
) {

# df_name <- deparse(substitute(df))
cols_found <- names(df)

    # unitesFixes
    if (df_type == "unitesFixes") {

        fixe_type <- case_when(
            "q109a" %in% names(df) ~ "consumption",
            "q110" %in% names(df) ~ "production",
            TRUE ~ "unknown"
        )

        if (fixe_type == "consumption") {

            cols_expected <- c(
                "interview__id", "interview__key", "unitesFixes__id", "produit_nom",
                "q104", "q105", "q105autre", "q108", "q109a", "q109"
            )

            cols_missing <- cols_expected[!cols_expected %in% cols_found]
            cols_missing_ms <- dplyr::case_when(
                !any(stringr::str_detect(cols_found, "q106__[0-9]+")) ~ "q106__*",
                !any(stringr::str_detect(cols_found, "q107__[0-9]+")) ~ "q107__*",
                !any(stringr::str_detect(cols_found, "q106__[0-9]+")) & !any(stringr::str_detect(cols_found, "q107__[0-9]+")) ~ c("q106__*", "q107__*"),
                TRUE ~ NA_character_
            )
            
        } else if (fixe_type == "production") {

            cols_expected <- c(
                "interview__key", "interview__id", "unitesFixes__id", "produit_nom", 
                "q103", "q104", "q105", "q105autre", "q107", "q108", "q109", "q110"
            )

            cols_missing <- cols_expected[!cols_expected %in% cols_found]
            cols_missing_ms <- dplyr::case_when(
                !any(stringr::str_detect(cols_found, "q106__[0-9]+")) ~ "q106__*",
                TRUE ~ NA_character_
            )

        } else if (fixe_type == "unknown") {

            stop("Impossible d'identifier le type de base unitesFixes")

        }

        cols_missing <- c(cols_missing, cols_missing_ms)
        cols_missing <- cols_missing[!is.na(cols_missing)]

        assertthat::assert_that(
            length(cols_missing) == 0,
            msg = glue::glue(
                "Expected column(s) missing in {df_name}",
                "Columns missing: {glue::glue_collapse(cols_missing, sep = ', ')}",
                .sep = "\n"
            )
        )

    }

    # releve
    if (df_type == "releve") {

        cols_expected <- c(
            "interview__id", "interview__key", "unitesFixes__id", "releve__id", "produit_nom", 
            "q107autre", "q110b", "q110a"
        )

        cols_missing <- cols_expected[!cols_expected %in% cols_found]

        assertthat::assert_that(
            length(cols_missing) == 0,
            msg = glue::glue(
                "Expected column(s) missing in {df_name}",
                "Columns missing: {glue::glue_collapse(cols_missing, sep = ', ')}",
                .sep = "\n"
            )
        )

    }

    # unitesautre1
    if (df_type == "unitesautre1") {

        cols_expected <- c(
            "interview__id", "interview__key", "unitesautre1__id", "produit_nom",
            "q113_autre1", "q114a_autre1", "q114_autre1"
            # sometimes q114a_autre1
        )

        cols_missing <- cols_expected[!cols_expected %in% cols_found]
        cols_missing_ms <- dplyr::case_when(
            !any(stringr::str_detect(cols_found, "q112_autre1__[0-9]+")) ~ "q112_autre1__*",
            TRUE ~ NA_character_
        )
        cols_missing <- c(cols_missing, cols_missing_ms)
        cols_missing <- cols_missing[!is.na(cols_missing)]

        assertthat::assert_that(
            length(cols_missing) == 0,
            msg = glue::glue(
                "Expected column(s) missing in {df_name}",
                "Columns missing: {glue::glue_collapse(cols_missing, sep = ', ')}",
                .sep = "\n"
            )
        )

    }

    # unitesAutre1
    if (df_type == "unitesAutre1") {

        cols_expected <- c(
            "interview__id", "interview__key", "unitesAutre1__id", "produit_nom",
            "q103_autre1", "q104_autre1", "q105_autre1", "q107_autre1", 
            "q108_autre1", "q109_autre1", "q110_autre1"
        )

        cols_missing <- cols_expected[!cols_expected %in% cols_found]
        cols_missing_ms <- dplyr::case_when(
            !any(stringr::str_detect(cols_found, "q106_autre1__[0-9]+")) ~ "q106_autre1__*",
            TRUE ~ NA_character_
        )
        cols_missing <- c(cols_missing, cols_missing_ms)
        cols_missing <- cols_missing[!is.na(cols_missing)]

        assertthat::assert_that(
            length(cols_missing) == 0,
            msg = glue::glue(
                "Expected column(s) missing in {df_name}",
                "Columns missing: {glue::glue_collapse(cols_missing, sep = ', ')}",
                .sep = "\n"
            )
        )

    }

    # autre1releve_
    if (df_type == "autre1releve") {

        cols_expected <- c(
            "interview__id", "interview__key", "unitesautre1__id", "autre1releve__id", "produit_nom",
            "q112autre_autre1", "q115b_autre1", "q115a_autre1"
        )

        cols_missing <- cols_expected[!cols_expected %in% cols_found]

        assertthat::assert_that(
            length(cols_missing) == 0,
            msg = glue::glue(
                "Expected column(s) missing in {df_name}",
                "Columns missing: {glue::glue_collapse(cols_missing, sep = ', ')}",
                .sep = "\n"
            )
        )

    }

    # unitesautre2
    if (df_type == "unitesautre2") {

        cols_expected <- c(
            "interview__id", "interview__key", "unitesautre2__id", "produit_nom",
            "q116", "q118_autre2", "q119_autre2"
        )

        cols_missing <- cols_expected[!cols_expected %in% cols_found]
        cols_missing_ms <- dplyr::case_when(
            !any(stringr::str_detect(cols_found, "q117_autre2__[0-9]+")) ~ "q117_autre2__*",
            TRUE ~ NA_character_
        )
        cols_missing <- c(cols_missing, cols_missing_ms)
        cols_missing <- cols_missing[!is.na(cols_missing)]

        assertthat::assert_that(
            length(cols_missing) == 0,
            msg = glue::glue(
                "Expected column(s) missing in {df_name}",
                "Columns missing: {glue::glue_collapse(cols_missing, sep = ', ')}",
                .sep = "\n"
            )
        )

    }

    # unitesAutre2
    if (df_type == "unitesAutre2") {

        cols_expected <- c(
            "interview__id", "interview__key", "unitesAutre2__id", "produit_nom",
            "q102", "q103_autre2", "q104_autre2", "q105_autre2", "q107_autre2", 
            "q108_autre2", "q109_autre2", "q110_autre2"
        )

        cols_missing <- cols_expected[!cols_expected %in% cols_found]
        cols_missing_ms <- dplyr::case_when(
            !any(stringr::str_detect(cols_found, "q106_autre2__[0-9]+")) ~ "q106_autre2__*",
            TRUE ~ NA_character_
        )
        cols_missing <- c(cols_missing, cols_missing_ms)
        cols_missing <- cols_missing[!is.na(cols_missing)]

        assertthat::assert_that(
            length(cols_missing) == 0,
            msg = glue::glue(
                "Expected column(s) missing in {df_name}",
                "Columns missing: {glue::glue_collapse(cols_missing, sep = ', ')}",
                .sep = "\n"
            )
        )

    }

    # autre2releve_
    if (df_type == "autre2releve") {

        cols_expected <- c(
            "interview__id", "interview__key", 
            "autre2releve__id", "unitesautre2__id", "produit_nom",
            "q117autre_autre2", "q120b_autre2", "q120a_autre2"
        )        

        cols_missing <- cols_expected[!cols_expected %in% cols_found]

        assertthat::assert_that(
            length(cols_missing) == 0,
            msg = glue::glue(
                "Expected column(s) missing in {df_name}",
                "Columns missing: {glue::glue_collapse(cols_missing, sep = ', ')}",
                .sep = "\n"
            )
        )

    }

}

#' Change column to expected type
#' 
#' First, check whether the column is of the expected type
#' Then, if not, change the column's type
#' 
#' @param df Data frame whose columns types to check and correct.
#' @param df_type Character. For consumption data, one of : "unitesFixes", "releve", "unitesautre1", "autre1releve", "unitesautre2", "autre2releve"
#' For production data, one of: "unitesFixes", "unitesAutre1", "unitesAutre2"
#' 
#' @return Data frame with column types corrected where applicable.
#' 
#' @importFrom dplyr `%>%` mutate across starts_with
#' @importFrom rlang .data
correct_col_type <- function(
    df, 
    df_type
) {

    df_fixed <- df %>%
        {
            # unitesFixes
            if (df_type == "unitesFixes") {

                fixe_type <- case_when(
                    "q109a" %in% names(df) ~ "consumption",
                    "q110" %in% names(df) ~ "production",
                    TRUE ~ "unknown"
                )

                if (fixe_type == "consumption") {

                    dplyr::mutate(.,
                        # character cols
                        dplyr::across(
                            .cols = c(
                                .data$interview__id, .data$interview__key, 
                                .data$produit_nom, .data$q105autre, .data$q109
                            ),
                            .fns = as.character
                        ),
                        # double cols
                        dplyr::across(
                            .cols = c(
                                .data$unitesFixes__id, .data$q104, .data$q105, 
                                .data$q108, .data$q109a, 
                                starts_with("q106__"), starts_with("q107__")
                            ),
                            .fns = as.double
                        )
                    )

                } else if (fixe_type == "production") {

                    dplyr::mutate(.,
                        # character cols
                        dplyr::across(
                            .cols = c(
                                .data$interview__id, .data$interview__key, 
                                .data$produit_nom, .data$q105autre
                            ),
                            .fns = as.character
                        ),
                        # double cols
                        dplyr::across(
                            .cols = c(
                                .data$unitesFixes__id, .data$q103, .data$q104, 
                                .data$q105, .data$q107, .data$q109,
                                .data$q108, .data$q110, starts_with("q106__")
                            ),
                            .fns = as.double
                        )
                    )

                } else if (fixe_type == "unknown"){
                    stop("Impossible de reconnaitre le type de base fixe")
                }


            # releve
            } else if (df_type == "releve") {

                dplyr::mutate(.,
                    # character cols
                    dplyr::across(
                        .cols = c(
                            .data$interview__id, .data$interview__key, 
                            .data$produit_nom, .data$q107autre
                        ),
                        .fns = as.character
                    ),
                    # double cols
                    dplyr::across(
                        .cols = c(
                            .data$unitesFixes__id, .data$releve__id, 
                            .data$q110b, .data$q110a
                        ),
                        .fns = as.double
                    )
                )

            # unitesautre1
            } else if (df_type == "unitesautre1") {

                dplyr::mutate(., 
                    # character cols
                    dplyr::across(
                        .cols = c(
                            .data$interview__id, .data$interview__key, 
                            .data$produit_nom, .data$q114_autre1
                        ),
                        .fns = as.character
                    ),
                    # double cols
                    dplyr::across(
                        .cols = c(
                            .data$unitesautre1__id, .data$q113_autre1, 
                            starts_with("q112_autre1__")
                        ),
                        .fns = as.double
                    )
                )

            # unitesAutre1
            } else if (df_type == "unitesAutre1") {

                dplyr::mutate(.,
                    # character cols
                    dplyr::across(
                        .cols = c(
                            .data$interview__id, .data$interview__key,
                            .data$produit_nom
                        ),
                        .fns = as.character
                    ),
                    # double cols
                    dplyr::across(
                        .cols = c(
                            .data$unitesAutre1__id, .data$q103_autre1, .data$q104_autre1, 
                            .data$q105_autre1, .data$q107_autre1, .data$q109_autre1,
                            .data$q108_autre1, .data$q110_autre1, starts_with("q106__")
                        ),
                        .fns = as.double
                    )
                )

            # autre1releve
            } else if (df_type == "autre1releve") {

                dplyr::mutate(., 
                    # character cols
                    dplyr::across(
                        .cols = c(
                            .data$interview__id, .data$interview__key, 
                            .data$produit_nom
                        ),
                        .fns = as.character
                    ),
                    # double cols
                    dplyr::across(
                        .cols = c(
                            .data$unitesautre1__id, .data$autre1releve__id, 
                            .data$q112autre_autre1, .data$q115b_autre1, 
                            .data$q115a_autre1
                        ),
                        .fns = as.double
                    )
                )

            # unitesautre2
            } else if (df_type == "unitesautre2") {

                dplyr::mutate(., 
                    # character cols
                    dplyr::across(
                        .cols = c(
                            .data$interview__id, .data$interview__key, 
                            .data$produit_nom, .data$q116, .data$q119_autre2
                        ),
                        .fns = as.character
                    ),
                    # double cols
                    dplyr::across(
                        .cols = c(
                            .data$unitesautre2__id, 
                            starts_with("q117_autre2_"), 
                            .data$q118_autre2
                        ),
                        .fns = as.double
                    )
                )

            # unitesAutre2
            } else if (df_type == "unitesAutre2") {

                dplyr::mutate(.,
                    # character cols
                    dplyr::across(
                        .cols = c(
                            .data$interview__id, .data$interview__key,
                            .data$produit_nom
                        ),
                        .fns = as.character
                    ),
                    # double cols
                    dplyr::across(
                        .cols = c(
                            .data$unitesAutre2__id, .data$q103_autre2, 
                            .data$q104_autre2, .data$q105_autre2, .data$q107_autre2, 
                            .data$q109_autre2, .data$q108_autre2, .data$q110_autre2, 
                            starts_with("q106_autre2__")
                        ),
                        .fns = as.double
                    )
                )

            # autre2releve
            } else if (df_type == "autre2releve") {

                dplyr::mutate(., 
                    # character cols
                    dplyr::across(
                        .cols = c(
                            .data$interview__id, .data$interview__key, 
                            .data$produit_nom, .data$q117autre_autre2
                        ),
                        .fns = as.character
                    ),
                    # double cols
                    dplyr::across(
                        .cols = c(
                            .data$autre2releve__id, .data$unitesautre2__id, 
                            .data$q120b_autre2, .data$q120a_autre2
                        ), 
                        .fns = as.double
                    )

                )

            }

        }

}

#' Rename variables to harmonized names
#' 
#' First, changes variable names by removing the product-specific component, thereby leaving harmonized names.
#' Then, overwrite the input df in the global environment with an updated df.
#' 
#' @param df Data frame whose columns to rename.
#' @param df_type Character. For consumption data, one of : "unitesFixes", "releve", "unitesautre1", "autre1releve", "unitesautre2", "autre2releve"
#' For production data, one of: "unitesFixes", "unitesAutre1", "unitesAutre2"
#' 
#' @return Data frame with whose columns have harmonized names.
#' 
#' @importFrom stringr str_subset str_extract str_replace
#' @importFrom dplyr `%>%` case_when mutate rename_with everything starts_with
rename_variables <- function(
    df,
    df_type
) {
    
    get_product_name <- function(
        df,
        pattern
    ) {
        
        column_name <- stringr::str_subset(names(df), pattern = pattern)
        
        product_name <- stringr::str_extract(column_name, pattern = pattern)
        
        return(product_name)
        
    }
    
    # convert `df_type` input into the type of regex needed for tasks below
    pattern <- dplyr::case_when(
        df_type == "unitesFixes" ~ "(?<=unitesFixes_)[A-Za-z0-9]+(?=__id)",
        df_type == "releve" ~ "(?<=releve_)[A-Za-z0-9]+(?=__id)",
        df_type == "unitesautre1" ~ "(?<=unitesautre1)[A-Za-z0-9]+(?=__id)",
        df_type == "unitesAutre1" ~ "(?<=unitesAutre1_)[A-Za-z0-9]+(?=__id)",
        df_type == "autre1releve" ~ "(?<=autre1releve_)[A-Za-z0-9]+(?=__id)",
        df_type == "unitesautre2" ~ "(?<=unitesautre2)[A-Za-z0-9]+(?=__id)",
        df_type == "unitesAutre2" ~ "(?<=unitesAutre2_)[A-Za-z0-9]+(?=__id)",
        df_type == "autre2releve" ~ "(?<=autre2releve_)[A-Za-z0-9]+(?=__id)"
    )
    
    produit_nom <- get_product_name(df = df, pattern = pattern)
    
    df_renamed <- df %>%
        dplyr::mutate(produit_nom = produit_nom) %>%
        dplyr::rename_with(
            .cols = dplyr::everything(),
            .fn = ~ stringr::str_replace(
                string = .x,
                pattern = paste0("_", produit_nom),
                replacement = ""
            )
        ) %>%          
        { 
            if (df_type == "unitesautre1") {
                # roster ID
                rename_with(
                    .data = .,
                    .cols = starts_with("unitesautre1"),
                    .fn = ~ stringr::str_replace(
                        string = .x,
                        pattern = "(?<=unitesautre1)[a-z0-9]+(?=__id)",
                        replacement = ""
                    )
                )             
            } else if (df_type == "unitesAutre1") {
                # roster ID
                rename_with(
                    .data = .,
                    .cols = starts_with("unitesAutre1"),
                    .fn = ~ stringr::str_replace(
                        string = .x,
                        pattern = "(?<=unitesAutre1_)[a-z0-9]+(?=__id)",
                        replacement = ""
                    )
                )
            } else if (df_type == "autre1releve") {
                # ID from parent roster
                rename_with(
                    .data = .,
                    cols = starts_with("unitesautre1"),
                    .fn = ~ stringr::str_replace(
                        string = .x,
                        pattern = "(?<=unitesautre1)[a-z0-9]+(?=__id)",
                        replacement = ""
                    )
                )
            } else if (df_type == "unitesautre2") {
                # roster ID
                rename_with(
                    .data = .,
                    .cols = starts_with("unitesautre2"),
                    .fn = ~ stringr::str_replace(
                        string = .x,
                        pattern = "(?<=unitesautre2)[a-z0-9]+(?=__id)",
                        replacement = ""
                    )
                )
            } else if (df_type == "unitesAutre2") {
                # roster ID
                rename_with(
                    .data = .,
                    .cols = starts_with("unitesAutre2"),
                    .fn = ~ stringr::str_replace(
                        string = .x,
                        pattern = "(?<=unitesAutre2_)[a-z0-9]+(?=__id)",
                        replacement = ""
                    )
                )
            } else if (df_type == "autre2releve") {
                # ID from parent roster
                rename_with(
                    .data = .,
                    cols = starts_with("unitesautre2"),
                    .fn = ~ stringr::str_replace(
                        string = .x,
                        pattern = "(?<=unitesautre2)[a-z0-9]+(?=__id)",
                        replacement = ""
                    )
                )
            } else {
                .
            }
                       
        }
    
    return(df_renamed)
    
}

#' Fix miscellaneous issues with input data
#' 
#' Master fix-it function that applies solutions to known data issues
#' 
#' Can easily be extended to new issues. Fixes to issues are organized as follows:
#' 
#' - At the top level, group by file type
#' - Within file type, group by particular files by checking for file-specific column names
#' 
#' @param df Data frame whose disparate issues to fix.
#' @param df_type Character. For consumption data, one of : "unitesFixes", "releve", "unitesautre1", "autre1releve", "unitesautre2", "autre2releve"
#' For production data, one of: "unitesFixes", "unitesAutre1", "unitesAutre2"
#' 
#' @return Data frame with corrections, where applicable.
#' 
#' @importFrom dplyr `%>%` mutate rename rename_with matches
#' @importFrom labelled add_value_labels
#' @importFrom stringr str_replace
#' @importFrom rlang .data
fix_misc_issues <- function(
    df,
    df_type
) {

    if (df_type == "unitesautre1") {

        df_fixed <- df %>%
            # add q114a_autre column if missing
            {
                if (!"q114a_autre1" %in% names(df)) {
                    dplyr::mutate(
                        .data = .,
                        q114a_autre1 = NA_real_
                    ) %>%
                    labelled::add_value_labels(
                        .data = ., 
                        q114a_autre1 = c(Oui = 1, Non = 2)
                    )
                }
                else {
                    .
                }

            } %>%
            # misnamed variable in tomato concentrate df
            {
                if ("unitesautre1tmteCctr__id" %in% names(df)) {
                    dplyr::rename(
                        .data = .,
                        unitesautre1__id = .data$unitesautre1tmteCctr__id
                    )
                } else {
                    .
                }
            } %>%
            {
                if (any(stringr::str_detect(names(df), "q112_courgete_autre1__"))) {
                    dplyr::rename_with(
                        .data = .,
                        .cols = dplyr::matches("courgete"),
                        .fn = ~ stringr::str_replace(
                            string = .x,
                            pattern = "_courgete",
                            replacement = ""
                        )
                    )
                } else if (any(stringr::str_detect(names(df), "q112farinloc[12]_autre1__0"))) {
                    dplyr::rename_with(
                        .data = .,
                        .cols = dplyr::matches("farinloc[12]"),
                        .fn = ~ stringr::str_replace(
                            string = .x,
                            pattern = "farinloc[12]",
                            replacement = ""
                        )
                    )
                } else {
                    .
                }
            }

    } else if (df_type == "autre1releve") {

        df_fixed <- df %>%
            # misnamed variable in tomato concentrate df
            {
                if ("unitesautre1tmteCctr__id" %in% names(df)) {
                    dplyr::rename(
                        .data = .,
                        unitesautre1__id = .data$unitesautre1tmteCctr__id
                    )
                } else {
                    .
                }
            }

    } else if (df_type %in% c("unitesAutre1", "unitesAutre2")) {

        df_fixed <- df %>%
            {
                if (any(stringr::str_detect(names(df), "q106_pasthe_autre[12]__1"))) {
                    dplyr::rename_with(
                        .data = .,
                        .cols = matches("pasthe"),
                        .fn = ~ str_replace(
                            string = .x,
                            pattern = "_pasthe",
                            replacement = ""
                        )
                    )
                }  else if (any(stringr::str_detect(names(df), "q106_autreag_autre[12]__1"))) {
                    rename_with(
                        .data = .,
                        .cols = matches("autreag"),
                        .fn = ~ str_replace(
                            string = .x,
                            pattern = "_autreag",
                            replacement = ""
                        )
                    )                    
                } else if (any(stringr::str_detect(names(df), "q106_better_autre[12]__1"))) {
                    rename_with(
                        .data = .,
                        .cols = matches("better"),
                        .fn = ~ str_replace(
                            string = .x,
                            pattern = "_better",
                            replacement = ""
                        )
                    )                        
                } else if ("q106_calebas_autre1__1" %in% names(df)) {
                    rename_with(
                        .data = .,
                        .cols = matches("calebas"),
                        .fn = ~ str_replace(
                            string = .x,
                            pattern = "_calebas",
                            replacement = ""
                        )
                    )
                } else if ("q106_caleba_autre2__1" %in% names(df)) {
                    rename_with(
                        .data = .,
                        .cols = matches("caleba"),
                        .fn = ~ str_replace(
                            string = .x,
                            pattern = "_caleba",
                            replacement = ""
                        )
                    )
                } else if (any(stringr::str_detect(names(df), "q106_gingem_autre[12]__1"))) {
                    rename_with(
                        .data = .,
                        .cols = matches("gingem"),
                        .fn = ~ str_replace(
                            string = .x,
                            pattern = "_gingem",
                            replacement = ""
                        )
                    )
                } else if (any(stringr::str_detect(names(df), "q106_haricotv_autre[12]__1"))) {
                    rename_with(
                        .data = .,
                        .cols = matches("haricotv"),
                        .fn = ~ str_replace(
                            string = .x,
                            pattern = "_haricotv",
                            replacement = ""
                        )
                    )
                } else if (any(stringr::str_detect(names(df), "q106_petitp_autre[12]__1"))) {
                    rename_with(
                        .data = .,
                        .cols = matches("petitp"),
                        .fn = ~ str_replace(
                            string = .x,
                            pattern = "_petitp",
                            replacement = ""
                        )
                    )
                } else if (any(stringr::str_detect(names(df), "q106_arachid_autre[12]__"))) {
                    rename_with(
                        .data = .,
                        .cols = matches("arachid"),
                        .fn = ~ str_replace(
                            string = .x,
                            pattern = "_arachid",
                            replacement = ""
                        )
                    )
                } else {
                    .
                }
            }

    } else if (df_type == "unitesautre2") {

        df_fixed <- df %>%
            # misnamed variable in tomato concentrate df
            {
                if ("unitesautre2tmteCctr__id" %in% names(df)) {
                    dplyr::rename(
                        .data = .,
                        unitesautre2__id = .data$unitesautre2tmteCctr__id
                    )
                } else {
                    .
                }
            } %>%
            # misnamed variables in sucre en morceaux df
            {
                if ("q116_sucrem" %in% names(df)) {
                    dplyr::rename_with(
                        .data = .,
                        .cols = matches("_sucrem"),
                        .fn = ~ stringr::str_replace(
                            string = .x, 
                            pattern = "_sucrem",
                            replacement = ""
                        )
                    )                    
                } else {
                    .
                }
            } %>%
            {
                if (any(stringr::str_detect(names(df), "q117_courgete_autre2__"))) {
                    dplyr::rename_with(
                        .data = .,
                        .cols = dplyr::matches("courgete"),
                        .fn = ~ stringr::str_replace(
                            string = .x,
                            pattern = "_courgete",
                            replacement = ""
                        )
                    )
                } else if (any(stringr::str_detect(names(df), "q117farinloc[12]_autre2__0"))) {
                    dplyr::rename_with(
                        .data = .,
                        .cols = dplyr::matches("farinloc[12]"),
                        .fn = ~ stringr::str_replace(
                            string = .x,
                            pattern = "farinloc[12]",
                            replacement = ""
                        )
                    )
                } else {
                    .
                }                
            }

    } else if (df_type == "autre2releve") {

        df_fixed <- df %>%
            # misnamed variable in tomato concentrate df
            {
                if ("unitesautre2tmteCctr__id" %in% names(df)) {
                    dplyr::rename(
                        .data = .,
                        unitesautre2__id = .data$unitesautre2tmteCctr__id
                    )
                } else {
                    .
                }
            }

    } else {

        df_fixed <- df

    }

    return(df_fixed)

}


#' Extract labels from all labelled variables
#' 
#' First, determines which variables are labelled. 
#' Then, extracts the labels
#' Next, creates a named list: names are variable names; values are associated labels. 
#' Then, creates a label object in the global environment whose name is the `{df}_lbls`.
#' 
#' @param df Data frame whose value labels to extract
#' 
#' @return List of value labels. Each element is a named numeric vector. Each vector consists of a named values. The names correspond to character labels and the values to the numerical values.
#' 
#' @importFrom purrr map_lgl map
#' @importFrom haven is.labelled
#' 
#' @return Side-effect: create list object in global environment containing labels for all variables
extract_labels <- function(df) {

    # get list of all variable names
    vars_in_df <- names(df)

    # determine--TRUE/FALSE--which have labels
    which_have_labels_lgl <- purrr::map_lgl(
        .x = vars_in_df, 
        .f = ~ haven::is.labelled(df[[.x]]))

    # return the names of those variables with labels
    which_have_labels_names <- names(df)[which_have_labels_lgl]

    # extract labels for each variable into a list
    all_labels <- purrr::map(
        .x = which_have_labels_names,
        .f = ~ attr(x = df[[.x]], which = "labels")
    )

    # create a named list: name is variable name; value is corresponding labels
    labels_named <- stats::setNames(
        nm = which_have_labels_names,
        object = all_labels
    )

}

#' Label variables
#' 
#' Apply labels to all variables in the data frame that need them. To do this:
#' 
#' First, determine the variables that have labels for them in the list of labels.
#' Then, apply those labels to each column
#' 
#' @param df Data frame
#' @param labels_list List that contains the consolidated labels
#' 
#' @return Data frame with labelled columns
#' 
#' @importFrom dplyr `%>%` mutate across all_of cur_column
#' @importFrom haven labelled
label_variables <- function(
    df,
    labels_list
) {

    # extract variable names from label list
    vars_to_label <- names(labels_list)

    # apply labels to columns 
    df <- df %>%
        dplyr::mutate(
            dplyr::across(
                .cols = dplyr::all_of(vars_to_label),
                .fns = ~ haven::labelled(
                    .x, 
                    labels = labels_list[[dplyr::cur_column()]]
                )
            )
        )

    return(df)

}

#' Save data to disk
#' 
#' Save data to disk in Stata format
#' 
#' @param df Data frame to save to disk.
#' @param df_type Character. For consumption data, one of : "unitesFixes", "releve", "unitesautre1", "autre1releve", "unitesautre2", "autre2releve"
#' For production data, one of: "unitesFixes", "unitesAutre1", "unitesAutre2"
#' @param dir Character. Directory where to save data.
#' 
#' @importFrom assertthat assert_that
#' @importFrom haven write_dta
save_nsu_data <- function(
    df,
    df_type,
    dir
) {

    expected_types <- c(
        # unites fixes
        "unitesFixes",
        "releve",

        # unites prévues ailleurs
        "unitesautre1",
        "autre1releve",

        # unites imprévues
        "unitesautre2",
        "autre2releve"
    )

    assertthat::assert_that(
        df_type %in% expected_types,
        msg = "Unexpected `type` provided."
    )

    assertthat::assert_that(
        dir.exists(dir),
        msg = "Directory provided in `dir` does not exist."
    )

    # construct files name
    file_name <- paste0(df_type, ".dta")

    # save file
    haven::write_dta(
        data = df,
        path = paste0(dir, file_name)
    )

}

#' Find the market file for the food group
#' 
#' First, filter files: 
#' - those with a .dta extension
#' - those that do not start with prefixes associated with other files (e.g., "interview__", "assignment__", "unitesFixes", etc.)
#' 
#' Then, retain as the market file the only file remaining.
#' 
#' @param dir Character. Directory whose files to search for the market file.
#' @param pattern_to_exclude Character. Regular expression of files that are not the market file.
#' 
#' @return Character. Name of the market file, with extension
#' 
#' @importFrom fs dir_ls path_file
#' @importFrom stringr str_subset
#' @importFrom glue glue
find_market_file <- function(
    dir,
    pattern_to_exclude = "^interview__|^assignment__|^autre2releve_|^unites[Aa]utre2|^autre1releve_|^unites[Aa]utre1|^releve_|^unitesFixes_"
) {

    all_files_paths <- fs::dir_ls(dir, type = "file")
    all_files <- fs::path_file(all_files_paths)
    dta_files <- stringr::str_subset(string = all_files, pattern = "\\.dta$")
    market_file <- stringr::str_subset(
        string = dta_files,
        pattern = pattern_to_exclude,
        negate = TRUE
    )

    n_market_files <- length(market_file)

    if (n_market_files == 1) {
        
        return(market_file)

    } else if (n_market_files == 0) {

        stop(glue::glue(
            "No market file found for the current food group",
            "Directory: {dir}",
            "Please check the name of the market file and change `pattern_to_exclude` so that this file is not excluded.",
            .sep = "\n"
        ))

    } else if (n_market_files > 1) {

        stop(glue::glue(
            "More than one potential market file found.",
            "These files are {market_file}",
            .sep = "\n"
        ))

    }

}

#' Combine NSU all data of same type in the same directory
#'  
#' @param dir_in Character. Directory path whose files should be combined.
#' @param data_type Character. One of: "consumption", "production". 
#' For data from markets, "consumption"; for data from farmers, "production".
#' @param df_type Character. For consumption data, one of : "unitesFixes", "releve", "unitesautre1", "autre1releve", "unitesautre2", "autre2releve"
#' For production data, one of: "unitesFixes", "unitesAutre1", "unitesAutre2"
#' 
#' @importFrom dplyr if_else bind_rows `%>%` slice starts_with everything left_join mutate select
#' @importFrom stringr str_ends
#' @importFrom purrr map walk2 pmap modify
#' @importFrom fs path_ext_remove path_file
#' @importFrom haven zap_labels read_dta
#' @importFrom tidyr pivot_longer
combine_nsu_in_dir <- function(
    dir_in,
    data_type,
    df_type
) {

    # add "/" to folder so that file paths later are correctly constructed
    dir_in <- dplyr::if_else(
        stringr::str_ends(dir_in, "/"),
        dir_in,
        paste0(dir_in, "/")
    )

    # get list of matching files
    files <- get_matching_files(
        dir = dir_in, 
        pattern = df_type
    )

    # ingest files
    data_list <- purrr::map(
        .x = files,
        .f = ingest_dta_file,
        dir = dir_in
    )

    # rename variables to harmonized names
    renamed_data_list <- purrr::map(
        .x = data_list,
        .f = ~ rename_variables(
            df = .x,
            df_type = df_type
        )
    )

    # grab names of files
    file_names <- purrr::map(
        .x = files,
        .f = ~ fs::path_ext_remove(fs::path_file(.x)) 
    )

    # apply those names to data frames in list
    named_data <- stats::setNames(
        object = renamed_data_list,
        nm = file_names
    )

    # fix errors in files
    fixed_data <- purrr::map(
        .x = named_data,
        .f = ~ fix_misc_issues(
            df = .x,
            df_type = df_type
        )
    )

    # check for required columns
    purrr::walk2(
        .x = fixed_data,
        .y = file_names,
        .f = ~ check_cols(
            df = .x,
            df_name = .y,
            df_type = df_type
        )
    )

    # remove labels from source data frames
    data_wo_labels <- purrr::map(
        .x = fixed_data,
        .f = ~ haven::zap_labels(.x)
    )

    # correct column types
    corrected_data <- purrr::map(
        .x = data_wo_labels,
        .f = ~ correct_col_type(
            df = .x,
            df_type = df_type
        )
    )

    # extract labels
    labels <- purrr::map(
        .x = fixed_data,
        .f = ~ extract_labels(df = .x)
    )    

    # combine labels into a single list of named labels
    combined_labels <- purrr::pmap(
        .l = unname(labels), 
        .f = c
    )

    # add an empty entry for q114 for fruits
    # this ensures that the loop is the same length for each named label--that is, that q114 is not missing for fruits
    # if (df_type == "unitesautre1" & (!"q114a_autre1" %in% names(labels))) {
    #     combined_labels = purrr::list_merge(
    #         .x = combined_labels, 
    #         q114a_autre1 = double()
    #     )
    # }

    # remove duplicate labels by modify each vector by removing duplicates
    consolidated_labels <- purrr::modify(
        .x = combined_labels, 
        .f = ~ .x[!duplicated(.x)]
    )

    # combine files
    nsu_combined <- dplyr::bind_rows(corrected_data)

    # apply value labels
    # nsu_combined <- label_variables(
    #     df = nsu_combined,
    #     labels_list = consolidated_labels
    # )
    
    # add product ID code and labels
    # drawing from market file where that information exists
    # first, identify the market file
    market_file <- find_market_file(dir = dir_in)
    # then, determine if the market file has any observations
    market_df <- haven::read_dta(file = paste0(dir_in, market_file))
    # if so, construct a product name-ID mapping
    if (nrow(market_df) > 0) {

        col_prefix <- dplyr::case_when(
            data_type == "consumption" ~ "codeProduit_",
            data_type == "production" ~ "codeCulture_"
        )

        product_name_id_map <- market_df %>%
            dplyr::slice(1) %>% # take first obs, since all observations contain the same information
            select(starts_with(col_prefix)) %>% # keep variables whose names contain the product name and whose values contains the code
            tidyr::pivot_longer(
                cols = everything(), 
                names_to = "produit_nom", 
                names_prefix = col_prefix,
                values_to = "produit_id"
            )
        # next, construct values labels for product IDs 
        product_id_labels <- stats::setNames(
            nm = product_name_id_map$produit_nom, 
            object = product_name_id_map$produit_id
        )
        product_id_labels <- list(product_id_labels)
        product_id_labels <- stats::setNames(
            nm = "produit_id",
            object = product_id_labels
        )

        labels <- c(consolidated_labels, product_id_labels)

        # then, add product IDs to data set and label codes
        nsu_combined <- nsu_combined %>%
            dplyr::left_join(product_name_id_map, by = "produit_nom")
    
        # add market ID variables
        market_id_vars <- market_df %>%
            # add numeroReleve column if absent, as in BFA data
            { 
                if ((!"numeroReleve" %in% names(.)) & ("s0q19" %in% names(.))) {
                    dplyr::mutate(.data = ., numeroReleve = .data$s0q19)
                } else {
                    .
                }
            } %>%
            dplyr::select(
                .data$interview__id, .data$interview__key, 
                dplyr::matches("^s00q0[0-8]"), .data$numeroReleve
            )
        nsu_combined <- nsu_combined %>%
            dplyr::left_join(market_id_vars, by = c("interview__id", "interview__key"))

    # if not, create empty entries:
    # - produit_id in the df
    # - produit_id entry in the labels
    # in this way, both data and labels have same length for cases where there are some observations
    } else {
        nsu_combined <- nsu_combined %>%
            dplyr::mutate(produit_id = NA_integer_)
        product_id_labels <- list(NA)
        product_id_labels <- stats::setNames(
            nm = "produit_id",
            object = product_id_labels
        )        
        labels <- c(consolidated_labels, product_id_labels)
    }
    
    data_and_labels <- list(
        data = nsu_combined,
        labels = labels
    )

}

#' Combine all NSU data of the same type across food group directories
#' 
#' Apply `combine_nsu_in_dir()` iteratively to each food group folders. Combine the data. Combine value labels. Save the output to combined data to disk in Stata format.
#' 
#' @param dir_in Character. Root directory where food group sub-directories are located.
#' @param dir_regexp Character. Regular expression to identify folders over which to iterate.
#' @param data_type Character. One of: "consumption", "production". 
#' For data from markets, "consumption"; for data from farmers, "production".
#' @param df_type Character. For consumption data, one of : "unitesFixes", "releve", "unitesautre1", "autre1releve", "unitesautre2", "autre2releve"
#' For production data, one of: "unitesFixes", "unitesAutre1", "unitesAutre2"
#' @param dir_out Character. Directory where combined files should be saved.
#' 
#' @importFrom fs dir_ls
#' @importFrom purrr map pluck pmap modify
#' @importFrom dplyr `%>%` bind_rows
combine_nsu_across_dirs <- function(
    dir_in,
    dir_regexp = "_STATA_",
    data_type,
    df_type,
    dir_out
) {

    # identify data folders to iterate over
    folders <- fs::dir_ls(
        dir_in, 
        type = "directory",
        regexp = dir_regexp,
        recurse = FALSE
    )

    # combine files in each folder, yielding
    # data and labels for each folder
    outputs <- purrr::map(
        .x = folders,
        .f = ~ combine_nsu_in_dir(
            dir_in = .x,
            df_type = df_type,
            data_type = data_type
        )
    )

    # bind together df from the "data" elements of the output
    combined_nsu_df <- seq_along(1:length(outputs)) %>%
        purrr::map(
            .f = ~ purrr::pluck(
                outputs, 
                .x, 
                "data"
            )
        ) %>% 
        dplyr::bind_rows()

    # combine value labels across all data so that all values are labelled
    combined_nsu_labels <- seq_along(1:length(outputs)) %>%
        # extract the "labels" element from each element of the output
        purrr::map(
            .f = ~ purrr::pluck(
                outputs, 
                .x, 
                "labels"
            )
        ) %>% 
        # combine labels into a single list of named labels
        purrr::pmap(.f = c) %>%
        # remove duplicate labels by modify each vector by removing duplicates
        purrr::modify(.f = ~ .x[!duplicated(.x)]) %>%
        # remove NA entries created if no entries in the market file
        purrr::modify(.f = ~ .x[!is.na(.x)])

    # apply combined labels to combined data frame
    nsu_combined <- label_variables(
        df = combined_nsu_df,
        labels_list = combined_nsu_labels
    )

    return(nsu_combined)

}

#' Fusionner l'ensemble des bases NSU
#' 
#' Créer les bases suivantes sous format Stata:
#' 
#' - unitesFixes
#' - releve
#' - unitesautre1
#' - autre1releve
#' - unitesautre2
#' - autre2releve
#' 
#' Ces bases seront créées en:
#' 
#' - Parcourant les répertoires qui correspondent à la chaîne regex `dir_regexp`.
#' - Fusionnant les bases cibles
#' - "Fusionnant" les étiquettes de valeurs pour les variables étiquettées
#' - Sauvegardant les bases résultantes dans le répertoire `dir_out`
#' 
#' @param dir_in Character. Root directory where food group sub-directories are located.
#' @param dir_regexp Character. Regular expression to identify folders over which to iterate.
#' @param data_type Character. One of: "consumption", "production". 
#' For data from markets, "consumption"; for data from farmers, "production".
#' @param dir_out Character. Directory where combined files should be saved.
#' 
#' @importFrom purrr map walk2
#' @importFrom haven write_dta
#' 
#' @export 
combine_nsu_data <- function(
    dir_in,
    dir_regexp = "_STATA_",
    data_type, 
    dir_out
) {

    if (data_type == "consumption") {

        file_types <- c(
            "unitesFixes", "releve", 
            "unitesautre1", "autre1releve", 
            "unitesautre2", "autre2releve"
        )
    } else if (data_type == "production") {

        file_types <- c("unitesFixes", "unitesAutre1", "unitesAutre2")

    }


    # create combined data files
    nsu_combined <- purrr::map(
        .x = file_types,
        .f = ~ combine_nsu_across_dirs(
            dir_in = dir_in,
            data_type = data_type,
            dir_regexp = dir_regexp,
            df_type = .x,
            dir_out = dir_out
        )

    )

    # write data to disk
    purrr::walk2(
        .x = nsu_combined,
        .y = file_types,
        .f = ~ haven::write_dta(
            data = .x,
            path = paste0(dir_out, .y, ".dta")
        )
    )

}

#' Fusionner les données relatives aux marchés
#' 
#' Créer un fichier `marches.dta` dans le répertoire désigné dans `dir_out`.
#' 
#' @param dir_in Character. Root directory where food group sub-directories are located.
#' @param dir_regexp Character. Regular expression to identify folders over which to iterate.
#' @param data_type Character. One of: "consumption", "production". 
#' For data from markets, "consumption"; for data from farmers, "production".
#' @param dir_out Character. Directory where combined files should be saved.
#' 
#' @importFrom fs dir_ls
#' @importFrom purrr map_chr map map2
#' @importFrom haven read_dta write_dta
#' @importFrom dplyr mutate select bind_rows
#' 
#' @export 
combine_market_data <- function(
    dir_in,
    dir_regexp = "_STATA_",
    data_type,
    dir_out
) {

    # compile list folders with data inside
    dirs <- fs::dir_ls(
        path = dir_in, 
        type = "directory", 
        regexp = dir_regexp, 
        recurse = FALSE
    )

    # find market file in folder
    market_files <- purrr::map_chr(
        .x = dirs,
        .f = ~ find_market_file(dir = .x)
    )

    file_paths <- paste0(names(market_files), "/", market_files)

    # ingest market files
    market_dfs <- purrr::map(
        .x = file_paths,
        .f = ~ haven::read_dta(file = .x)
    ) 

    # add column to indicate source file name
    # this will allow counting obs per food group
    market_dfs_w_source <- purrr::map2(
        .x = market_dfs,
        .y = market_files,
        .f = ~ mutate(
            .data = .x,
            fichier_source = .y
        )
    )

    # retain relevant columns only
    market_dfs_edited <- purrr::map(
        .x = market_dfs_w_source,
        .f = ~ dplyr::select(.data = .x,
            # SuSo identifiers
            interview__id, interview__key, assignment__id, fichier_source,
            # geographical IDs and other process data
            starts_with("s00q"), observation,
            # information on status
            has__errors, interview__status
        )
    )

    # combine dfs into one df
    market_df <- dplyr::bind_rows(market_dfs_edited)

    # save df to disk as Stata file
    file_name <- dplyr::case_when(
        data_type == "consumption" ~ "marches.dta",
        data_type == "production" ~ "fermes.dta"
    )
    haven::write_dta(data = market_df, path = paste0(dir_out, "/", file_name))

}
arthur-shaw/nsuoutils documentation built on Dec. 19, 2021, 4:41 a.m.