R/fct_admin.R

Defines functions chkDupIDs

#' Check for duplicate records
#'
#' @description Function to be applied to each database table to report duplicate records and duplicate primary keys
#'
#' @param db The URL to the database in Google sheets
#'
#' @param sheet The sheet in the DB to check for suplicates
#'
#' @return A list of tibbles showing duplicates.
#'
#' @noRd
    chkDupIDs <- function(db = dbURL, sheet){
        if (!sheet %in% c('LU_INGREDIENTS','LU_MEAL','XREF_INGREDIENT')){
            stop('Invalid sheet name. Must be one of LU_INGREDIENTS,LU_MEAL,XREF_INGREDIENT')}
        #Get ID column based on the sheet
        idCol <- switch(sheet,
                        'LU_INGREDIENTS' = 'INGREDIENT_ID',
                        'LU_MEAL' = 'MEAL_ID',
                        'XREF_INGREDIENT' = c('MEAL_ID','INGREDIENT_ID')
        )#end switch

        #Get the sheet DF
        chkSheet <- googlesheets4::read_sheet(db,sheet = sheet)

        #Case for LU_MEAL and LU_INGREDIENTS
        if (sheet %in% c('LU_INGREDIENTS','LU_MEAL')){
            #Check whole table
            tblDups <- chkSheet %>%
                mutate(
                    FWD = duplicated(.),
                    BCK = duplicated(., fromLast = TRUE),
                    DUPS = ifelse(FWD == TRUE | BCK == TRUE, 'TRUE','FALSE')
                ) %>%
                filter(DUPS == TRUE) %>%
                select(-c(FWD,BCK,DUPS))

            #Check just duplicated IDs
            idDups <- chkSheet %>%
                select(all_of(idCol)) %>%
                mutate(
                    FWD = duplicated(.),
                    BCK = duplicated(., fromLast = TRUE),
                    DUPS = ifelse(FWD == TRUE | BCK == TRUE, 'TRUE','FALSE')
                ) %>%
                filter(DUPS == TRUE) %>%
                select(all_of(idCol))

        } else

            if(sheet == 'XREF_INGREDIENT'){
                #Check whole table
                tblDups <- chkSheet %>%
                    mutate(
                        FWD = duplicated(.),
                        BCK = duplicated(., fromLast = TRUE),
                        DUPS = ifelse(FWD == TRUE | BCK == TRUE, 'TRUE','FALSE')
                    ) %>%
                    filter(DUPS == TRUE) %>%
                    select(-c(FWD,BCK,DUPS))

                #Check just duplicated IDs
                idDups <- chkSheet %>%
                    select(all_of(idCol),INGREDIENT,MEAL_NAME) %>%
                    mutate(
                        FWD = duplicated(.),
                        BCK = duplicated(., fromLast = TRUE),
                        DUPS = ifelse(FWD == TRUE | BCK == TRUE, 'TRUE','FALSE')
                    ) %>%
                    filter(DUPS == TRUE) %>%
                    select(all_of(idCol),INGREDIENT,MEAL_NAME)

            }#end if else

        return(
            list(
                dupRecords = tblDups,
                dupIDs = idDups
            )
        )

    }#end chkDupIDs

#' Fuzzy match top 20 similar meal and ingredient names
#'
#' @description Function to get and list top 20 meall or ingredient names that are similar
#'
#' @param db The URL to the database in Google sheets
#'
#' @param sheet The sheet in the DB to check for suplicates
#'
#' @return A list of tibbles showing close matches to inspect visually and fix if needed.
#'
#' @noRd
    chkNames <- function(db = dbURL, sheet){
        #Fuzzy string matching by Jaro-Winkler distance method
        #Report top 20 close matches and inspect manually
        if (!sheet %in% c('LU_INGREDIENTS','LU_MEAL','XREF_INGREDIENT')){
            stop('Invalid sheet name. Must be one of LU_INGREDIENTS,LU_MEAL,XREF_INGREDIENT')}

        #Get NAME column based on the sheet
        nameCol <- switch(sheet,
                          'LU_INGREDIENTS' = 'INGREDIENT',
                          'LU_MEAL' = 'MEAL_NAME'
        )#end switch

        #Get the sheet DF
        chkSheet <- googlesheets4::read_sheet(db,sheet = sheet)
        a <- chkSheet %>% select(all_of(nameCol)) %>% pull(.) %>% tolower(.)
        d <- stringdistmatrix(a,a, method = 'jw')
        diag(d) <- NA
        dists <- apply(d, 1, FUN = min, na.rm = T)
        rnk <- rank(dists, ties.method = 'min')
        rows <- which(rnk < 21)
        rnk <- rnk[rows]
        out <- chkSheet[rows,] %>% mutate(RANK = rnk) %>% arrange(RANK)

        if(sheet == 'LU_MEAL'){out <- out %>% select(MEAL_ID,MEAL_NAME, MEAL_TYPE, MEAL_DESCRIPTION)}
        if(sheet == 'LU_INGREDIENTS'){out <- out %>% select(INGREDIENT_ID, INGREDIENT,
                                                            INGREDIENT_DESCRIPTION, SERVING_SIZE_DESCRIPTION)}


        return(out)

    }#end chkNames
peernisse/riverMenu documentation built on Aug. 31, 2022, 7:39 p.m.