#' Check for duplicate records
#'
#' @description Function to be applied to each database table to report duplicate records and duplicate primary keys
#'
#' @param db The URL to the database in Google sheets
#'
#' @param sheet The sheet in the DB to check for suplicates
#'
#' @return A list of tibbles showing duplicates.
#'
#' @noRd
chkDupIDs <- function(db = dbURL, sheet){
if (!sheet %in% c('LU_INGREDIENTS','LU_MEAL','XREF_INGREDIENT')){
stop('Invalid sheet name. Must be one of LU_INGREDIENTS,LU_MEAL,XREF_INGREDIENT')}
#Get ID column based on the sheet
idCol <- switch(sheet,
'LU_INGREDIENTS' = 'INGREDIENT_ID',
'LU_MEAL' = 'MEAL_ID',
'XREF_INGREDIENT' = c('MEAL_ID','INGREDIENT_ID')
)#end switch
#Get the sheet DF
chkSheet <- googlesheets4::read_sheet(db,sheet = sheet)
#Case for LU_MEAL and LU_INGREDIENTS
if (sheet %in% c('LU_INGREDIENTS','LU_MEAL')){
#Check whole table
tblDups <- chkSheet %>%
mutate(
FWD = duplicated(.),
BCK = duplicated(., fromLast = TRUE),
DUPS = ifelse(FWD == TRUE | BCK == TRUE, 'TRUE','FALSE')
) %>%
filter(DUPS == TRUE) %>%
select(-c(FWD,BCK,DUPS))
#Check just duplicated IDs
idDups <- chkSheet %>%
select(all_of(idCol)) %>%
mutate(
FWD = duplicated(.),
BCK = duplicated(., fromLast = TRUE),
DUPS = ifelse(FWD == TRUE | BCK == TRUE, 'TRUE','FALSE')
) %>%
filter(DUPS == TRUE) %>%
select(all_of(idCol))
} else
if(sheet == 'XREF_INGREDIENT'){
#Check whole table
tblDups <- chkSheet %>%
mutate(
FWD = duplicated(.),
BCK = duplicated(., fromLast = TRUE),
DUPS = ifelse(FWD == TRUE | BCK == TRUE, 'TRUE','FALSE')
) %>%
filter(DUPS == TRUE) %>%
select(-c(FWD,BCK,DUPS))
#Check just duplicated IDs
idDups <- chkSheet %>%
select(all_of(idCol),INGREDIENT,MEAL_NAME) %>%
mutate(
FWD = duplicated(.),
BCK = duplicated(., fromLast = TRUE),
DUPS = ifelse(FWD == TRUE | BCK == TRUE, 'TRUE','FALSE')
) %>%
filter(DUPS == TRUE) %>%
select(all_of(idCol),INGREDIENT,MEAL_NAME)
}#end if else
return(
list(
dupRecords = tblDups,
dupIDs = idDups
)
)
}#end chkDupIDs
#' Fuzzy match top 20 similar meal and ingredient names
#'
#' @description Function to get and list top 20 meall or ingredient names that are similar
#'
#' @param db The URL to the database in Google sheets
#'
#' @param sheet The sheet in the DB to check for suplicates
#'
#' @return A list of tibbles showing close matches to inspect visually and fix if needed.
#'
#' @noRd
chkNames <- function(db = dbURL, sheet){
#Fuzzy string matching by Jaro-Winkler distance method
#Report top 20 close matches and inspect manually
if (!sheet %in% c('LU_INGREDIENTS','LU_MEAL','XREF_INGREDIENT')){
stop('Invalid sheet name. Must be one of LU_INGREDIENTS,LU_MEAL,XREF_INGREDIENT')}
#Get NAME column based on the sheet
nameCol <- switch(sheet,
'LU_INGREDIENTS' = 'INGREDIENT',
'LU_MEAL' = 'MEAL_NAME'
)#end switch
#Get the sheet DF
chkSheet <- googlesheets4::read_sheet(db,sheet = sheet)
a <- chkSheet %>% select(all_of(nameCol)) %>% pull(.) %>% tolower(.)
d <- stringdistmatrix(a,a, method = 'jw')
diag(d) <- NA
dists <- apply(d, 1, FUN = min, na.rm = T)
rnk <- rank(dists, ties.method = 'min')
rows <- which(rnk < 21)
rnk <- rnk[rows]
out <- chkSheet[rows,] %>% mutate(RANK = rnk) %>% arrange(RANK)
if(sheet == 'LU_MEAL'){out <- out %>% select(MEAL_ID,MEAL_NAME, MEAL_TYPE, MEAL_DESCRIPTION)}
if(sheet == 'LU_INGREDIENTS'){out <- out %>% select(INGREDIENT_ID, INGREDIENT,
INGREDIENT_DESCRIPTION, SERVING_SIZE_DESCRIPTION)}
return(out)
}#end chkNames
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.