### data expectation functions -----
#' Check if a data frame has duplicates in a given column. If a vector is given, check for duplicates in the vector.
#'
#' @param df data frame to check
#' @param group_by_column character vector name of column expecting no duplicates
#' @param stop_if_fail T/F for whether to consider failure an error
#' @param report_duplicates T/F for whether to return a partial list of the top duplicates if failure
#' @param return_df T/F whether to end function with dataframe input (as in if a check in part of a pipe)
#'
#' @return several options depending on whether it fails or succeeeds
#' @export
#' @importFrom dplyr group_by_
#' @importFrom dplyr count
#' @importFrom dplyr filter
#'
#' @examples
#' expect_no_duplicates(mtcars, "cyl")
#' # [1] "top duplicates..."
#' # A tibble: 3 x 2
#' # Groups: cyl [3]
#' #cyl n
#' #<dbl> <int>
#' #1 4 11
#' #2 6 7
#' #3 8 14
#' # Error in ifelse(stop_if_fail, stop(paste0("Duplicates detected in column: ", :
#' Duplicates detected in column: cyl
#'
#' expect_no_duplicates(rownames(mtcars))
#' # [1] "no vector duplicates...OK"
expect_no_duplicates <- function(df, group_by_column = NA, stop_if_fail = TRUE, report_duplicates = TRUE, return_df = TRUE){
if(return_df){
df_copy_for_later <- df
}
if (!("data.frame" %in% class(df)) & is.vector(df)){
df <- data.frame("vector" = df)
group_by_column <- "vector"
}
# no test column given, assume testing all for duplicates
if (is.na(group_by_column)){
group_by_column <- names(df)
}
# if number given, resave as field name with that number
if (is.numeric(group_by_column)){
group_by_column <- names(df)[group_by_column]
}
if (sum(colnames(df) == as.name(group_by_column)) == 0) {
stop(paste0("No column named: ", group_by_column))
}
if (sum(colnames(df) == as.name(group_by_column)) > 1) {
stop(paste0("Expected only one, but multiple columns named: ", group_by_column))
}
for (column in group_by_column){
dft <- group_by_(df, as.name(column))
dft <- count(dft)
dft <- filter(dft, n > 1)
if(nrow(dft) == 0){
print(paste0("no ", column, " duplicates...OK"))
} else if(nrow(dft) > 0){
if(report_duplicates){
print("top duplicates...")
dft <- arrange(dft, desc(n))
print(dft)
}
ifelse(stop_if_fail,
stop(paste0("Duplicates detected in column: ", column)),
warning(paste0("Duplicates detected in column: ", column)))
}
}
if(return_df){
df_copy_for_later
}
}
#' Check if a dataframe has the same number of rows as another, or else 0 rows. If vectors are given the lengths of the vectors are compared.
#'
#' @param df1 dataframe or vector to check (required)
#' @param df2 optional second dataframe or vector to compare (if not given, defaults to zero row data frame)
#' @param stop_if_fail T/F for whether to consider failure an error
#' @param report_rowcount T/F for whether to return the number of rows
#' @param return_df T/F whether to end function with dataframe 1 input (as in if a check in part of a pipe)
#'
#' @return several options depending on whether it fails or succeeeds
#' @export
#'
#' @examples
#' expect_same_number_of_rows(mtcars, mtcars)
#' # [1] "Same number of rows...OK"
#'
#' expect_same_number_of_rows(mtcars, iris)
#' # Error in ifelse(stop_if_fail, stop(paste0("Different number of rows: ", :
#' # Different number of rows: 32 vs: 150
#'
#' expect_same_number_of_rows(mtcars)
#' # Error in ifelse(stop_if_fail, stop(paste0("Different number of rows: ", :
#' # Different number of rows: 32 vs: 0
expect_same_number_of_rows <- function(df1, df2 = data.frame(), stop_if_fail = TRUE, report_rowcount = FALSE, return_df = TRUE){
# df2 = data.frame() default means if df2 is not specified, it checks if df1 has zero rows
if(return_df){
df_copy_for_later <- df1
}
# if df is a vector not a df, make it into a df
if (!("data.frame" %in% class(df1)) & is.vector(df1)){
df1 <- data.frame(df1)
}
if (!("data.frame" %in% class(df2)) & is.vector(df2)){
df2 <- data.frame(df2)
}
if((!("data.frame" %in% class(df1)) & !is.vector(df1)) |
(!("data.frame" %in% class(df2)) & !is.vector(df2))){
print(class(df1))
print(typeof(df1))
print(class(df2))
print(typeof(df2))
stop("One of the inputs is neither a data frame nor vector")
}
if (nrow(df1) == nrow(df2)){
if(nrow(df2) == 0){
print(paste0("No rows found as expected...OK"))
} else{
print(paste0("Same number of rows", ifelse(report_rowcount, paste0(": ", nrow(df1)), ""), "...OK"))
}
} else{
ifelse(stop_if_fail,
stop(paste0("Different number of rows: ", nrow(df1), " vs: ", nrow(df2))),
warning(paste0("Different number of rows: ", nrow(df1), " vs: ", nrow(df2))))
}
if(return_df){
df_copy_for_later
}
}
#' Check if the column names you expect to be in the df, are indeed in there
#'
#' @param df
#' @param colums_expected a character vector
#' @param return_df T/F whether to end function with dataframe input (as in if a check in part of a pipe)
#'
#' @return
#' @export
#'
#' @examples
#' expect_column_names_somewhere_in_data_frame(mtcars, c("mpg", "cyl"))
#' # [1] "all columns found...OK"
#' expect_column_names_somewhere_in_data_frame(mtcars, c("mpg", "cyl", "car_name"))
#' # Error in expect_column_names_somewhere_in_data_frame(mtcars, c("mpg", :
#' # car_name column not found
expect_column_names_somewhere_in_data_frame <- function(df, colums_expected, return_df = TRUE){
if(return_df){
df_copy_for_later <- df
}
if(sum(names(df) %in% colums_expected) == length(colums_expected)){
print("all columns found...OK")
} else{
cols_not_found <- colums_expected[!(colums_expected %in% names(df))]
stop(paste0(paste0(cols_not_found, collapse = ", "), " column",
ifelse(length(cols_not_found) > 1, "s", ""),
" not found"))
}
if(return_df){
df_copy_for_later
}
}
#' Check that values in a discrete or categorical vector are within a set of acceptable values
#'
#' @param test_vector vector to test
#' @param correct_vector vector of all acceptable values
#'
#' @return
#' @export
#'
#' @examples
#' expect_values_only_in(mtcars$cyl, c(2, 4, 6))
#' # Error in expect_values_only_in(mtcars$cyl, c(2, 4, 6)) :
#' # 8 value not found in list given
#' expect_values_only_in(mtcars$cyl, c(2, 4, 6, 8))
#' # [1] "all values expected...OK"
expect_values_only_in <- function(test_vector, correct_vector){
if(typeof(test_vector) != typeof(correct_vector) |
class(test_vector) != class(correct_vector)){
stop(paste0("typeof() or class() of test_vector does not match correct_vector"))
}
if(sum(!(unique(test_vector) %in% correct_vector)) == 0 ){
print("all values expected...OK")
} else{
vals_not_found <- unique(test_vector)[!(unique(test_vector) %in% correct_vector)]
stop(paste0(paste0(vals_not_found, collapse = ", "), " value",
ifelse(length(vals_not_found) > 1, "s", ""),
" not found in list given"))
}
}
#' Check if there are any NA values in a data frame, or specified column, within a tolerance
#'
#' @param df
#' @param test_column character string for column to test - optional
#' @param na_tolerance number of NA allowed before failure, default is zero
#' @param return_df T/F whether to end function with dataframe input (as in if a check in part of a pipe)
#'
#' @return
#' @export
#' @importFrom dplyr select_
#'
#' @examples
#' expect_no_nas(mtcars, "cyl")
#' # [1] "Detected 0 NAs...OK"
#' expect_no_nas(mtcars)
#' # [1] "Detected 0 NAs...OK"
#' expect_no_nas(c(0, 3, 4, 5))
#' # [1] "Detected 0 NAs...OK"
#' expect_no_nas(c(0, 3, NA, 5))
#' # Error in expect_no_nas(c(0, 3, NA, 5)) : Detected 1 NAs
expect_no_nas <- function(df, test_column = NA, na_tolerance = 0, return_df = TRUE){
if(return_df){
df_copy_for_later <- df
}
if(!is.na(test_column)){
# -i to handle multiple columns, need to iterate, maybe create or find a function that will make mult names
df <- select_(df, as.name(test_column))
}
na_sum <- sum(is.na(df))
if(na_sum > na_tolerance){
stop(paste0("Detected ", na_sum, " NAs"))
} else{
print(paste0("Detected ", na_sum, " NAs...OK"))
}
if(return_df){
df_copy_for_later
}
}
#' Check if a set of columns are dates. Useful for debugging PowerBI R scripts with dates.
#'
#' @param df
#' @param cols character vector of columns names to check if dates
#' @param stop_if_fail T/F for whether to consider failure an error
#' @param return_df T/F whether to end function with dataframe input (as in if a check in part of a pipe)
#'
#' @return
#' @export
#'
#' @examples
expect_date <- function(df, cols, stop_if_fail = TRUE, return_df = TRUE){
if(return_df){
df_copy_for_later <- df
}
if (!("data.frame" %in% class(df)) & is.vector(df)){
df <- data.frame("vector" = df)
cols <- "vector"
}
# if any not found, then return error
for(co in cols){
if (sum(colnames(df) %in% co) == 0) {
# -i: would like to return all the columns not found, not just the first
stop(paste0("No column named: ", co))
}
}
dfdates <- sapply(df[cols], function(x) is.Date(x))
if(sum(dfdates) == length(cols)){
print(paste0("all columns are dates...OK"))
} else if(sum(dfdates) < length(cols)){
ifelse(stop_if_fail,
stop(paste0("Column is not a date: ", names(which(!dfdates)))),
warning(paste0("Column is not a date: ", names(which(!dfdates)))))
}
if(return_df){
df_copy_for_later
}
}
#' Write a data frame to a temporary .csv and open in Excel
#'
#' @param df
#' @param n suffix for temporary csv file name, defaults to Sys.time()
#' @param open T/F for whether to auto-open in Excel
#' @param del T/F whether to deleted the created file
#' @param del_wait how many seconds to wait to delete the file
#'
#' @return
#' @export
#'
#' @examples
#' wc(mtcars)
#' mtcars %>% filter(cyl == 4) %>% wc()
wc <- function(df, n = NA, open = TRUE, del = TRUE, del_wait = 10){
# by default, add a timestamp to current temp.csv file
if(is.na(n)){
n <- gsub(":| ", "_", Sys.time())
} else if(n == ""){
n <- ""
} else if(n == "+1" | n == -1 | n == 0){
temp_files <- list.files()[grepl(pattern = "temp[0-9]*\\.csv", list.files())]
nm1 <- max(as.numeric(
gsub(".csv", "", gsub("temp", "", temp_files))),
na.rm = T)
if(nm1 == -Inf){
n <- 1
} else {
n <- nm1 + 1
}
}
readr::write_csv(df, paste0("temp", n, ".csv"))
if(open){
system(paste0("open ",(paste0("temp", n, ".csv"))))
}
if(del){
Sys.sleep(del_wait)
file.remove(paste0("temp", n, ".csv"))
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.