R/cln_functions.R

Defines functions dup_diagnostic na_diagnostic

#devtools::document()
#' @export
#' @importFrom purrr is_empty
#' @importFrom na.tools any_na
#' @importFrom na.tools which_na
#' @importFrom na.tools n_na
#' @importFrom rlog log_info
na_diagnostic <- function(data) {
  df_name <- paste(substitute(data)) %>% as.character()
  
  df <- if(!is.data.frame(data)) {
    data_frame <- as.data.frame(data)
    return(data_frame)
  } else {
    data
  }
  
  #Any NA Values in the columns?
  na_col_check <- lapply(df, na.tools::any_na)[lapply(df, na.tools::any_na) == TRUE] %>% names()
  
  #If so, where are they located in each column?
  na_col_pos <- lapply(df[names(df) %in% na_col_check], na.tools::which_na)
  #Count number of NA values
  na_count_vals <- lapply(df[names(df) %in% na_col_check], na.tools::n_na)
  
  na_msg <- if(purrr::is_empty(na_col_check)) {
    rlog::log_info(paste("There are no NA values in any columns" 
                         ))
  } else {
    rlog::log_info(paste("There are", na_count_vals,
                         "NA values in columns", na_col_check
                         ))
    return(list(
      na_col_names = na_col_check,
      na_row_index = na_col_pos,
      na_sum = na_count_vals
    ))
  }
  
  
  
}



#' @export
#' @importFrom rlog log_info
dup_diagnostic <- function(data) {
  #need to filter nms todo
  df <- if(!is.data.frame(data)) {
    data_frame <- as.data.frame(data)
  } else {
    data
  }
  
  dup_col_logical <- lapply(df[, names(df)], duplicated)
  dup_col_names <- dup_col_logical %>% names()
  
  df_fct <- lapply(df[names(df) %in% dup_col_names], as.factor)
  df_fct_tbl <- lapply(df_fct, table) %>% lapply(as.data.frame) %>%
    lapply(function(x) dplyr::filter(x, x[,2] > 1)) %>%
    lapply(function(x) x[order(x[,2], decreasing = TRUE),])
  
  check <- lapply(df_fct_tbl, nrow) %>% unlist %>% as.data.frame()
  #anydups <- nrow(check)
  
  if(sum(check$.) == 0)  {
    rlog::log_info("There are no duplicate values in any column")
    
  } else {
    
    rlog::log_info(paste(
      "There are duplicate values in columns", dup_col_names
    ))
    
    return(list(
      duplicated_col_names = dup_col_names,
      duplicated_col_count = df_fct_tbl,
      duplicated_col_summary = check
    ))
  }
  
  
}


#' @export
round_ts_midnight <- function(ts_col, origin) {
  collection_col_ts_norm <- format(ts_col, "%Y-%m-%d")
  collection_col_ts_norm <- paste(collection_col_ts_norm, "23:59:59")
  collection_col_ts_norm <- as.POSIXct(collection_col_ts_norm,  tz = "UTC",
                                       format = "%Y-%m-%d %H:%M%OS",
                                       origin = origin)
  
  return(collection_col_ts_norm)
}

#' @export
parse_fct <- function(data, col_names, choose_class) {
  my_data <- data
  my_data[, c(col_names)] <- lapply(my_data[,c(col_names)], choose_class)
  return(my_data)
}

#' @export
env_warn <- function(env_nm) {
  if(!exists(env_nm)) {
    env_nm <- new.env()
    rlog::log_info("new env created")
    return(env_nm)
  } else {
    rlog::log_info("env already exists. no env overwrite made.")
    get(env_nm, envir = .GlobalEnv)
  }
}

#' @export
#This function compares two date columns to determine if they are sequential.
#If not then FALSE, if equiv.
#If dates are same then I classify as TRUE.
#key_column helps join the resultant df.
date_logic_seq <- function(key_column, prior_date_col, latter_date_col) {
  col_nm_testkit_key <- substitute(key_column) %>% deparse()
  col_nm_prior <- substitute(prior_date_col) %>% deparse()
  col_nm_latter <- substitute(latter_date_col) %>% deparse()
  
  date_df <- dplyr::tibble(key_column, prior_date_col, latter_date_col)
  
  #For some reason I need a seperate statement for equality,
  date_df$date_ck <- ifelse(prior_date_col < latter_date_col, "Time Order", "Time Unorder")
  date_df$date_ck <- ifelse(prior_date_col == latter_date_col, "Time Order", date_df$date_ck)
  
  date_df_time_unorder <- dplyr::filter(date_df, date_ck == "Time Unorder")
  date_df_time_unorder <- date_df_time_unorder[,c(1:3)]
  colnames(date_df_time_unorder) <- c(col_nm_testkit_key, col_nm_prior, col_nm_latter)
  
  readout <- ifelse(nrow(date_df_time_unorder) == 0, paste("No datetime comparison errors.",
                                                           deparse(substitute(prior_date_col)),
                                                           "occurs before",
                                                           deparse(substitute(latter_date_col)),
                                                           "in all rows"),
                    paste(nrow(date_df_time_unorder), "instances where",
                          deparse(substitute(prior_date_col)), "occurs before",
                          deparse(substitute(latter_date_col))))
  
  return(list(datetime_error_exists = readout,
              datetime_error_rows = date_df_time_unorder))
  
}


#' @export
#' @importFrom magrittr %>%
rs_connect <- function(rs_host,
                       rs_port,
                       rs_user,
                       rs_password,
                       rs_dbname) {
  rlog::log_info("Connecting to Redshift...")
  connect_rs <- DBI::dbConnect(RPostgres::Postgres(),
                               host = rs_host,
                               port = rs_port,
                               user = rs_user,
                               password = rs_password,
                               dbname = rs_dbname)
  if(DBI::dbIsValid(connect_rs)) {
    rlog::log_info("Successful Redshift Connection")
  } else {
    rlog::log_info("DB Failed To Connect")
  }
  return(connect_rs)
}

#' @export
nrow_loaded_info <- function(data, df_name) {
  df_nrow <- nrow(data)
  statement <- paste(df_nrow,"rows loaded from", df_name)
  rlog::log_info(statement)
  return(statement)
}



#' @export
#' @importFrom magrittr %>%
#This converts Rmd to R. R format is necessary to use source() in shiny apps
#Having issues? Check your wd()
#For consistent performance, use setwd(here::here()) in the code chunk this is run
#and have an R project open.
purl_mdown <- function(file_name,
                       new_file_name,
                       file_path_origin,
                       file_path_destination) {
  
  if(length(file_name) <= 1) {
    paste0(file_path_origin, "/", file_name,".Rmd") %>% knitr::purl()
    file.rename(from = paste0(file_path_origin, "/", file_name,".R"),
                to = paste0(file_path_destination, "/", new_file_name))
  } 
}

#' @export
#' @importFrom purrr map2
purl_ordered <- function(fnames) {
  
  file_nm_order <- function(fnames) {
    fname_len <- length(fnames)
    fname_ordered <- paste0("0", 1:length(fnames), "_", fnames, ".R")
    return(fname_ordered)
  }
  
  file_order_names <- function(fnames, fnames_ordered) {
    fnames_ordered[grep(fnames, fnames_ordered)]
  }
  
  fnames_ordered <- file_nm_order(fnames)
  fname_vars <- purrr::map2(fnames, fnames_ordered, file_order_names)
  
  return(fname_vars)
}

# -------------------------------------------------------------------------
virusgeeks/vgtools documentation built on April 25, 2022, 12:38 p.m.