R/data_validation.R

Defines functions validate_ts

Documented in validate_ts

#' Validate Time Series Data
#' 
#' This functions fill gaps in time series and provide revelant statistics
#'
#' @param .data Tibble
#' @param na_values list: which values to consider and replace, by default: 
#' na_values = list(y_var = 0, reg_value = 0, reg_name = "")
#'
#' @return Tibble
#' @export
#'
validate_ts <- function(.data, na_values = list(y_var = 0, reg_value = 0, reg_name = "")){
  
  key <- attributes(.data)[["key"]]
  
  fill_ts <- function(.data){
    seq_complete <- seq.Date(from = as.Date(min(.data[["date_var"]]))
                             , to = as.Date(max(.data[["date_var"]]))
                             , by = as.character(.log_init$prescription$freq_name))
    n_missing_dates <- (length(seq_complete) - length(unique(.data[["date_var"]])))
    missing_dates <- base::setdiff(as.character(seq_complete), as.character(.data[["date_var"]]))
    dates_with_reg <- .data %>% 
      filter(is.na(reg_name)==F | reg_name != "", reg_value !=0) %>% 
      pull(date_var) %>% 
      unique() %>% 
      as.character()
    
    duplicated_dates <- .data[["date_var"]][duplicated(.data[["date_var"]])]
    
    if(length(duplicated_dates)>0){
      suspicious_cases <- .data %>% 
        filter(date_var %in% duplicated_dates)
      
      if(n_distinct(suspicious_cases$reg_name) > length(duplicated_dates)){
        warning_log <- "Multiple regressors have been found for a same date"
        if(sum(suspicious_cases$reg_value == 0) == length(duplicated_dates)){
          solution_log <- "Zeros have been found, they will be filtered out"
          vec_excluded <- (.data$reg_value == 0 & .data$date_var %in% duplicated_dates)
          .data <- .data[!vec_excluded, ]
        } else {
          solution_log <- "No solution has been applied"
        }
        
        log_update(module = "dates_check"
                   , key = attributes(.data)[["key"]]
                   , new_log = list(n_dates = length(seq_complete)
                                    , duplicated_dates = duplicated_dates
                                    , missing_dates = missing_dates
                                    , dates_with_reg = dates_with_reg
                                    , warning = warning_log
                                    , solution = solution_log))
        }
    }
    
    log_update(module = "dates_check"
               , key = attributes(.data)[["key"]]
               , new_log = list(n_dates = length(seq_complete)
                                , missing_dates = missing_dates
                                , duplicated_dates = duplicated_dates
                                , dates_with_reg = dates_with_reg
                                , warning = "No duplicated dates have been found."))
    
    # Output ------------------------------------------------------------------
    
    tmp <- seq_complete %>% 
      enframe(value = "date_var") %>% 
      left_join(.data, by = "date_var") %>%
      do({
        tmp_1 <- tibble(.)
        if(all(is.na(tmp_1$reg_name))){
          tmp_1 %>% 
            replace_na(replace = list(y_var = na_values[["y_var"]]
                                      , reg_value = na_values[["reg_value"]]
                                      , reg_name = na_values[["reg_name"]]))
        } else {
          tmp_1 %>% 
            #fill("reg_name", .direction = "down") %>% 
            replace_na(replace = list(reg_value = 0
                                      , y_var = na_values[["y_var"]]))
        }
      }) %>% 
      dplyr::select(-name) %>% 
      structure("key" = attributes(.data)[["key"]])
    
    return(tmp)
  }
  
  .data %>% 
    fill_ts()
  
  #list(.data = fill_ts(.data), logger)
}
opoyc/autoforecast documentation built on May 18, 2021, 1:29 a.m.