R/get_granularity.R

Defines functions get_granularity

Documented in get_granularity

#' Get granularity of values for each Key in df
#' @param df data frame containing at least Columns Key, Date and Value
#' @return vector of data granularities with length of nrow(df)
#' @export
get_granularity <- function(df){
  library(tidyverse)
  library(lubridate)
  assertthat::assert_that(is.data.frame(df), msg = "df must be a data.frame")
  assertthat::assert_that(all(c("Key", "Date", "Value") %in% colnames(df)),
                          msg = "df must include columns 'Key', 'Date' and 'Value'")

  KeysWeekly <- df %>% group_by(Key) %>% mutate(Date_diff = c(7, diff(Date))) %>%
    summarize(Date_Diff_all_7k = ifelse(all(Date_diff %in% c((1:100) * 7)), TRUE, FALSE)) %>%
    dplyr::filter(Date_Diff_all_7k) %>%  pull("Key") %>%  unique

  KeysMonthly_1 <-  df %>%
    mutate(Month = format(as.Date(Date), "%Y-%m")) %>%
    group_by(Key, Month) %>%
    summarise(Variance = var(Value, na.rm = T)) %>% na.omit %>%
    mutate(no_monthly_variance = ifelse(all(Variance == 0, na.rm = TRUE), TRUE, FALSE)) %>%
    dplyr::filter(no_monthly_variance) %>%  pull("Key") %>% unique

  final_df_monthlyKeys_1 <- df %>% dplyr::filter(Key %in% KeysMonthly_1) %>%
    mutate(Month = month(Date), Year = year(Date)) %>% group_by(Key, Month, Year) %>%
    mutate(NumberOfDaysThatMonth = n()) %>%  group_by(Month, Year) %>%
    mutate(NumberOfDaysThatMonth = max(NumberOfDaysThatMonth)) %>%
    group_by(Key) %>%  mutate(Value = Value * NumberOfDaysThatMonth)  %>%
    dplyr::filter(day(Date) == 1) %>% dplyr::select(Key, Date, Value)

  df <- df %>%  dplyr::filter(!Key %in% KeysMonthly_1) %>% bind_rows(final_df_monthlyKeys_1)

  KeysMonthly_2 <- df %>%
    mutate(Month = format(as.Date(Date), "%Y-%m")) %>%
    group_by(Key, Month) %>%  mutate(Values_in_Month = n()) %>%
    #summarize(max_values_in_month = max(Values_in_Month))
    group_by(Key) %>%
    summarize(only_one_Value_in_months = all(Values_in_Month == 1)) %>%
    filter(only_one_Value_in_months) %>%
    pull(Key)

  KeysMonthly <- c(KeysMonthly_1, KeysMonthly_2) %>%  unique

  KeysDaily <- df$Key %>%  unique %>%  setdiff(c(KeysWeekly, KeysMonthly))

  df  <- df %>% mutate(granularity = ifelse(Key %in% KeysDaily, "daily",
                                            ifelse(Key %in% KeysWeekly, "weekly",
                                                   ifelse(Key %in% KeysMonthly, "monthly", NA))))
  return(df)
}
td-berlin/anomalizer documentation built on Feb. 21, 2020, 2:03 a.m.