R/decomposition.R

################################################################################
# Time serie component decomposition
################################################################################

#------------------------------------------------------------------------------#
# Interface

#' The default component for time-series decomposition.
#' 
#' @description A character vector of the component expected for a time serie
#' decomposition. Note that this is temporary only. Later only 'residual' will
#' be enforced and any decomposition will be allowed.
default_decomp <- c('trend', 'seasonal', 'daily', 'ar', 'residual')

#' Time-serie component decomposition
#' 
#' @description 
#' 
#' @param cols Character vector of the columns that should be decomposed. If the
#' decomposition is missing for one of several column then that column is
#' decomposed with a null decomposition (all components equals to 0 and a
#' residual component equal to the full time-serie) and a warning is issued. But
#' no error is thrown.
#' 
#' @param strategy A function that takes as parameter one tse, one column name
#' and that outputs a data.frame with 1 column per component. Right now the
#' default components are: trend, seasonal, daily, ar. They should
#' match the output of the strategy function column names.
#' 
#' @return A tse object. It is the modified version of the input tse object. For
#' each column in cols, 4 new columns are created or overwritten in the tse: 
#' decomp.col.trend, decomp.col.seasonal, decomp.col.daily and decomp.col.ar
#' 
#' @family has_correction, apply_correct
tse_decomp <- function(tse, cols, strategy) {
  return(tse)
}

#' Time-serie component composition and recomposition.
#' 
#' @description apply_correct apply the correction according to the components 
#' selected and put the results in a column called "decomp.cola.corrected".
#' 
#' @param cols A character vector of the columns on which should the correction
#' be applied
#' 
#' @param comps A character vector which indicates the components to include in
#'  the recomposition
#' 
#' @return The initial tse with the decomp.*.corrected columns updated
#' 
#' @usage apply_correct(g17, "load", "trend")
#' 
#' @family tse_decomp, has_correction
#' 
#' @export
apply_correct <- function(tse, cols, comps) {
  return(tse)
}

#' Check if correction stack if available for specified column.
#' 
#' @description The correction stack a.k.a. time-serie decomposition is
#' implemented as special columns in the tse object: columns starting by
#' "decomp." and having a suffix like ".trend" or something else. When the
#' time-serie decomposition is created those column are created. This function
#' checks whether those columns are present or missing. If missing they can be
#' created using a call to tse_decomp.
#' It would be really good if the function could also check that the sum of the
#' componenent + the residual sums up to the original column, but currently that
#' is not implemented.
#' 
#' @param col A character vector of length 1, which has to match one of the
#' column names of tse
#' 
#' @return A boolean. True if the decomposition is available false otherwise.
#' 
#' @family tse_decomp, apply_correct
#' 
#' @export
has_correction <- function(tse, col) {
  # input checks
  stopifnot(is.tse(tse))
  # col is a character vector of length 1
  stopifnot(is.character(col) & identical(length(col), 1L))
  stopifnot(col %in% get_value_cols(tse))
  
  # get the decomposition columns
  comps_cols <- get_decomp_cols(tse, col)
  # check that the default decomposition columns are present
  present <- decomp_col(col, default_decomp) %in% comps_cols
  # if not all the value columns are present then it is not a valid
  # decomposition and we return FALSE
  if(!all(present))
    return(FALSE)
  # if all the required decomposition columns are present
  # we check that they sum to the original column:
  all.equal()
}

#------------------------------------------------------------------------------#
# Decomposition utils

#' [Internal] Get the column name of a time-serie component.
#' 
#' @param col Character vector of column names.
#' @param comp Character vector of component names.
#' @return Character vector. Column names of the components
decomp_col <- function(col, comp) {
  paste('decomp', 
        interaction(col, comp, lex.order = TRUE) %>% levels,
        sep = '.')
}

#' Return all the decomposition columns names of a given value column.
#' @param col Column name
#' @return Vector of column names
get_decomp_cols <- function(tse, col) {
  pattern <- paste0('^decomp.', col)
  get_value_cols(tse) %>% grep(pattern = pattern, value = TRUE)
}

#------------------------------------------------------------------------------#
# Decomposition strategies

#' Default strategy for decomposing double seasonal time-series.
#' 
#' @description compute the trend, seasonality, daily profiles, and 
#' auto-regressive part of a time-serie according to the given model parameters
#' 
#' @return a data.frame with 4 columns: trend, seasonality, daily, ar
#' 
#' @export
compute_correction_stack  <- function(data, use_trend, 
                                      use_seasonal, n_harmonics, 
                                      use_daily, n_clusters,
                                      use_AR, value_var) {
  stopifnot(typeof(value_var) == 'character')
  
  trend <- paste(value_var, 'trend', sep = '_')
  seasonal <- paste(value_var, 'seasonal', sep = '_')
  daily_profiles <- paste(value_var, 'daily_profiles', sep = '_')
  ar <- paste(value_var, 'ar', sep = '_')
  residual <- paste(value_var, 'residual', sep = '_')
  fitted <- paste(value_var, 'fitted', sep = '_')
  
  # initialize all the components to 0
  data[,trend] <- 0
  data[,seasonal] <- 0
  data[,daily_profiles] <- 0
  data[,ar] <- 0
  
  # apply the corrections that have been selected
  if(use_trend) data[,trend] <- compute_trend(data, value_var)$component
  if(use_seasonal) {
    temp <- data
    temp[, value_var] <- data[, value_var] - data[, trend]
    data[, seasonal] <- compute_seasonal(temp, 1/seq_len(n_harmonics), value_var)$component
  }
  if(use_daily) {
    temp <- data
    temp[, value_var] <- data[, value_var] - data[, trend] - data[, seasonal]
    data[, daily_profiles] <- compute_daily(temp, n_clusters, value_var)$component
  }
  if(use_AR) {
    temp <- data
    temp[, value_var] <- data[, value_var] - data[, trend] - data[, seasonal] - data[, daily_profiles]
    data[, ar] <- compute_AR(temp, value_var)$component
  }
  
  # compute the residual component
  data[, residual] <- data[, value_var] - data[, trend] - data[, seasonal] - 
    data[, daily_profiles] - data[, ar]
  
  # compute the global fit by summing the component from each model
  data[, fitted] <- data[, trend] + data[, seasonal] + data[, daily_profiles] + data[, ar]
  
  # return the dataset with the correction
  data
}


# computes the trend component of the data
# input: d dataframe with local_time, x1
# output: lm object, trend (fitted values)
compute_trend <- function(d, value_var) {
  # compute the linear regression
  m <- paste(value_var, 'local_time', sep = ' ~ ') %>% # make the formula
    as.formula %>%
    lm(data = d)
  
  # return the results
  list(model = m, component = fitted(m))
}

# computes the seasonal component of the data
# input: - d dataframe with local_time, x1, year
#        - harmonics, vector of periods in years for the regression
# output: lm object, seasonal (fitted values)
compute_seasonal <- function(d, harmonics, value_var) {
  n_harmonics <- length(harmonics)
  
  # compute the basis functions for the regression
  t <- as.numeric(d$local_time)
  t <- 2*pi/3600/24/365 * matrix(rep(t, n_harmonics), ncol = n_harmonics) %*% diag(1/harmonics)
  d <- data.frame(x = d[, value_var], cos(t), sin(t))
  
  # compute the linear regression
  m <- lm(x ~ ., data = d)
  
  # return the results
  list(model = m, component = fitted(m))
}

# computes the daily component of the data
# input: - d dataframe with local_time, x1, day, period
#               Should have no incomplete days
#        - n_clusters: number of clusters to be used
# output: kmeans object, daily (fitted values), clusters (per point)
compute_daily <- function(d, n_clusters, value_var) {
  # we need numeric day for kmeans to work?
  d$day <- as.numeric(d$day)
  
  # cast the data into daily profiles
  daily_profiles <- 
    melt(d, id.vars = c('local_time', 'day', 'period'), measure.vars = value_var) %>%
    dcast(day ~ period)
  
  # sort daily profiles by day
  daily_profiles[order(daily_profiles$day),]
  
  # apply the kmeans clustering algorithm
  m <- kmeans(x = select(daily_profiles, -day), centers = n_clusters)
  
  # merge the clustering information back into the dataset
  d <- 
    data.frame(day = daily_profiles$day, clusters = m$cluster) %>%
    merge(d, all.y = TRUE)
  
  # flatten the daily component, daily_profiles is sorted by days so we can just string together all the rows
  component <- c(t(fitted(m, method = "centers")))
  
  list(model = m, component = component, clusters = d$clusters)
}

# computes the AR component of the data
# input: d dataframe with local_time, x1
# output: AR object, AR (fitted values)
compute_AR <- function(truc, value_var) {
  
  load_ts <- ts(data = truc[, value_var], start = 1, end = dim(truc)[1], frequency = 1)
  m <- ar(load_ts)
  
  # the first values of the fit are NA
  component <- load_ts - resid(m)
  component[seq_len(m$order)] <- load_ts[seq_len(m$order)]
  
  
  #component %>% length %>% print
  
  list(model = m, component = component)
}
EBlonkowski/timeseries documentation built on May 6, 2019, 2:57 p.m.