R/time-serie.R

#==============================================================================#
# merge
# print
# is.tse
# sum, diff
# D
# tse
# update_available_ (private)

#------------------------------------------------------------------------------#
# tse creation

tse <- function(data, period = 'hour') {
  
  # input check
  
  stopifnot(period %in% c('hour', 'period', 'minute'))
  
  # create time columns
  
  data$time.period <- hour(data$time.local)*2 + minute(data$time.local)/30
  data$time.hour <- hour(data$time.local)
  data$time.day <- as.POSIXct(trunc(data$time.local, 'day'))
  data$time.wday <- wday(data$time.local)
  data$time.month <- month(data$time.local)
  data$time.year <- year(data$time.local)
  if(period == 'minute') {
    data$time.minute <- minute(data$time.local)
    data$time.min_in_day <- hour(data$time.local)*60 + minute(data$time.local)
  }
  
  # order
  
  data <- data[order(data$time.local), ] # order by time
  
  class(data) <- c('tse', class(data)) # the data is now a piece of time-series
  attributes(data)$period <- paste0('time.', period)
  
  # merge public holiday info
  
  data <- data %>%
    merge(PublicHolidayData, by.x = "time.day", by.y = "date") %>%
    dplyr::rename(time.holiday = public_holiday_name)
  # Change the holiday level to include Normal days (no holiday)
  levels(data$time.holiday) <- c(levels(data$time.holiday), 'Normal')
  data$time.holiday[is.na(data$time.holiday)] <- 'Normal'
  
  # set availability
  
  data %>% order_columns %>% update_available
  
}

order_columns <- function(tse) {
  
  atb <- attributes(tse)
  
  value_col_i <- colnames(tse) %>% 
    grep('^time\\.', ., invert = TRUE) # find all the variables starting by "time."
    
  value_col <- colnames(tse)[value_col_i] %>% # get the column names
    setdiff('available') # remove the availability column
  
  tse[, c(time_cols(tse), 'available', value_col)] %>% restore_attrib_(atb)
}

# read from a time indexed data frame, timezone is assumed 2 b SG
# time is still as string
parse_time <- function(data, time_i, time_format = '%d/%m/%Y %H:%M') {
  
  # put the local time variable first
  
  col_i <- seq_along(colnames(data))
  col_i[1] <- time_i
  col_i[time_i] <- 1
  data <- data[, col_i] 
  colnames(data)[1] <- 'time.local'
  
  # parse the time information
  
  data$time.local <- as.POSIXct(strptime(data[, 1], format = time_format, tz = 'Asia/Singapore'))
  
  data
}

#' a time series cannot have gaps in the time indexation the following 
#' method fills the missing times
#' 
#' @param data A time indexed data.frame
#' @param period One of the following: time.hour, time.period, time.minute
fill_time <- function(data, time_col, period) {
  switch(period, 
    time.hour = {
      # compute the number of indices needed
      n_ind <- interval(start = data[1, time_col], end = tail(data[, time_col])) %/% hours(1)
      # create all the indices
      data2 <- data.frame(new_time_i = data[1, time_col] + dhours(0:n_ind))
    },
    time.period = {
      # compute the number of indices needed
      n_ind <- interval(start = data[1, time_col], end = tail(data[, time_col])) %/% minutes(30)
      # create all the indices
      data2 <- data.frame(new_time_i = data[1, time_col] + 30*dminutes(0:n_ind))
    },
    time.minute = {
      # compute the number of indices needed
      n_ind <- interval(start = data[1, time_col], end = tail(data[, time_col])) %/% minutes(1)
      # create all the indices
      data2 <- data.frame(new_time_i = data[1, time_col] + dminutes(0:n_ind))
    }
    )
  
  # merge the full time_index into the one with gaps
  merge.data.frame(data2, data, by.x = 'new_time_i', by.y = time_col, all.x = TRUE) %>%
    rename_(.dots = structure(list('new_time_i'), names = time_col))
}

#' Internal method to update the availability column.
update_available <- function(data) {
  data$available <- TRUE
  for (col in data) {
    data$available <- data$available & !is.na(col)
  }
  data
}


#------------------------------------------------------------------------------#
# tse indexation and subsetting

#' Subset columns.
subcol <- function(tse, j) {
  
  atb <- attributes(tse)
  
  # add the time info columns
  cn <- j
  if(!is.character(j)) cn <- colnames(tse)[j]
  cn <- union(c(time_cols(tse), 'available'), cn)

  # subset, order columns and update availability
  tse <- `[.data.frame`(tse, T, cn) %>%
    restore_attrib_(atb) %>%
    order_columns %>%
    update_available

  # restore the attributes
  # lol
}

#' Subset assignment operator.
`[<-.tse` <- function(tse, i, j, value) {
  # currently we don't allow row subsetting as it
  # would probably kill the constant time difference
  # between rows
  stopifnot(missing(i))
  
  # save attributes
  atb <- attributes(tse)
  
  # subset
  tse <- `[<-.data.frame`(tse, i, j, value)
  
  tse %>% restore_attrib_(atb) %>% update_available # update availability and restore
}

#' Subset operator.
#' @description Currently not implemented
# `[.tse` <- function(tse, i, j) {
#   stop('[`[.tse`] Sorry I\'m not implemented')
# }

#' Filter rows.
#' 
#' @description Filter returns a data.frame
#' 
#' @return A data.frame
#' @family filter_
filter.tse <- function(.data, ...) {
   filter_(.data, .dots = lazyeval::lazy_dots(...))
}


filter_.tse <- function(.data, ..., .dots) {
  atb <- attributes(.data) # save attributes
  
  .data <- filter_(as.data.frame(.data), ..., .dots = .dots) # filter the data

  # make sure the result is still a time serie -> but we don't want a ts
  .data %>%
    arrange(time.local) %>%
    restore_attrib_(atb) %>%
    order_columns
}

mutate.tse <- function(.data, ..., .dots) {
  stop('[mutate] Sorry I\'m not implemented')
}

mutate_.tse <- function(.data, ..., .dots) {
  stop('[mutate_] Sorry I\'m not implemented')
}

#' Rename column of a tse object.
#' 
#' @description Does not allow modification of the time index column.
#' @return Modified tse object.
rename.tse <- function(.data, ..., .dots) {
  rename_(.data, .dots = lazyeval::lazy_dots(...))
}

rename_.tse <- function(.data, ..., .dots) {
  # if we are reasssigning the index column it should throw an error
  if('time.local' %in% names(list(...)))
    stop('Cannot modify the time index of a tse')
  if('time.local' %in% names(.dots))
    stop('Cannot modify the time index of a tse')
  
  atb <- attributes(.data) # save attributes
  rename_(as.data.frame(.data), ..., .dots = .dots) %>% # rename the columns
    restore_attrib_(atb) %>%
    order_columns
}
# 
# select.tse <- function(.data, ..., .dots) {
#   filter_(.data, .dots = lazyeval::lazy_dots(...))
# }
# 
# select_.tse <- function(.data, ..., .dots) {
#   stop('[select_] Sorry I\'m not implemented')
# }

#' Remove useless rows from a time-serie
#' 
#' @description Some time-series start or end with pure NAs rows in the value
#' columns. This rows bring no information and can be safely removed. This is 
#' the purpose of this function
#' 
#' @return The same tse without the useless pure NAs rows at the stard and end
#' of the time-serie
clamp_tse <- function(tse) {
  # initialize vector who locates empty rows
  any_available <- rep(TRUE, dim(tse)[1])
  # compute empty row vectors
  for(col in get_value_cols(tse))
    any_available <- any_available & !is.na(tse[, col])
  # run-length encoding of empty rows
  r <- rle(any_available)
  
  # if starts with empty rows
  if(!r$values[1])
    tse <- tail(tse, -r$lengths[1])
  # if ends with empty rows
  if(!tail(r$values, 1))
    tse <- head(tse, -tail(r$lengths, 1))
  
  tse
}

#------------------------------------------------------------------------------#
# Operations on multiple tse's: merging, cbind, compatibility

#' Merging of time series.
#' 
#' @description This function merges a time serie with another time-serie or
#' data.frame.
#' 
#' @param tse A time-serie object.
#' @param y A time-serie object or a data.frame object.
#' @return a tse object
#' @family cbind.tse, rbind.tse, are_compatible
merge.tse <- function(tse, y, by = intersect(names(tse), names(y)),
                      by.x = by, by.y = by){
  atb <- attributes(tse) # serve attributes
  
  tse2 <- merge.data.frame(tse, y, by, by.x, by.y, all.x = TRUE, sort = FALSE)
  
  tse2 <- tse2[order(tse2$time.local), ] # order by time the new columns
  
  tse[, setdiff(names(tse2), names(tse))] <- tse2[, setdiff(names(tse2), names(tse))]
  
  tse %>% restore_attrib_(atb) %>% update_available
}

#' Checks whether several time-series share a common time-indexation.
#' 
#' @description check whether 2 time-series are compatible (same frequency, 
#' same start and end) = same time indexation
#' 
#' @return Boolean value
are_compatible <- function(tse1, tse2) {
  stopifnot(is.tse(tse1))
  stopifnot(is.tse(tse2))
  
  t1 <- identical(period_tse(tse1), period_tse(tse2)) # equality of periods
  t2 <- identical(tse1$time.local[1], tse2$time.local[1]) # equality of starting date
  t3 <- identical(tail(tse1$time.local, 1), tail(tse2$time.local, 1)) # equality of end date
  
  t1 & t2 & t3
}

#' Binds columns of compatible time-series.
#' @return tse object
#' @family merge.tse, rbind.tse, are_compatible
cbind.tse <- function(tse1, tse2, deparse.level = 1) {
  # input check
  stopifnot(are_compatible(tse1, tse2))
  
  atb <- attributes(tse1)
  cbind.data.frame(tse1, tse2[, get_value_cols(tse2)]) %>%
    restore_attrib_(atb)
}

#' Concatenate time-series.
#' @return tse object
#' @family cbind.tse, merge.tse, are_compatible
rbind.tse <- function(tse1, tse2, deparse.level = 1) {
  stop('[rbind.tse] Sorry not implemented :(')
}

#------------------------------------------------------------------------------#
# tse utilities

# get the period
period_tse <- function(tse) { attr(tse, 'period') }

get_period_in_day <- function(tse) {
  switch(period_tse(tse),
    'time.hour' = 'time.hour',
    'time.period' = 'time.period',
    'time.minute' = 'time.min_in_day')
}

is.tse <- function(data) {
  
  atb <- attributes(data)
  
  # check class attribute and columns
  t <- inherits(data, 'tse') &
    all(c(time_cols(data), 'available') %in% colnames(data))
  
  # chack that there are no gaps, and that it is ordered
  
  if (identical(atb$period, 'hour')) t <- t & all(diff(data$time.local) == dhours(1))
  if (identical(atb$period, 'period')) t <- t & all(diff(data$time.local) == dminutes(30))
  
  # check that the availability flag is correctly set
  #lol
  
  # return
  t
  
}

# print method, by default doesn't show the time variables
print.tse <- function(tse, n = 6L, show_all = F) {
  
  # find the value columns
  
  value_col_i <- colnames(tse) %>% 
    grep('^time\\.', ., invert = TRUE) # find all the variables starting by "time."
  
  value_col <- colnames(tse)[value_col_i] %>% # get the column names
    setdiff('available') # remove the availability column
  
  # print some meta-info
  
  period_help <- c(time.hour = 'hourly', time.period = 'half-houly', time.minute = 'every minute')
  c('Date range: ', tse$time.local[1] %>% format, ' -> ', 
    tail(tse$time.local, 1) %>% format, '\n') %>% cat # time range
  c('Local time in:', attr(tse$time.local[1], 'tzone'), 
    '    time inveral:', period_help[attributes(tse)$period], '\n') %>% cat # print the timezone and period
  c('#rows:', dim(tse)[1], 
    '   #columns:', length(time_cols(tse)) + 1, '+', length(value_col),
    '   size:', object_size(tse) %>% format, '\n\n') %>% cat
  
  #paste(c(length(value_col), 'value columns:', value_col, '\n\n'), collapse = ' ') %>% cat # print value column names

  # select the columns to show
  
  display_cols <- c('time.local', 'available', value_col)
  if(show_all) display_cols <- colnames(tse)
  
  #
  tse[, display_cols] %>% head(n) %>% print.data.frame # print the data
}

as.data.frame.tse <- function(tse) {
  # alter attributes
  class(tse) <- 'data.frame'
  attributes(tse)$period <- NULL
  
  tse
}

restore_attrib_ <- function(tse, atb) { 
  structure(tse, class = c('tse', 'data.frame'), 
            period = atb$period) 
}

# get all the time indexation columns
time_cols <- function(tse) {
  per <- period_tse(tse)
  # period is hour
  if(per == 'time.hour')
    return(c('time.local', 'time.year', 'time.month', 'time.day', 'time.wday', 
             'time.holiday', 'time.hour', 'time.period'))
  # period is half-hour
  if(per == 'time.period')
    return(c('time.local', 'time.year', 'time.month', 'time.day', 'time.wday', 
             'time.holiday', 'time.hour', 'time.period'))
  # period is minute
  if(per == 'time.minute')
    # time.min_in_day is the the index of the minute in the day
    # time.min_in_day is between 0 and 60*24-1 = 1439
    return(c('time.local', 'time.year', 'time.month', 'time.day', 'time.wday', 
             'time.holiday', 'time.hour', 'time.minute', 'time.min_in_day'))
  
}

# get all the base columns: time index + availability
base_cols <- function(tse) c(time_cols(tse), 'available')

# get all the columns that are not for internal use
get_value_cols <- function(tse) {
  setdiff(colnames(tse), base_cols(tse))
}

# get the columns name who have the specified type
get_cols_type <- function(tse, sel_type) {
  tse %>% sapply(typeof) %>% 
    grep(pattern = sel_type, value = T) %>% names
}

# get the column names that are factors ()
get_cols_class <- function(tse, sel_class = 'factor') {
  r <- tse %>% sapply(class) %>% grepl(pattern = sel_class)
  colnames(tse)[r]
}

#-------------------------------------------------------------------------------
# time-series operators

split_tse <- function(data, ratio, var_out) {
  l <- dim(data)[1]
  
  data[, var_out] <- seq_len(l) <= l*ratio
  
  data
}

split_tse2 <- function(data, var_out, ...) {
  l <- dim(data)[1]
  ratios <- list(...) %>% c(recursive = TRUE)
  stopifnot(all(ratios>=0))
  stopifnot(all(ratios<=1))
  stopifnot(identical(sum(ratios), 1))
  
  r_old <- 0
  index <- seq_len(l)
  col <- rep('', l) # result column
  for(n in names(ratios)) {
    this_span <-  index >= l*r_old & index <= l*(r_old + ratios[[n]])
    col[this_span] <- n
    r_old <- r_old + ratios[[n]]
  }
  data[, var_out] <- factor(col)
  data
}

D <- function (data, col, n = 1L) {
  stopifnot(col %in% colnames(data))
  
  colnamestart <- paste(col, 'd', sep = '.')
  for (k in n) {
    data[, paste(colnamestart, k, sep = '')] <- lag(data[, col], k)
  }
  
  data
}

# Lags several columns at once, columns to be lagged and their
# orders can be given as parameters:
# - D_multi(tse, load = c(1,3), drybulb = 1:4)
# Or as a list:
# - D_multi(tse, list = list(load=c(1,3), drybulb = 1:4))
D_multi <- function(tse, ..., orders_list) {
  # list parameter is missing
  if(missing(orders_list) || is.null(orders_list))
    orders <- list(...) # use the dots
  else orders <- orders_list
  
  # input checks
  stopifnot(names(orders) %in% colnames(tse)) # all the orders must match some column in the tse
  stopifnot(all(c(list(...), recursive = TRUE) >= 0)) # all orders must be non-negative
  for(col in names(orders))
    tse <- D(tse, col, orders[[col]]) 
  tse
}


diff.tse <- function(data, col1, col2 = paste(col1, 'd1', sep = '.'), out_col) {
  stopifnot(c(col1, col2) %in% colnames(data))
  
  data[, out_col] <- data[, col1] - data[, col2]
  
  data
  
}

sum.tse <- function(tse, col1, col2, out = paste(col1, col2, sep = '+')) {
  tse[, out] <- tse[, col1] + tse[, col2]
}
EBlonkowski/timeseries documentation built on May 6, 2019, 2:57 p.m.