R/forecasting_engine.R

Defines functions SeasonalityTest Smoothing_ts2 dtw_dist Euclidean_dist AD_dist Forecasting Similarity

Documented in Similarity

library(dtw)
library(robustbase)
library(forecast)

# Smooth & deseasonalisation functions
#'
#' @param input observation time series
#' @param ppy frequency of the series
#'
#' @return whether to execute seasonal adjustment
#' @references Assimakopoulos & Nikolopoulos (2000)
#' @export
#'
#' @examples
SeasonalityTest <- function(input, ppy){
  if (length(input)<3*ppy){
    test_seasonal <- FALSE
  }else{
    xacf <- acf(input, plot = FALSE)$acf[-1, 1, 1]
    clim <- 1.645/sqrt(length(input)) * sqrt(cumsum(c(1, 2 * xacf^2)))
    test_seasonal <- ( abs(xacf[ppy]) > clim[ppy] )
    if (is.na(test_seasonal)==TRUE){ test_seasonal <- FALSE }
  }
  return(test_seasonal)
}
Smoothing_ts2 <- function(x, spanw, fh){
  ppy <- frequency(x) ; ST <- F ; trend <- 1:length(x)
  if (ppy>1){ ST <- SeasonalityTest(x,ppy) }
  if (ST==T){
    lambda <- BoxCox.lambda(x,lower=0,upper=1)
    bc.x <- as.numeric(BoxCox(x, lambda))
    seasonal <- stl(ts(bc.x, frequency = ppy), "per")$time.series[, 1]
    bc.x <- bc.x - as.numeric(seasonal)
    x <- as.numeric(InvBoxCox(bc.x, lambda))+x-x
    suppressWarnings(x.loess <- loess(x ~ trend, span = spanw/length(x), degree = 1))
    x <- as.numeric(x.loess$fitted)+x-x
    SIin <- seasonal
    SIout <- head(rep(seasonal[(length(seasonal)-ppy+1):length(seasonal)], fh), fh)
  }else{
    suppressWarnings(x.loess <- loess(x ~ trend, span = spanw/length(x), degree = 1))
    x <- as.numeric(x.loess$fitted)+x-x
    SIin <- rep(0, length(x))
    SIout <- rep(0, fh)
    lambda <- 1
  }
  output <- list(series=x, seasonalIn=SIin, seasonal=SIout, lambda=lambda)
  return(output)
}

# Distance measures related functions
dtw_dist <- function(reference, testing){
  reference <- as.numeric(reference)
  testing <- as.numeric(testing)
  if (length(reference)>length(testing)){
    DTW <- NA
  }else if (length(reference)<length(testing)){
    extract <- tail(testing,length(reference))
    DTW<-dtw(x=reference, y=extract)$distance/length(extract) 
  }else{
    DTW <- (dtw(x=reference, y=testing)$distance)/length(testing)
  }
  return(DTW)
}
Euclidean_dist <- function(reference, testing){
  reference <- as.numeric(reference)
  testing <- as.numeric(testing)
  if (length(reference)>length(testing)){
    Euclidean <- NA
  }else if (length(reference)<length(testing)){
    extract <- tail(testing,length(reference))
    Euclidean <- (sum((reference-extract)^2))/length(extract)
  }else{
    Euclidean <- (sum((reference-testing)^2))/length(testing)
  }
  return(Euclidean)
}
AD_dist <- function(reference, testing){
  reference <- as.numeric(reference)
  testing <- as.numeric(testing)
  if (length(reference)>length(testing)){
    AD <- NA
  }else if (length(reference)<length(testing)){
    extract <- tail(testing,length(reference))
    AD<- (sum(abs(reference-extract)))/length(extract)
  }else{
    AD <- (sum(abs(reference-testing)))/length(testing)
  }
  return(AD)
}

# Forecast function
#'
#' @param data_input input data for each data frequency
#' @param fh number of forecasting horizon
#' @param nts number of the most similar series
#' @param Preprocessing remove seasonality, smooth the series and scale the target and the reference
#' series to the same magnitude
#' @param Dist_type number of types of distance measure
#' @param clustering number of sample categories in kmeans
#' @param LoadData 
#' @param FullOutput 
#'
#' @return A list generated by forecasting with similarity
#' @export
#'
#' @examples

Forecasting <- function(data_input, fh, nts, Preprocessing, Dist_type, path, clustering=0, LoadData=TRUE, FullOutput=FALSE){
  
  if (LoadData){
    load(paste0("reference/ref",frequency(data_input),".RData"))
    load(paste0("reference/ref",frequency(data_input),"_",Preprocessing,".RData"))
  } else {
    load(paste0("reference/", path,".RData"))
    load(paste0("reference/", path,"_",Preprocessing,".RData"))
  }
  
  if (frequency(data_input)==1){
    spanw <- 6
  } else if (frequency(data_input)==4){
    spanw <- 8
  } else if (frequency(data_input)==12){
    spanw <- 18
  }
  
  limit <- min(ref_info$lengths)
  if (length(data_input)>limit){
    data_input <- tail(data_input, limit)
  }
  
  #Keep only ref t-s of enough observations
  cd <- length(data_input) + fh
  keep <- ref_info[ref_info$lengths>=cd,]$ordnum
  smoothed_ref <- lapply(keep, function(x) as.numeric(tail(list_ref[[x]], cd)))
  
  # Smooth target series
  if (Preprocessing==1){
    out_req <- Smoothing_ts2(data_input, spanw, fh)
    smoothed_tar <- out_req$series
    SeasIndIn <- out_req$seasonalIn
    SeasInd <- out_req$seasonal
    lambda <- out_req$lambda
  }else{
    smoothed_tar <- data_input
    SeasIndIn <- rep(0, length(data_input))
    SeasInd <- rep(0, fh)
  }
  
  # Split reference set and target series into insample and outsample
  insample_ref <- lapply(c(1:length(smoothed_ref)), function(x) head(smoothed_ref[[x]], length(smoothed_ref[[x]])-fh))
  outsample_ref <- lapply(c(1:length(smoothed_ref)), function(x) tail(smoothed_ref[[x]], fh))
  insample_tar <- smoothed_tar
  
  # Normalize reference set and target series
  Scale_ref <- lapply(c(1:length(insample_ref)), function(x) as.numeric(tail(insample_ref[[x]],1)) )
  Scale_tar <- as.numeric(tail(insample_tar,1))
  
  insample_ref_scale <- lapply(c(1:length(insample_ref)), function(x) insample_ref[[x]]/Scale_ref[[x]]) 
  outsample_ref_scale <- lapply(c(1:length(outsample_ref)), function(x) outsample_ref[[x]]/Scale_ref[[x]]) 
  insample_tar_scale <- insample_tar/Scale_tar
  
  # Compute distances
  if (Dist_type==1){
    Dist <- unlist(lapply(c(1:length(insample_ref_scale)), function(x) dtw_dist(insample_tar_scale,insample_ref_scale[[x]])))
  }else if (Dist_type==2){
    Dist <- unlist(lapply(c(1:length(insample_ref_scale)), function(x) Euclidean_dist(insample_tar_scale,insample_ref_scale[[x]])))
  }else{
    Dist <- unlist(lapply(c(1:length(insample_ref_scale)), function(x) AD_dist(insample_tar_scale,insample_ref_scale[[x]])))
  }
  
  # Create forecasts
  temp_dist <- Dist
  temp_order <- rank(temp_dist)
  temp_id <- c(1:length(temp_dist))
  temp <- data.frame(temp_id,temp_dist,temp_order)
  temp <- temp[order(temp_order),] ; row.names(temp) <- NULL
  temp <- na.omit(temp)
  
  if (clustering >= 2){
    fit <- kmeans(Dist, clustering)
    nts <- length(which(fit$cluster == which.min(aggregate(Dist, by=list(fit$cluster), FUN=mean)[,2])))
  }
  
  temp_c_in <- array(NA, c(nts,length(insample_tar_scale)))
  for (i in 1:nts){
    temp_c_in[i,] <- as.numeric(insample_ref_scale[[temp$temp_id[i]]])
  }
  if (is.null(nrow(temp_c_in))){
    temp_fcs_in <- temp_c_in
  }else{
    temp_fcs_in <- colMedians(temp_c_in)
  }
  temp_fcs_in <- temp_fcs_in*Scale_tar
  temp_c_in <- temp_c_in*Scale_tar
  if (all(SeasIndIn==0)==F){
    temp_fcs_in <- InvBoxCox( (BoxCox(temp_fcs_in, lambda) + SeasIndIn) , lambda)
    for (i in 1:nts){
      temp_c_in[i,] <- InvBoxCox( (BoxCox(temp_c_in[i,], lambda) + SeasIndIn) , lambda)
    }
  }
  
  if (fh>1){
    temp_c <- array(NA, c(nts,fh))
    for (i in 1:nts){
      temp_c[i,] <- as.numeric(outsample_ref_scale[[temp$temp_id[i]]])
    }
    if (is.null(nrow(temp_c))){
      temp_fcs <- temp_c
    }else{
      temp_fcs <- colMedians(temp_c)
    }
    temp_fcs <- temp_fcs*Scale_tar
    temp_c <- temp_c*Scale_tar
    if (all(SeasInd==0)==F){
      temp_fcs <- InvBoxCox( (BoxCox(temp_fcs, lambda) + SeasInd) , lambda)
      for (i in 1:nts){
        temp_c[i,] <- InvBoxCox( (BoxCox(temp_c[i,], lambda) + SeasInd) , lambda)
      }
    }
  } else {
    temp_c <- array(NA, nts)
    for (i in 1:nts){
      temp_c[i] <- as.numeric(outsample_ref_scale[[temp$temp_id[i]]])
    }
    if (is.null(nrow(temp_c))){
      temp_c <- temp_c
    }else{
      temp_fcs <- median(temp_c)
    }
    temp_fcs <- temp_fcs*Scale_tar
    temp_c <- temp_c*Scale_tar
    if (SeasInd!=0){
      temp_fcs <- InvBoxCox( (BoxCox(temp_fcs, lambda) + SeasInd) , lambda)
      for (i in 1:nts){
        temp_c[i] <- InvBoxCox( (BoxCox(temp_c[i], lambda) + SeasInd) , lambda)
      }
    }
  }
  
  if (FullOutput){
    return(list(fcs_in = temp_fcs_in, similarfcs_in = temp_c_in, fcs = temp_fcs, similarfcs = temp_c, similarDistances=temp$temp_dist[1:nts], k=nts))
  } else {
    return(list(fcs = temp_fcs, similarfcs = temp_c))
  }
  
}

#' Similarity
#'
#' @param y "vector" the observation series
#' @param fh number of forecasting horizon
#' @param nts number of the most similar series
#' @param Preprocessing remove seasonality, smooth the series and scale the target and the reference
#' series to the same magnitude
#' @param Dist_type number of forecasting benchmarks
#' @param gamma calibrated significance level quantiles
#' @param scaling scaling method: using different scaling or not
#' @param clustering number of sample categories in kmeans
#' @param LoadData 
#' @param FullOutput 
#'
#' @return A list containing calculation of similarity
#' @export
#'
#' @examples
Similarity <- function(y, fh, LoadData, path, nts=100, Preprocessing=1, Dist_type=3, gamma = 0.05, scaling = c('sdiff', 'snaive'), 
                         clustering=0, FullOutput=FALSE){
  
  deltas = seq(0, 1, 0.01)
  all_MSIS = rep(0, length(deltas))
  y.train = head(y, length(y) - fh)
  y.test = as.vector(tail(y, fh))
  ytrainresults = Forecasting(y.train, fh=fh, nts=nts, Preprocessing=Preprocessing, Dist_type=Dist_type, path=path, 
                          clustering=clustering, LoadData=LoadData, FullOutput=FullOutput)
  if (fh>1){
    PIs_ytrain = apply(ytrainresults$similarfcs, 2, quantile, probs=c(gamma/2, 1-gamma/2))
    for (i in seq_along(deltas)){
      delta = deltas[i]
      PIL = PIs_ytrain[1,] * (1 - delta)
      PIU = PIs_ytrain[2,] * (1 + delta)
      all_MSIS[i]  = calculate_PI_MSIS(PIL, PIU, y.train, y.test, gamma = gamma, scaling = scaling)
    }
  } else {
    PIs_ytrain = quantile(ytrainresults$similarfcs, probs=c(gamma/2, 1-gamma/2))
    for (i in seq_along(deltas)){
      delta = deltas[i]
      PIL = PIs_ytrain[1] * (1 - delta)
      PIU = PIs_ytrain[2] * (1 + delta)
      all_MSIS[i]  = calculate_PI_MSIS(PIL, PIU, y.train, y.test, gamma = gamma, scaling = scaling)
    }
  }
  
  opt.delta = deltas[which.min(all_MSIS)]
  yresults = Forecasting(y, fh=fh, nts=nts, Preprocessing=Preprocessing, Dist_type=Dist_type, path=path,
                      clustering=clustering, LoadData=LoadData, FullOutput=FullOutput)
  if (fh>1){
    PIs = apply(yresults$similarfcs, 2, quantile, probs=c(gamma/2, 1-gamma/2))
    PIL_Similarity = PIs[1,] * (1 - opt.delta)
    PIU_Similarity = PIs[2,] * (1 + opt.delta)
  } else {
    PIs = quantile(yresults$similarfcs, probs=c(gamma/2, 1-gamma/2))
    PIL_Similarity = PIs[1] * (1 - opt.delta)
    PIU_Similarity = PIs[2] * (1 + opt.delta)
  }

  if (FullOutput){
    return(list(fcs = yresults$fcs,
                similarfcs = yresults$similarfcs,
                PIL = PIL_Similarity, 
                PIU = PIU_Similarity, 
                opt.delta = opt.delta,
                fcs_in = yresults$fcs_in,
                similarfcs_in = yresults$similarfcs_in,
                similarDistances = yresults$similarDistances))
  } else {
    return(list(fcs = yresults$fcs,
                similarfcs = yresults$similarfcs,
                PIL = PIL_Similarity, 
                PIU = PIU_Similarity))
  }
}
kdwang1808/Dejavu documentation built on March 23, 2020, 5:14 a.m.