R/prep_xgboost.R

Defines functions ARMA_creator MA_creator AR_creator split_data

#' @export
#' @rdname xgboost_prep

split_data <- function(sample,spliti){
  names(sample)[1] <- "date"
  sample <- sample %>%
    dplyr::mutate(.,
                  months = lubridate::month(date),
                  years = lubridate::year(date),
                  weeks = lubridate::week(date),
                  days = lubridate::day(date),
                  quarter = lubridate::quarter(date,fiscal_start = 1),
                  semester = lubridate::semester(date))

  n_sample <- round(nrow(sample)*spliti)
  split.at = sample[n_sample,"date"]
  split = which(sample$date<split.at)
  out <- NULL
  out$df.train <-  sample[split,] # Predictor training data set
  out$y.train <- sample[split,c(2)] # Outcome for training data set
  out$date.train <- sample[split,c(1)] # Date, not a predictor but useful for plotting

  out$df.test <- sample[-split,] # Predictors for testing/evaluation data set
  out$y.test <- sample[-split,c(2)] # Outcome for testing data set
  out$date.test <- sample[-split,c(1)] # date for test data set

  return(out)
}
#' @export
#' @rdname xgboost_prep
AR_creator <- function(df,variable,lag){
  names(df)[1] <- "date"
  df$date <- as.Date(df$date,origin = "1970-01-01")

  if(lag > 0){
  df <- df[,c("date",variable)]
  df <- df %>%  timetk::tk_augment_lags(contains(variable), .lags = 1:lag)
  df <- df[,c(-1,-2)]
  return(df)
  }else{
    return(df)
  }
}

#' @export
#' @rdname xgboost_prep
MA_creator <- function(df,variable,avg_len){

  if(avg_len > 0){
  avg_len <- as.numeric(avg_len)
  x <- zoo::zoo(df[,variable])

  df <- as.data.frame(zoo::rollmean(x, k = avg_len, fill = NA))
  names(df)[1] <- paste("MA_",variable,sep = "")
  return(df)
  }else{
  return(df)
  }

}
#' @export
#' @rdname xgboost_prep
# input variable for "Close"
ARMA_creator <- function(res,variable){

  names(res)[1] <- "date"

  help_df <- res %>% dplyr::select(-date)

  help_df <- help_df %>% select_if(~ !any(is.na(.)))

  list_var <- names(help_df)

  optlags <- NULL


  for(i in list_var[-1]){
    lag <- VARselect(help_df[,c(variable,i)],lag.max = 10, type = "const")$selection[["AIC(n)"]]
    optlags <- c(optlags,lag)
  }
  # insert here
  lag <- VARselect(help_df[,variable],lag.max = 10, type = "const")$selection[["AIC(n)"]]

  optlags <- c(optlags,lag)

  list_ma <- rep(5,each=ncol(help_df))

  bb <- mapply(c,optlags, list_var, SIMPLIFY = T)
  cc <- mapply(c,list_ma, list_var, SIMPLIFY = T)



  for(i in 1:length(list_var)){
    cols_ar <- AR_creator(res,bb[2,i],bb[1,i])
    res <- cbind(res,cols_ar)
  }
  for(i in 1:length(list_var)){
    cols_ma <- MA_creator(res,cc[2,i],cc[1,i])
    res <- cbind(res,cols_ma)
  }

  return(res)

}
#' @export
#' @rdname xgboost_prep

ARMA_creator_for <- function(res,res_pre){

  res_pre <- res_pre[1:nrow(res),]
  list_col <- names(res_pre)[!(names(res_pre) %in% names(res))]

  for(i in list_col){
    res[,i] <- rep(NA,nrow(res))
  }

  res[, list_col] <- lapply(list_col, function(x) as.numeric(res[[x]]))

  return(res)
}



#' @export
#' @rdname xgboost_prep
lag_cols <- function(res,variable){    #input variable for "Close"
  #insert here
  b <- res %>% dplyr::select(-Dates,-variable) %>%  lag(1)
  #insert here
  res <- res %>% dplyr::select(Dates,variable) %>%  cbind(b)

  res <- res[-1,]

  return(res)
}

#' @export
#' @rdname xgboost_prep
make_ts_stationary <- function(res){

  for(i in 2:ncol(res)){
    # optlags <- VARselect(res[,i],lag.max = 10,
    #                      type = "const")$selection[["AIC(n)"]]
    #
    if(adf.test(res[,i],k=2)$p.value > 0.1){
      res[,i] <- c(diff(res[,i],1),NA)

    }

  }
  res <- drop_na(res)

  return(res)
}
#' @export
#' @rdname xgboost_prep
split_data_for <- function(sample,n_ahead,ftype,variable){
  names(sample)[1] <- "date"
  sample <- sample %>%
    dplyr::mutate(.,
                  months = lubridate::month(date),
                  years = lubridate::year(date),
                  weeks = lubridate::week(date),
                  days = lubridate::day(date),
                  quarter = lubridate::quarter(date,fiscal_start = 1),
                  semester = lubridate::semester(date))

  n <- dim(sample)[1]
  out <- NULL
  out$df_train <- sample[1:(n-n_ahead),]
  out$df_forecast <- sample[(n-n_ahead)+1:n,c("date","days","weeks","months","years","quarter","semester")]
  out$y_forecast <- sample[(n-n_ahead)+1:n,variable] #insert here
  out$f_dates <- sample[(n-n_ahead)+1:n,"date"]

  out$df_forecast <- drop_na(out$df_forecast)
  out$y_forecast <- out$y_forecast[1:n_ahead]
  out$f_dates <-  out$f_dates[1:n_ahead]

  n_train <- dim(out$df_train)[1]
  past_features <- out$df_train[(n_train-n_ahead)+1:n_train,]
  past_features <- drop_na(past_features)

  #out$y_forecast <- past_features %>% dplyr::select(Close)

  if(ftype == "past_features"){

    # append past X's to test data
    past_features <- past_features %>% dplyr::select(-date,-days,-weeks,-months,-years,-quarter,-semester)

    out$df_forecast <- cbind(out$df_forecast,past_features)
    #out$date_forecast <- out$df_forecast %>% dplyr::select(date)


  }else if(ftype == "forecasted_features"){
    covariates <- out$df_train %>% dplyr::select(-date,-days,-weeks,-months,-years,-quarter,-semester) %>%
      names()

    for(i in covariates){

      Lambda <- forecast::BoxCox.lambda(out$df_train[,i])
      arima_fit <-  forecast::auto.arima(out$df_train[,i],D=1,approximation = F,allowdrift = T,
                                         allowmean = T,seasonal = T,lambda = Lambda)
      preds_cov <- forecast::forecast(arima_fit,h = n_ahead)

      out$df_forecast <- cbind(out$df_forecast,fcast = preds_cov$mean)
      names(out$df_forecast)[ncol(out$df_forecast)] <- paste0(i)

      # out$date_forecast <- out$df_forecast %>% dplyr::select(date)

    }
  }else{
    help_df <- data.frame(matrix(ncol = ncol(sample)-ncol(out$df_forecast)
                                 , nrow = n_ahead))
    x_names <- out$df_train %>% dplyr::select(-date,-days,-weeks,-months,-years,-quarter,-semester) %>% names()
    colnames(help_df) <- x_names

    out$df_forecast <- cbind(help_df,out$df_forecast)
    #move date column to the first position
    out$df_forecast <- out$df_forecast[,c(which(colnames(out$df_forecast)=="date"),which(colnames(out$df_forecast)!="date"))]

    #numeric

    out$df_forecast[, x_names] <- lapply(x_names, function(x) as.numeric(out$df_forecast[[x]]))

  }
  return(out)
}

#' @export
#' @rdname xgboost_prep

split_data_for_ahead <- function(sample,n_ahead2,ftype2){
  names(sample)[1] <- "date"
  sample <- sample %>%
    dplyr::mutate(.,
                  months = lubridate::month(date),
                  years = lubridate::year(date),
                  weeks = lubridate::week(date),
                  days = lubridate::day(date),
                  quarter = lubridate::quarter(date,fiscal_start = 1),
                  semester = lubridate::semester(date))

  n <- dim(sample)[1]
  out <- NULL
  out$df_train <- sample

  dats_seq <- seq(from = as.Date(max(sample$date),origin = "1970-01-01")+1,
                  to = as.Date(max(sample$date),origin = "1970-01-01") + n_ahead2, by = "day")

  out$df_forecast <- data.frame(matrix(ncol = 1 , nrow = n_ahead2 ))
  x <- c("date")
  colnames(out$df_forecast) <- x

  out$df_forecast$date <- dats_seq

  out$df_forecast <- out$df_forecast %>%
    dplyr::mutate(.,
                  months = lubridate::month(date),
                  years = lubridate::year(date),
                  weeks = lubridate::week(date),
                  days = lubridate::day(date),
                  quarter = lubridate::quarter(date,fiscal_start = 1),
                  semester = lubridate::semester(date))

  n_train <- dim(out$df_train)[1]
  past_features <- out$df_train[(n_train-n_ahead2)+1:n_train,]
  past_features <- drop_na(past_features)



  if(ftype2 == "past_features"){

    # append past X's to test data
    past_features <- past_features %>% dplyr::select(-date,-days,-weeks,-months,-years,-quarter,-semester)

    out$df_forecast <- cbind(out$df_forecast,past_features)
    #out$date_forecast <- out$df_forecast %>% dplyr::select(date)


  }else if(ftype2 == "forecasted_features"){
    covariates <- out$df_train %>% dplyr::select(-date,-days,-weeks,-months,-years,-quarter,-semester) %>%
      names()

    for(i in covariates){

      Lambda <- forecast::BoxCox.lambda(out$df_train[,i])
      arima_fit <-  forecast::auto.arima(out$df_train[,i],D=1,approximation = F,allowdrift = T,
                                         allowmean = T,seasonal = T,lambda = Lambda)
      preds_cov <- forecast::forecast(arima_fit,h = n_ahead2)

      out$df_forecast <- cbind(out$df_forecast,fcast = preds_cov$mean)
      names(out$df_forecast)[ncol(out$df_forecast)] <- paste0(i)

      # out$date_forecast <- out$df_forecast %>% dplyr::select(date)

    }
  }else{
    help_df <- data.frame(matrix(ncol = ncol(sample)-ncol(out$df_forecast)
                                 , nrow = n_ahead2))
    x_names <- out$df_train %>% dplyr::select(-date,-days,-weeks,-months,-years,-quarter,-semester) %>% names()
    colnames(help_df) <- x_names

    out$df_forecast <- cbind(help_df,out$df_forecast)
    #move date column to the first position
    out$df_forecast <- out$df_forecast[,c(which(colnames(out$df_forecast)=="date"),which(colnames(out$df_forecast)!="date"))]
    #numeric

    out$df_forecast[, x_names] <- lapply(x_names, function(x) as.numeric(out$df_forecast[[x]]))

  }
  return(out)
}

#' @export
#' @rdname xgboost_prep
split_data_eval <- function(sample,n_ahead3){
  names(sample)[1] <- "date"

  sample <- sample %>%
    dplyr::mutate(.,
                  months = lubridate::month(date),
                  years = lubridate::year(date),
                  weeks = lubridate::week(date),
                  days = lubridate::day(date),
                  quarter = lubridate::quarter(date,fiscal_start = 1),
                  semester = lubridate::semester(date))

  n <- dim(sample)[1]
  out <- NULL
  out$df_train <- sample[1:(n-n_ahead3),]
  out$df_forecast <- sample[(n-n_ahead3)+1:n,]
  out$df_forecast <- drop_na(out$df_forecast)
  out$y_forecast <- sample[(n-n_ahead3)+1:n,"Close"] #insert here
  out$f_dates <- sample[(n-n_ahead3)+1:n,"date"]

  return(out)
}
lubrunn/DSP_App_Abgabe documentation built on Dec. 21, 2021, 11:51 a.m.