

# sp500 symbols
symbols <- c("AAPL", "T") # for test

# get market data
prices <- BatchGetSymbols(symbols, first.date = as.Date("1990-01-01"))
prices <- as.data.table(na.omit(prices$df.tickers))
setnames(prices, c("ticker", "ref.date"), c("symbol", "date"))

# alfred data
permits <- as.data.table(get_alfred_series("PERMIT"))[realtime_period == max(realtime_period), c(1, 3)]
tcu <- as.data.table(get_alfred_series("TCU"))[realtime_period == max(realtime_period), c(1, 3)]
unrate <- as.data.table(get_alfred_series("UNRATE"))[realtime_period == max(realtime_period), c(1, 3)]

# macro data
macro_data <- merge(permits, tcu, by = c('date'), all.x = TRUE, all.y = FALSE)
macro_data <- merge(macro_data, unrate, by = c('date'), all.x = TRUE, all.y = FALSE)
macro_vars_to_growth_rates <- colnames(macro_data)[c(2, 3)]
macro_data[, (macro_vars_to_growth_rates) := lapply(.SD, function(x) x / shift(x) - 1), .SDcols = macro_vars_to_growth_rates]
macro_data <- na.omit(macro_data)

# dependent variable
prices[, future_return := shift(price.adjusted, 23, type = 'lead') / price.adjusted - 1]
prices[, date_end_month := ceiling_date(date, "month") - days(1)]
dependent_variable <- prices[, .SD[which.max(date)], by = .(symbol, date_end_month)]
dependent_variable <- dependent_variable[, .(symbol, future_return, date_end_month)]

# realtime covariets
realtime_x <- prices[, .(symbol, date, ret.adjusted.prices, volume)]

# define estimation period
est.start <- as.Date("2005-01-01")
est.end <- as.Date("2016-01-01")

# parameters (this will go to function arguments later). I will filter only one firm.
y_data = dependent_variable[symbol == "AAPL", .(date_end_month, future_return)]
x_macro_data = as.data.frame(macro_data)
x_real_time = as.data.frame(realtime_x[symbol == "AAPL", .(date, ret.adjusted.prices, volume)])
horizon = 1L
macro_delay = 0 # always use realtime data!
est_end = est.end
est_start = est.start
x_lag = 5 # same lags for all covariates for simplicity
legendre_degree = 3L
disp_flag = FALSE
x.quarterly_group = NULL
group_ar_lags = FALSE
real_time_predictions = FALSE

# checks and init
dim_macro <- dim(x_macro_data)[2]-1
dim_real_time <- dim(x_real_time)[2]-1

# storage
x_str_out <- x_str <- NULL
x_unstr_out <- x_unstr <- NULL
x_average_out <- x_average <- NULL
group_index <- group_index_un <- group_index_av <- 0
ref_in <- ref_out <- NULL

# # macro covarites
# x_macro_date <- x_macro_data$date
# x_macro_data <- x_macro_data[,-1]
# data.refdate <- dependent_variable$date_end_month
# data.refdate <- data.refdate %m-% months(macro_delay)
# est.start_m <- est_start
# est.end_m <- est_end
# est.start_m <- est.start_m %m-% months(macro_delay)
# est.end_m <- est.end_m %m-% months(macro_delay)
# data.refdate <- data.refdate  %m-% months(horizon)
# est.end_m <- est.end_m  %m-% months(horizon)
# for (j_macro in seq(dim_macro)){
#   j_data <- scale(x_macro_data[,j_macro], center = TRUE, scale = TRUE)
#   # get MIDAS structure:
#   tmp <- mixed_freq_data_single(data.refdate = data.refdate, data.x = j_data, data.xdate = x_macro_date,
#                                 x.lag = x_lag, horizon = 0, est_start, est.end_m, disp.flag = disp_flag)
#   if(j_macro==1){
#     ref_in <- tmp$est.refdate
#     ref_out <- tmp$out.refdate
#     ref_in <- ref_in %m+% months(macro_delay) %m+% months(horizon)
#     if(!is.null(ref_out))
#       ref_out <- ref_out %m+% months(macro_delay) %m+% months(horizon)
#   }
#   # get Legendre weights:
#   tmp_w <- lb(legendre_degree,a=0,b=1,jmax=x_lag)
#   # store aggregated case:
#   x_str <- cbind(x_str, tmp$est.x%*%tmp_w)
#   x_str_out <- cbind(x_str_out, tmp$out.x%*%tmp_w)
#   # store unrestricted case:
#   x_unstr <- cbind(x_unstr, tmp$est.x)
#   x_unstr_out <- cbind(x_unstr_out, tmp$out.x)
#   # store averages:
#   x_average <- cbind(x_average, rowMeans(tmp$est.x))
#   x_average_out <- cbind(x_average_out, rowMeans(tmp$out.x))
#   # store group indices:
#   group_index <- c(group_index, rep(max(group_index)+1,times=legendre_degree+1))
#   group_index_un <- c(group_index_un, rep(max(group_index_un)+1,times=x_lag))
#   group_index_av <- c(group_index_av, max(group_index_av)+1)
# }

# realtime covarites
x_realtime_date <- x_real_time$date
x_realtime_data <- x_real_time[,-1]
data.refdate <- dependent_variable$date_end_month
est_end <- est_end  %m-% months(horizon)
data.refdate <- data.refdate  %m-% months(horizon)

for (j_real_time in seq(dim_real_time)){
  j_data <- scale(x_realtime_data[,j_real_time], center = TRUE, scale = TRUE)
  # get MIDAS structure:
  tmp <- mixed_freq_data_single(data.refdate = data.refdate, data.x = j_data, data.xdate = x_realtime_date,
                                x.lag = x_lag, horizon = 0, est_start, est_end, disp.flag = disp_flag)

    ref_in <- tmp$est.refdate
    ref_out <- tmp$out.refdate
    lubridate::month(ref_in) <- lubridate::month(ref_in)+horizon
      lubridate::month(ref_out) <- lubridate::month(ref_out)+horizon
  # get Legendre weights:
  tmp_w <- lb(legendre_degree,a=0,b=1,jmax=x_lag)
  # store aggregated case:
  x_str <- cbind(x_str, tmp$est.x%*%tmp_w)
  x_str_out <- cbind(x_str_out, tmp$out.x%*%tmp_w)
  # store unrestricted case:
  x_unstr <- cbind(x_unstr, tmp$est.x)
  x_unstr_out <- cbind(x_unstr_out, tmp$out.x)
  # store averages:
  x_average <- cbind(x_average, rowMeans(tmp$est.x))
  x_average_out <- cbind(x_average_out, rowMeans(tmp$out.x))
  # store group indices:
  group_index <- c(group_index, rep(max(group_index)+1,times=legendre_degree+1))
  group_index_un <- c(group_index_un, rep(max(group_index_un)+1,times=x_lag))
  group_index_av <- c(group_index_av, max(group_index_av)+1)

if(is.null(ref_in) || is.null(ref_out)) {
  message("ref dates were not computed. likely that both macro and real time datasets where not inputed. at least one dataset must be inputed.")

# drop initializing zero:
group_index <- group_index[-1]
group_index_un <- group_index_un[-1]
group_index_av <- group_index_av[-1]

# sort quarterly group of covariates and lags
y.data_in <- y_data[as.Date(y_data$date_end_month) %in% ref_in,]
y.data_out <- y_data[as.Date(y_data$date_end_month) %in% ref_out,]

y_in <- y.data_in$future_return
y_in_dates <- y.data_in$date_end_month
y_out <- y.data_out$future_return
y_out_dates <- y.data_out$date_end_month

output <- list(y_in = y_in, y_in_dates = y_in_dates, y_out = y_out, y_out_dates = y_out_dates,
               x_str = x_str, x_str_out = x_str_out, x_unstr = x_unstr, x_unstr_out = x_unstr_out, x_average = x_average, x_average_out = x_average_out,
               group_index = group_index, group_index_un = group_index_un, group_index_av = group_index_av)

# FUNDAMENTAL DATA --------------------------------------------------------

# set api token
APIKEY = "15cd5d0adf4bc6805a724b4417bbaafc"

# utils

trading_days_year <- 256
trading_days_halfyear <- trading_days_year / 2
trading_days_q <- trading_days_year / 4

# market capitalization
market_cap <- import_daily(path = 'D:/fundamental_data/market_cap', extension = 'csv')
market_cap <- market_cap[symbol %in% symbols]

# import daily data
# prices <- import_daily(path = 'D:/market_data/equity/usa/day/trades', extension = 'csv')
prices <- market_cap[prices, on = c('symbol', 'date'), roll = -Inf]
prices[, `:=`(year = year(date), month = month(date))]

# Average volumes
prices[, Advt_12M_Usd := frollmean(volume, trading_days_year, na.rm = TRUE), by = .(symbol)]
prices[, Advt_6M_Usd := frollmean(volume, trading_days_halfyear, na.rm = TRUE), by = .(symbol)]
prices[, Advt_3M_Usd := frollmean(volume, trading_days_q, na.rm = TRUE), by = .(symbol)]

# Rolling volatility
prices[, Vol1Y_Usd := roll_sd(adjusted, trading_days_year), by = list(symbol)]
prices[, Vol3Y_Usd := roll_sd(adjusted, trading_days_year * 3), by = list(symbol)]
tail(prices[, .(date, close, Vol1Y_Usd, Vol3Y_Usd)])

# Average market capitalization
prices[, Mkt_Cap_12M_Usd := frollmean(as.numeric(marketCap), trading_days_year, na.rm = TRUE), by = list(symbol)]
prices[, Mkt_Cap_6M_Usd := frollmean(as.numeric(marketCap), trading_days_halfyear, na.rm = TRUE), by = list(symbol)]
prices[, Mkt_Cap_3M_Usd := frollmean(as.numeric(marketCap), trading_days_q, na.rm = TRUE), by = list(symbol)]
tail(prices[, .(date, marketCap, Mkt_Cap_12M_Usd, Mkt_Cap_6M_Usd, Mkt_Cap_3M_Usd)], 10)

# filter last observation in month and convert it to end of month date
prices <- prices[, .SD[which.max(date)] , by = .(symbol, year, month)]
prices[, date := ceiling_date(date, "month") - days(1)]

# Momentum
prices[, Mom_11M_Usd := data.table::shift(close, 12)/data.table::shift(close, 1)-1, by = list(symbol)]
prices[, Mom_5M_Usd := data.table::shift(close, 5)/data.table::shift(close, 1)-1, by = list(symbol)]
head(prices[, c('date', 'close', 'Mom_11M_Usd', 'Mom_5M_Usd')], 15)

# Labels
prices <- prices[,`:=`(R1M_Usd = future_return(adjusted, 1),
                       R3M_Usd = future_return(adjusted, 3),
                       R6M_Usd = future_return(adjusted, 6),
                       R12M_Usd = future_return(adjusted, 12)), by = .(symbol)]
head(prices[, .(date, adjClose, R1M_Usd, R3M_Usd, R6M_Usd, R12M_Usd)])

# labels for classification
prices[, `:=`(R1M_Usd_C = as.factor(R1M_Usd > median(R1M_Usd, na.rm = TRUE)),
              R12M_Usd_C = as.factor(R12M_Usd > median(R12M_Usd, na.rm = TRUE))), by = date]

# import balance sheet data
balance <- import_daily(path = "D:/fundamental_data/balance_sheet", extension = 'csv')
balance <- as.data.table(balance)
balance <- balance[symbol %in% symbols]

# import ratios
ratios <- import_daily(path = "D:/fundamental_data/ratios", extension = 'csv')
ratios <- as.data.table(ratios)
ratios <- ratios[symbol %in% symbols]
ratios <- ratios[order(symbol, date)]

# ratios
ratios[, price := pbRatio * bookValuePerShare]
ratios[, eps_ttm := frollsum(netIncomePerShare, 4), by = .(symbol)]
ratios[, pe_ttm := price / eps_ttm]

# merge ratios and financial statements
ratios <- balance[ratios, on = .(symbol, date)]

# merge ratios and prices
ratios[, date_end_month := ceiling_date(fillingDate, "month") - days(1)]
data <- merge(prices, ratios, by.x = c('symbol', 'date'), by.y = c('symbol', 'date_end_month'), all.x = TRUE, all.y = FALSE)

# define features and labels
labels_reg <- c("R1M_Usd", "R3M_Usd", "R6M_Usd", "R12M_Usd")
labels_clf <- c("R1M_Usd_C", "R12M_Usd_C")
features <- c(
  "Advt_3M_Usd", "Advt_6M_Usd", "Advt_12M_Usd",
  "cashPerShare", "dividendYield", "eps_ttm",
  "Mkt_Cap_3M_Usd", "Mkt_Cap_6M_Usd", "Mkt_Cap_12M_Usd",
  "Vol1Y_Usd", "Vol3Y_Usd",
  "Mom_5M_Usd", "Mom_11M_Usd",
  # 'netDebtToEBITDA',
  "pbRatio", "pe_ttm",
  # non ml factor features

# final table
cols <- c("symbol", "date", features, labels_reg, labels_clf)
DT <- data[, ..cols]
variables <- colnames(DT)[3:ncol(DT)]
DT <- DT[, (variables) := lapply(.SD, function(x) na.locf(x, na.rm = FALSE)), by = .(symbol), .SDcols = variables]
tail(DT, 20)

# TEST --------------------------------------------------------------------

est.start <- as.Date("1990-12-01")
est.end <- as.Date("2017-03-01")
rgdp.data <- macro_midasml$rgdp.data
rgdp.data <- rgdp.data[rgdp.data$DATE<=as.Date("2017-06-01"),]
qtarget.sort_midasml(y.data = rgdp.data, x.macro.data = macro_midasml$md.data,
                     x.real.time = macro_midasml$text.data, x.quarterly_group = macro_midasml$survey.data,
                     x.lag = 12, legendre_degree = 3,
                     horizon = 1, macro_delay = 1, est.start, est.end,
                     standardize = TRUE, group_ar_lags = FALSE, disp.flag = FALSE)

# params
y.data = rgdp.data
x.macro.data = macro_midasml$md.data
x.real.time = macro_midasml$text.data
x.quarterly_group = macro_midasml$survey.data
x.lag = 12
legendre_degree = 3
horizon = 1
macro_delay = 1
standardize = TRUE
group_ar_lags = FALSE
disp.flag = FALSE

  stop("both macro and real time data were not inputed. program stops as you need monthly data to compute nowcasts. please input either x.macro.data or x.real.time on the next run.")
dim_macro <- dim_real.time <- dim_quarterly <- 0
  dim_macro <- dim(x.macro.data)[2]-1 # dimension od macro data; -1 because one column is date
  dim_real.time <- dim(x.real.time)[2]-1

dim_x <- dim_macro + dim_real.time + 1
  stop("x.lag variable, which defines the lag structure of each covariate, must be specified.")
if(length(x.lag)!=1 && length(x.lag)!=(dim_macro + dim_real.time)) # it is possible to define xlag for every covariate!!!
  stop(paste0("x.lag variable length must be either of size 1 (the same lag structure for x.macro.data and/or x.real.time) or must be of the length size equal to the total number of high-frequency covariates: ",dim_macro + dim_real.time,"."))
  x.lag <- rep(x.lag,times=(dim_macro + dim_real.time))

if(length(legendre_degree)!=1 && length(legendre_degree)!=dim_x){
  message(paste0("Legendre polynomial degree must be specified the same for all covariates (one number) or a seperate value for each (the length size equals to the total number of covariates). the length of legendre_degree: ", length(legendre_degree), ", number of covaraites that are inputed: ", dim_x,". Legendre degree is set to (for all covariates): ",  legendre_degree[1], " - the first entry in legendre_degree"))
  legendre_degree <- legendre_degree[1]
if (length(legendre_degree)==1)
  legendre_degree <- rep(legendre_degree,times=dim_x)

  message("typically macro series is published with 1 month lag. please check if your inputed macro data has different publication lag. the program does not stop, you need to re-run by re-setting macro_delay input.")

# storage

# storage
x_str_out <- x_str <- NULL
x_unstr_out <- x_unstr <- NULL
x_average_out <- x_average <- NULL
group_index <- group_index_un <- group_index_av <- 0
ref_in <- ref_out <- NULL
# computing the macro data if inputed
  x.macro_date <- x.macro.data$DATE
  x.macro_data <- x.macro.data[,-1]
  data.refdate <- y.data$DATE
  lubridate::month(data.refdate) <- lubridate::month(data.refdate)-macro_delay
  est.start_m <- est.start
  est.end_m <- est.end
  lubridate::month(est.start_m) <- lubridate::month(est.start_m)-macro_delay
  lubridate::month(est.end_m) <- lubridate::month(est.end_m)-macro_delay
  lubridate::month(data.refdate) <- lubridate::month(data.refdate) - horizon
  lubridate::month(est.end_m) <- lubridate::month(est.end_m) - horizon
      tmp_date <- data.refdate[length(data.refdate)]
      lubridate::month(tmp_date) <- lubridate::month(tmp_date) + 3
      data.refdate <- c(data.refdate, tmp_date)
  for (j_macro in seq(dim_macro)){
      j_data <- scale(x.macro_data[,j_macro], center = TRUE, scale = TRUE)
    } else {
      j_data <- scale(x.macro_data[,j_macro], center = TRUE, scale = TRUE)
    # get MIDAS structure:
    tmp <- mixed_freq_data_single(data.refdate = data.refdate, data.x = j_data, data.xdate = x.macro_date,
                                  x.lag[j_macro], horizon = 0, est.start, est.end_m, disp.flag = disp.flag) #horizon is taken into account by shifting est.end_m back
      ref_in <- tmp$est.refdate
      ref_out <- tmp$out.refdate
      lubridate::month(ref_in) <- lubridate::month(ref_in)+macro_delay+horizon
        lubridate::month(ref_out) <- lubridate::month(ref_out)+macro_delay+horizon
    # get Legendre weights:
    tmp_w <- lb(legendre_degree[j_macro],a=0,b=1,jmax=x.lag[j_macro])
    # aggregate in-sample:
    x_str <- cbind(x_str, tmp$est.x%*%tmp_w)
    x_str_out <- cbind(x_str_out, tmp$out.x%*%tmp_w)
    # store unrestricted case:
    x_unstr <- cbind(x_unstr, tmp$est.x)
    x_unstr_out <- cbind(x_unstr_out, tmp$out.x)
    # store averages:
    x_average <- cbind(x_average, rowMeans(tmp$est.x))
    x_average_out <- cbind(x_average_out, rowMeans(tmp$out.x))
    # get group indices
    group_index <- c(group_index, rep(max(group_index)+1,times=legendre_degree[j_macro]+1))
    group_index_un <- c(group_index_un, rep(max(group_index_un)+1,times=x.lag[j_macro]))
    group_index_av <- c(group_index_av, max(group_index_av)+1)

# library(httr)
# data <- content(GET(
#   "https://oss.uredjenazemlja.hr/rest/katHr/lrInstitutions/position?id=2432593&status=1332094865186&x=undefined&y=undefined",
#   add_headers(origin = "https://www.katastar.hr")
# ), as = "parsed", type = "application/json")
# print(data)
# data <- content(GET(
#   "https://oss.uredjenazemlja.hr/rest/katHr/parcelDetails?id=9048388&status=47204326483802",
#   add_headers(origin = "https://www.katastar.hr")
# ), as = "parsed", type = "application/json")
# print(data)
# number = 5447310
# 13314810894620
# rand = 9048388
# randStr = as.character(rand)
# n = 0
# library(V8)
# ct <- v8()
# x <- ct$eval("(function() {return Math.floor(1e7 * Math.random())})()")
# ct$eval(paste0("(function(e,t){for(var n=0,i=0;i<e.length;i++)n=(n<<5)-n+e.charAt(i).charCodeAt(0),n&=n;return null==t&&(t=e),Math.abs(n).toString().substring(0,6)+(Number(t)<<1)})(",
#                x,
#                ", null)"))
