R/time_series.R

Defines functions Check_forTimeGaps BuildTimeDf Example CreateTimeGrid TestCreateTimeGrid TimestampDf_toCounts EventTimestamp_toAggrDt TestEventTimestamp_toAggrDt BuildTimeDf2 BuildDatetime CreateTemporalGrid Example Build_predictorTemporalNH Example CalcTimestampProgress Example Build_eventTimestamp Example MovingAvg Example Build_eventPredictor Example Build_eventExplanList Example FindSeriesChunks Example PlotMultiCurves Example MergeDfList Example Example CreateLagMat2 CreateLagMat Example CreateLongtMat2 CreateLongtMat Example CreateLagIndFcn Example CreateFsFcn Example CreateFsIndFcn Example CreateLagFsFcn Example GaussLinRegSol Example BetaGiven_volatParams LlikGaussReg Example LlikGaussRegVarVolat Example MaxGaussLlikGivenBeta Example FitGaussLin_varyingVol FitGaussVolatMod predict.gaussVolatMod print.gaussVolatMod Example BetaVarCov BetaPrec Example ModSelSubsetGen VarOnMean CoefOfVar CreateLagData GenBoundsDfGg GenBoundsDf Example GenBoundsFcn Example LongtAvgPlt

#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#    https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# authors: Alireza Hosseini, Reza Hosseini

## Time series manipulation and forecasting functions

## testing to check if the timestamps are consecutive with no gaps
Check_forTimeGaps = function(df, timeCol) {

  x = NULL
  Func = function(i) {
    x = difftime(df[i + 1, timeCol], df[i, timeCol])
    return(x)
  }

  x = lapply(1:(nrow(obsDf)-1), FUN=Func)
  x = unlist(x)
  return(summary(x))
}

## build a time data frame with useful
# temporal attributes for plotting and modeling
BuildTimeDf = function(
  ts,
  format="%Y-%m-%d %H:%M:%S",
  tz="GMT",
  origin_forTimeVars=2000) {

  ts = as.POSIXct(ts, format=format, tz=tz)
  ## function to decide leap year, we delibrately avoid is_leap in lubridate
  IsLeap = function(year) {
    out = (year %% 400 == 0) + (year %% 100 != 0)*(year %% 4 == 0)
    return(out > 0)
  }

  second = lubridate::second(ts)
  minute = lubridate::minute(ts)
  hour = lubridate::hour(ts)
  doy = lubridate::yday(ts)
  #as.numeric(strftime(ts, format = "%j"))
  # time of day (ranges in 0-24)
  tod = hour + minute / 60 + second / 3600
  dom = chron::days(ts)
  week = lubridate::week(ts)
  weekday = weekdays(ts)
  isWeekend = weekday %in% c("Saturday", "Sunday")
  # day of week
  dow = with(chron::month.day.year(ts), chron::day.of.week(month, day, year))
  # time of week (ranges from 0-7)
  tow = dow + tod / 24
  month =  lubridate::month(ts)
  year =  lubridate::year(ts)
  yearLength = 365 + IsLeap(year)
  # time of year
  toy = (doy - 1) + tod / 24
  # continuous time scale (unit is in years)
  contiTime = year + toy / yearLength
  ct1 = contiTime - origin_forTimeVars
  ct2 = (ct1^2) * sign(ct1)
  ctSqrt = sqrt(abs(ct1)) * sign(ct1)
  ctCbrt = abs(ct1)^(1/3) * sign(ct1)
  date = as.Date(ts)
  timeDf = data.frame(
    date=date,
    contiTime=contiTime,
    ct1=ct1,
    ct2=ct2,
    ctSqrt=ctSqrt,
    ctCbrt=ctCbrt,
    year=year,
    month=month,
    dom=dom,
    doy=doy,
    weekday=weekday,
    isWeekend=isWeekend,
    week=week,
    dow=dow,
    tow=tow,
    tod=tod,
    toy=toy,
    hour=hour,
    minute=minute,
    second=second)

  timeDf[ , "ywoy"] = paste(timeDf[ , "year"], timeDf[ , "week"], sep=":")
  timeDf[ , "ydoy"] = paste(timeDf[ , "year"], timeDf[ , "doy"], sep=":")
  timeDf[ , 'tod_wknd'] =timeDf[ , 'tod'] * timeDf[ , 'isWeekend']

  timeDf[ , "dow_hr"] = paste(timeDf[ , "dow"], timeDf[ , "hour"], sep="_")

  ind = minute > 30
  half = rep("1sthalf", nrow(timeDf))
  half[ind] = "2ndhalf"

  timeDf[ , "dow_30min"] = paste(timeDf[ , "dow_hr"], half, sep="_")


  return(timeDf)
}

Example = function() {

  ts = strptime("01/01/2011 11:43", "%d/%m/%Y %H:%M", tz = "Europe/London")
  ts = chron::as.chron(ts)
  BuildTimeDf(ts)

}

## build a time data frame given the two end points and delta
# wrote this for uber originally
CreateTimeGrid = function(t1, t2, delta, origin_forTimeVars=2000) {

  tsGrid = seq.POSIXt(
    as.POSIXct(t1, tz="GMT"),
    as.POSIXct(t2, tz="GMT"),
    by=delta)
  tsGrid = data.frame(ts=tsGrid)
  timeDf = BuildTimeDf(
    tsGrid[ , "ts"], origin_forTimeVars=origin_forTimeVars)
  df = cbind(tsGrid, timeDf)
  return(df)
}

TestCreateTimeGrid = function() {
  t1 = chron::as.chron(strptime(
    "01/01/2011 23:53", "%d/%m/%Y %H:%M", tz = "Europe/London"))
  t2 = chron::as.chron(strptime(
    "02/01/2011 01:03", "%d/%m/%Y %H:%M", tz = "Europe/London"))
  timeDf = CreateTimeGrid(t1, t2, delta=1/(24*60))
}

## aggregates timestamp data and return counts per time interval of length delta
# it also fills in the gaps where there are no timestamps with zero count
# was originally written for uber homework
TimestampDf_toCounts = function(
  ts,
  delta,
  timeColName='ts') {
  tsDelta = trunc(ts, delta)
  tab = table(as.character(tsDelta))
  df = as.data.frame(tab)
  ## change back the factor class to chron
  df[ , 1] = as.POSIXct(
    as.character(df[ , 1]),
    format="%Y-%m-%d %H:%M:%S", tz="GMT")

  names(df) = c(timeColName, 'count')

  ## for some time intervals we may have zero logins.
  # Therefore we need to fill in the gaps
  tsDelta = df[ , 1]
  tmin = min(tsDelta)
  tmax = max(tsDelta)
  tsGrid = seq.POSIXt(
    as.POSIXlt(tmin, tz="GMT"),
    as.POSIXlt(tmax, tz="GMT"), by=delta)
  ## we may see a mismatch in size:
  # that means some time intervals are missing in our df
  ## that indicates we have some intervals with zero login
  ## in order to fill in the gaps we merge the tsGrid and df
  tsGridDf = as.data.frame(tsGrid)
  names(tsGridDf) = timeColName
  tsGridDf[ , 1] = as.character(tsGridDf[ , 1])
  df[ , 1] = as.character(df[ , 1])
  dfGapFilled = merge(tsGridDf, df, by=timeColName, all.x=TRUE, from='left')
  dfGapFilled[ , 1] = as.POSIXct(
    dfGapFilled[ , 1], format="%Y-%m-%d %H:%M:%S",
    tz="GMT")
  ## lets find out which timestamps has no logins (NA)
  indNA = which(is.na(dfGapFilled[ , 2]))
  tsNA = dfGapFilled[indNA , 1]
  ## setting NA to zero
  dfGapFilled[indNA, 2] = 0

  # adjust for the fact that the last interval count is under-estimated
  # coef below is between 0 and 1.
  # coef ~ 1;  is when we have a login close to the end of the last interval
  # coef ~ 0; is when the last login happens to spill over to the last interval
  coef = as.numeric(difftime(max(ts), tmax, unit=delta))
  # if coef < 1/2 the the data for the last interval is not reliable
  # because the last obs login is half way through the interval
  # and we delete the last row of the df
  if (coef < 1/2) {
    df = dfGapFilled[-dim(df)[1] , , drop=FALSE]
  }
  #print(coef)

  # if coef > 1/2 we have a login which is more than half way through
  # the last interval
  # therefore we proportionaly adjust (bump the count up)
  # we round down because there is an upward bias with bumpig up with coef
  if (coef < 1 & coef > 1/2) {
    #print(dfGapFilled[dim(dfGapFilled)[1], 'count'])
    x = dfGapFilled[dim(dfGapFilled)[1], 'count'] * (coef^(-1))
    newx = round(x)
    dfGapFilled[dim(dfGapFilled)[1], 'count'] = newx
    #print(dfGapFilled[dim(dfGapFilled)[1], 'count'])
  }

  outList = list(df=dfGapFilled, tsNA=tsNA)
}

## aggregates timestamp data with an associated value
# it creates a gridded data table
# and returns an aggregate eg counts (AggFunc = sum) for each time grid
# it also fills in the gaps where there are no timestamps with "naFiller"
# the naFiller is zero if the value count
# ts is a series
# timeRounding is the time rounding
# since the last timestamp is not necessarily and end point to the obsetrvation
# window, for example it could be the last time an event has occurred
# it tried to weight the last time grids value
# if the last time observed timestamp is close to the max grid point
# it does nothing
# if its more than half grid off it adjust the last value
EventTimestamp_toAggrDt = function(
    dt,
    timeCol,
    valueCol=NULL,
    timeRounding,
    newTimeCol="ts",
    timeFormat="(%Y-%m-%d %H:%M:%S)",
    AggFunc=function(x)sum(x, na.rm=TRUE),
    naFiller=0,
    roundLastValue=FALSE) {

  #tsDelta = trunc(ts, timeRounding)

  if (is.null(valueCol)) {
    valueCol = "cnt"
    dt[ , "cnt"] = 1
  }

  if (is.null(newTimeCol)) {
    newTimeCol = "ts"
  }

  ts = dt[ , get(newTimeCol)]

  dt[ , newTimeCol] = round_date(Col(dt=dt , col=timeCol), timeRounding)

  dt = DtSimpleAgg(
      dt=dt,
      gbCols=c(newTimeCol),
      valueCols=valueCol,
      AggFunc=AggFunc)

  ## for some time intervals we may have zero logins.
  # Therefore we need to fill in the gaps
  tsDelta = dt[ , get(newTimeCol)]
  tmin = min(tsDelta)
  tmax = max(tsDelta)

  delta = as.numeric(lubridate::duration(timeRounding))

  tsGrid = seq(tmin, tmax, by=delta)
  ## we may see a mismatch in size: that means some time intervals are missing
  # in our df
  # that indicates we have some intervals with zero login
  # in order to fill in the gaps we merge the tsGrid and df
  tsGridDt = data.table(tsGrid)
  names(tsGridDt) = newTimeCol

  #tsGridDt[ , 1] = as.character(tsGridDt[ , 1])
  #dt[ , 1] = as.character(dt[ , 1])

  gridDt = merge(
      tsGridDt,
      dt,
      by=newTimeCol,
      all.x=TRUE,
      from='left')
  #gridDt[ , 1] = as.chron(gridDt[ , 1], format=timeFormat)

  ## lets find out which timestamps has no logins (NA)
  indNA = which(is.na(gridDt[ , 2]))
  tsNA = gridDt[indNA , 1]
  ## setting NA to zero
  gridDt[indNA, 2] = naFiller

  # adjust for the fact that the last interval count is under-estimated
  # lastTs_gapCoef below is between 0 and 1.
  # lastTs_gapCoef ~ 1;  is when we have a login close to the end of the last interval
  # lastTs_gapCoef ~ 0; is when the last login happens to spill over to the last interval
  lastTs_gapCoef = as.numeric(max(ts) - tmax) / delta
  # if lastTs_gapCoef < 1/2 the the data for the last interval is not reliable
  # because the last obs login is half way through the interval
  # and we delete the last row of the df
  if (lastTs_gapCoef < 1/2) {
    df = gridDt[-nrow(gridDt) , , drop=FALSE]
  }


  # if lastTs_gapCoef > 1/2 we have a login which is more than half way through
  # the last interval
  # therefore we proportionaly adjust (bump the count up)
  # we round down because there is an upward bias with bumpig up with lastTs_gapCoef
  if (lastTs_gapCoef < 1 & lastTs_gapCoef > 1/2) {
    x = gridDt[nrow(gridDt), get(valueCol)] * (lastTs_gapCoef^(-1))
    if (roundLastValue) {
      x = round(x)
    }
    gridDt[dim(gridDt)[1], get(valueCol)] = x
  }

  return(list(
      "dt"=gridDt,
      "tsNA"=tsNA,
      "lastTs_gapCoef"=lastTs_gapCoef))
}

TestEventTimestamp_toAggrDt = function() {

  ts = seq(
    from=as.POSIXlt("2011-01-01 1:53"),
    to=as.POSIXlt("2011-01-01 05:53"),
    by=60)

  n = length(ts)

  df = data.frame(
      "ts"=ts,
      "event_cnt"=sample(1:10, n, replace=TRUE))

  m = round(n / 3)
  ind = sort(sample(1:n, m, replace=FALSE))

  df = df[ind, ]

  ts = df[ , "ts"]

  dt = data.table(df)

  res = EventTimestamp_toAggrDt(
    dt=dt,
    timeCol="ts",
    valueCol=NULL,
    timeRounding="5 min",
    newTimeCol=NULL,
    timeFormat="(%Y-%m-%d %H:%M:%S)")

}


## this function build time data frame
BuildTimeDf2 = function(dt) {

  library("lubridate")
  n = length(dt)
  hourVec = rep(NA, n)
  mdayVec = rep(NA, n)
  monthVec = rep(NA, n)
  quarterVec = rep(NA, n)
  wdayVec = rep(NA, n)
  weekVec = rep(NA, n)
  doyVec = rep(NA, n)
  yearVec = rep(NA, n)
  minuteVec = rep(NA, n)
  secondVec = rep(NA, n)
  todVec = rep(NA, n)
  towVec = rep(NA, n)
  toyVec = rep(NA, n)
  contiTimeVec = rep(NA, n)
  ## year, week of yea
  ywoyVec = rep(NA, n)
  ydoyVec = rep(NA, n)

  for (i in 1:n) {
    dt1 = dt[i]
    hourVec[i] = hour(dt1)
    mdayVec[i] = mday(dt1)
    monthVec[i] = month(dt1)
    quarterVec[i] = quarter(dt1)
    wdayVec[i] = wday(dt1)
    weekVec[i] = week(dt1)
    doyVec[i] = yday(dt1)
    yearVec[i] = year(dt1)
    minuteVec[i] = minute(dt1)
    secondVec[i] = second(dt1)
    # tod (time of day) is between 0 and 24
    todVec[i] = hourVec[i] + minuteVec[i]/60 + secondVec[i]/3600
    # tow (time of week) between 0 to 7
    towVec[i] = (wdayVec[i]-1) + todVec[i]/24
    # toy (time of year)
    toyVec[i] = (doyVec[i]-1) +  todVec[i]/24
    contiTimeVec[i] = yearVec[i] + toyVec[i]/365
    ywoyVec[i] = paste(yearVec[i], weekVec[i], sep=":")
    ydoyVec[i] = paste(yearVec[i], doyVec[i], sep=":")
  }
  timeDf = data.frame(
      hour=hourVec, mday=mdayVec, month=monthVec,
      quarter=quarterVec, wday=wdayVec, week=weekVec, doy=doyVec, year=yearVec,
      minute=minuteVec, second=secondVec, tod=todVec, tow=towVec, toy=toyVec,
      contiTime=contiTimeVec, ywoy=ywoyVec, ydoy=ydoyVec)
  return(timeDf)
}

## build timestamp (datetime) from date, hr, minute, sec
BuildDatetime = function(
    df,
    dateCol=NULL,
    yearCol=NULL,
    monthCol=NULL,
    dayCol=NULL,
    hrCol=NULL,
    minCol=NULL,
    secCol=NULL,
    dateFormat="%Y-%m-%d") {

  asch = as.character

  Func = function(
      date=NULL, day=NULL, month=NULL, year=NULL,
      hr="00", min="00", sec="00") {
    if (is.null(date)) {
      date=paste(year, month, day, sep="-")
      dateFormat = "%Y-%m-%d"
    }

    dt = date
    if (is.null(hr)) {hr="00"}
    if (is.null(min)) {min="00"}
    if (is.null(sec)) {sec="00"}
    dt = paste(dt, hr, sep=" ")
    dt = paste(dt, min, sep=":")
    dt = paste(dt, sec, sep=":")

    #dtFormat = paste(dateFormat,"%H:%M:%S",sep=" ")
    #out =  strptime(dt, dtFormat)
    out = dt
    return(out)
  }

  n = dim(df)[1]
  out = rep(NA, n)
  for (i in 1:n) {
    date=NULL
    year=NULL
    month=NULL
    day=NULL
    hr=NULL
    min=NULL
    sec=NULL
    if (!is.null(dateCol)) {date = df[i, dateCol]}
    if (!is.null(yearCol)) {year = df[i, yearCol]}
    if (!is.null(monthCol)) {month = df[i, monthCol]}
    if (!is.null(dayCol)) {day = df[i, dayCol]}
    if (!is.null(hrCol)) {hr=df[i, hrCol]}
    if (!is.null(minCol)) {min=df[i, minCol]}
    if (!is.null(secCol)) {sec=df[i, secCol]}

    res = F(
        date=date,
        day=day,
        month=month,
        year=year,
        hr=hr,
        min=min,
        sec=sec)

    out[i] = res
    #res = as.POSIXct(res,format="%Y-%m-%d %H:%M:%S")
    #out[i] = asch(res)
  }
  return(out)
}

## creates a grid between given dates
CreateTemporalGrid = function(
  startDate,
  endDate,
  startTime,
  endTime,
  timeDelta) {

  period_dates = seq(
    as.Date(startDate),
    as.Date(endDate),
    by="days")

  date = period_dates
  hr = 0:23
  min = 0:(round(60/timeDelta)-1)
  period_grid = expand.grid(min=min, hr=hr, date=period_dates)

  period_grid = data.frame(
      date=as.Date(period_grid[ , 3]),
      hr=period_grid[ , 2],
      min=period_grid[ , 1])

  date = period_grid[ , 1]
  hr = period_grid[ , 2]
  min = period_grid[ , 3]
  time_of_day =  hr + timeDelta*(min/(60))

  conti_time = date - as.Date(startDate) + hr/24 + timeDelta*(min/(24*60))
  conti_time = as.numeric(conti_time)

  week_days =  weekdays(as.Date(date, "%d-%m-%Y"))
  num_week_days = weekday_to_numday(week_days)
  time_of_week = num_week_days +  hr/24 + timeDelta*(min/(24*60))

  year =  strftime(date, format="%Y")
  month = strftime(date, format="%m")
  day = strftime(date, format="%d")

  timestamp = ISOdatetime(
      year=year, month=month, day=day, hour=hr,
      min=min*timeDelta, sec=0, tz = "")

  period_grid = data.frame(
    timestamp=timestamp, date=date, hr=hr,
    min=min*timeDelta, min_aggr=min, time=conti_time,
    week_day=week_days, num_day=num_week_days,
    time_of_week=time_of_week,
    time_of_day=time_of_day)

  t1_ind = which(time_of_day >= startTime)
  t2_ind = which(time_of_day <= endTime)
  ind1 = t1_ind[1]
  ind2 = t2_ind[length(t2_ind)]

  period_grid = period_grid[ind1:ind2, ]
  return(period_grid)
}

Example = function() {

  timeDelta = 30
  startDate = "2013-12-01"
  endDate = "2013-12-25"
  startTime = 9.5
  endTime = 11.5
  fut_grid = CreateTemporalGrid(
      startDate, endDate, startTime, endTime, timeDelta)
  #grid_day_categ = date_NH_fcn(fut_grid$date)
  fut_grid = cbind(fut_grid, day_categ=grid_day_categ)
}

## gets the timestamps and National Holidays to build predictors
Build_predictorTemporalNH = function(
  timestamps=NA, ord=NA, NHdatesR=NA) {

  asn = as.numeric
  dates = as.Date(timestamps)
  hr =  format(timestamps,"%H")
  hr = asn(hr)
  min =  format(timestamps,"%M")
  min = asn(min)
  week_day =  format(timestamps,"%A")
  num_day =  format(timestamps,"%w")
  num_day = asn(num_day) %% 7
  time_of_week = num_day + hr/24 + min/(24*60)
  time_of_day = time_of_week %% 1

  day_categ = date_NH_fcn(dates, NHdatesR=NHdatesR)
  week_day_binary = (as.character(week_day) %in% c("Monday", "Tuesday",
    "Wednesday", "Thursday","Friday"))
  Fri_binary = (as.character(week_day) %in% c("Friday"))
  weekend_binary = (as.character(week_day) %in% c("Saturday","Sunday"))
  Sun_binary = (as.character(week_day) %in% c("Sunday"))

  ### define national holidays indicator
  NH = !day_categ %in% c("regular", "BNH", "BNH2")

  ## define before national holiday indicator
  BNH = day_categ %in% c("BNH")

  ### define NH but not weekend!
  NH_Reg = NH * (!week_day %in% c("Sunday", "Saturday"))

  ### define BNH but not weekend!
  BNH_Reg = BNH * (!week_day %in% c("Sunday", "Saturday"))

  n = length(timestamps)

  ## defining the frequencies
  omega_week = (2*pi)/(7)
  omega_day = (2*pi)/24

  sinMat1 = matrix(NA, n, ord[1])
  cosMat1 = matrix(NA, n, ord[1])

  ### weekly patterns (1)
  for (i in 1:ord[1]) {
    sinMat1[ , i] = sin(i*omega_week*time_of_week)
    cosMat1[ , i] = cos(i*omega_week*time_of_week)
  }

  ### week-day patterns (2)
  sinMat2 = matrix(NA, n, ord[2])
  cosMat2 = matrix(NA, n, ord[2])
  for (i in 1:ord[2]) {
    sinMat2[ , i] = week_day_binary*sin(i*omega_day*time_of_day)
    cosMat2[ , i] = week_day_binary*cos(i*omega_day*time_of_day)
  }

  ### Weekend (3)
  sinMat3 = matrix(NA, n, ord[3])
  cosMat3 = matrix(NA, n, ord[3])
  for (i in 1:ord[3]) {
    sinMat3[ , i] = weekend_binary*sin(i*omega_day*time_of_day)
    cosMat3[ , i] = weekend_binary*cos(i*omega_day*time_of_day)
  }

  ### Friday patterns (4)
  sinMat4 = matrix(NA, n, ord[4])
  cosMat4 = matrix(NA, n, ord[4])
  for (i in 1:ord[4]) {
    sinMat4[ , i] = Fri_binary*sin(i*omega_day*time_of_day)
    cosMat4[ , i] = Fri_binary*cos(i*omega_day*time_of_day)
  }

  ### Sunday (5)
  sinMat5 = matrix(NA, n, ord[5])
  cosMat5 = matrix(NA, n, ord[5])
  for (i in 1:ord[5]) {
    sinMat5[ , i] = Sun_binary*sin(i*omega_day*time_of_day)
    cosMat5[ , i] = Sun_binary*cos(i*omega_day*time_of_day)
  }

  ### NH (6)
  sinMat6 = matrix(NA, n, ord[6])
  cosMat6 = matrix(NA, n, ord[6])

  for (i in 1:ord[6]) {
    sinMat6[ , i] = NH_Reg*sin(i*omega_day*time_of_day)
    cosMat6[ , i] = NH_Reg*cos(i*omega_day*time_of_day)
  }

  ### BNH (7)
  sinMat7 = matrix(NA, n, ord[7])
  cosMat7 = matrix(NA, n, ord[7])

  for (i in 1:ord[7]) {
    sinMat7[,i] = BNH_Reg*sin(i*omega_day*time_of_day)
    cosMat7[,i] = BNH_Reg*cos(i*omega_day*time_of_day)
  }

  ### 8
  X_NH_Reg = cbind(NH_Reg, BNH_Reg)

  Reg_sin = sinMat1
  Reg_cos = cosMat1

  Reg_within_day_sin = sinMat2
  Reg_within_day_cos = cosMat2

  Fri_within_day_sin = sinMat3
  Fri_within_day_cos = cosMat3

  Weekend_within_day_sin = sinMat4
  Weekend_within_day_cos = cosMat4

  Sun_within_day_sin = sinMat5
  Sun_within_day_cos = cosMat5

  NH_Reg_sin = sinMat6
  NH_Reg_cos = cosMat6

  BNH_Reg_sin = sinMat7
  BNH_Reg_cos = cosMat7

  pred_data = list(
    Reg_sin=Reg_sin,
    Reg_cos=Reg_cos,
    Reg_within_day_sin=Reg_within_day_sin,
    Reg_within_day_cos=Reg_within_day_cos,
    Fri_within_day_sin = Fri_within_day_sin,
    Fri_within_day_cos = Fri_within_day_cos,
    Weekend_within_day_sin = Weekend_within_day_sin,
    weekend_within_day_cos = Weekend_within_day_cos,
    Sun_within_day_sin = Sun_within_day_sin,
    Sun_within_day_cos = Sun_within_day_cos,
    X_NH_Reg=X_NH_Reg,
    NH_Reg_sin=NH_Reg_sin,
    NH_Reg_cos=NH_Reg_cos,
    BNH_Reg_sin=BNH_Reg_sin,
    BNH_Reg_cos=BNH_Reg_cos)

  return(pred_data)
}

Example = function() {

  O = 11
  D = 31
  startTime = 0
  endTime = 24
  startDate = "2012-12-01"
  endDate = "2013-02-01"
  pred_data_bricks = CreateTemporalGrid(
    startDate, endDate, startTime, endTime, timeDelta)
  ord = c(6, 10, 5, 5, 5, 5, 5)
  NHdatesR = NHdatesR
  timestamps = pred_data_bricks[ , "timestamp"]
  T1 = Sys.time()
  res = Build_predictorTemporalNH2(timestamps, ord, NHdatesR=NHdatesR)
  T2 = Sys.time()
  T2-T1
  #end
}

## this function calculates a progress in an event
# which happens between timeStart and timeEnd at timestamp
CalcTimestampProgress = function(
  timeStart=NA, timeEnd=NA, timestamp1=NA) {

  timestamp1 = as.POSIXct(timestamp1)
  timeStart = as.POSIXct(timeStart)
  timeEnd = as.POSIXct(timeEnd)
  asn = as.numeric
  a = asn(difftime(timestamp1, timeStart, unit="secs"))
  b = asn(difftime(timeEnd, timeStart, unit="secs"))
  return(a/b)
}

Example = function() {

  startTime = 0
  endTime = 24
  startDate = "2013-09-17"
  endDate = "2013-09-24"
  timeDelta = 30
  pred_data_bricks = CreateTemporalGrid(startDate, endDate,
    startTime, endTime, timeDelta)
  timestamps = pred_data_bricks$timestamp
  timeStart = timestamps[1]
  timeEnd = timestamps[10]
  CalcTimestampProgress(timeStart, timeEnd, timestamps[9])
}

## build event timestamps: this file creates a variable
# which tells the event status for a given timestamp
Build_eventTimestamp = function(timestamps, events_info) {

  Events = events_info[["EventID"]]
  EventsSet = unique(Events)
  event_num = length(EventsSet)

  timestamp1 = events_info$timestamp_start
  timestamp2 = events_info$timestamp_end
  timestamp1 = as.POSIXct(timestamp1)
  timestamp2 = as.POSIXct(timestamp2)

  n = length(timestamps)
  eventMatrix = matrix(0, n, event_num)
  eventDf = data.frame(eventMatrix)
  names(eventDf) = EventsSet
  eventDf = cbind(timestamp=timestamps, eventDf)
  #event_status = rep("None",n)
  #event_progress = rep(0,n)

  for (i in 1:event_num) {
    Event = EventsSet[i]
    ind = which(Events == Event)
    for (k in ind) {
      t1 = timestamp1[k]
      t2 = timestamp2[k]
      event_ind = which(timestamps >= t1 & timestamps <= t2)

      if (length(event_ind) > 0) {
      eventTimestamps = timestamps[event_ind]
      event_prog = CalcTimestampProgress(t1, t2, eventTimestamps)
      eventDf[event_ind, (i + 1)] = event_prog
      }
    }
  }

  return(eventDf)
}

Example = function() {

  startTime = 0
  endTime = 24
  startDate = "2013-09-19"
  endDate = "2013-09-24"
  timeDelta = 30
  pred_data_bricks = CreateTemporalGrid(startDate, endDate,
    startTime, endTime, timeDelta)
  timestamps = pred_data_bricks$timestamp
  events_status = Build_eventTimestamp(timestamps, events_info)
}


## (old function, check) the weighted moving average with missing allowed
MovingAvg = function(
  vec,
  filterNum=NULL,
  filterPpns=NULL,
  missNum=0,
  periodic=FALSE) {

  if (!is.null(filterNum) && filterNum > length(vec)) {
    'Warning: filterNum bigger than vec size'
    return(rep(mean(vec, na.rm=TRUE), length(vec)))
  }

  l = length(filterPpns)
  if (is.null(filterNum)) {
    filterNum = 0
  }
  if (!is.null(filterPpns)) {
    filterNum = l
  }
  if (is.null(filterPpns)) {
    filterPpns = rep(1, filterNum)
  }
  l = length(filterPpns)
  filterWeights = c(filterPpns[l:1], 1, filterPpns[1:l])
  filterWeights = filterWeights/sum(filterWeights)
  #print(filterWeights)
  L = length(vec)
  out = rep(NA, L)
  v = rep(NA, L)
  if (periodic) {
    v = vec
  }

  vecvec = c(v, vec, v)
  for (q in 1:L) {
    part = vecvec[(L+q-filterNum):(L+q+filterNum)]
    miss = is.na(part)
    partMissNum = sum(miss)
    if (partMissNum <= missNum) {
      ind = which(is.na(part) == FALSE)
      subPart = part[ind]
      filterWeights2 = filterWeights[ind]
      filterWeights2 = filterWeights2/sum(filterWeights2)
      subPart = subPart*filterWeights2
      out[q] = sum(subPart)
    }
  }
  return(out)
}

##Example
Example = function() {
  vec = rnorm(100)
  filterNum = 5
  filterPpns = c(1/2, 1/3, 1/4, 1/5, 1/6)
  missNum = 2
  y = MovingAvg(vec, filterNum=filterNum, filterPpns=filterPpns, missNum=missNum)
  plot(vec)
  lines(y,col='red')

  y[5:6] = c(NA, NA)
  y = MovingAvg(y, filterNum=filterNum, filterPpns=filterPpns, missNum=missNum)
  plot(vec)
  lines(y,col='red')


  y[5:6] = c(NA, NA)
  y = MovingAvg(y, filterNum=filterNum, filterPpns=filterPpns, missNum=missNum)
  plot(vec)
  lines(y, col='red')

  ## periodic Examples
  vec = rnorm(100)
  filterNum = NULL
  filterPpns = c(1/2, 1/3, 1/4, 1/5, 1/6)
  missNum = 2
  #par(mfrow=c(3,2))
  y = MovingAvg(vec, filterNum=filterNum, filterPpns=filterPpns,
    missNum=missNum, periodic=TRUE)
  plot(vec)
  lines(y,col='red')

  y[5:6] = c(NA, NA)
  y = MovingAvg(y, filterNum=filterNum, filterPpns=filterPpns,
    missNum=missNum, periodic=TRUE)
  plot(vec)
  lines(y,col='red')

  y[5:6] = c(NA, NA)
  y = MovingAvg(y, filterNum=filterNum, filterPpns=filterPpns,
    missNum=missNum, periodic=TRUE)
  plot(vec)
  lines(y,col='red')

  y[5:6] = c(NA, NA)
  y = MovingAvg(y, filterNum=filterNum, filterPpns=filterPpns,
    missNum=missNum, periodic=TRUE)
  plot(vec)
  lines(y,col='red')


  y[5:6] = c(NA, NA)
  y = MovingAvg(y, filterNum=filterNum, filterPpns=filterPpns,
    missNum=missNum, periodic=TRUE)
  plot(vec)
  lines(y,col='red')

  y[5:6] = c(NA, NA)
  y = MovingAvg(y, filterNum=filterNum, filterPpns=filterPpns,
    missNum=missNum, periodic=TRUE)
  plot(vec)
  lines(y, col='red')
}

## this function gets event status which is the progress of the event
# for Example 0,0,0,0,0.1,0.2,...,1....
# this returns the Fourier series with cos terms in even positions
# and sin terms in odd
Build_eventPredictor = function(event_status, ord) {

  n = length(event_status)
  sinMat = matrix(NA, n, ord)
  cosMat = matrix(NA, n, ord)

  for (i in 1:ord) {
    sinMat[ , i] = sin(i*2*pi*event_status)*(event_status>0)
    cosMat[ , i] = cos(i*2*pi*event_status)*(event_status>0)
  }

  trigMat = matrix(NA, nrow=n, ncol=2*ord)
  trigMat = data.frame(trigMat)
  for (i in 1:ord) {
    trigMat[ , (2*i-1)] = sinMat[ , i]
    names(trigMat)[2*i-1] = paste("sin", i, sep="")
    trigMat[ , 2*i] = cosMat[ , i]
    names(trigMat)[2*i] = paste("cos", i, sep="")
  }
  #pred_data = list(event_sin=sinMat,event_cos=cosMat)
  return(trigMat)
}

Example = function() {
  startTime = 0
  endTime = 24
  startDate = "2013-09-19"
  endDate = "2013-09-24"
  timeDelta = 30
  pred_data_bricks = CreateTemporalGrid(startDate, endDate,
    startTime, endTime, timeDelta)

  timestamps = pred_data_bricks[["timestamp"]]
  events_status = Build_eventTimestamp(timestamps, events_info)

  event_status = events_status[ , 2]
  ord = 2
  pred_data = Build_eventPredictor(event_status, ord)
  pred_data
}

## this file will build features for pre-specified events
# event times must include the event start and end timestamps
Build_eventExplanList = function(
  timestamps=NA, events_info=NA, ord_event=NA) {

  events_status = Build_eventTimestamp(timestamps, events_info)
  event_names = names(events_status)[-1]
  timestamps = events_status[ , 1]
  d = dim(events_status)[2] - 1
  l = list(timestamp=timestamps)
  for (i in 1:d) {
    event_status = events_status[ , i+1]
    pred = Build_eventPredictor(event_status, ord_event[i])
    l[[i+1]] = pred
  }
  return(l)
}

Example = function() {

  startTime = 0
  endTime = 24
  startDate = "2013-09-19"
  endDate = "2013-09-24"
  timeDelta = 30
  pred_data_bricks = CreateTemporalGrid(startDate, endDate,
    startTime, endTime, timeDelta)

  timestamps = pred_data_bricks[["timestamp"]]
  events_status = Build_eventTimestamp(timestamps, events_info)

  event_status = events_status[ , 2]
  ord = 2
  pred_data = Build_eventPredictor(event_status, ord)
  #pred_data
  ord = c(2,2)
  event_preds = build_event_explanList(
    timestamps, events_info=events_info, ord=ord)
}


## plot time series chunks in R.
# some times the time series is not continuous
# and thus should be plotted in chunks
# delta is the difference in seconds
FindSeriesChunks = function(data, timeCol, delta) {

  asn = as.numeric
  data = data[order(data[ , timeCol]), ]
  timestamps = data[ , timeCol]
  timestampsLag = c(timestamps[1], timestamps[-length(timestamps)])
  #timestampsLag
  Diff = difftime(timestamps, timestampsLag, unit='hours')
  break_ind = which(Diff > delta)
  myList = list(data=data)
  if (length(break_ind ) > 0) {
    outList = list()
    if (break_ind[length(break_ind)] < length(timestamps)) {
      break_ind = c(break_ind, length(timestamps))
    }
    k = length(break_ind)
    start_ind = 1
    end_ind = 2
    for (i in 1:k) {
      end_ind = break_ind[i] - 1
      outList[[i]] = data[(start_ind:end_ind),]
      start_ind = end_ind + 1
    }
  }
  return(outList)
}

Example = function() {
  date1 =  "2015-09-17 00:20:00 MYT"
  date2 =  "2015-09-18 03:00:00 MYT"
  date1 = as.POSIXlt(date1)
  date2 = as.POSIXlt(date2)
  delta = 1/3
  timestamps3 = seq(date1, date2, by=(delta*60*60*12))

  date1 =  "2014-09-17 00:20:00 MYT"
  date2 =  "2014-09-18 03:00:00 MYT"
  date1 = as.POSIXlt(date1)
  date2 = as.POSIXlt(date2)
  delta = 1/3
  timestamps3 = seq(date1, date2, by=(delta*60*60*12))

  timestamps = c(timestamps3, timestamps4)
  y = c(1:(length(timestamps4) + length(timestamps3)))
  delta = 13
  data = data.frame(timestamp=timestamps, y=y, z=y)
  dataList = FindSeriesChunks(data, timeCol='timestamp', delta)
}

## gets a list of dataframes and from plots col valueCol vs xCol
PlotMultiCurves = function(
    dataList,
    dataListAdd=NULL,
    xCol,
    valueCol,
    valueColAdd,
    dataListInd=NULL,
    type='l',
    cols=c(1, 2),
    mfrow=NULL,
    xlab='x',
    ylab='y',
    mainTitles=NULL,
    Title=NULL) {

  n = length(dataList)
  if (is.null(dataListInd)) {
    dataListInd=1:n
  }
  if (is.null(mfrow)) {
    mfrow=c(1, length(dataListInd))
  }
  oldpar = par(mfrow=mfrow, mar=c(3, 2, 3, 2), oma=c(2, 2, 5, 2))
  ### find the right y axis limits
  ymax = -Inf
  ymin = Inf

  for (j in 1:length(dataListInd)) {
    i = dataListInd[j]
    df = dataList[[i]]
    ymax1 = max(df[ , valueCol], na.rm=TRUE)
    ymax2 = ymax1
    ymin1 = min(df[ , valueCol], na.rm=TRUE)
    ymin2 = ymin1
    if (!is.null(dataListAdd)) {
      dfAdd=dataListAdd[[i]]
      ymin2 = max(dfAdd[ , valueColAdd], na.rm=TRUE)
    }
    ymax = max(c(ymax, ymax1, ymax2))
    ymin = min(c(ymin, ymin1, ymin2))
  }


  for (j in 1:length(dataListInd)) {
    i = dataListInd[j]
    df = dataList[[i]]
    main1 = df[1, 1]
    if (!is.null(mainTitles)) {
      main1=mainTitles[j]
    }

    plot(df[ , timeCol], df[ , valueCol], type=type, main=main1, xlab=xlab,
      ylab=ylab, col=cols[1], ylim=c(ymin, ymax))

    if (!is.null(dataListAdd)) {
      dfAdd=dataListAdd[[i]]
      lines(dfAdd[ , xCol], dfAdd[ , valueColAdd], col=cols[2])
    }
  }

  if (!is.null(Title)) {
    mtext(Title, side=3, line=1, outer=TRUE, cex=1, font=1)
  }
  par(oldpar)
}

Example = function() {
  date1 =  "2012-09-17 00:20:00 MYT"
  date2 =  "2012-09-17 03:00:00 MYT"
  date1 = as.POSIXlt(date1)
  date2 = as.POSIXlt(date2)
  delta = 1/3
  timestamps1 = seq(date1, date2, by=(delta*60*60))

  date1 =  "2013-09-17 00:20:00 MYT"
  date2 =  "2013-09-17 03:00:00 MYT"
  date1 = as.POSIXlt(date1)
  date2 = as.POSIXlt(date2)
  delta = 1/3
  timestamps2 = seq(date1, date2, by=(delta*60*60))
  timestamps = c(timestamps2, timestamps1)

  date1 =  "2000-09-17 00:20:00 MYT"
  date2 =  "2000-09-17 03:00:00 MYT"
  date1 = as.POSIXlt(date1)
  date2 = as.POSIXlt(date2)
  delta = 1/3
  timestamps3 = seq(date1, date2, by=(delta*60*60))

  date1 =  "2015-09-17 00:20:00 MYT"
  date2 =  "2015-09-18 03:00:00 MYT"
  date1 = as.POSIXlt(date1)
  date2 = as.POSIXlt(date2)
  delta = 1/3
  timestamps4 = seq(date1, date2, by=(delta*60*60*12))

  timestamps = c(timestamps2, timestamps1, timestamps3, timestamps4)

  y = c(1:length(timestamps1), 1:length(timestamps2),
  1:length(timestamps3), 1:length(timestamps4))
  data = data.frame(timestamp=timestamps, y=y)

  delta = 13
  dataList = FindSeriesChunks(data=data, timeCol='timestamp', delta)
  z = y + 1
  data = data.frame(timestamp=timestamps, z=z)
  dataListAdd = FindSeriesChunks(data=data, timeCol='timestamp', delta)

  PlotMultiCurves(dataList, dataListAdd, timeCol='timestamp',
    valueCol='y', valueColAdd='z',
    dataListInd=1:3, mainTitles=c('name1', 'name2', 'name3'),
    xlab='time', ylab='y', mfrow=c(3, 1), Title='Fit')
}



## this function gets a list of data.frames of same row length and cbinds them
# it only cbinds the ones given in ind
MergeDfList = function(dfList=NA, ind=NULL, fcn=cbind) {

  if (is.null(ind)) {
    ind = 1:length(dfList)
  }

  out = NULL
  for (i in 1:length(ind)) {
    j = ind[i]

    if (i == 1) {
      out = dfList[[j]]
    }

    if (i > 1) {
      out = fcn(out, dfList[[j]])
    }
  }
  out = data.frame(out)
  return(out)
}

Example = function() {
  startTime = 0
  endTime = 24
  startDate = "2013-09-19"
  endDate = "2013-09-24"
  timeDelta = 30

  pred_data_bricks = CreateTemporalGrid(startDate, endDate,
    startTime, endTime, timeDelta)

  timestamps = pred_data_bricks[["timestamp"]]
  events_status = Build_eventTimestamp(timestamps, events_info)

  plot(timestamps,events_status[ , 2], type="l")
  lines(timestamps,events_status[ , 3], col=2)

  event_status = events_status[ , 2]
  ord = 2
  pred_data = Build_eventPredictor(event_status, ord)

  plot(timestamps,pred_data[ , 1], type="l")
  lines(timestamps,pred_data[ , 2], col=2)
  lines(timestamps,pred_data[ , 3], col=3)
  lines(timestamps,pred_data[ , 4], col=4)


  #pred_data
  ord = c(1, 1)
  event_preds = build_event_explanList(
    timestamps, events_info=events_info, ord=ord)

  explan_events = MergeDfList(dfList=event_preds[-1], ind=NA)

  plot(timestamps, explan_events[ , 1], type="l")
  lines(timestamps, explan_events[ , 2], col=1)

  lines(timestamps, explan_events[ , 3], col=2)
  lines(timestamps, explan_events[ , 4], col=2)
}

## packages for lag:
Example = function() {

  x = 1:100
  install.packages("DataCombine")
  slide(x, 1)
  install.packages("quantmod")
  Lag(x, 1)

  library(plyr)
  adf = ddply(adf, "Site",
       function(my_data) lag(my_data, "Time", "Value"))

  set.seed(13)
  dt = data.frame(location = rep(letters[1:2], each = 4), time = rep(1:4, 2),
    var = rnorm(8))

  lg = function(x)c(NA, x[1:(length(x)-1)])
  ## 1
  unlist(tapply(dt$var, dt$location, lg))

  ## 2
  ddply(dt, ~location, transform, lvar = lg(var))

  ## 3
  ddt <- data.table(dt)
  ddt[ , lvar := lg(var), by = c("location")]

  ## 4
  dt %>% group_by(location) %>% mutate(lvar = lag(var))
}

## this function acts on a timeseries y and creates a data.frame of
# lags upto lag=lagNum
# this is deprecated in favor of the faster function below
CreateLagMat2 = function(y, lagNum, colName=NULL, naFill=NA,
  fut=FALSE) {

  if (fut) {
    y = y[(length(y)-lagNum+1):length(y)]
    y = c(y, NA)
  }

  n = length(y)
  lagMat = matrix(NA, n, lagNum)
  lagMat = data.frame(lagMat)
  y1 = y
  colNames = NULL
  for (i in 1:lagNum) {
    y1 = c(naFill, y1)
    y1 = y1[-(n + 1)]
    lagMat[ , i] = y1

    colName2 = paste(colName, "lag", i, sep="")
    colNames = c(colNames, colName2)
  }

  if (!is.null(colName)) {
    colnames(lagMat) = colNames}

  if (fut) {
    lagMat = lagMat[dim(lagMat)[1], , drop=FALSE]
  }

  names(lagMat) = colNames
  return(lagMat)
}

## this function acts on a timeseries y
# and creates a data.frame of lags upto lag=lagNum
CreateLagMat = function(
    y, lagNum, colName=NULL, naFill=NA,
    fut=FALSE) {

  if (fut) {
    lagMat = matrix(y[length(y):(length(y)-lagNum+1)], 1, lagNum)
    colNames = paste(colName, "lag", 1:lagNum, sep="")
    colnames(lagMat) = colNames
    return(lagMat)
  }

  n = length(y)

  Fcn = function(i) {
    c(rep(naFill, i), y[1:(n-i)])
  }

  lagMat = sapply(1:lagNum, Fcn)

  colNames = paste(colName, "lag", 1:lagNum, sep="")
  colnames(lagMat) = colNames
  #lagMat = data.frame(lagMat)
  return(lagMat)
}

Example = function() {
  m = 100000
  k = 2000
  fut=TRUE
  t1 = Sys.time()
  x = CreateLagMat(1:m, k, colName="zereshk", fut=fut)
  t2 = Sys.time()
  print(t2-t1)

  t1 = Sys.time()
  x = CreateLagMat2(1:m, k, colName="zereshk", fut=fut)
  t2 = Sys.time()
  print(t2-t1)

  set.seed(1)
  email_data <- data.frame(dates=1:10, email_reach=1:10)

  library(data.table)
  setDT(email_data)[, paste0("email_reach", 1:3) := shift(email_reach, 1:3)][]

  set.seed(1)
  data <- data.table(time = c(1:3, 1:4),
           groups = c(rep(c("b", "a"), c(3, 4))),
           value = rnorm(7))


  library(dplyr)
  data2 <-
    data %>%
    group_by(groups) %>%
    mutate(lag.value=lag(value, 1:2))
}

## this function acts on a timeseries y and creates a data.frame of avg
# lags upto lag=lagNum
CreateLongtMat2 = function(y, lagNum, colName=NULL, naFill=NA,
  AvgFcn=rowMeans, fut=FALSE) {

  ylagMat = CreateLagMat(y, lagNum, colName, naFill=naFill, fut=fut)
  n = dim(ylagMat)[1]
  longtMat = matrix(NA, n, lagNum)
  longtMat = data.frame(longtMat)
  colNames = NULL
  for (i in 1:lagNum) {
    ylagMat1 = ylagMat[ , 1:i, drop=FALSE]
    avg_col = AvgFcn(ylagMat1)
    longtMat[ , i] = avg_col
    colName2 = paste(colName, "longt", i, sep="")
    colNames = c(colNames, colName2)
  }
  colnames(longtMat) = colNames
  return(longtMat)
}

# faster
CreateLongtMat = function(
  y,
  lagNum,
  colName=NULL,
  naFill=NA,
  AvgFcn=rowMeans,
  fut=FALSE) {

  ylagMat = CreateLagMat(y, lagNum, colName=NULL, naFill=naFill, fut=fut)
  Fcn = function(i) {
    AvgFcn(ylagMat[ , 1:i, drop=FALSE])
  }
  longtMat = sapply(1:lagNum, Fcn)

  if (fut) {
    longtMat = matrix(longtMat, 1, lagNum)
  }
  colnames(longtMat) = paste(colName, "longt", 1:lagNum, sep="")
  return(longtMat)
}

Example = function() {

  M = 2000
  k = 200
  fut = FALSE
  t1 = Sys.time()
  x = CreateLongtMat(1:M, k, colName="zereshk", fut=fut)
  t2 = Sys.time()
  print(t2-t1)

  t1 = Sys.time()
  x = CreateLongtMat2(1:M, k, colName="zereshk", fut=fut)
  t2 = Sys.time()
  print(t2-t1)
}

## this function creates a function which applies on a time series y
# and gives back a data.frame of needs lags or averaged lags
# lagIndex: the desired lags
# longtIndex: averaged lagIndex
CreateLagIndFcn = function(
  lagIndex=NULL,
  longtIndex=NULL,
  AvgFcn=rowMeans,
  LagTrans=function(x){x}) {

  lagNum = 0

  if (!is.null(lagIndex)) {
    lagNum = max(lagIndex)
  }

  if (!is.null(longtIndex)) {
    lagNum = max(c(lagNum, longtIndex))
  }

  Fcn = function(y, colName="", naFill=NA, fut=FALSE) {
    explanLag = NULL
    if (!is.null(lagIndex)) {
      lagMat = CreateLagMat(
        y=y,
        lagNum=lagNum,
        colName=colName,
        naFill=naFill,
        fut=fut)
      lagMat = lagMat[ , lagIndex, drop=FALSE]
      if (is.null(explanLag)) {
        explanLag = lagMat
      } else {
        explanLag = cbind(explanLag, lagMat)
      }
    }
    if (!is.null(longtIndex)) {
      longtMat = CreateLongtMat(
        y=y,
        lagNum=lagNum,
        colName=colName,
        naFill=naFill,
        AvgFcn=AvgFcn,
        fut=fut)

      longtMat = longtMat[ , longtIndex, drop=FALSE]
      if (is.null(explanLag)) {
        explanLag = longtMat
      } else {
        explanLag = cbind(explanLag, longtMat)
      }
    }
    explanLag = LagTrans(explanLag)
    return(explanLag)
  }
  return(Fcn)
}

Example = function() {

  y = 1:10
  ordLag = 3
  lagIndex = 1:3
  longtIndex = 3

  Fcn = CreateLagIndFcn(
    lagIndex=lagIndex,
    longtIndex=longtIndex)

  df1 = Fcn(y, colName="a", naFill=y[1])

  Fcn = CreateLagIndFcn(
    lagIndex=lagIndex,
    longtIndex=longtIndex,
    LagTrans=log)

  df1 = Fcn(y, colName="a", naFill=y[1])

  Fcn = CreateLagIndFcn(lagIndex=lagIndex, longtIndex=longtIndex)
  df1 = Fcn(y, colName="a", naFill=y[1], fut=TRUE)


  Fcn2 = CreateLagIndFcn(lagIndex=1:3, longtIndex=3)
  df2 = Fcn2(y)

  Fcn2 = CreateLagIndFcn(lagIndex=1:3, longtIndex=1)
  df2 = Fcn2(y)
}

## creates a function which get time values and creates a data.frame of
# fourier series terms
# ord: is the order of the series
# omega is the fourier series Freq: omega = 2pi/T where T is the period
# the odd columns of the data frame are sin terms
# the even columns of the data frame are even terms
CreateFsFcn = function(ord, omega, colName="") {

  Fcn = function(x) {
    n = length(x)
    #sinMat = matrix(NA,n,ord)
    #cosMat = matrix(NA,n,ord)
    mat = matrix(NA, n, 2*ord)
    df = data.frame(mat)
    for (i in 1:ord[1]) {
      sinColName = paste(colName, "sin", i, sep="")
      cosColName = paste(colName, "cos", i, sep="")
      df[ , (2*i-1)] = sin(i*omega*x)
      df[ , 2*i] = cos(i*omega*x)
      names(df)[(2*i-1)] = sinColName
      names(df)[2*i] = cosColName
    }
    return(df)
  }
  return(Fcn)
}

Example = function() {

  ord = 3
  omega = 2*pi/10
  x = seq(0, 10, by=0.001)
  explanFourier = CreateFsFcn(ord, omega)(x)
  explanFourier[1:5, ]

  plot(explanFourier[ , 1], type="l")
  lines(explanFourier[ , 2], col=2)
  lines(explanFourier[ , 3], col=3)
}

CreateFsIndFcn = function(sinOrd, cosOrd, omega, colName="") {

  ord = length(sinOrd) + length(cosOrd)
  Fcn = function(x) {
    n = length(x)
    mat = matrix(NA, n, ord)
    df = data.frame(mat)
    for (i in 1:length(sinOrd)) {
      j = sinOrd[i]
      sinColName = paste(colName, "sin", j, sep="")
      df[ , i] = sin(j*omega*x)
      names(df)[i] = sinColName
    }

    for (i in 1:length(cosOrd)) {
      j = cosOrd[i]
      cosColName = paste(colName, "cos", j, sep="")
      df[ , length(sinOrd) + i] = cos(i*omega*x)
      names(df)[length(sinOrd) + i] = cosColName
    }
    return(df)
  }
  return(Fcn)
}

Example = function() {
  omega = 2*pi/10
  x = seq(0, 10, by=0.1)
  explanFourier = CreateFsIndFcn(sinOrd=c(1, 10), cosOrd=c(2, 3),
    omega, colName="zereshk")(x)
  explanFourier[1:5, ]

  plot(explanFourier[ , 1], type="l")
  lines(explanFourier[ , 2], col=2)
  lines(explanFourier[ , 3], col=3)
}

CreateLagFsFcn = function(
  lagIndex,
  longtIndex,
  sinOrd,
  cosOrd,
  omega,
  AvgFcn=rowMeans,
  fut=FALSE) {

  Fcn = function(x, y) {

    F1 = CreateFsIndFcn(sinOrd, cosOrd, omega)

    F2 = CreateLagIndFcn(
      lagIndex=lagIndex,
      longtIndex=longtIndex,
      AvgFcn=AvgFcn)

    out = cbind(F1(x), F2(y, fut=fut))

    return(out)
  }
  return(Fcn)
}

Example = function() {
  omega = 2*pi/10
  x = seq(0, 5, by=0.1)

  explanFourier = CreateFsIndFcn(sinOrd=c(1, 10), cosOrd=c(2, 3), omega)(x)
  y = explanFourier[ , 1] + 2*explanFourier[ , 2]
  Fcn = CreateLagFsFcn(lagIndex=1:3, longtIndex=2,
    sinOrd=1:2, cosOrd=c(3, 4), omega=omega, AvgFcn=rowMeans)

  df = Fcn(x, y)
  Fcn2 = CreateLagFsFcn(lagIndex=1:3, longtIndex=2,
    sinOrd=1:2, cosOrd=c(3, 4), omega=omega, AvgFcn=rowMeans)

  Fcn2(x=x[1], y=y)
}

################ gaussian time-varying volatility time series model functions
## this return Beta for linear regression when y = Beta X + error,
# when error is dependent
# and error var-cov matrix is denoted by Omega
# this function seems broken in some cases when input is omegaInvChol
# it does not return the right thing which is the beta estimate
GaussLinRegSol = function(
  y, x, omegaInvCholVec=NULL,
  omegaInvChol=NULL, omegaInv=NULL) {

  d = dim(x)
  n = d[1]
  if (is.null(omegaInvCholVec) & is.null(omegaInvChol) & is.null(omegaInv)) {
    xTild = x
    yTild = y
  }
  if (!is.null(omegaInvCholVec)) {
    xTild = omegaInvCholVec*x
    yTild = omegaInvCholVec*y
  } else
  if (!is.null(omegaInvChol)) {
    xTild = omegaInvChol%*%x
    yTild = omegaInvChol
  } else
  if (!is.null(omegaInv)) {
    w = chol(omegaInv)
    xTild = w%*%x
    yTild = w%*%y
  }

  #SVD = svd(xTild)
  #U = SVD$u
  #V = SVD$v
  #D.inv = diag(SVD$d^{-1})
  betaHat = solve((t(xTild) %*% xTild)) %*% t(xTild) %*% yTild
  return(betaHat)
}

Example = function() {

  n = 100
  p_1 = 2
  p_2 = 3
  x1 = matrix(rnorm(n*p_1), n, p_1)
  x2 = matrix(rnorm(n*p_2), n, p_2)
  y1 = 2*x1[ , 1] + 3*x1[ , 2] + rnorm(n)
  y2 = -3*x2[ , 1] - 4*x2[ , 2] + 5*x2[ , 3] + rnorm(n)
  x = cbind(y1, y2)
  xList = list(x1, x2)
  xbig = XbigList(xList)
  x = xbig
  yCol = as.matrix((as.vector(t(y))))
  y = yCol
  omegaInv = NA
  GaussLinRegSol(y=yCol, y=xBig, omegaInvChol=NA)
  GaussLinRegSol(y=yCol, y=xBig, omegaInvChol=diag(200))

  ###### testing with error
  p = 10
  n = 1000

  x = matrix(rnorm(p*n), n, p)
  beta = round(10*rnorm(p))
  y = rowSums(t(beta*t(x)))
  sdVec = seq((1:n)/n)
  error = rnorm(n, sd=sdVec)
  y = y + error
  y = as.matrix(y)

  omegaInvCholVec = 1/sdVec
  #omegaInvChol = diag(omegaInvCholVec)

  beta
  t1 = Sys.time()
  GaussLinRegSol(y=y, x=x, omegaInvCholVec=omegaInvCholVec)
  t2 = Sys.time()
  t2-t1

  t1 = Sys.time()
  GaussLinRegSolSvd(y=y, x=x, omegaInvCholVec=omegaInvCholVec)
  t2 = Sys.time()
  t2-t1

  GaussLinRegSol(y=y, x=x, omegaInvChol=NA)
}

### estimate Beta for dependent errors given the volatility
# parameters and structure
BetaGiven_volatParams = function(y, x, omegaPred, omegaParams) {

  b = omegaParams
  logOmegaVec = rowSums(t(b*t(omegaPred)))
  omegaVec = exp(logOmegaVec)
  omegaInvCholVec = sqrt(1/omegaVec)

  out = GaussLinRegSol(y=y, x=x, omegaInvCholVec=omegaInvCholVec)
  return(out)
}

############## Seems like this is where we need to start #################
LlikGaussReg = function(
  y, x, beta, omegaInvCholVec=NULL, omegaInvChol=NULL) {

  n = dim(x)[1]
  beta = as.matrix(beta)
  p = dim(beta)[1]
  z = y - x%*%beta
  if (is.null(omegaInvCholVec) & is.null(omegaInvChol)) {
    zTild = Z
    logDet = 0
  } else
  if (!is.null(omegaInvCholVec)) {
    zTild = omegaInvCholVec * z
    logDet = sum(log(omegaInvCholVec))
  } else
  if (!is.null(omegaInvChol)) {
    zTild = omegaInvChol%*%z
    logDet = log(det(omegaInvChol))
  }
  #logDet.Omega =  -2*logDet
  zz = t(zTild) %*% zTild
  out1 = (-n/2)*log(2*pi) + logDet - (1/2)*zz
  out2 = 2*p - 2*out1
  out3 = log(n)*p - 2*out1
  out = list(llik=out1, AIC=out2, BIC=out3)
  return(out)
}

Example = function() {
  p = 4
  n = 50
  x = matrix(rnorm(p*n), n, p)
  beta = round(10*rnorm(p))
  y = rowSums(t(beta*t(x)))

  T = (1:n)/n
  sint = sin(2*pi*t)
  cost = cos(2*pi*t)
  omegaPred = cbind(1, sint, cost)
  omegaParams = c(1, 5, 10)
  b = omegaParams
  logomegaVec = rowSums(t(b*t(omegaPred)))

  omegaVec = exp(logomegaVec)
  sdVec = sqrt(omegaVec)
  error = rnorm(n, sd=sdVec)
  y = y + error
  y = as.matrix(y)
  omegaInvCholVec = 1/sdVec
  omegaInvChol = diag(omegaVec)

  beta1 = GaussLinRegSol(y=y, x=x, omegaInvCholVec=omegaInvCholVec)
  e1 = y - x %*% beta1
  e1 = as.vector(e1)
  sd1 = sd(e1)
  sdVec1 = rep(sd1, n)
  O1 = 1/sdVec1

  beta2 = GaussLinRegSol(y=y, x=x)
  e2 = y - x %*% beta2
  e2 = as.vector(e2)
  sd2 = sd(e2)
  sdVec2 = rep(sd2, n)
  O2 = 1/sdVec2

  LlikGaussReg(y, x, beta=beta1, omegaInvCholVec=O1)
  #LlikGaussReg(Y,X,Beta1,omegaInvCholVec)

  #LlikGauss(Y=Y,X=X,Beta=Beta1,omegaInvCholVec=O1)
  #LlikGaussReg(Y,X,Beta1,omegaInvCholVec=omegaInvCholVec)

  LlikGaussReg(y, x, beta, omegaInvCholVec=O1)
  LlikGaussReg(y, x, beta, omegaInvCholVec)

  LlikGaussReg(y, x, beta2, omegaInvCholVec=O2)
  LlikGaussReg(y, x, beta2, omegaInvCholVec)

  LlikGaussReg(y, x, beta=beta2, omegaInvCholVec=O2)
  mod = glm(y ~ -1+x)
  mod$aic


  #### Testing manually the log likelihood function
  p = 4
  n = 50
  X = matrix(rnorm(p*n), n, p)
  beta = round(10*rnorm(p))
  y = rowSums(t(beta*t(x)))

  t = (1:n)/n
  sint = sin(2*pi*t)
  cost = cos(2*pi*t)
  omegaPred = cbind(1, sint, cost)
  omegaParams = c(1, 0, 0)
  b = omegaParams
  logomegaVec = rowSums(t(b*t(omegaPred)))

  omegaVec = exp(logomegaVec)
  sdVec = sqrt(omegaVec)
  error = rnorm(n, sd=sdVec)
  y = y + error
  y = as.matrix(y)
  omegaInvCholVec = 1/sdVec
  omegaInvChol = diag(omegaInvCholVec)

  omegaVec = sdVec^2
  Omega = diag(omegaVec)

  varcov = Omega

  LlikGaussReg(y, x, beta, omegaInvCholVec)
  dmnorm(as.vector(y), mean=as.vector(x %*% beta), varcov=varcov, log=TRUE)
}

## likelihood with varying volatility
LlikGaussRegVarVolat = function(y, x, beta, omegaPred, omegaParams) {

  b = omegaParams
  logomegaVec = rowSums(t(b*t(omegaPred)))
  omegaVec = exp(logomegaVec)
  omegaInvCholVec = sqrt(1/omegaVec)
  out = LlikGaussReg(
    y, x, beta, omegaInvCholVec=omegaInvCholVec, omegaInvChol=NA)
  return(out)
}

Example = function() {
  p = 3
  n = 100
  x = matrix(rnorm(p*n), n, p)
  y = -7*x[ , 1] + 5*x[ , 2] + 10*x[ , 3]

  t = (1:n)/n
  sint = sin(2*pi*t)
  cost = cos(2*pi*t)
  omegaPred = cbind(1, sint, cost)
  omegaParams = c(0, 5, 10)
  b = omegaParams
  logomegaVec = rowSums(t(b*t(omegaPred)))

  omegaVec = exp(logomegaVec)
  sdVec = sqrt(omegaVec)
  error = rnorm(n, sd=sdVec)
  y = y + error
  y = as.matrix(y)
  omegaInvCholVec = 1 / sdVec

  beta = GaussLinRegSol(y=y, x=x, omegaInvCholVec=omegaInvCholVec)
  #BetaGiven_volatParams(Y,X,omegaPred,omegaParams)
  LlikGaussReg(y, x, beta, omegaInvCholVec)
  LlikGaussRegVarVolat(y, x, beta, omegaPred, omegaParams)
}

##
MaxGaussLlikGivenBeta = function(y, x, beta, omegaPred, init=NULL) {

  z = y - x %*% beta

  F = function(b) {
    logomegaVec = rowSums(t(b*t(omegaPred)))
    omegaVec = exp(logomegaVec)
    omegaInvCholVec = sqrt(1/omegaVec)
    out = LlikGaussReg(y, x, beta, omegaInvCholVec=omegaInvCholVec,
      omegaInvChol=NULL)
    out1 = out$llik
    return(-out1)
  }

  bLength = dim(omegaPred)[2]
  b = init
  if (is.null(b)) {
    b = rep(0, bLength)
  }

  res = optim(par=b, F)
  return(res)
}

Example = function() {

  p = 3
  n = 200
  x = matrix(rnorm(p*n), n, p)
  beta = c(-7, 5, 10)
  y = rowSums(t(beta*t(x)))

  t = (1:n)/n
  sint = sin(2*pi*t)
  cost = cos(2*pi*t)
  omegaPred = cbind(1, sint, cost)
  omegaParams = c(1, 5, 1)
  b = omegaParams
  logomegaVec = rowSums(t(b*t(omegaPred)))

  omegaVec = exp(logomegaVec)
  sdVec = sqrt(omegaVec)
  error = rnorm(n, sd=sdVec)
  y = y + error
  y = as.matrix(y)
  omegaInvCholVec = 1/sdVec
  res1 = MaxGaussLlikGivenBeta(y, x, beta, omegaPred)
  MaxGaussLlikGivenBeta(y, x, beta, omegaPred, init=c(0, 0, 0))
}

## fit with varying-volatility
FitGaussLin_varyingVol = function(x, y, omegaPred, iter=1, init=NULL) {
  #print(y)
  p = dim(x)[2]
  p2 = dim(omegaPred)[2]
  n = dim(x)[1]
  beta = GaussLinRegSol(y=y, x=x)
  volatSample = NULL
  betaSample = NULL

  for (i in 1:iter) {
    res = MaxGaussLlikGivenBeta(y, x, beta, omegaPred, init=init)
    omegaParams = res[["par"]]
    value = res[["value"]]
    ll =  - value
    volatSample = rbind(volatSample, omegaParams)
    beta = BetaGiven_volatParams(y, x, omegaPred, omegaParams)
    volatSample = rbind(volatSample, omegaParams)
    betaSample = rbind(betaSample, as.vector(beta))
  }

  aic = 2*(p + p2) - 2*ll
  bic = log(n)*(p + p2) - 2*ll
  betaEst = betaSample[iter, ]
  volEst = volatSample[iter, ]
  fitted =  x %*% betaEst
  out = list(
    "y"=y,
    "x"=x,
    "fitted.values"=fitted,
    "residual"=(y-fitted),
    "betaSample"=betaSample,
    "volatSample"=volatSample,
    "betaEst"=betaEst,
    "volEst"=volEst,
    "llik"=ll,
    "AIC"=aic,
    "BIC"=bic)
}

## fit the model via formula
FitGaussVolatMod = function(
  df,
  yCol,
  explanCols=NULL,
  omegaPredCols=NULL,
  iter=10) {

  if (is.null(explanCols)) {
    explanCols = "1"
  }

  if (is.null(omegaPredCols)) {
    omegaPredCols = "1"
  }
  ## find the overall SD
  bigVol = sd(df[ , yCol], na.rm=TRUE)
  formulaString = paste(yCol, "~", PlusVec(explanCols))
  ModDesignMat1 = DesignMatFcn(df, formulaString)
  x = ModDesignMat1(df)
  omegaPred = as.matrix(rep(1, dim(df)[1]))

  if (!is.null(omegaPredCols)) {
    formulaString = paste(yCol, "~", PlusVec(omegaPredCols))
    ModDesignMat2 = DesignMatFcn(df, formulaString)
    omegaPred = ModDesignMat2(df)
  }

  y = df[ , yCol]
  res = FitGaussLin_varyingVol(x=x, y=y, omegaPred=omegaPred, iter=iter)
  betaEst = res[["betaEst"]]
  volEst = res[["volEst"]]

  PredMeanFcn = function(newdata) {
    newX = ModDesignMat1(newdata)
    out = newX %*% betaEst
    return(out)
  }

  PredVolFcn = function(newdata) {

    newOmegaPred = ModDesignMat2(newdata)
    out = newOmegaPred %*% volEst
    out = MinComp(x=c(sqrt(exp(out))), y=bigVol)
    return(out)
  }

  fitVol = PredVolFcn(df)
  res[["PredMeanFcn"]] = PredMeanFcn
  res[["PredVolFcn"]] = PredVolFcn
  res[["omegaPred"]] = omegaPred
  res[["fitVol"]] = fitVol
  res[["ModDesignMatMean"]] = ModDesignMat1
  res[["ModDesignMatVol"]] = ModDesignMat2
  class(res) = "gaussVolatMod"
  return(res)
}

## define the predict function for the class gaussVolatMod
predict.gaussVolatMod = function(gaussVolatMod, newdata) {

  value = gaussVolatMod[["PredMeanFcn"]](newdata)
  vol = gaussVolatMod[["PredVolFcn"]](newdata)
  return(list("mean"=value, "vol"=vol))
}

## print the model
print.gaussVolatMod = function(gaussVolatMod) {

  print(gaussVolatMod[c("betaEst", "volEst", "AIC", "BIC", "llik")])
}

Example = function() {
  p = 3
  n = 3000
  x = 3*matrix(abs(runif(p*n)), n, p)
  beta = c(0, 3, 0, 0.5)
  data = data.frame(x)
  yCol = "y"

  explanCols =  c("X1", "X2", "X3")
  formulaString = paste("~", PlusVec(explanCols))
  ModDesignMat1 = DesignMatFcn(data, formulaString)
  xOld = x = ModDesignMat1(data)
  yOld = y = rowSums(t(beta*t(x)))
  data[ , yCol] = y

  omegaPredCols = c("X1", "X3")
  formulaString = paste(yCol, "~", PlusVec(omegaPredCols))
  ModDesignMat2 = DesignMatFcn(data, formulaString)
  omegaPredOld = omegaPred = ModDesignMat2(data)

  omegaParams = c(0, 1, -0.5)
  b = omegaParams
  logOmegaVec = rowSums(t(b*t(omegaPred)))
  omegaVec = exp(logOmegaVec)
  sdVec = sqrt(omegaVec)
  error = rnorm(n, sd=sdVec)
  y = y + error
  y = as.matrix(y)
  omegaInvCholVec = 1/sdVec
  iter = 20
  data[ , yCol] = y

  indTrain = 1:round(n/2)
  indTest = (round(n/2)+1):n
  dataTrain = data[indTrain, ]
  dataTest = data[indTest, ]

  xTrain = x[indTrain, , drop=FALSE]
  yTrain = y[indTrain]
  xTest = x[indTest, , drop=FALSE]
  yTest = y[indTest]
  omegaPredTrain = omegaPred[indTrain, , drop=FALSE]
  omegaPredTest = omegaPred[indTest, , drop=FALSE]

  ### via FitGaussLin_varyingVol
  res = FitGaussLin_varyingVol(x=xTrain, y=yTrain, omegaPred=omegaPredTrain,
    iter=iter)
  round(res[["betaEst"]], 2)
  round(res[["volEst"]], 2)
  fittedSig2 = exp(omegaPredTrain%*%res[["volEst"]])
  fittedSd = sqrt(fittedSig2)
  mod = lm(res[["y"]] ~ res[["x"]])
  summary(mod)$sigma
  fittedSd[1, ]
  e = res[["y"]] - res[["x"]] %*% res[["betaEst"]]
  sqrt(var(e))
  betaCov = BetaVarCov(x=res[["x"]], betaCoef=res[["betaEst"]],
    volCoef=res[["volEst"]], omegaPred=omegaPred)
  sqrt(diag(betaCov))
  mod = lm(y ~ -1 + x)
  summary(mod)
  plot(yTrain, res[["fitted.values"]])

  yPred = xTest %*% res[["betaEst"]]
  plot(yTest, yPred)


  ### Same model Via FitGaussVolatMod
  res = FitGaussVolatMod(data=data[indTrain, ], yCol, explanCols,
    omegaPredCols, iter=iter)
  round(res[["betaEst"]], 2)
  round(res[["volEst"]], 2)
  fittedSig2 = exp(res[["omegaPred"]]%*%res[["volEst"]])
  fittedSd = sqrt(fittedSig2)
  yFit = res[["fitted.values"]]
  plot(data[indTrain , yCol], yFit)
  fitVol = res[["fitVol"]]
  yFitUpper = yFit + 1.96*fitVol
  yFitLower = yFit - 1.96*fitVol

  #ord = order(yTrain)
  ord = order(xTrain[ , 2])
  plot(xTrain[ , 2][ord], yTrain[ord], col=1)
  lines(xTrain[ , 2][ord], yFit[ord], col="blue")
  lines(xTrain[ , 2][ord], yFitUpper[ord], col="red")
  lines(xTrain[ , 2][ord], yFitLower[ord], col="green")

  ## compare true vol with fit vol
  plot(sdVec[indTrain], fitVol)

  pred = predict(res, newdata=data[indTrain, ])
  yFit = pred[["mean"]]
  vol = pred[["vol"]]
  plot(yTrain, yFit)

  testPred = predict(res, newdata=data[indTest, ])
  yPred = testPred[["mean"]]
  testVol = testPred[["vol"]]
  plot(y[indTest], yPred)
  testResidual = yTest - yPred
  plot(abs(testResidual), testVol)

  ## compare true vol with test vol
  plot(sdVec[indTest], testVol)


  ## corner cases of no variables in mean or volat explans
  res = FitGaussVolatMod(data=data[indTrain, ], yCol, explanCols=NULL,
    omegaPredCols, iter=iter)

  plot(res[["fitted.values"]], yTrain)

  res = FitGaussVolatMod(data=data[indTrain, ], yCol, explanCols=explanCols,
    omegaPredCols=NULL, iter=iter)

  plot(res[["fitted.values"]], yTrain)
}

BetaVarCov = function(x, betaCoef, volCoef, omegaPred) {

  b = volCoef
  logomegaVec = rowSums(t(b*t(omegaPred)))
  omegaVec = exp(logomegaVec) #omegaVec
  omegaInvVec = (1/omegaVec)
  omegaInv = diag(omegaInvVec)
  betaCov = solve(t(x) %*% omegaInv %*% x)
  return(betaCov)
}

BetaPrec = function(x, betaCoef, volCoef, omegaPred) {

  b = volCoef
  logomegaVec = rowSums(t(b*t(omegaPred)))
  omegaVec = exp(logomegaVec) #omegaVec
  omegaInvVec = (1/omegaVec)
  omegaInv = diag(omegaInvVec)
  betaPrec = (t(x) %*% omegaInv %*% x)
  return(betaPrec)
}

Example = function() {

  p = 3
  n = 100
  x = matrix(rnorm(p*n), n, p)
  beta = round(10*rnorm(p))
  y = rowSums(t(beta*t(x)))
  t = (1:n)/n
  sint = sin(2*pi*t)
  cost = cos(2*pi*t)
  #omegaPred = cbind(1,sint,cost);omegaParams = c(1,3,4)
  omegaPred = matrix(1, n, 1)
  omegaParams = c(1)
  b = omegaParams
  logomegaVec = rowSums(t(b*t(omegaPred)))

  omegaVec = exp(logomegaVec)
  sdVec = sqrt(omegaVec)
  error = rnorm(n, sd=sdVec)
  y = y + error
  y = as.matrix(y)
  omegaInvCholVec = 1/sdVec

  iter = 20
  res = FitGaussLin_varyingVol(x, y, omegaPred, iter=iter)

  betaSample = res[["betaSample"]]
  volatSample = res[["volatSample"]]
  res[["llik"]]
  res[["AIC"]]
  res[["BIC"]]
  signif(betaSample[iter, ], 3)
  signif(volatSample[iter, ], 3)
  betaCoef = betaSample[iter, ]
  volCoef = volatSample[iter, ]
  betaCov = BetaVarCov(x, betaCoef, volCoef, omegaPred)
  betaCov

  mod = lm(y ~ -1+x)
  AIC(mod)

  ### Example 2
  p = 3
  n = 40
  x = matrix(rnorm(p*n), n, p)
  beta = round(10*rnorm(p))
  beta
  Y = rowSums(t(beta*t(x)))
  #U1 = rnorm(n)
  #U2 = rnorm(n)
  omegaPred = matrix(round(rnorm(2*n)), n, 2)
  omegaParams = c(1, -1)
  b = omegaParams
  logomegaVec = rowSums(t(b*t(omegaPred)))
  omegaVec = exp(logomegaVec)
  sdVec = sqrt(omegaVec)
  error = rnorm(n, sd=sdVec)
  y = y + error
  y = as.matrix(y)
  omegaInvCholVec = 1/sdVec
  iter = 20
  res = FitGaussLin_varyingVol(x, y, omegaPred, iter=iter, init=NA)
  betaSample = res[["betaSample"]]
  volatSample = res[["volatSample"]]
  beta
  b
  round(betaSample[iter, ])
  round(volatSample[iter, ])
  betaEstim = betaSample[iter, ]
  volatEstim = volatSample[iter, ]
  fittedSig2 = exp(omegaPred%*%volatEstim)
  fittedSd = sqrt(fittedSig2)
  mod = lm(y ~ x)
  summary(mod)$sigma
  fittedSd[1, ]
  e = y - x %*% betaEstim
  sqrt(var(e))

  betaCoef = betaSample[iter, ]
  volCoef = volatSample[iter, ]
  betaCov = BetaVarCov(x, betaCoef, volCoef, omegaPred)
  betaCov
  sqrt(diag(betaCov))

  mod = lm(Y ~ -1+x)
  summary(mod)
}

## generalized model selection
ModSelSubsetGen = function(
  yCol,
  explanColsF=NULL,
  explanCols=NULL,
  FitFcn,
  m=1,
  crit="AIC",
  family="gaussian",
  varNum=NULL,
  critSummaryDelta=0.1)  {

  d = length(explanCols)
  varNum2 = d
  if (!is.null(varNum)) {
    if (varNum > d) {
      print("Warning: varNum > data dimension so it was reset to dim[2].")
    }
    varNum2 = min(d, varNum)
  }
  varNum = varNum2

  MsFcn = function(x) {
    if (is.null(x)) {
      explanCols1 = explanColsF
    } else {
      explanCols1 = explanCols[x]
      if (!is.null(explanColsF)) {
        explanCols1 = c(explanColsF, explanCols1)
      }
    }
    res = FitFcn(explanCols1)
    out = c(res, explanCols[x], rep(NA, (varNum - length(x))))
    return(out)
  }

  nullModRes = MsFcn(x=NULL)

  ## subSets of up to order varNum
  subSets = lapply(1:varNum, function(x) as.list(data.frame(combn(d, x))))
  subSets = unlist(subSets, recursive=FALSE)

  resList = lapply(subSets, MsFcn)
  resDf = matrix(NA, length(resList), varNum+1)
  resDf = rbind(resDf, nullModRes)
  n = length(resList)
  for (i in 1:n) {
    resDf[i, ] = resList[[i]]
  }

  colnames(resDf) = NULL
  resDf = data.frame(resDf)
  resDf[ , 1] = as.numeric(as.character(resDf[ ,1]))
  colnames(resDf)[1] = crit

  ord = order(resDf[ , 1])
  resDf = resDf[ord, ]
  topModMatrix = resDf[1:m, ]
  critVec = resDf[ , crit]
  critVec = na.omit(critVec)
  critSummary = quantile(critVec, seq(0, 1, critSummaryDelta))
  outList = list(topModMatrix=topModMatrix, critSummary=critSummary)
  return(outList)
}

################ END #####################

## calculation functions
VarOnMean = function(x) {
  var(x)/mean(x)
}

CoefOfVar = function(x) {
  sd(x)/mean(x)
}

############# time-varying distribution functions

## Create lag data
CreateLagData = function(
  data,
  respCols,
  lagIndex,
  longtIndex) {

  F = CreateLagIndFcn(lagIndex=lagIndex, longtIndex=longtIndex)
  lagList = list()

  for (i in 1:length(respCols)) {
    resp = respCols[i]
    y = data[ , resp]
    lagList[[resp]] = F(y, resp, naFill=NA)
  }

  lagDf = MergeDfList(dfList=lagList)
  l = list(lagList=lagList, lagDf=lagDf)
  return(l)
}


## should be deprecated since ggplot does now allow adding legends
GenBoundsDfGg = function(
    df,
    wrtCols,
    valueCol,
    splitCol=NULL,
    boundQuantiles=c(0.001, 0.999),
    boundQuantileDelta=0.05) {

  probs = sort(c(boundQuantiles, 0.01, 0.1, 0.25, 0.5, 0.75, 0.9, 0.99))
  boundsDf = SummWrt(
    data=df,
    valueCol=valueCol,
    wrtCols=wrtCols,
    probs=probs,
    fcnList=list(mean, sd, min, max),
    fcnNames=c('mean', 'sd', 'min', 'max'),
    addValuePrefix=TRUE)

  qStr1 = paste0('_', as.character(boundQuantiles[1]))
  qStr2 = paste0('_', as.character(boundQuantiles[2]))

  deltaStr = paste0('qr_', as.character(boundQuantileDelta*100), 'pcnt')

  boundsDf[ , paste0(valueCol, '_right_qr')] = (
    boundsDf[ , paste0(valueCol, '_0.99')] -
    boundsDf[ , paste0(valueCol, '_0.9')])/0.09

  boundsDf[ , paste0(valueCol, '_left_qr')] = (
    boundsDf[ , paste0(valueCol, '_0.1')] -
    boundsDf[ , paste0(valueCol, '_0.01')])/0.09

  lowerCol = paste0(valueCol, qStr1, '_minus_', deltaStr)
  upperCol = paste0(valueCol, qStr2, '_plus_', deltaStr)

  boundsDf[ , lowerCol] = (
    boundsDf[ , paste0(valueCol, '_', as.character(boundQuantiles[1]))] -
    boundQuantileDelta*boundsDf[ , paste0(valueCol, '_left_qr')])

  boundsDf[ , upperCol] = (
    boundsDf[ , paste0(valueCol, '_', as.character(boundQuantiles[2]))] +
    boundQuantileDelta*boundsDf[ , paste0(valueCol, '_right_qr')])

  boundsDfSumm = boundsDf[ ,
    c(
      wrtCols,
      lowerCol,
      upperCol),
    drop=FALSE]

  F = function(newDf) {
    outDf = merge(newDf, boundsDfSumm, on=wrtCols, sort=FALSE)
    outDf[ , valueCol] = MinComp(
      outDf[ , valueCol],
      outDf[ , upperCol])

    outDf[ , valueCol] = MaxComp(
      outDf[ , valueCol],
      outDf[ , lowerCol])
    outDf = outDf[ , colnames(newDf), drop=FALSE]
    return(outDf)
  }

  p = NULL
  colours = c("blue", "red")
  if (!is.null(splitCol)) {
    p = Plt_splitDfOverlay(
      df=df, splitCol=splitCol, wrtCol=wrtCols, yCol=valueCol)
    cols = c(lowerCol, upperCol)

    boundsDf[ , "lower"] = boundsDf[ , lowerCol]
    boundsDf[ , "upper"] = boundsDf[ , upperCol]

    p = p + geom_line(
      data=boundsDf,
      aes(x=get(wrtCol), y=lower),
      size=0.75, colour=colours[1], linetype=1)

    p = p + geom_line(
      data=boundsDf,
      aes(x=get(wrtCol), y=upper),
      size=0.75, colour=colours[2], linetype=2)

    p = p + theme(legend.position="top")
  }

  return(list(
    "F"=F,
    "boundsDf"=boundsDf,
    "boundsDfSumm"=boundsDfSumm,
    "p"=p))
}


## generates a function which calcualtes bounds
# based on df for each value on wrtCols
# such a function applies to new data
# and bounds the values of the new df
GenBoundsDf = function(
    df,
    wrtCols,
    valueCol,
    splitCol=NULL,
    boundQuantiles=c(0.001, 0.999),
    boundQuantileDelta=0.05,
    lwd=2) {

  probs = sort(c(boundQuantiles, 0.01, 0.1, 0.25, 0.5, 0.75, 0.9, 0.99))
  boundsDf = SummWrt(
      data=df,
      valueCol=valueCol,
      wrtCols=wrtCols,
      probs=probs,
      fcnList=list(mean, sd, min, max),
      fcnNames=c('mean', 'sd', 'min', 'max'),
      addValuePrefix=TRUE)

  qStr1 = paste0('_', as.character(boundQuantiles[1]))
  qStr2 = paste0('_', as.character(boundQuantiles[2]))

  deltaStr = paste0('qr_', as.character(boundQuantileDelta*100), 'pcnt')

  boundsDf[ , paste0(valueCol, '_right_qr')] = (
      boundsDf[ , paste0(valueCol, '_0.99')] -
      boundsDf[ , paste0(valueCol, '_0.9')])/0.09

  boundsDf[ , paste0(valueCol, '_left_qr')] = (
      boundsDf[ , paste0(valueCol, '_0.1')] -
      boundsDf[ , paste0(valueCol, '_0.01')])/0.09

  lowerCol = paste0(valueCol, qStr1, '_minus_', deltaStr)
  upperCol = paste0(valueCol, qStr2, '_plus_', deltaStr)

  boundsDf[ , lowerCol] = (
      boundsDf[ , paste0(valueCol, '_', as.character(boundQuantiles[1]))] -
      boundQuantileDelta*boundsDf[ , paste0(valueCol, '_left_qr')])

  boundsDf[ , upperCol] = (
      boundsDf[ , paste0(valueCol, '_', as.character(boundQuantiles[2]))] +
      boundQuantileDelta*boundsDf[ , paste0(valueCol, '_right_qr')])

  boundsDfSumm = boundsDf[ ,
      c(
        wrtCols,
        lowerCol,
        upperCol),
      drop=FALSE]

  F = function(newDf) {
    outDf = merge(newDf, boundsDfSumm, on=wrtCols, sort=FALSE)
    outDf[ , valueCol] = MinComp(
        outDf[ , valueCol],
        outDf[ , upperCol])

    outDf[ , valueCol] = MaxComp(
        outDf[ , valueCol],
        outDf[ , lowerCol])
    outDf = outDf[ , colnames(newDf), drop=FALSE]
    return(outDf)
  }

  Plt = function() {

    if (length(wrtCols) > 1) {
      return(NULL)
    }

    colours = c("blue", "red")
    if (!is.null(splitCol)) {

      df0 = df[order(df[ , splitCol], df[ , wrtCols]),  ]

      Plt_splitDfOverlay(
        df=df0, splitCol=splitCol, wrtCol=wrtCols, yCol=valueCol)
      cols = c(lowerCol, upperCol)
    } else {
        yMax = max(df[ , valueCol], na.rm=TRUE)
        yMin = min(df[ , valueCol], na.rm=TRUE)
        xMax = max(df[ , wrtCols], na.rm=TRUE)
        xMin = min(df[ , wrtCols], na.rm=TRUE)
        yDelta = (yMax - yMin) / 4
        yMax = yMax + yDelta
        yMin = yMin - yDelta

      plot(
        df[ , wrtCols], df[ , valueCol], xlab=wrtCols, ylab=valueCol,
        xlim=c(xMin, xMax), ylim=c(yMin, yMax), col=ColAlpha("black", 0.3))
    }
    boundsDf = boundsDf[order(boundsDf[ , wrtCols]), ]
    boundsDf[ , "lower"] = boundsDf[ , lowerCol]
    boundsDf[ , "upper"] = boundsDf[ , upperCol]

    lines(boundsDf[ , wrtCols], boundsDf[ , lowerCol], col=colours[1],
      lty=2, lwd=lwd)
    lines(boundsDf[ , wrtCols], boundsDf[ , upperCol], col=colours[2],
      lty=3, lwd=lwd)

    legend(
      x="topright", legend=c("obs", "lower", "upper"),
      col=c("grey", colours), lty=c(1, 2, 3))

  }


  return(list(
    "F"=F,
    "boundsDf"=boundsDf,
    "boundsDfSumm"=boundsDfSumm,
    "Plt"=Plt))
}

Example = function() {

  m = 500
  x = rep(1:100, m)/100
  y = sin(x*2*pi) + rnorm(length(x))
  z = unlist(lapply(X=1:m, FUN=function(i){rep(i, 100)}))

  df = data.frame(x=x, y=y, z=z)
  res = GenBoundsDf(
    df=df,
    wrtCols="x",
    valueCol="y",
    splitCol="z",
    boundQuantiles=c(0.01, 0.99),
    boundQuantileDelta=0.1)

  res[["Plt"]]()

  res = GenBoundsDf(
      df=df,
      wrtCols="x",
      valueCol="y",
      splitCol=NULL,
      boundQuantiles=c(0.01, 0.99),
      boundQuantileDelta=0.1)

  res[["Plt"]]()
}


GenBoundsFcn = function(
    df,
    wrtCols,
    valueCol,
    boundQuantiles=c(0.001, 0.999),
    boundQuantileDelta=0.05) {

  res = GenBoundsDf(
    df=df,
    wrtCols=wrtCols,
    valueCol=valueCol,
    boundQuantiles=boundQuantiles,
    boundQuantileDelta=boundQuantileDelta)
  return(res[["F"]])
}

Example = function() {
  x = sample(1:10, 1000, replace=TRUE)
  y = rnorm(1000)
  df = data.frame(x, y)
  F = GenBoundsFcn(
    df,
    wrtCols="x",
    valueCol="y",
    boundQuantiles=c(0.001, 0.999),
    boundQuantileDelta=0.05)

  df2 = df[1:10, ]
  df2[1, ] = c(1, 100)
  F(df2)
}


## make a longterm avg plot by taking the average in a window
# and moving that across the time
# that window can be a week, month, year etc
LongtAvgPlt = function(
  df,
  timeCol,
  windowCol,
  respCol,
  ylab=NULL,
  ylim=NULL,
  pltTitle="",
  color=ColAlpha("blue", 0.5),
  addToPlot=FALSE,
  addPoints=FALSE,
  cex.lab=1.5,
  cex.axis=1.5,
  cex.main=1.5,
  lty=1) {

    # this is for choosing the major color
    # after averaging
    Func = function(x) {
      x[1]
    }

    df0 = df[ , c(timeCol, windowCol, respCol)]
    df0[ , "color"] = color
    names(df0) = c("timeCol", "windowCol", "col", "color")
    df0 = data.table(df0)
    dfW = df0[ ,
      .(
        contiTime=mean(timeCol),
        col=mean(col),
        color=Func(color),
        num=.N),
      by=.(windowCol)]

    if (is.null(ylab)) {
      ylab = "avg resp across window"
    }

    dfW = data.frame(dfW)
    Plt = plot

    if (addToPlot) {
      Plt = lines
    }

    Plt(
      dfW[ , "contiTime"],
      dfW[ , "col"],
      ylim=ylim,
      type="l",
      xlab="time",
      ylab=ylab,
      main=pltTitle,
      col=dfW[ , "color"],
      lwd=3,
      cex.lab=cex.lab,
      cex.axis=cex.axis,
      cex.main=cex.main,
      lty=lty)
    if (addPoints) {
      points(
        dfW[ , "contiTime"],
        dfW[ , "col"],
        col=ColAlpha("black", 0.6),
        pch=20,
        cex=1)
    }

    return(dfW)
}

Example = function() {

  y1 = 5 * rnorm(100)
  y2 = y1 + 1 + rnorm(100)
  contiTime = 1:100

  window = c(
    rep(1, 20),
    rep(2, 20),
    rep(3, 20),
    rep(4, 20),
    rep(5, 20))

  color = window

  df = data.frame(
    "contiTime"=contiTime,
    "windowCol"=window,
    "color"=color,
    "y1"=y1,
    "y2"=y2)

  ltDf = LongtAvgPlt(
    df=df,
    timeCol="contiTime",
    windowCol="windowCol",
    respCol="y1",
    ylab=NULL,
    pltTitle="",
    color=df[ , "color"])

  LongtAvgPlt(
    df=df,
    timeCol="contiTime",
    windowCol="windowCol",
    respCol="y2",
    ylab=NULL,
    pltTitle="",
    color=df[ , "color"],
    addToPlot=TRUE,
    lty=2)
}

######## fit Box-Cox
## pivot the data wrt appropriate variables.
# for daily weather it should be pivotPar=c("doy","year")
BuildDataPivot = function(df, resp, pivotVar=c("week", "year")) {
  castText = paste(
    "dfPivot = cast(df,",
    pivotVar[1],
    "~",
    pivotVar[2],
    ",value=\"",
    resp,
    "\", fun.aggregate=mean)",
    sep="")
  eval(parse(text=castText))
  return(dfPivot)
}

## build a function to estimate boxcox
EstimBoxCoxFcn = function(
  OptCrit,
  shiftIt,
  powRange=seq(1/2, 2, 1/4),
  shiftGridNum=10,
  prefCoef=c(1, 0.1, 1)) {
  function(x, ...) {

    x = x[-1]
    res = EstimBoxCox(
        x,
        OptCrit=OptCrit,
        powRange=powRange,
        shiftIt=shiftIt,
        shiftGridNum=shiftGridNum,
        prefCoef=prefCoef)

    y = res[["transX"]]
    c(
      res[["param"]],
      res[["value"]],
      res[["newValue"]],
      mean(x, na.rm=TRUE),
      sd(x, na.rm=TRUE),
      mean(y, na.rm=TRUE),
      sd(y, na.rm=TRUE))
  }
}

## Apply BoxCox pointwise appropriately,
# we choose BoxCox parameters with respect to
# e.g. week/day (or day of year in this case)
BoxCox1Var = function(boxCoxRes) {
  Asn = as.numeric
  x = boxCoxRes[[resp]]
  x = Asn(x)
  pow = boxCoxRes[["pow"]]
  pow = Asn(pow)
  shift = boxCoxRes[["shift"]]
  shift = Asn(shift)
  out = NA
  if (!is.na(x)) {
    out = BoxCox(x=x, pow=pow, shift=shift)
  }
  return(out)
}

## manual way to apply BoxCox pointwise appropriately
Example = function() {
  dfValues = rawDf
  n = dim(dfValues)[1]
  dfValues[ , "tempOrder"] = 1:n
  data = merge(dfValues,dfColsArgs,by="week",all.x=TRUE)
  data = data[order(data[,"tempOrder"]), ]
  cols = c(wrtCols, valueCols, colnames(dfColsArgs))
  cols = unique(cols)

  data2 = data[ , cols]
  #data2 = data.table(data2)
  transX1 = apply(data2, 1, fcn)
}

## smooth boxCox parameters over time (of year)
SmoothBoxCoxRes = function(boxCoxRes, method="smoothSpline", ord=10) {

  pow = boxCoxRes[ , "pow"]
  shift = boxCoxRes[ , "shift"]
  timeLen = length(pow)
  data = data.frame(t=1:timeLen, pow=pow, shift=shift)
  y1 = data[ , "pow"]
  y2 = data[ , "shift"]

  if (method == "regression") {
    fsData = Fseries(data=data, colName="t",
      omega=(2*pi)/timeLen, ord=ord)[["df"]]
    mod1 = GlmFitMatr(y=y1, explan=fsData, family="gaussian")
    mod2 = GlmFitMatr(y=y2, explan=fsData, family="gaussian")
    yPred1 = predict(mod1, explan=fsData)
    yPred2 = predict(mod2, explan=fsData)
  }

  if (method == "smoothSpline") {
    mod1 = smooth.spline(x=1:timeLen, y=y1, spar=0.7)
    mod2 = smooth.spline(x=1:timeLen, y=y2, spar=0.7)
    yPred1 = predict(mod1, explan=fsData)[["y"]]
    yPred2 = predict(mod2, explan=fsData)[["y"]]
  }

  if (method == "GPfit") {
    InstallLoad("GPfit")
    #xMat = matrix((1:timeLen)/timeLen)
    fsData = Fseries(data=data, colName="t",
      omega=(2*pi)/timeLen, ord=1)[["df"]]
    xMat = fsData
    fsData = (fsData + 1)/2
    y = y1
    mod = GP_fit(xMat, y)
    res = predict(mod)
    yPred1 = res[["Y_hat"]]
    y = y2
    mod = GP_fit(xMat, y)
    res = predict(mod)
    yPred2 = res[["Y_hat"]]
  }

  if (method == "filt") {
    yPred1 = MovingAvg(vec=y1, filterNum=ord, filterPpns=NULL,
      missNum=100, periodic=TRUE)
    yPred2 = MovingAvg(vec=y2, filterNum=ord, filterPpns=NULL,
      missNum=100, periodic=TRUE)
  }


  par(mfrow=c(1,2))
  plot(1:timeLen, y1, pch=20, col="grey")
  lines(1:timeLen, yPred1, col="red")
  plot(1:timeLen, y2, pch=20, col="grey")
  lines(1:timeLen, yPred2, col="red")

  return(list(powPred=yPred1, shiftPred=yPred2))
}

## plot boxcox trans results
PlotBoxCoxRes = function(
    dfPivot,
    powPred,
    shiftPred,
    OptCrit=FitGoodness) {

  n = dim(dfPivot)[1]
  m1 = m2 = rep(NA, dim(dfPivot)[1])
  s1 = s2 = rep(NA, dim(dfPivot)[1])
  valueVec = rep(NA, dim(dfPivot)[1])
  newValueVec = rep(NA, dim(dfPivot)[1])

  Func = function(i) {
    x = dfPivot[i, ]
    x = as.numeric(x)
    x = x[-1]
    y = BoxCox(x, pow=powPred[i], shift=shiftPred[i])
    m1 = mean(x, na.rm=TRUE)
    m2 = mean(y, na.rm=TRUE)
    s1 = sd(x, na.rm=TRUE)
    s2 = sd(y, na.rm=TRUE)
    qsk1 = QskewAbs(na.omit(x))
    qsk2 = QskewAbs(na.omit(y))
    sk1 = SkewnessAbs(na.omit(x))
    sk2 = SkewnessAbs(na.omit(y))
    kur1 = kurtosis(x, na.rm=TRUE)
    kur2 = kurtosis(y, na.rm=TRUE)

    value = OptCrit(x)
    newValue = OptCrit(y)
    return(c(m1, m2, s1, s2, sk1, sk2, qsk1, qsk2, kur1, kur2, value,
      newValue))
  }

  x = matrix(1:dim(dfPivot)[1])
  mat = apply(X=x, MARGIN=1, FUN=Func)
  mat = t(mat)

  m1 = mat[ , 1]
  m2 = mat[ , 2]
  s1 = mat[ , 3]
  s2 = mat[ , 4]
  sk1 = mat[ , 5]
  sk2 = mat[ , 6]
  qsk1 = mat[ , 7]
  qsk2 = mat[ , 8]
  kur1 = mat[ , 9]
  kur2 = mat[ , 10]
  valueVec = mat[ , 11]
  newValueVec = mat[ , 12]

  Max2 = function(x) {
    max(x, na.rm=TRUE)
  }

  Min2 = function(x) {
    min(x, na.rm=TRUE)
  }

  YlimFcn = function(x1, x2) {
    c(Min2(c(x1, x2)), Max2(c(x1, x2)))
  }


  par(mfrow=c(2, 7))
  plot2 = function(x,...) {
    plot(x, xlab="day of year", type="l", col=ColAlpha("red"), ...)
  }

  plot(powPred, main="BoxCox pow", type="l", lwd=2)
  plot2(m1, ylim=YlimFcn(m1, m2), main="mean bef BC")
  plot2(s1, ylim=YlimFcn(s1, s2), main="sd bef BC")
  plot2(sk1, ylim=YlimFcn(sk1, sk2), main="skew bef BC")
  plot2(qsk1, ylim=YlimFcn(qsk1, qsk2), main="qskew bef BC")
  plot2(kur1, ylim=YlimFcn(kur1, kur2), main="kurt bef BC")
  plot2(valueVec, ylim=YlimFcn(valueVec, newValueVec), main="GOF bef BC")

  plot(shiftPred, main="BoxCox shift", type="l",lwd=2)

  plot2(m2, ylim=YlimFcn(m1, m2), main="mean aft BC")
  plot2(s2, ylim=YlimFcn(s1, s2), main="sd aft BC")
  plot2(sk2, ylim=YlimFcn(sk1, sk2), main="skew aft BC")
  plot2(qsk2, ylim=YlimFcn(qsk1, qsk2), main="qskew aft BC")
  plot2(kur2, ylim=YlimFcn(kur1, kur2), main="kurt aft BC")
  plot2(newValueVec, ylim=YlimFcn(valueVec, newValueVec), main="GOF aft BC")
}

################ time series weather functions
## source function depending on online or off-line
# source(paste(addr.source,"extract_rawData_frost.r",sep=""),local=TRUE)

## this is a function to build explan data for the data set
# we can then later use it for explan generation for future data as well
BuildLagFsDf = function(
  df,
  respCols,
  LagFcn=NULL,
  fsFcnList=NULL,
  fut=FALSE,
  contiTimeFut=NULL) {

  lagMatList = list()
  outDf = NULL

  if (!is.null(LagFcn)) {
    for (r in respCols) {
      lagMatList[[r]] = LagFcn(df[ , r], colName=r, fut=fut)
    }

    for (r in respCols) {
      if (!is.null(outDf)) {
        outDf = cbind(outDf, lagMatList[[r]])
        } else {
          outDf = lagMatList[[r]]
        }
    }
  }


  if (!is.null(fsFcnList)) {
    timeCols = names(fsFcnList)

    for (i in 1:length(timeCols)) {

        FsFcn = fsFcnList[[i]]
        if (!fut) {
          sinCosMat = FsFcn(df[ , timeCols[i]])
        } else {
          sinCosMat = FsFcn(contiTimeFut[i])
        }

        if (is.null(outDf))  {
          outDf = sinCosMat
        } else {
          outDf = cbind(outDf, sinCosMat)
        }
      }

  }

  return(outDf)
}

Example = function() {

  longtIndex = c(1, 2, 5)
  LagFcn = CreateLagIndFcn(lagIndex=1:3,
    longtIndex=longtIndex, AvgFcn=rowMeans)
  FsFcnTod = CreateFsIndFcn(sinOrd=1:2, cosOrd=1:2,
    omega=(2*pi/24), colName="tod")
  FsFcnTow = CreateFsIndFcn(sinOrd=1:3, cosOrd=1:3,
    omega=(2*pi/7), colName="tow")

  fsFcnList = list("tod"=FsFcnTod, "tow"=FsFcnTow)

  yCol = "count"
  t1 = strptime("01/01/2011 11:23", "%d/%m/%Y %H:%M", tz = "Europe/London")
  t1 = chron::as.chron(t1)
  t2 = t1 +  365
  timeDf = CreateTimeGrid(t1, t2, delta=1/(24))
  df = timeDf
  df[ , yCol] = rpois(dim(df)[1], lambda=10)

  futTs = t2 + 1/24
  futTimeInfo = BuildTimeDf(futTs)
  contiTimeFut = futTimeInfo[ , names(fsFcnList)]

  modDf = BuildLagFsDf(
    df=df,
    respCols=c("count"),
    LagFcn=LagFcn,
    fsFcnList=fsFcnList,
    fut=FALSE,
    contiTimeFut=NULL)

  modDfFut = BuildLagFsDf(
    df=df,
    respCols=c("count"),
    LagFcn=LagFcn,
    fsFcnList=fsFcnList,
    fut=TRUE,
    contiTimeFut=contiTimeFut)
}

## from basic model builds a data with lags and fourier series terms
# this also imputes the data: the imputation will replace
# the missing value with avg wrt wrtCols
BuildTsModDf = function(
  df,
  respCols,
  LagFcn,
  fsFcnList,
  wrtCols=NULL,
  omitNa=FALSE) {

  outDf = BuildLagFsDf(
    df=df,
    respCols=respCols,
    LagFcn=LagFcn,
    fsFcnList=fsFcnList,
    fut=FALSE,
    contiTimeFut=NULL)

  cols = names(outDf)
  augDf = cbind(df, outDf)

  if (!is.null(wrtCols)) {
    augData = ImputeDf(
        df=augDf,
        valueCols=cols,
        wrtCols=wrtCols,
        AggFunc=NULL,
        replaceCol=TRUE)
  }

  if (omitNa) {
    augDf = na.omit(augDf)
  }

  return(augDf)
}

## this is for generic seasonal time series eg traffic
BuildTsModDf_lagFourierParams = function(
    df,
    todK=NULL,
    todK_wknd=NULL,
    towK=NULL,
    toyK=NULL,
    lagK=NULL,
    longtIndex=NULL,
    lagTrans=function(x){x},
    imputeWrtCols,
    omitNa=FALSE
    ) {

  LagFcn = NULL

  if (!is.null(lagK) || !is.null(longtIndex)) {
    LagFcn = CreateLagIndFcn(
        lagIndex=1:lagK,
        longtIndex=longtIndex,
        AvgFcn=rowMeans,
        LagTrans=LagTrans)
  }

  fsFcnList = list()

  if (!is.null(todK)) {
    FsFcnTod = CreateFsIndFcn(
        sinOrd=1:todK,
        cosOrd=1:todK,
        omega=(2*pi/24),
        colName="tod")

    fsFcnList[["tod"]] = FsFcnTod
  }

  if (!is.null(todK_wknd)) {

    FsFcnTod_wknd = CreateFsIndFcn(
        sinOrd=1:todK_wknd,
        cosOrd=1:todK_wknd,
        omega=(2*pi/24),
        colName="tod_wknd")

    fsFcnList[["tod_wknd"]] = FsFcnTod_wknd
  }

  if (!is.null(towK)) {
    FsFcnTow = CreateFsIndFcn(
        sinOrd=1:towK,
        cosOrd=1:towK,
        omega=(2*pi/7),
        colName="tow")

    fsFcnList[["tow"]] = FsFcnTow
  }


  if (!is.null(toyK)) {
    FsFcnToy = CreateFsIndFcn(
        sinOrd=1:toyK,
        cosOrd=1:toyK,
        omega=(2*pi/366),
        colName="toy")

    fsFcnList[["toy"]] = FsFcnToy
  }

  modDf = BuildTsModDf(
      df=df,
      respCols=yCol,
      LagFcn=LagFcn,
      fsFcnList=fsFcnList,
      wrtCols=imputeWrtCols,
      omitNa=FALSE)

  return(list(
      "modDf"=modDf,
      "fsFcnList"=fsFcnList,
      "LagFcn"=LagFcn))
}

## adds the explan data to a data set.
# this is useful for fut explan generation
Add_LagFs_explanDf = function(
  df,
  respCols,
  LagFcn,
  fsFcnList,
  fut=FALSE,
  futDfRow=NULL) {

  contiTimeFut = futDfRow[ , names(fsFcnList)]
  contiTimeFut = as.vector(contiTimeFut)
  outDf = BuildLagFsDf(
    df=df,
    respCols=respCols,
    LagFcn=LagFcn,
    fsFcnList=fsFcnList,
    fut=fut,
    contiTimeFut=contiTimeFut)

  if (!fut) {
    augDf = cbind(df, outDf)
  } else {
    #data[dim(data)[1] , , drop=FALSE]
    augDf = cbind(futDfRow, outDf)
  }
  return(augDf)
}

Example = function() {
  longtIndex = c(1, 2, 5)
  LagFcn = CreateLagIndFcn(
    lagIndex=1:3,
    longtIndex=longtIndex,
    AvgFcn=rowMeans)
  FsFcnTod = CreateFsIndFcn(
    sinOrd=1:2,
    cosOrd=1:2,
    omega=(2*pi/24),
    colName="tod")
  FsFcnTow = CreateFsIndFcn(
    sinOrd=1:3,
    cosOrd=1:3,
    omega=(2*pi/7),
    colName="tow")
  fsFcnList = list("tod"=FsFcnTod, "tow"=FsFcnTow)

  yCol = "count"
  t1 = strptime("01/01/2011 11:00", "%d/%m/%Y %H:%M", tz="Europe/London")
  t1 = chron::as.chron(t1)
  t2 = t1 +  365
  timeDf = CreateTimeGrid(t1, t2, delta=1/(24))
  df = timeDf
  df[ , yCol] = rpois(dim(df)[1], lambda=10)

  futTs = t2 + 1/24
  futTimeInfo = BuildTimeDf(futTs)
  futDfRow = futTimeInfo
  contiTimeFut = futTimeInfo[ , names(fsFcnList)]

  modDf = Add_LagFs_explanDf(
    data=df,
    respCols=c("count"),
    LagFcn=LagFcn,
    fsFcnList=fsFcnList,
    fut=FALSE,
    futDfRow=NULL)

  modDfFut = Add_LagFs_explanDf(
    data=df,
    respCols=c("count"),
    LagFcn=LagFcn,
    fsFcnList=fsFcnList,
    fut=TRUE,
    futDfRow=futDfRow)
}

#### Auto-correlation plot by season ######
CreateAutoCSeas = function(
  data,
  resp="tmin",
  Fcn=pacf,
  fcnName="PAC",
  lagNum=5) {

  G = function(seas) {
    data_seas = subset(data, is.element(data[ , "month"], seas))
    val = data_seas[ , resp]
    acf = Fcn(na.omit(val), plot=FALSE)
    return(acf[["acf"]])
    #lags = acf[["lag"]]
  }

  valWin = G(c(12, 1, 2))
  valSpr = G(4:5)
  valSum = G(6:8)
  valAut = G(9:11)

  plot(1:lagNum, valWin[1:lagNum], type="l", xlab="lag",
    ylab=fcnName, cex.lab=1.2, cex.axis=1.2, ylim=c(0, 1), col=2)
  lines(1:lagNum, valSpr[1:lagNum], lwd=2, lty=2, col=3)
  lines(1:lagNum, valSum[1:lagNum], lwd=2, lty=3, col=4)
  lines(1:lagNum, valAut[1:lagNum], lwd=2, lty=4, col=5)
  abline(0, 0, lty=2)
  legend(x=lagNum-2, y=max(valWin, valSpr, valSum, valAut),
    legend=c("Win", "Spr", "Sum", "Fall"), lwd=c(2, 2, 2, 2),
    col=2:5, lty=1:4)
}

Example = function() {
  # CreateAutoCSeas(data=data, resp="tmin", Fcn=pacf, fcnName="PAC", lagNum=5)
  # CreateAutoCSeas(data=data, resp="tmax", Fcn=pacf, fcnName="PAC", lagNum=5)
  fn = paste0(odir, "pac_per_season.png")
  png(fn)
  CreateAutoCSeas(data=data, resp="tmin", Fcn=pacf, fcnName="PAC", lagNum=5)
  dev.off()
}

##### Distribution
# y
# doyVec
CheckHistMonthly = function(data, doyCol="doy", resp, Fcn=hist) {
  monthVecStarts = c(1, 31, 61, 91, 121, 151, 181, 211, 241, 271,
    301, 331, 361)
  par(mfrow=c(3, 4))

  for (i in 1:12) {
    minn = monthVecStarts[i]
    maxx = monthVecStarts[i+1]
    ind = which(data[ , doyCol] >= minn & data[ , doyCol]<=maxx)
    yres = data[ind, resp]
    Fcn(yres, main=resp)
  }
}
# CheckHistMonthly(data, resp)

CheckQqMonthly = function(data, doyCol="doy", resp) {
  monthVecStarts = c(1, 31, 61, 91, 121, 151, 181, 211, 241, 271, 301,
    331, 361)
  par(mfrow=c(3, 4))
  for (i in 1:12) {
    minn = monthVecStarts[i]
    maxx = monthVecStarts[i+1]
    ind = which(data[ , doyCol] >= minn & data[ , doyCol] <= maxx)
    yres = data[ind, resp]
    qqnorm(yres, main=resp, col=ColAlpha("black", 0.5))
    qqline(yres)
  }
}

CheckHistBiMonthly = function(data, doyCol="doy", resp, Fcn=hist) {
  bimonthVecStarts = c(1, 61, 121, 181, 241, 301, 361)
  par(mfrow=c(3, 2))
  for (i in (1:6)) {
    minn = bimonthVecStarts[i]
    maxx = bimonthVecStarts[i+1]

    ind = which(data[ , "doy"] >= minn & data[ , "doy"] <= maxx)
    z = data[ind, resp]

    z = na.omit(z)
    i1 = 2*i-1
    i2 = 2*i
    textt = paste("months ", i1, "and", i2)
    Fcn(z, main=textt, probability=TRUE, xlab="temp(C)")
  }
}
# CheckHistBiMonthly(data, resp)

CheckQqBiMonthly = function(data, resp) {
  bimonthVecStarts = c(1, 61, 121, 181, 241, 301, 361)
  par(mfrow=c(3,2))
  for (i in (1:6)) {
    minn = bimonthVecStarts[i]
    maxx = bimonthVecStarts[i+1]
    ind = which(data[ , "doy"] >= minn & data[ , "doy"] <= maxx)
    z = data[ind, resp]
    z = na.omit(z)
    i1 = 2*i-1
    i2 = 2*i
    textt = paste("monthVecs ", i1, "and", i2)
    qqnorm(z, main=textt, ColAlpha("black", 0.5))
    qqline(z)
  }
}

# CheckQqBiMonthly(data=data, resp="tmax")
TestDistShapiro = function(data) {
  monthVecStarts = c(1, 31, 61, 91, 121, 151, 181, 211, 241,
    271, 301, 331, 361)
  for (i in (1:12)) {
    minn = monthVecStarts[i]
    maxx = monthVecStarts[i]+1

    ind = which(data[ , "doy"] >= minn & data[ , "doy"] <= maxx)
    yres = data[ind , resp]

    test = shapiro.test(yres)
    print(test)
  }
}
# TestDistShapiro(data)

## extracting optimal columns from msRes
MsExtractOptCols = function(msRes, critNum=2) {

  topMod = msRes[["topModMatrix"]]
  optMod = topMod[1, ]
  explanColsOpt = optMod[(critNum + 1):length(optMod)]
  explanColsOpt = as.matrix(explanColsOpt)
  names(explanColsOpt) = NULL
  explanColsOpt = na.omit((explanColsOpt)[1, ])
  #print(explanColsOpt)
  crit = round(optMod[1:critNum], 3)
  #print(crit)
  return(explanColsOpt)
}

## plot a model given the optimal columns
FitPlotMod = function(explanColsOpt) {
  formulaString = paste(resp, "~", PlusVec(explanColsOpt))
  mod = glm(formula(formulaString), data=data)
  AIC(mod)
  BIC(mod)
  yFit = predict(mod)
  plot(data[ , "contiTime"], yFit, type="l")
}

## selects models and extracts columns
ModSelExtract = function(
  df, yCol, explanCols, explanColsF=NULL, crit="BIC", family="gaussian",
  ModelFcn=glm) {

  msRes = ModSelSubsetFormula(
    df=df,
    yCol=yCol,
    explanCols=explanCols,
    explanColsF=explanColsF,
    m=1,
    crit=crit,
    family=family,
    ModelFcn=glm)

  explanColsOpt = MsExtractOptCols(msRes)
  return(explanColsOpt)
}

Example = function() {
  l = list()
  crit = "AIC"
  l[["S1"]] = ModSelExtract(
    data=data, yCol=resp, explanCols=groupsList[["seas"]],
    explanColsF=NULL, crit=crit)

  ###### Extend to AR:
  l[["S2"]] = ModSelExtract(
    data=data, yCol=resp, explanCols=groupsList[["lag"]],
    explanColsF=l[["S1"]], crit=crit)


  l[["S3"]] = ModSelExtract(
    data=data, yCol=resp, explanCols=groupsList[["longt"]],
    explanColsF=l[["S1"]], crit=crit)

  l[["S4"]] = ModSelExtract(
    data=data, yCol=resp, explanCols=groupsList[["pow"]],
    explanColsF=l[["S1"]], crit=crit)

  l[["S5"]] = ModSelExtract(
    data=data, yCol=resp, explanCols=c(l[["S2"]], l[["S3"]], l[["S4"]]),
    explanColsF=l[["S1"]], crit=crit)
}

## specify model selection sequence
# in each step we may refer to a previous step or the groups
ApplySeqMs = function(
  df,
  yCol,
  crit,
  msSteps,
  family="gaussian",
  ModelFcn=glm) {

  l = list()
  n = length(msSteps)

  for (i in 1:n) {
    #print(paste("*** Model Selection: Step", i))
    step = msSteps[[i]]
    name = step[["name"]]
    explanColsF = step[["explanColsF"]]
    explanCols = step[["explanCols"]]
    explanColsFInd = step[["explanColsFInd"]]
    explanColsInd = step[["explanColsInd"]]
    #if (is.null(explanColsF) & !is.null(explanColsFInd))
    if (!is.null(explanColsFInd)) {
      explanColsF = c(explanColsF, unlist(l[explanColsFInd]))
    }
    if (is.null(explanCols) & !is.null(explanColsInd)) {
      explanCols = unlist(l[explanColsInd])
    }

    chosenCols = ModSelExtract(
      df=df,
      yCol=yCol,
      explanCols=explanCols,
      explanColsF=explanColsF,
      crit=crit,
      family=family,
      ModelFcn=ModelFc)

    #Mark(chosenCols)
    #Mark(explanColsF)

    l[[name]] = list("explanColsF"=explanColsF, "chosenCols"=chosenCols)
  }
  return(l)
}

## builds a FitFcn for generalized model selection via ModSelSubsetGen
# this is to pick best volatility
FitFcnBuild = function(df, yCol, explanCols, iter=10) {
  function(omegaPredCols) {
    out = FitGaussVolatMod(
      df=df,
      yCol=yCol,
      explanCols=explanCols,
      omegaPredCols=omegaPredCols,
      iter=10)[["BIC"]]
  }
}

## this function finds the optimal explan for autoregressive trend
# then it finds the optimal volatility structure
ApplySeqMs_tvv = function(
  df,
  yCol,
  msSteps,
  crit="BIC",
  omegaPredCols,
  critVolat="BIC",
  varNumVolat=2,
  volatDeviance=FALSE,
  writePath=NULL,
  fileNameExtra="_raw") {

  msRes_tvv = ApplySeqMs(
    df=df,
    yCol=yCol,
    crit=crit,
    msSteps=msSteps,
    family="gaussian")

  #print("*** result of ms")
  #print(res)

  optModExplan = sort(unique(as.vector(unlist(msRes_tvv))))
  seasModExplan = res[[1]]
  explanMsRes = res

  if (!is.null(writePath)) {
    write.csv(
      x=optModExplan,
      file=paste0(writePath,"opt_model", fileNameExtra, ".csv"),
      row.names=FALSE)
  }

  df2 = df

  if (volatDeviance) {
    formulaString = paste(yCol, "~", PlusVec(seasModExplan))
    seasMod = glm(as.formula(formulaString), data=modDf)
    d = df[ , yCol] - predict(seasMod)
    d = d[-length(d)]
    d = c(0, d)
    absD = abs(d)
    df2[ , "devFromUsual"] = d
    df2[ , "absDevFromUsual"] = absD
    omegaPredCols = c(omegaPredCols, "absDevFromUsual")
  }

  ## fitting varying volatility models
  res = ModSelSubsetGen(
    yCol,
    explanColsF=NULL,
    explanCols=omegaPredCols,
    FitFcn=FitFcnBuild(
      df=df2,
      yCol=yCol,
      explanCols=optModExplan),
    m=3,
    crit=critVolat,
    varNum=varNumVolat,
    critSummaryDelta=0.1)

  omegaPredColsOpt = MsExtractOptCols(res, critNum=1)

  volatMsRes = res

  if (!is.null(writePath)) {
    write.csv(
      x=omegaPredColsOpt,
      file=paste0(writePath, "opt_omega", fileNameExtra, ".csv"),
      row.names=FALSE)
  }

  outList = list()
  outList[["optModExplan"]] = optModExplan
  outList[["seasModExplan"]] = seasModExplan
  outList[["explanMsRes"]] = explanMsRes
  outList[["omegaPredColsOpt"]] = omegaPredColsOpt
  outList[["volatMsRes"]] = volatMsRes
  outList[["expandedData"]] = df2

  return(outList)
}

## initial can be the series upto the date we simulate from
## it augments the data with lag info
SimulNextWithModFcn = function(mod) {

  if ("gaussVolatMod" %in% class(mod)) {
    Fcn = function(futData) {
      pred = predict.gaussVolatMod(gaussVolatMod=mod, newdata=futData)
      ySim = pred[["mean"]] + rnorm(1, mean=0, sd=pred[["vol"]])
      return(ySim)
    }
  }  else if (paste(class(mod), collapse=",") == "glm,lm") {
    modFamily = family(mod)
    familyStr = family(mod)["family"]
    linkStr = family(mod)["link"]
    disp = mod[["dispersion"]]

    if (familyStr == "quasi" & linkStr == "log") {
      VarFcn = family(mod)["variance"]["variance"]
      Fcn = function(futData) {
        mu = exp(predict(mod, newdata=futData))
        theta = mod[["theta"]]
        ySim = rqpois(n=1, mu=mu, theta=theta)
        return(ySim)
      }
    }

    if (familyStr == "quasipoisson" & linkStr == "log") {
      rqpois = function(n, mu, theta) {
          rnbinom(n=n, mu=mu, size=mu/(theta-1))
      }
      Fcn = function(futData) {
        mu = exp(predict(mod, newdata=futData))
        var = disp * mu
        #ySim = rpois(1, lambda=yPred)
        #ySim = rqpois(n=1, mu=pred, theta=theta)
        ySim = rnorm(1, mean=mu, sd=sqrt(var))
        ySim = max(0, ySim)
        return(ySim)
      }
    }

    if (familyStr == "poisson" & linkStr == "log") {
      Fcn = function(futData) {
        mu = exp(predict(mod, newdata=futData))
        ySim = rpois(1, lambda=mu)
        return(ySim)
      }
    }

    if (familyStr == "poisson" & linkStr == "sqrt") {
      Fcn = function(futData) {
        mu = predict(mod, newdata=futData)^2
        ySim = rpois(1, lambda=mu)
        return(ySim)
      }
    }

    if (familyStr == "Gamma" & linkStr == "log") {
      alpha = 1/disp
      Fcn = function(futData) {
        mu = exp(predict(mod, newdata=futData))
        beta = alpha/mu
        ySim = rgamma(1, shape=alpha, rate=beta)
        return(ySim)
      }
    }

    if (familyStr == "Gamma" & linkStr == "identity") {
      alpha = 1/disp
      Fcn = function(futData) {
        mu = predict(mod, newdata=futData)
        beta = alpha/mu
        ySim = rgamma(1, shape=alpha, rate=beta)
        return(ySim)
      }
    }

    if (familyStr == "Gamma" & linkStr == "inverse") {
      alpha = 1/disp
      Fcn = function(futData) {
        mu = 1/predict(mod, newdata=futData)
        beta = alpha/mu
        ySim = rgamma(1, shape=alpha, rate=beta)
        return(ySim)
      }
    }
  } else if (paste(class(mod), collapse=",") == "negbin,glm,lm") {
    modFamily = family(mod)
    familyStr = family(mod)["family"]
    linkStr = family(mod)["link"]

    if (grepl("Negative Binomial", familyStr) & linkStr == "log") {
      #print("DUH")
      Fcn = function(futData) {
        mu = exp(predict(mod, newdata=futData))
        theta = mod[["theta"]]
        theta = theta + 1
        ySim = rnegbin(1, mu=mu, theta=theta)
        return(ySim)
      }
    }
  } else {
    print("Warning: prediction is not implemented for your model.")
    Fcn = function(x) NULL
  }
  return(Fcn)
}

# SimulNext(t=2007, data=basicData, respCols=respCols, mod=mod, TransFcn=function(x)x)
SimulateSeries = function(
  futDf,
  prevDf,
  mod,
  respCols,
  fsFcnList,
  LagFcn,
  simNum=1,
  limitSimValue=FALSE,
  yMax=NULL,
  yMin=NULL,
  Transf=function(x)x,
  applyBounds=FALSE,
  BoundDf=NULL,
  boundsWrtCols=NULL) {

  SimViaModel = SimulNextWithModFcn(mod)
  ## a function to simulate next value
  SimulNext = function(
    futDfRow, df, respCols, mod,
    fsFcnList, LagFcn, TransFcn=function(x)x) {

    futDf2 = Add_LagFs_explanDf(
      df=df,
      respCols=respCols,
      LagFcn=LagFcn,
      fsFcnList=fsFcnList,
      fut=TRUE,
      futDfRow=futDfRow)

    ySim = SimViaModel(futDf2)
    ySim = Transf(ySim)

    changedValue_global = 0
    changedValue_wrt = 0
    if (limitSimValue) {
      ySim2 = MinComp(ySim, yMax)
      ySim2 = MaxComp(ySim2, yMin)
      if (ySim2 != ySim) {
        ySim = ySim2
        changedValue_global = 1
      }
    }

    if (applyBounds) {
      newDf = setNames(
        data.frame(matrix(
          ncol=length(boundsWrtCols) + 1,
          nrow=0)),
        c(boundsWrtCols, respCols))
      newDf[1, ] = c(futDf2[1, boundsWrtCols], ySim)
      boundedDf = BoundDf(newDf=newDf)
      ySim2 = boundedDf[1, respCols]
      if (ySim2 != ySim) {
        ySim = ySim2
        changedValue_wrt = 1
      }
    }
    return(list(
      "ySim"=ySim,
      "changedValue_global"=changedValue_global,
      "changedValue_wrt"=changedValue_wrt))
  }


  SimFutVec = function(x) {
    futDf[ , respCols] = 0
    cols = names(futDf)
    df = prevDf[ , cols, drop=FALSE]

    changedNum_global = 0
    changedNum_wrt = 0
    changedNum_both = 0

    for (i in 1:dim(futDf)[1]) {
      futDfRow = futDf[i, , drop=FALSE]
      l = SimulNext(
        futDfRow=futDfRow, df=df, respCols=respCols,
        mod=mod, LagFcn=LagFcn, fsFcnList=fsFcnList,
        TransFcn=TransFcn)
      futDfRow[ , respCols] = l[["ySim"]]

      changedNum_global = changedNum_global + l[["changedValue_global"]]
      changedNum_wrt = changedNum_wrt + l[["changedValue_wrt"]]
      changedNum_both = (
        changedNum_both +
        l[["changedValue_global"]]*l[["changedValue_wrt"]])

      futDf[i, ] = futDfRow
      df = rbind(df, futDfRow)
    }
    return(list(
        "futDf"=futDf,
        "changedNum_global"=changedNum_global,
        "changedNum_wrt"=changedNum_wrt,
        "changedNum_both"=changedNum_both))
  }

  resList = lapply(X=1:simNum, FUN=SimFutVec)

  changedNum_global = 0
  changedNum_wrt = 0
  changedNum_both = 0

  futDf2 = futDf
  for (i in 1:simNum) {
    res = resList[[i]]
    futDf2[ , paste0(respCols, "_sim", i)] = res[["futDf"]][ , respCols]

    changedNum_global = changedNum_global + res[["changedNum_global"]]
    changedNum_wrt = changedNum_wrt + res[["changedNum_wrt"]]
    changedNum_both = changedNum_both + res[["changedNum_both"]]
  }

  futDf2[ , respCols] = futDf2[ , paste0(respCols, "_sim", 1)]

  Q1 = function(x) {quantile(x=x, p=0.25)}
  Q3 = function(x) {quantile(x=x, p=0.75)}

  for (respCol in respCols) {
    cols = paste0(respCol, "_sim", 1:simNum)
    futDf2[ , paste0(respCol, "_sim_mean")] = apply(
      futDf2[ , cols, drop=FALSE], 1, mean)
    futDf2[ , paste0(respCol, "_sim_sd")] =  apply(
      futDf2[ , cols, drop=FALSE], 1, sd)
    futDf2[ , paste0(respCol, "_sim_Q1")] =  apply(
      futDf2[ , cols, drop=FALSE], 1, Q1)
    futDf2[ , paste0(respCol, "_sim_Q3")] =  apply(
      futDf2[ , cols, drop=FALSE], 1, Q3)
  }

  n = dim(futDf)[1] * simNum

  return(list(
    "futDf"=futDf2,
    "changedPcnt_global"=100*changedNum_global / n,
    "changedPcnt_wrt"=100*changedNum_wrt / n,
    "changedPcnt_both"=100*changedNum_both / n))
}

## generate timeseries
CreateYearDoyIndex = function(timeStart, timeEnd, origin_forTimeVars=2000) {

  timeGrid = seq(timeStart, timeEnd, 1)
  timeDf = BuildTimeDf(
    timeGrid, origin_forTimeVars=origin_forTimeVars)
  out = timeDf[ , c("year", "doy", "contiTime")]
  return(out)
}

Example = function() {

  lastDay = basicData[dim(basicData)[1], "date"]
  timeStart = lastDay + 1
  timeEnd = lastDay + 366
  contiTime = CreateYearDoyIndex(timeStart, timeEnd)[ , "contiTime"]
}

## it tests constructed time series data
SanityCheck_tsModDf = function(obsDf, modDf, fsFcnList) {
  plot(modDf[ , "doy"])
  lines(plot(modDf[ , "toy"]))
  #Pause()
  plot(modDf[ , "contiTime"], obsDf[ , "contiTime"])
  #Pause()
  ind = 380:420
  plot(obsDf[ind, "contiTime"], modDf[ind, "tod"], type="p")
  #Pause()
  ind = 1:2000
  par(mfrow=c(1, length(names(fsFcnList))))
  #Pause()
  for (name in names(fsFcnList)) {
    plot(
      modDf[ind, "contiTime"],
      modDf[ind, paste0(name, "sin1")], type="p")
  }
  name = "tow"
  ind = 1:dim(modDf)[1]
  plot(modDf[ind, name], modDf[ind, paste0(name, "sin2")])
  #Pause()
  par(mfrow=c(1, 2))
  ind = 1:(24*7)
  plot(modDf[ind, "contiTime"], modDf[ind, "tow"])
  #Pause()
  ind = 1:(24)
  plot(modDf[ind, "contiTime"], modDf[ind, "tod"])
  #Pause()
}

## set up commom model selection steps (works for both glm and tvv)
# it includes, tod, tow, toy
BuildMsStepsTodTowToy = function(
  resp,
  todK,
  towK,
  toyK,
  todK_wknd,
  lagK,
  longtIndex,
  todK_fixed=0,
  towK_fixed=0,
  toyK_fixed=0,
  todK_wknd_fixed=0,
  lagK_fixed=0,
  contiTimeVars=c("ct1", "ct2", "ctSqrt", "ctCbrt"),
  explanColsF=NULL) {

  if (todK_fixed > 0) {
    explanColsF = c(
      explanColsF,
      paste0("tod", "sin", 1:todK_fixed),
      paste0("tod", "cos", 1:todK_fixed)
    )
  }

  if (towK_fixed > 0) {
    explanColsF = c(
      explanColsF,
      paste0("tow", "sin", 1:towK_fixed),
      paste0("tow", "cos", 1:towK_fixed)
    )
  }

  if (toyK_fixed > 0) {
    explanColsF = c(
      explanColsF,
      paste0("toy", "sin", 1:toyK_fixed),
      paste0("toy", "cos", 1:toyK_fixed)
    )
  }

  if (todK_wknd_fixed > 0) {
    explanColsF = c(
      explanColsF,
      paste0("tod_wknd", "sin", 1:todK_wknd_fixed),
      paste0("tod_wknd", "cos", 1:todK_wknd_fixed)
    )
  }

  if (lagK_fixed > 0) {
    explanColsF = c(
      explanColsF,
      paste0(resp, "lag", 1:lagK_fixed))
  }

  groupsList = list()
  groupsList[["tod"]] = c(
    paste0("tod", "sin", (todK_fixed + 1):todK),
    paste0("tod", "cos", (todK_fixed + 1):todK))
  groupsList[["tod_wknd"]] = c(
    paste0("tod_wknd", "sin", (todK_fixed + 1):todK),
    paste0("tod_wknd", "cos", (todK_fixed + 1):todK))
  groupsList[["tow"]] = c(
    paste0("tow", "sin", (1 + towK_fixed):towK),
    paste0("tow", "cos", (1 + towK_fixed):towK))
  groupsList[["toy"]] = c(
    paste0("toy", "sin", (1 + toyK_fixed):toyK),
    paste0("toy", "cos", (1 + toyK_fixed):toyK))
  groupsList[["contiTime"]] = contiTimeVars
  groupsList[["lag"]] = paste0(resp, "lag", (lagK_fixed + 1):lagK)
  groupsList[["longt"]] = paste0(resp, "longt", longtIndex)

  l = list()
  msSteps = list(
    list(
      name="S1",
      explanColsF=explanColsF,
      explanColsFInd=NULL,
      explanCols=groupsList[["tod"]],
      explanColsInd=NULL),
    list(
      name="S2",
      explanColsF=explanColsF,
      explanColsFInd="S1",
      explanCols=groupsList[["tod_wknd"]],
      explanColsInd=NULL),
    list(
      name="S3",
      explanColsF=explanColsF,
      explanColsFInd=c("S1", "S2"),
      explanCols=groupsList[["tow"]],
      explanColsInd=NULL),
    list(name="S4",
      explanColsF=explanColsF,
      explanColsFInd=c("S1", "S2", "S3"),
      explanCols=groupsList[["toy"]],
      explanColsInd=NULL),
    list(name="S5",
      explanColsF=explanColsF,
      explanColsFInd=c("S1", "S2", "S3", "S4"),
      explanCols=groupsList[["contiTime"]],
      explanColsInd=NULL),
    list(
      name="S6",
      explanColsF=explanColsF,
      explanColsFInd=c("S1", "S2", "S3", "S4", "S5"),
      explanCols=groupsList[["longt"]],
      explanColsInd=NULL),
    list(
      name="S7",
      explanColsF=explanColsF,
      explanColsFInd=c("S1", "S2", "S3", "S4", "S5"),
      explanCols=groupsList[["lag"]],
      explanColsInd=NULL) #,
    #list(
    #    name="S7",
    #    explanColsF=explanColsF,
    #    explanColsFInd=c("S1", "S2", "S3", "S4"),
    #    explanCols=NULL,
    #    explanColsInd=c("S5", "S6")),
    )

  return(msSteps)
}

## fits a glm model (nb, glm, ...)
FitGlmMod = function(
  familyStr,
  link,
  modDf,
  yCol,
  optModExplan_glm,
  dropExtraComponents=TRUE) {

  ## fitting neg binomial
  if (familyStr == "nb") {
    model = glm.nb(
      formula=formula(paste0(yCol, "~", AddSepFcn("+")(optModExplan_glm))),
      data=modDf,
      link=log)
  } else {
    model = glm(
      formula(paste0(yCol, "~", AddSepFcn("+")(optModExplan_glm))),
      data=modDf,
      family=eval(parse(text=paste0(familyStr, "(", link,")"))))
  }

  aic = AIC(model)
  bic = BIC(model)
  dev = deviance(model)
  n = dim(modDf)[1]

  corFitObs = cor(modDf[ , yCol], model[["fitted.values"]])
  model[["dispersion"]] = summary(model)[["dispersion"]]

  if (dropExtraComponents) {
    unnecess = c(
      "data",
      "y",
      "weights",
      "prior.weights",
      "residuals",
      #"linear.predictors",
      "values",
      "na.action")

    model[unnecess] = NULL
    model[["qr"]][["qr"]] = NULL

    #objList = ls(envir=attr(model[["terms"]], ".Environment"))
    #objList = setdiff(
    #  objList,
    #  c("aic", "bic", "dev", "n", "outList", "corFitObs"))
    objList = "modDf"

    #print(ls(envir=attr(model[["terms"]], ".Environment")))

    rm(list=objList, envir=attr(model[["terms"]], ".Environment"))
  }

  outList = list(
    "aic_stdized"=aic / n,
    "bic_stdized"=bic / n,
    "dev"=dev,
    "corFitObs"=corFitObs)

  outList[["model"]] = model

  return(outList)
}

Example = function() {

  n = 10^4
  x1 = rnorm(n)
  x2 = rnorm(n)
  y = rpois(n, lambda=exp(x1 + x2))
  plot(x1, y)
  df = data.frame(x1=x1, x2=x2, y=y)
  k = n/2
  trainDf = df[1:k, ]
  testDf = df[(k+1):n, ]

  res = FitGlmMod(
    familyStr="poisson",
    link="log",
    modDf=trainDf,
    yCol="y",
    optModExplan_glm=c("x1", "x2"))

  model = res[["model"]]
  ReportObjectSize(model)
  ReportObjectSize_true(model)

  yPred = predict(model, testDf)
  plot(testDf[ , "y"], yPred)
  summary(model)

}

## fitting a list of models
FitGlmModList = function(
  yCol,
  modDf,
  familyLinkStrVec,
  optModExplanList,
  modList=list(),
  modInfoDf = setNames(
        data.frame(matrix(
          ncol=9,
          nrow=0)),
        c(
          "modStr",
          "response",
          "familyStr",
          "link",
          "parameter_complexity",
          "aic_stdized",
          "bic_stdized",
          "dev",
          "corFitObs"))) {

  for (familyLinkStr in familyLinkStrVec) {
    for (j in 1:length(optModExplanList)) {
      optModExplan_glm = optModExplanList[[j]]
      model_complexity = names(optModExplanList)[j]
      familyStr = strsplit(familyLinkStr, "-")[[1]][1]
      link = strsplit(familyLinkStr, "-")[[1]][2]
      modStr = paste0(
        yCol, "-", familyLinkStr,
        "-", model_complexity)
      #Mark(modStr, message="modStr")

      modList[[modStr]] = FitGlmMod(
        familyStr= familyStr,
        link=link,
        modDf=modDf,
        yCol=yCol,
        optModExplan_glm=optModExplan_glm)


      modInfoDf[dim(modInfoDf)[1] + 1, ] = c(
        modStr,
        yCol,
        familyStr,
        link,
        model_complexity,
        modList[[modStr]][["aic_stdized"]],
        modList[[modStr]][["bic_stdized"]],
        modList[[modStr]][["dev"]],
        modList[[modStr]][["corFitObs"]])
    }
  }
  return(list("modList"=modList, "modInfoDf"=modInfoDf))
}

## compare fitted coefficients
CompareModelsCoefs = function(
  modList,
  modStrVec=NULL) {

  if (is.null(modStrVec)) {
    modStrVec = names(modList)
  }
  modStr = modStrVec[1]
  x = 1:length(modList[[modStr]][["model"]][["coefficients"]])
  plot(
    x,
    modList[[modStr]][["model"]][["coefficients"]],
    col=ColAlpha(1, 0.4),
    pch=1,
    cex=1,
    lwd=2)

  for (i in 2:length(modStrVec)) {
    modStr = modStrVec[i]

    points(
      x,
      modList[[modStr]][["model"]][["coefficients"]],
      col=ColAlpha(i, 0.4),
      pch=i,
      cex=1,
      lwd=2)
  }

  legend(
    "topright", modStrVec,
    col=ColAlpha(1:length(modStrVec), 0.7),
    pch=1:length(modStrVec), bty="n")
}

## plots a fitted model and compares it with obs data
# by pivoting Data
Plt_fittedModWrt = function(
  model,
  yCol,
  modDf,
  pivotVar=c("tow", "ywoy"),
  pltTitle="") {
  plot(
    modDf[ , yCol],
    model[["fitted.values"]],
    col=ColAlpha("blue", 0.05),
    pch=5)

  #print(cor(modDf[ , yCol], model[["fitted.values"]]))
  fitData = modDf
  fitData[ , yCol] = model[["fitted.values"]]
  fitPivotDf = PlotSummPivot(
    df=fitData,
    resp=yCol,
    pivotVar=pivotVar,
    pltTitle=pltTitle)

  obsOverlayDf = Plt_overlayFcnList(
    df=modDf,
    resp=yCol,
    fcnList=list(mean, sd),
    fcnNames=c("mean", "sd"),
    wrtCol=pivotVar[1],
    pltTitle="",
    colSuffix="_obs",
    colPrefix="",
    addToPlot=FALSE)

  fitOverlayDf = Plt_overlayFcnList(
    df=fitData,
    resp=yCol,
    fcnList=list(mean, sd),
    fcnNames=c("mean", "sd"),
    wrtCol=pivotVar[1],
    pltTitle="",
    colSuffix="_fit",
    colPrefix="",
    addToPlot=TRUE)

  return(list(
    "obsOverlayDf"=obsOverlayDf,
    "fitOverlayDf"=fitOverlayDf))
}

## simulation of data using fitted models
SimGlmModList = function(
  yCol,
  modList,
  familyLinkStrVec,
  optModExplanList,
  futDf,
  prevDf,
  fsFcnList,
  LagFcn,
  simNum=1,
  limitSimValue=FALSE,
  yMax=NULL,
  yMin=NULL,
  Transf=function(x)x,
  applyBounds=FALSE,
  BoundDf=NULL,
  boundsWrtCols=NULL,
  simDataList=list(),
  simInfoDf = setNames(
    data.frame(matrix(
      ncol=4,
      nrow=0)),
    c(
      "modStr",
      "changedPcnt_global",
      "changedPcnt_wrt",
      "changedPcnt_both")),
  parallel=FALSE,
  parallel_outfile="") {

  modStrVec = NULL
  for (familyLinkStr in familyLinkStrVec) {
    for (j in 1:length(optModExplanList)) {
      model_complexity = names(optModExplanList)[j]
      modStr = paste0(
        yCol, "-", familyLinkStr,
        "-", model_complexity)
      modStrVec = c(modStrVec, modStr)
    }
  }

  Func = function(modStr) {
    Src()
    print(Sys.time())
    print("inside the simulation: memory usage")

    print(ReportObjectSize(prevDf, "prevDf inside the sims"))

    model = modList[[modStr]][["model"]]

    res = SimulateSeries(
      futDf=futDf,
      prevDf=prevDf,
      mod=model,
      respCols=yCol,
      fsFcnList=fsFcnList,
      LagFcn=LagFcn,
      Transf=Transf,
      simNum=simNum,
      limitSimValue=limitSimValue,
      yMax=yMax,
      yMin=yMin,
      applyBounds=applyBounds,
      BoundDf=BoundDf,
      boundsWrtCols=boundsWrtCols)
    #names(res) = modStr
    return(res)
  }

  if (!parallel) {
    simDataList2 = lapply(X=modStrVec, FUN=Func)
  } else {
    library("parallel")
    closeAllConnections()
    if (nrow(prevDf) > 500) {
      prevDf = prevDf[(nrow(prevDf)-500):nrow(prevDf), ]
    }

    # Calculate the number of cores
    no_cores = detectCores() - 2
    no_cores = min(no_cores, length(modStrVec) + 1)
    Mark(no_cores, "no_cores")
    # Initiate cluster
    cl = makeCluster(no_cores, outfile=parallel_outfile)

    clusterExport(
      cl=cl,
      list(
        "yCol", "modList", "familyLinkStrVec", "Transf",
        "futDf", "prevDf", "fsFcnList", "LagFcn", "limitSimValue",
        "yMax", "yMin", "applyBounds", "BoundDf", "boundsWrtCols",
        "Src", "simNum"),
      envir=environment())

    simDataList2 = parLapply(cl=cl, X=modStrVec, fun=F)

    stopCluster(cl)
    closeAllConnections()
  }


  names(simDataList2) = modStrVec
  simDataList = c(simDataList, simDataList2)

  for (modStr in modStrVec) {
    simInfoDf[dim(simInfoDf)[1] + 1, ] = c(
      modStr,
      simDataList[[modStr]][["changedPcnt_global"]],
      simDataList[[modStr]][["changedPcnt_wrt"]],
      simDataList[[modStr]][["changedPcnt_both"]])
  }

  return(list("simDataList"=simDataList, "simInfoDf"=simInfoDf))
}

## plots sim data and obsdata
# it compares data across wrtCols
# also it comapres longterm trends
PltTsSimData = function(
  yCol,
  simDf,
  trainDf,
  testDf,
  windowCol,
  overlayWrtCol,
  timeThresh=-1,
  pltTitleVec=c("", "", "", ""),
  cex.main=NULL,
  mfrow=NULL) {

  trainDf[ , "color"] = ColAlpha("blue", 0.3)
  testDf[ , "color"] = ColAlpha("blue", 0.3)
  obsDf_all = rbind(trainDf, testDf)
  #simData[ , "count"] = exp(simData[ , "logCount"])

  simDf[ , "color"] = ColAlpha("red", 0.3)
  cols = c("contiTime", windowCol, yCol, "color")
  combinDf = rbind(trainDf[ , cols], simDf[ , cols])
  ind = combinDf[ , "contiTime"] > timeThresh

  if (is.null(mfrow)) {
    mfrow = c(2, 2)
  }
  par(mfrow=mfrow)

  plot(
      combinDf[ind, "contiTime"], combinDf[ind, yCol],
      col=ColAlpha("blue", 0.3), type="l", ylab=yCol,
      xlab="time", main=pltTitleVec[1], cex.main=1.5,
      cex.axis=1.5, cex.lab=1.5)

  lines(
    simDf[ , "contiTime"], simDf[ , yCol],
    col=ColAlpha("red", 0.3), lty=2)

  legend(
      "topleft",
      legend=c("obs", "sim"),
      lty=c(1, 2),
      col=ColAlpha(c("blue", "red"), 0.3))

  ## obs data
  obsOverlayDf = Plt_overlayFcnList(
    df=testDf,
    resp=yCol,
    fcnList=list(mean, sd),
    fcnNames=c("mean obs", "sd obs"),
    wrtCol=overlayWrtCol,
    pltTitle=pltTitleVec[2],
    colSuffix="_obs",
    colPrefix="",
    addToPlot=FALSE)

  ## sim data
  simOverlayDf = Plt_overlayFcnList(
    df=simDf,
    resp=yCol,
    fcnList=list(mean, sd),
    fcnNames=c("mean sim", "sd sim"),
    wrtCol=overlayWrtCol,
    pltTitle="",
    colSuffix="_sim",
    colPrefix="",
    addToPlot=TRUE)


  ## weekly pattern comparison plot
  res = LongtAvgPlt(
      df=obsDf_all,
      timeCol="contiTime",
      windowCol=windowCol,
      respCol=yCol,
      ylab="weekly avg count",
      color=ColAlpha("blue", 0.4),
      pltTitle=pltTitleVec[3])

  res = LongtAvgPlt(
      df=simDf,
      timeCol="contiTime",
      windowCol=windowCol,
      respCol=yCol,
      ylab="weekly avg count",
      color=ColAlpha("red", 0.4),
      addToPlot=TRUE,
      lty=2)

  legend(
      "topleft",
      legend=c("obs", "sim"),
      lty=c(1, 2),
      col=ColAlpha(c("blue", "red"), 0.3))

  ## Annual avg comparison plot
  res = LongtAvgPlt(
      df=obsDf_all,
      timeCol="contiTime",
      windowCol="year",
      respCol=yCol,
      addPoints=TRUE,
      ylab="avg annual count",
      ylim=c(0, 600),
      color=ColAlpha("blue", 0.4),
      pltTitle=pltTitleVec[4])

  res = LongtAvgPlt(
      df=simDf,
      timeCol="contiTime",
      windowCol="year",
      respCol=yCol,
      addPoints=TRUE,
      addToPlot=TRUE,
      ylab="avg annual count",
      color=ColAlpha("red", 0.5),
      lty=2)

  legend(
      "topleft",
      legend=c("obs", "sim"),
      lty=c(1, 2),
      col=ColAlpha(c("blue", "red"), 0.3))
}

AssessSimErr = function(
  yCol,
  simDf,
  testDf) {

  yObs = testDf[ , yCol]
  ySim = simDf[ , yCol]
  rmse = sqrt(mean((yObs - ySim)^2))
  mae = mean(abs(yObs - ySim))
  corr = cor(yObs, ySim)
  out = c(rmse, mae, corr)
  names(out) = c("RMSE", "MAE", "cor(obs, sim)")
  return(out)
}
Reza1317/funcly documentation built on Feb. 5, 2020, 4:06 a.m.