#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# authors: Alireza Hosseini, Reza Hosseini
## Time series manipulation and forecasting functions
## testing to check if the timestamps are consecutive with no gaps
Check_forTimeGaps = function(df, timeCol) {
x = NULL
Func = function(i) {
x = difftime(df[i + 1, timeCol], df[i, timeCol])
return(x)
}
x = lapply(1:(nrow(obsDf)-1), FUN=Func)
x = unlist(x)
return(summary(x))
}
## build a time data frame with useful
# temporal attributes for plotting and modeling
BuildTimeDf = function(
ts,
format="%Y-%m-%d %H:%M:%S",
tz="GMT",
origin_forTimeVars=2000) {
ts = as.POSIXct(ts, format=format, tz=tz)
## function to decide leap year, we delibrately avoid is_leap in lubridate
IsLeap = function(year) {
out = (year %% 400 == 0) + (year %% 100 != 0)*(year %% 4 == 0)
return(out > 0)
}
second = lubridate::second(ts)
minute = lubridate::minute(ts)
hour = lubridate::hour(ts)
doy = lubridate::yday(ts)
#as.numeric(strftime(ts, format = "%j"))
# time of day (ranges in 0-24)
tod = hour + minute / 60 + second / 3600
dom = chron::days(ts)
week = lubridate::week(ts)
weekday = weekdays(ts)
isWeekend = weekday %in% c("Saturday", "Sunday")
# day of week
dow = with(chron::month.day.year(ts), chron::day.of.week(month, day, year))
# time of week (ranges from 0-7)
tow = dow + tod / 24
month = lubridate::month(ts)
year = lubridate::year(ts)
yearLength = 365 + IsLeap(year)
# time of year
toy = (doy - 1) + tod / 24
# continuous time scale (unit is in years)
contiTime = year + toy / yearLength
ct1 = contiTime - origin_forTimeVars
ct2 = (ct1^2) * sign(ct1)
ctSqrt = sqrt(abs(ct1)) * sign(ct1)
ctCbrt = abs(ct1)^(1/3) * sign(ct1)
date = as.Date(ts)
timeDf = data.frame(
date=date,
contiTime=contiTime,
ct1=ct1,
ct2=ct2,
ctSqrt=ctSqrt,
ctCbrt=ctCbrt,
year=year,
month=month,
dom=dom,
doy=doy,
weekday=weekday,
isWeekend=isWeekend,
week=week,
dow=dow,
tow=tow,
tod=tod,
toy=toy,
hour=hour,
minute=minute,
second=second)
timeDf[ , "ywoy"] = paste(timeDf[ , "year"], timeDf[ , "week"], sep=":")
timeDf[ , "ydoy"] = paste(timeDf[ , "year"], timeDf[ , "doy"], sep=":")
timeDf[ , 'tod_wknd'] =timeDf[ , 'tod'] * timeDf[ , 'isWeekend']
timeDf[ , "dow_hr"] = paste(timeDf[ , "dow"], timeDf[ , "hour"], sep="_")
ind = minute > 30
half = rep("1sthalf", nrow(timeDf))
half[ind] = "2ndhalf"
timeDf[ , "dow_30min"] = paste(timeDf[ , "dow_hr"], half, sep="_")
return(timeDf)
}
Example = function() {
ts = strptime("01/01/2011 11:43", "%d/%m/%Y %H:%M", tz = "Europe/London")
ts = chron::as.chron(ts)
BuildTimeDf(ts)
}
## build a time data frame given the two end points and delta
# wrote this for uber originally
CreateTimeGrid = function(t1, t2, delta, origin_forTimeVars=2000) {
tsGrid = seq.POSIXt(
as.POSIXct(t1, tz="GMT"),
as.POSIXct(t2, tz="GMT"),
by=delta)
tsGrid = data.frame(ts=tsGrid)
timeDf = BuildTimeDf(
tsGrid[ , "ts"], origin_forTimeVars=origin_forTimeVars)
df = cbind(tsGrid, timeDf)
return(df)
}
TestCreateTimeGrid = function() {
t1 = chron::as.chron(strptime(
"01/01/2011 23:53", "%d/%m/%Y %H:%M", tz = "Europe/London"))
t2 = chron::as.chron(strptime(
"02/01/2011 01:03", "%d/%m/%Y %H:%M", tz = "Europe/London"))
timeDf = CreateTimeGrid(t1, t2, delta=1/(24*60))
}
## aggregates timestamp data and return counts per time interval of length delta
# it also fills in the gaps where there are no timestamps with zero count
# was originally written for uber homework
TimestampDf_toCounts = function(
ts,
delta,
timeColName='ts') {
tsDelta = trunc(ts, delta)
tab = table(as.character(tsDelta))
df = as.data.frame(tab)
## change back the factor class to chron
df[ , 1] = as.POSIXct(
as.character(df[ , 1]),
format="%Y-%m-%d %H:%M:%S", tz="GMT")
names(df) = c(timeColName, 'count')
## for some time intervals we may have zero logins.
# Therefore we need to fill in the gaps
tsDelta = df[ , 1]
tmin = min(tsDelta)
tmax = max(tsDelta)
tsGrid = seq.POSIXt(
as.POSIXlt(tmin, tz="GMT"),
as.POSIXlt(tmax, tz="GMT"), by=delta)
## we may see a mismatch in size:
# that means some time intervals are missing in our df
## that indicates we have some intervals with zero login
## in order to fill in the gaps we merge the tsGrid and df
tsGridDf = as.data.frame(tsGrid)
names(tsGridDf) = timeColName
tsGridDf[ , 1] = as.character(tsGridDf[ , 1])
df[ , 1] = as.character(df[ , 1])
dfGapFilled = merge(tsGridDf, df, by=timeColName, all.x=TRUE, from='left')
dfGapFilled[ , 1] = as.POSIXct(
dfGapFilled[ , 1], format="%Y-%m-%d %H:%M:%S",
tz="GMT")
## lets find out which timestamps has no logins (NA)
indNA = which(is.na(dfGapFilled[ , 2]))
tsNA = dfGapFilled[indNA , 1]
## setting NA to zero
dfGapFilled[indNA, 2] = 0
# adjust for the fact that the last interval count is under-estimated
# coef below is between 0 and 1.
# coef ~ 1; is when we have a login close to the end of the last interval
# coef ~ 0; is when the last login happens to spill over to the last interval
coef = as.numeric(difftime(max(ts), tmax, unit=delta))
# if coef < 1/2 the the data for the last interval is not reliable
# because the last obs login is half way through the interval
# and we delete the last row of the df
if (coef < 1/2) {
df = dfGapFilled[-dim(df)[1] , , drop=FALSE]
}
#print(coef)
# if coef > 1/2 we have a login which is more than half way through
# the last interval
# therefore we proportionaly adjust (bump the count up)
# we round down because there is an upward bias with bumpig up with coef
if (coef < 1 & coef > 1/2) {
#print(dfGapFilled[dim(dfGapFilled)[1], 'count'])
x = dfGapFilled[dim(dfGapFilled)[1], 'count'] * (coef^(-1))
newx = round(x)
dfGapFilled[dim(dfGapFilled)[1], 'count'] = newx
#print(dfGapFilled[dim(dfGapFilled)[1], 'count'])
}
outList = list(df=dfGapFilled, tsNA=tsNA)
}
## aggregates timestamp data with an associated value
# it creates a gridded data table
# and returns an aggregate eg counts (AggFunc = sum) for each time grid
# it also fills in the gaps where there are no timestamps with "naFiller"
# the naFiller is zero if the value count
# ts is a series
# timeRounding is the time rounding
# since the last timestamp is not necessarily and end point to the obsetrvation
# window, for example it could be the last time an event has occurred
# it tried to weight the last time grids value
# if the last time observed timestamp is close to the max grid point
# it does nothing
# if its more than half grid off it adjust the last value
EventTimestamp_toAggrDt = function(
dt,
timeCol,
valueCol=NULL,
timeRounding,
newTimeCol="ts",
timeFormat="(%Y-%m-%d %H:%M:%S)",
AggFunc=function(x)sum(x, na.rm=TRUE),
naFiller=0,
roundLastValue=FALSE) {
#tsDelta = trunc(ts, timeRounding)
if (is.null(valueCol)) {
valueCol = "cnt"
dt[ , "cnt"] = 1
}
if (is.null(newTimeCol)) {
newTimeCol = "ts"
}
ts = dt[ , get(newTimeCol)]
dt[ , newTimeCol] = round_date(Col(dt=dt , col=timeCol), timeRounding)
dt = DtSimpleAgg(
dt=dt,
gbCols=c(newTimeCol),
valueCols=valueCol,
AggFunc=AggFunc)
## for some time intervals we may have zero logins.
# Therefore we need to fill in the gaps
tsDelta = dt[ , get(newTimeCol)]
tmin = min(tsDelta)
tmax = max(tsDelta)
delta = as.numeric(lubridate::duration(timeRounding))
tsGrid = seq(tmin, tmax, by=delta)
## we may see a mismatch in size: that means some time intervals are missing
# in our df
# that indicates we have some intervals with zero login
# in order to fill in the gaps we merge the tsGrid and df
tsGridDt = data.table(tsGrid)
names(tsGridDt) = newTimeCol
#tsGridDt[ , 1] = as.character(tsGridDt[ , 1])
#dt[ , 1] = as.character(dt[ , 1])
gridDt = merge(
tsGridDt,
dt,
by=newTimeCol,
all.x=TRUE,
from='left')
#gridDt[ , 1] = as.chron(gridDt[ , 1], format=timeFormat)
## lets find out which timestamps has no logins (NA)
indNA = which(is.na(gridDt[ , 2]))
tsNA = gridDt[indNA , 1]
## setting NA to zero
gridDt[indNA, 2] = naFiller
# adjust for the fact that the last interval count is under-estimated
# lastTs_gapCoef below is between 0 and 1.
# lastTs_gapCoef ~ 1; is when we have a login close to the end of the last interval
# lastTs_gapCoef ~ 0; is when the last login happens to spill over to the last interval
lastTs_gapCoef = as.numeric(max(ts) - tmax) / delta
# if lastTs_gapCoef < 1/2 the the data for the last interval is not reliable
# because the last obs login is half way through the interval
# and we delete the last row of the df
if (lastTs_gapCoef < 1/2) {
df = gridDt[-nrow(gridDt) , , drop=FALSE]
}
# if lastTs_gapCoef > 1/2 we have a login which is more than half way through
# the last interval
# therefore we proportionaly adjust (bump the count up)
# we round down because there is an upward bias with bumpig up with lastTs_gapCoef
if (lastTs_gapCoef < 1 & lastTs_gapCoef > 1/2) {
x = gridDt[nrow(gridDt), get(valueCol)] * (lastTs_gapCoef^(-1))
if (roundLastValue) {
x = round(x)
}
gridDt[dim(gridDt)[1], get(valueCol)] = x
}
return(list(
"dt"=gridDt,
"tsNA"=tsNA,
"lastTs_gapCoef"=lastTs_gapCoef))
}
TestEventTimestamp_toAggrDt = function() {
ts = seq(
from=as.POSIXlt("2011-01-01 1:53"),
to=as.POSIXlt("2011-01-01 05:53"),
by=60)
n = length(ts)
df = data.frame(
"ts"=ts,
"event_cnt"=sample(1:10, n, replace=TRUE))
m = round(n / 3)
ind = sort(sample(1:n, m, replace=FALSE))
df = df[ind, ]
ts = df[ , "ts"]
dt = data.table(df)
res = EventTimestamp_toAggrDt(
dt=dt,
timeCol="ts",
valueCol=NULL,
timeRounding="5 min",
newTimeCol=NULL,
timeFormat="(%Y-%m-%d %H:%M:%S)")
}
## this function build time data frame
BuildTimeDf2 = function(dt) {
library("lubridate")
n = length(dt)
hourVec = rep(NA, n)
mdayVec = rep(NA, n)
monthVec = rep(NA, n)
quarterVec = rep(NA, n)
wdayVec = rep(NA, n)
weekVec = rep(NA, n)
doyVec = rep(NA, n)
yearVec = rep(NA, n)
minuteVec = rep(NA, n)
secondVec = rep(NA, n)
todVec = rep(NA, n)
towVec = rep(NA, n)
toyVec = rep(NA, n)
contiTimeVec = rep(NA, n)
## year, week of yea
ywoyVec = rep(NA, n)
ydoyVec = rep(NA, n)
for (i in 1:n) {
dt1 = dt[i]
hourVec[i] = hour(dt1)
mdayVec[i] = mday(dt1)
monthVec[i] = month(dt1)
quarterVec[i] = quarter(dt1)
wdayVec[i] = wday(dt1)
weekVec[i] = week(dt1)
doyVec[i] = yday(dt1)
yearVec[i] = year(dt1)
minuteVec[i] = minute(dt1)
secondVec[i] = second(dt1)
# tod (time of day) is between 0 and 24
todVec[i] = hourVec[i] + minuteVec[i]/60 + secondVec[i]/3600
# tow (time of week) between 0 to 7
towVec[i] = (wdayVec[i]-1) + todVec[i]/24
# toy (time of year)
toyVec[i] = (doyVec[i]-1) + todVec[i]/24
contiTimeVec[i] = yearVec[i] + toyVec[i]/365
ywoyVec[i] = paste(yearVec[i], weekVec[i], sep=":")
ydoyVec[i] = paste(yearVec[i], doyVec[i], sep=":")
}
timeDf = data.frame(
hour=hourVec, mday=mdayVec, month=monthVec,
quarter=quarterVec, wday=wdayVec, week=weekVec, doy=doyVec, year=yearVec,
minute=minuteVec, second=secondVec, tod=todVec, tow=towVec, toy=toyVec,
contiTime=contiTimeVec, ywoy=ywoyVec, ydoy=ydoyVec)
return(timeDf)
}
## build timestamp (datetime) from date, hr, minute, sec
BuildDatetime = function(
df,
dateCol=NULL,
yearCol=NULL,
monthCol=NULL,
dayCol=NULL,
hrCol=NULL,
minCol=NULL,
secCol=NULL,
dateFormat="%Y-%m-%d") {
asch = as.character
Func = function(
date=NULL, day=NULL, month=NULL, year=NULL,
hr="00", min="00", sec="00") {
if (is.null(date)) {
date=paste(year, month, day, sep="-")
dateFormat = "%Y-%m-%d"
}
dt = date
if (is.null(hr)) {hr="00"}
if (is.null(min)) {min="00"}
if (is.null(sec)) {sec="00"}
dt = paste(dt, hr, sep=" ")
dt = paste(dt, min, sep=":")
dt = paste(dt, sec, sep=":")
#dtFormat = paste(dateFormat,"%H:%M:%S",sep=" ")
#out = strptime(dt, dtFormat)
out = dt
return(out)
}
n = dim(df)[1]
out = rep(NA, n)
for (i in 1:n) {
date=NULL
year=NULL
month=NULL
day=NULL
hr=NULL
min=NULL
sec=NULL
if (!is.null(dateCol)) {date = df[i, dateCol]}
if (!is.null(yearCol)) {year = df[i, yearCol]}
if (!is.null(monthCol)) {month = df[i, monthCol]}
if (!is.null(dayCol)) {day = df[i, dayCol]}
if (!is.null(hrCol)) {hr=df[i, hrCol]}
if (!is.null(minCol)) {min=df[i, minCol]}
if (!is.null(secCol)) {sec=df[i, secCol]}
res = F(
date=date,
day=day,
month=month,
year=year,
hr=hr,
min=min,
sec=sec)
out[i] = res
#res = as.POSIXct(res,format="%Y-%m-%d %H:%M:%S")
#out[i] = asch(res)
}
return(out)
}
## creates a grid between given dates
CreateTemporalGrid = function(
startDate,
endDate,
startTime,
endTime,
timeDelta) {
period_dates = seq(
as.Date(startDate),
as.Date(endDate),
by="days")
date = period_dates
hr = 0:23
min = 0:(round(60/timeDelta)-1)
period_grid = expand.grid(min=min, hr=hr, date=period_dates)
period_grid = data.frame(
date=as.Date(period_grid[ , 3]),
hr=period_grid[ , 2],
min=period_grid[ , 1])
date = period_grid[ , 1]
hr = period_grid[ , 2]
min = period_grid[ , 3]
time_of_day = hr + timeDelta*(min/(60))
conti_time = date - as.Date(startDate) + hr/24 + timeDelta*(min/(24*60))
conti_time = as.numeric(conti_time)
week_days = weekdays(as.Date(date, "%d-%m-%Y"))
num_week_days = weekday_to_numday(week_days)
time_of_week = num_week_days + hr/24 + timeDelta*(min/(24*60))
year = strftime(date, format="%Y")
month = strftime(date, format="%m")
day = strftime(date, format="%d")
timestamp = ISOdatetime(
year=year, month=month, day=day, hour=hr,
min=min*timeDelta, sec=0, tz = "")
period_grid = data.frame(
timestamp=timestamp, date=date, hr=hr,
min=min*timeDelta, min_aggr=min, time=conti_time,
week_day=week_days, num_day=num_week_days,
time_of_week=time_of_week,
time_of_day=time_of_day)
t1_ind = which(time_of_day >= startTime)
t2_ind = which(time_of_day <= endTime)
ind1 = t1_ind[1]
ind2 = t2_ind[length(t2_ind)]
period_grid = period_grid[ind1:ind2, ]
return(period_grid)
}
Example = function() {
timeDelta = 30
startDate = "2013-12-01"
endDate = "2013-12-25"
startTime = 9.5
endTime = 11.5
fut_grid = CreateTemporalGrid(
startDate, endDate, startTime, endTime, timeDelta)
#grid_day_categ = date_NH_fcn(fut_grid$date)
fut_grid = cbind(fut_grid, day_categ=grid_day_categ)
}
## gets the timestamps and National Holidays to build predictors
Build_predictorTemporalNH = function(
timestamps=NA, ord=NA, NHdatesR=NA) {
asn = as.numeric
dates = as.Date(timestamps)
hr = format(timestamps,"%H")
hr = asn(hr)
min = format(timestamps,"%M")
min = asn(min)
week_day = format(timestamps,"%A")
num_day = format(timestamps,"%w")
num_day = asn(num_day) %% 7
time_of_week = num_day + hr/24 + min/(24*60)
time_of_day = time_of_week %% 1
day_categ = date_NH_fcn(dates, NHdatesR=NHdatesR)
week_day_binary = (as.character(week_day) %in% c("Monday", "Tuesday",
"Wednesday", "Thursday","Friday"))
Fri_binary = (as.character(week_day) %in% c("Friday"))
weekend_binary = (as.character(week_day) %in% c("Saturday","Sunday"))
Sun_binary = (as.character(week_day) %in% c("Sunday"))
### define national holidays indicator
NH = !day_categ %in% c("regular", "BNH", "BNH2")
## define before national holiday indicator
BNH = day_categ %in% c("BNH")
### define NH but not weekend!
NH_Reg = NH * (!week_day %in% c("Sunday", "Saturday"))
### define BNH but not weekend!
BNH_Reg = BNH * (!week_day %in% c("Sunday", "Saturday"))
n = length(timestamps)
## defining the frequencies
omega_week = (2*pi)/(7)
omega_day = (2*pi)/24
sinMat1 = matrix(NA, n, ord[1])
cosMat1 = matrix(NA, n, ord[1])
### weekly patterns (1)
for (i in 1:ord[1]) {
sinMat1[ , i] = sin(i*omega_week*time_of_week)
cosMat1[ , i] = cos(i*omega_week*time_of_week)
}
### week-day patterns (2)
sinMat2 = matrix(NA, n, ord[2])
cosMat2 = matrix(NA, n, ord[2])
for (i in 1:ord[2]) {
sinMat2[ , i] = week_day_binary*sin(i*omega_day*time_of_day)
cosMat2[ , i] = week_day_binary*cos(i*omega_day*time_of_day)
}
### Weekend (3)
sinMat3 = matrix(NA, n, ord[3])
cosMat3 = matrix(NA, n, ord[3])
for (i in 1:ord[3]) {
sinMat3[ , i] = weekend_binary*sin(i*omega_day*time_of_day)
cosMat3[ , i] = weekend_binary*cos(i*omega_day*time_of_day)
}
### Friday patterns (4)
sinMat4 = matrix(NA, n, ord[4])
cosMat4 = matrix(NA, n, ord[4])
for (i in 1:ord[4]) {
sinMat4[ , i] = Fri_binary*sin(i*omega_day*time_of_day)
cosMat4[ , i] = Fri_binary*cos(i*omega_day*time_of_day)
}
### Sunday (5)
sinMat5 = matrix(NA, n, ord[5])
cosMat5 = matrix(NA, n, ord[5])
for (i in 1:ord[5]) {
sinMat5[ , i] = Sun_binary*sin(i*omega_day*time_of_day)
cosMat5[ , i] = Sun_binary*cos(i*omega_day*time_of_day)
}
### NH (6)
sinMat6 = matrix(NA, n, ord[6])
cosMat6 = matrix(NA, n, ord[6])
for (i in 1:ord[6]) {
sinMat6[ , i] = NH_Reg*sin(i*omega_day*time_of_day)
cosMat6[ , i] = NH_Reg*cos(i*omega_day*time_of_day)
}
### BNH (7)
sinMat7 = matrix(NA, n, ord[7])
cosMat7 = matrix(NA, n, ord[7])
for (i in 1:ord[7]) {
sinMat7[,i] = BNH_Reg*sin(i*omega_day*time_of_day)
cosMat7[,i] = BNH_Reg*cos(i*omega_day*time_of_day)
}
### 8
X_NH_Reg = cbind(NH_Reg, BNH_Reg)
Reg_sin = sinMat1
Reg_cos = cosMat1
Reg_within_day_sin = sinMat2
Reg_within_day_cos = cosMat2
Fri_within_day_sin = sinMat3
Fri_within_day_cos = cosMat3
Weekend_within_day_sin = sinMat4
Weekend_within_day_cos = cosMat4
Sun_within_day_sin = sinMat5
Sun_within_day_cos = cosMat5
NH_Reg_sin = sinMat6
NH_Reg_cos = cosMat6
BNH_Reg_sin = sinMat7
BNH_Reg_cos = cosMat7
pred_data = list(
Reg_sin=Reg_sin,
Reg_cos=Reg_cos,
Reg_within_day_sin=Reg_within_day_sin,
Reg_within_day_cos=Reg_within_day_cos,
Fri_within_day_sin = Fri_within_day_sin,
Fri_within_day_cos = Fri_within_day_cos,
Weekend_within_day_sin = Weekend_within_day_sin,
weekend_within_day_cos = Weekend_within_day_cos,
Sun_within_day_sin = Sun_within_day_sin,
Sun_within_day_cos = Sun_within_day_cos,
X_NH_Reg=X_NH_Reg,
NH_Reg_sin=NH_Reg_sin,
NH_Reg_cos=NH_Reg_cos,
BNH_Reg_sin=BNH_Reg_sin,
BNH_Reg_cos=BNH_Reg_cos)
return(pred_data)
}
Example = function() {
O = 11
D = 31
startTime = 0
endTime = 24
startDate = "2012-12-01"
endDate = "2013-02-01"
pred_data_bricks = CreateTemporalGrid(
startDate, endDate, startTime, endTime, timeDelta)
ord = c(6, 10, 5, 5, 5, 5, 5)
NHdatesR = NHdatesR
timestamps = pred_data_bricks[ , "timestamp"]
T1 = Sys.time()
res = Build_predictorTemporalNH2(timestamps, ord, NHdatesR=NHdatesR)
T2 = Sys.time()
T2-T1
#end
}
## this function calculates a progress in an event
# which happens between timeStart and timeEnd at timestamp
CalcTimestampProgress = function(
timeStart=NA, timeEnd=NA, timestamp1=NA) {
timestamp1 = as.POSIXct(timestamp1)
timeStart = as.POSIXct(timeStart)
timeEnd = as.POSIXct(timeEnd)
asn = as.numeric
a = asn(difftime(timestamp1, timeStart, unit="secs"))
b = asn(difftime(timeEnd, timeStart, unit="secs"))
return(a/b)
}
Example = function() {
startTime = 0
endTime = 24
startDate = "2013-09-17"
endDate = "2013-09-24"
timeDelta = 30
pred_data_bricks = CreateTemporalGrid(startDate, endDate,
startTime, endTime, timeDelta)
timestamps = pred_data_bricks$timestamp
timeStart = timestamps[1]
timeEnd = timestamps[10]
CalcTimestampProgress(timeStart, timeEnd, timestamps[9])
}
## build event timestamps: this file creates a variable
# which tells the event status for a given timestamp
Build_eventTimestamp = function(timestamps, events_info) {
Events = events_info[["EventID"]]
EventsSet = unique(Events)
event_num = length(EventsSet)
timestamp1 = events_info$timestamp_start
timestamp2 = events_info$timestamp_end
timestamp1 = as.POSIXct(timestamp1)
timestamp2 = as.POSIXct(timestamp2)
n = length(timestamps)
eventMatrix = matrix(0, n, event_num)
eventDf = data.frame(eventMatrix)
names(eventDf) = EventsSet
eventDf = cbind(timestamp=timestamps, eventDf)
#event_status = rep("None",n)
#event_progress = rep(0,n)
for (i in 1:event_num) {
Event = EventsSet[i]
ind = which(Events == Event)
for (k in ind) {
t1 = timestamp1[k]
t2 = timestamp2[k]
event_ind = which(timestamps >= t1 & timestamps <= t2)
if (length(event_ind) > 0) {
eventTimestamps = timestamps[event_ind]
event_prog = CalcTimestampProgress(t1, t2, eventTimestamps)
eventDf[event_ind, (i + 1)] = event_prog
}
}
}
return(eventDf)
}
Example = function() {
startTime = 0
endTime = 24
startDate = "2013-09-19"
endDate = "2013-09-24"
timeDelta = 30
pred_data_bricks = CreateTemporalGrid(startDate, endDate,
startTime, endTime, timeDelta)
timestamps = pred_data_bricks$timestamp
events_status = Build_eventTimestamp(timestamps, events_info)
}
## (old function, check) the weighted moving average with missing allowed
MovingAvg = function(
vec,
filterNum=NULL,
filterPpns=NULL,
missNum=0,
periodic=FALSE) {
if (!is.null(filterNum) && filterNum > length(vec)) {
'Warning: filterNum bigger than vec size'
return(rep(mean(vec, na.rm=TRUE), length(vec)))
}
l = length(filterPpns)
if (is.null(filterNum)) {
filterNum = 0
}
if (!is.null(filterPpns)) {
filterNum = l
}
if (is.null(filterPpns)) {
filterPpns = rep(1, filterNum)
}
l = length(filterPpns)
filterWeights = c(filterPpns[l:1], 1, filterPpns[1:l])
filterWeights = filterWeights/sum(filterWeights)
#print(filterWeights)
L = length(vec)
out = rep(NA, L)
v = rep(NA, L)
if (periodic) {
v = vec
}
vecvec = c(v, vec, v)
for (q in 1:L) {
part = vecvec[(L+q-filterNum):(L+q+filterNum)]
miss = is.na(part)
partMissNum = sum(miss)
if (partMissNum <= missNum) {
ind = which(is.na(part) == FALSE)
subPart = part[ind]
filterWeights2 = filterWeights[ind]
filterWeights2 = filterWeights2/sum(filterWeights2)
subPart = subPart*filterWeights2
out[q] = sum(subPart)
}
}
return(out)
}
##Example
Example = function() {
vec = rnorm(100)
filterNum = 5
filterPpns = c(1/2, 1/3, 1/4, 1/5, 1/6)
missNum = 2
y = MovingAvg(vec, filterNum=filterNum, filterPpns=filterPpns, missNum=missNum)
plot(vec)
lines(y,col='red')
y[5:6] = c(NA, NA)
y = MovingAvg(y, filterNum=filterNum, filterPpns=filterPpns, missNum=missNum)
plot(vec)
lines(y,col='red')
y[5:6] = c(NA, NA)
y = MovingAvg(y, filterNum=filterNum, filterPpns=filterPpns, missNum=missNum)
plot(vec)
lines(y, col='red')
## periodic Examples
vec = rnorm(100)
filterNum = NULL
filterPpns = c(1/2, 1/3, 1/4, 1/5, 1/6)
missNum = 2
#par(mfrow=c(3,2))
y = MovingAvg(vec, filterNum=filterNum, filterPpns=filterPpns,
missNum=missNum, periodic=TRUE)
plot(vec)
lines(y,col='red')
y[5:6] = c(NA, NA)
y = MovingAvg(y, filterNum=filterNum, filterPpns=filterPpns,
missNum=missNum, periodic=TRUE)
plot(vec)
lines(y,col='red')
y[5:6] = c(NA, NA)
y = MovingAvg(y, filterNum=filterNum, filterPpns=filterPpns,
missNum=missNum, periodic=TRUE)
plot(vec)
lines(y,col='red')
y[5:6] = c(NA, NA)
y = MovingAvg(y, filterNum=filterNum, filterPpns=filterPpns,
missNum=missNum, periodic=TRUE)
plot(vec)
lines(y,col='red')
y[5:6] = c(NA, NA)
y = MovingAvg(y, filterNum=filterNum, filterPpns=filterPpns,
missNum=missNum, periodic=TRUE)
plot(vec)
lines(y,col='red')
y[5:6] = c(NA, NA)
y = MovingAvg(y, filterNum=filterNum, filterPpns=filterPpns,
missNum=missNum, periodic=TRUE)
plot(vec)
lines(y, col='red')
}
## this function gets event status which is the progress of the event
# for Example 0,0,0,0,0.1,0.2,...,1....
# this returns the Fourier series with cos terms in even positions
# and sin terms in odd
Build_eventPredictor = function(event_status, ord) {
n = length(event_status)
sinMat = matrix(NA, n, ord)
cosMat = matrix(NA, n, ord)
for (i in 1:ord) {
sinMat[ , i] = sin(i*2*pi*event_status)*(event_status>0)
cosMat[ , i] = cos(i*2*pi*event_status)*(event_status>0)
}
trigMat = matrix(NA, nrow=n, ncol=2*ord)
trigMat = data.frame(trigMat)
for (i in 1:ord) {
trigMat[ , (2*i-1)] = sinMat[ , i]
names(trigMat)[2*i-1] = paste("sin", i, sep="")
trigMat[ , 2*i] = cosMat[ , i]
names(trigMat)[2*i] = paste("cos", i, sep="")
}
#pred_data = list(event_sin=sinMat,event_cos=cosMat)
return(trigMat)
}
Example = function() {
startTime = 0
endTime = 24
startDate = "2013-09-19"
endDate = "2013-09-24"
timeDelta = 30
pred_data_bricks = CreateTemporalGrid(startDate, endDate,
startTime, endTime, timeDelta)
timestamps = pred_data_bricks[["timestamp"]]
events_status = Build_eventTimestamp(timestamps, events_info)
event_status = events_status[ , 2]
ord = 2
pred_data = Build_eventPredictor(event_status, ord)
pred_data
}
## this file will build features for pre-specified events
# event times must include the event start and end timestamps
Build_eventExplanList = function(
timestamps=NA, events_info=NA, ord_event=NA) {
events_status = Build_eventTimestamp(timestamps, events_info)
event_names = names(events_status)[-1]
timestamps = events_status[ , 1]
d = dim(events_status)[2] - 1
l = list(timestamp=timestamps)
for (i in 1:d) {
event_status = events_status[ , i+1]
pred = Build_eventPredictor(event_status, ord_event[i])
l[[i+1]] = pred
}
return(l)
}
Example = function() {
startTime = 0
endTime = 24
startDate = "2013-09-19"
endDate = "2013-09-24"
timeDelta = 30
pred_data_bricks = CreateTemporalGrid(startDate, endDate,
startTime, endTime, timeDelta)
timestamps = pred_data_bricks[["timestamp"]]
events_status = Build_eventTimestamp(timestamps, events_info)
event_status = events_status[ , 2]
ord = 2
pred_data = Build_eventPredictor(event_status, ord)
#pred_data
ord = c(2,2)
event_preds = build_event_explanList(
timestamps, events_info=events_info, ord=ord)
}
## plot time series chunks in R.
# some times the time series is not continuous
# and thus should be plotted in chunks
# delta is the difference in seconds
FindSeriesChunks = function(data, timeCol, delta) {
asn = as.numeric
data = data[order(data[ , timeCol]), ]
timestamps = data[ , timeCol]
timestampsLag = c(timestamps[1], timestamps[-length(timestamps)])
#timestampsLag
Diff = difftime(timestamps, timestampsLag, unit='hours')
break_ind = which(Diff > delta)
myList = list(data=data)
if (length(break_ind ) > 0) {
outList = list()
if (break_ind[length(break_ind)] < length(timestamps)) {
break_ind = c(break_ind, length(timestamps))
}
k = length(break_ind)
start_ind = 1
end_ind = 2
for (i in 1:k) {
end_ind = break_ind[i] - 1
outList[[i]] = data[(start_ind:end_ind),]
start_ind = end_ind + 1
}
}
return(outList)
}
Example = function() {
date1 = "2015-09-17 00:20:00 MYT"
date2 = "2015-09-18 03:00:00 MYT"
date1 = as.POSIXlt(date1)
date2 = as.POSIXlt(date2)
delta = 1/3
timestamps3 = seq(date1, date2, by=(delta*60*60*12))
date1 = "2014-09-17 00:20:00 MYT"
date2 = "2014-09-18 03:00:00 MYT"
date1 = as.POSIXlt(date1)
date2 = as.POSIXlt(date2)
delta = 1/3
timestamps3 = seq(date1, date2, by=(delta*60*60*12))
timestamps = c(timestamps3, timestamps4)
y = c(1:(length(timestamps4) + length(timestamps3)))
delta = 13
data = data.frame(timestamp=timestamps, y=y, z=y)
dataList = FindSeriesChunks(data, timeCol='timestamp', delta)
}
## gets a list of dataframes and from plots col valueCol vs xCol
PlotMultiCurves = function(
dataList,
dataListAdd=NULL,
xCol,
valueCol,
valueColAdd,
dataListInd=NULL,
type='l',
cols=c(1, 2),
mfrow=NULL,
xlab='x',
ylab='y',
mainTitles=NULL,
Title=NULL) {
n = length(dataList)
if (is.null(dataListInd)) {
dataListInd=1:n
}
if (is.null(mfrow)) {
mfrow=c(1, length(dataListInd))
}
oldpar = par(mfrow=mfrow, mar=c(3, 2, 3, 2), oma=c(2, 2, 5, 2))
### find the right y axis limits
ymax = -Inf
ymin = Inf
for (j in 1:length(dataListInd)) {
i = dataListInd[j]
df = dataList[[i]]
ymax1 = max(df[ , valueCol], na.rm=TRUE)
ymax2 = ymax1
ymin1 = min(df[ , valueCol], na.rm=TRUE)
ymin2 = ymin1
if (!is.null(dataListAdd)) {
dfAdd=dataListAdd[[i]]
ymin2 = max(dfAdd[ , valueColAdd], na.rm=TRUE)
}
ymax = max(c(ymax, ymax1, ymax2))
ymin = min(c(ymin, ymin1, ymin2))
}
for (j in 1:length(dataListInd)) {
i = dataListInd[j]
df = dataList[[i]]
main1 = df[1, 1]
if (!is.null(mainTitles)) {
main1=mainTitles[j]
}
plot(df[ , timeCol], df[ , valueCol], type=type, main=main1, xlab=xlab,
ylab=ylab, col=cols[1], ylim=c(ymin, ymax))
if (!is.null(dataListAdd)) {
dfAdd=dataListAdd[[i]]
lines(dfAdd[ , xCol], dfAdd[ , valueColAdd], col=cols[2])
}
}
if (!is.null(Title)) {
mtext(Title, side=3, line=1, outer=TRUE, cex=1, font=1)
}
par(oldpar)
}
Example = function() {
date1 = "2012-09-17 00:20:00 MYT"
date2 = "2012-09-17 03:00:00 MYT"
date1 = as.POSIXlt(date1)
date2 = as.POSIXlt(date2)
delta = 1/3
timestamps1 = seq(date1, date2, by=(delta*60*60))
date1 = "2013-09-17 00:20:00 MYT"
date2 = "2013-09-17 03:00:00 MYT"
date1 = as.POSIXlt(date1)
date2 = as.POSIXlt(date2)
delta = 1/3
timestamps2 = seq(date1, date2, by=(delta*60*60))
timestamps = c(timestamps2, timestamps1)
date1 = "2000-09-17 00:20:00 MYT"
date2 = "2000-09-17 03:00:00 MYT"
date1 = as.POSIXlt(date1)
date2 = as.POSIXlt(date2)
delta = 1/3
timestamps3 = seq(date1, date2, by=(delta*60*60))
date1 = "2015-09-17 00:20:00 MYT"
date2 = "2015-09-18 03:00:00 MYT"
date1 = as.POSIXlt(date1)
date2 = as.POSIXlt(date2)
delta = 1/3
timestamps4 = seq(date1, date2, by=(delta*60*60*12))
timestamps = c(timestamps2, timestamps1, timestamps3, timestamps4)
y = c(1:length(timestamps1), 1:length(timestamps2),
1:length(timestamps3), 1:length(timestamps4))
data = data.frame(timestamp=timestamps, y=y)
delta = 13
dataList = FindSeriesChunks(data=data, timeCol='timestamp', delta)
z = y + 1
data = data.frame(timestamp=timestamps, z=z)
dataListAdd = FindSeriesChunks(data=data, timeCol='timestamp', delta)
PlotMultiCurves(dataList, dataListAdd, timeCol='timestamp',
valueCol='y', valueColAdd='z',
dataListInd=1:3, mainTitles=c('name1', 'name2', 'name3'),
xlab='time', ylab='y', mfrow=c(3, 1), Title='Fit')
}
## this function gets a list of data.frames of same row length and cbinds them
# it only cbinds the ones given in ind
MergeDfList = function(dfList=NA, ind=NULL, fcn=cbind) {
if (is.null(ind)) {
ind = 1:length(dfList)
}
out = NULL
for (i in 1:length(ind)) {
j = ind[i]
if (i == 1) {
out = dfList[[j]]
}
if (i > 1) {
out = fcn(out, dfList[[j]])
}
}
out = data.frame(out)
return(out)
}
Example = function() {
startTime = 0
endTime = 24
startDate = "2013-09-19"
endDate = "2013-09-24"
timeDelta = 30
pred_data_bricks = CreateTemporalGrid(startDate, endDate,
startTime, endTime, timeDelta)
timestamps = pred_data_bricks[["timestamp"]]
events_status = Build_eventTimestamp(timestamps, events_info)
plot(timestamps,events_status[ , 2], type="l")
lines(timestamps,events_status[ , 3], col=2)
event_status = events_status[ , 2]
ord = 2
pred_data = Build_eventPredictor(event_status, ord)
plot(timestamps,pred_data[ , 1], type="l")
lines(timestamps,pred_data[ , 2], col=2)
lines(timestamps,pred_data[ , 3], col=3)
lines(timestamps,pred_data[ , 4], col=4)
#pred_data
ord = c(1, 1)
event_preds = build_event_explanList(
timestamps, events_info=events_info, ord=ord)
explan_events = MergeDfList(dfList=event_preds[-1], ind=NA)
plot(timestamps, explan_events[ , 1], type="l")
lines(timestamps, explan_events[ , 2], col=1)
lines(timestamps, explan_events[ , 3], col=2)
lines(timestamps, explan_events[ , 4], col=2)
}
## packages for lag:
Example = function() {
x = 1:100
install.packages("DataCombine")
slide(x, 1)
install.packages("quantmod")
Lag(x, 1)
library(plyr)
adf = ddply(adf, "Site",
function(my_data) lag(my_data, "Time", "Value"))
set.seed(13)
dt = data.frame(location = rep(letters[1:2], each = 4), time = rep(1:4, 2),
var = rnorm(8))
lg = function(x)c(NA, x[1:(length(x)-1)])
## 1
unlist(tapply(dt$var, dt$location, lg))
## 2
ddply(dt, ~location, transform, lvar = lg(var))
## 3
ddt <- data.table(dt)
ddt[ , lvar := lg(var), by = c("location")]
## 4
dt %>% group_by(location) %>% mutate(lvar = lag(var))
}
## this function acts on a timeseries y and creates a data.frame of
# lags upto lag=lagNum
# this is deprecated in favor of the faster function below
CreateLagMat2 = function(y, lagNum, colName=NULL, naFill=NA,
fut=FALSE) {
if (fut) {
y = y[(length(y)-lagNum+1):length(y)]
y = c(y, NA)
}
n = length(y)
lagMat = matrix(NA, n, lagNum)
lagMat = data.frame(lagMat)
y1 = y
colNames = NULL
for (i in 1:lagNum) {
y1 = c(naFill, y1)
y1 = y1[-(n + 1)]
lagMat[ , i] = y1
colName2 = paste(colName, "lag", i, sep="")
colNames = c(colNames, colName2)
}
if (!is.null(colName)) {
colnames(lagMat) = colNames}
if (fut) {
lagMat = lagMat[dim(lagMat)[1], , drop=FALSE]
}
names(lagMat) = colNames
return(lagMat)
}
## this function acts on a timeseries y
# and creates a data.frame of lags upto lag=lagNum
CreateLagMat = function(
y, lagNum, colName=NULL, naFill=NA,
fut=FALSE) {
if (fut) {
lagMat = matrix(y[length(y):(length(y)-lagNum+1)], 1, lagNum)
colNames = paste(colName, "lag", 1:lagNum, sep="")
colnames(lagMat) = colNames
return(lagMat)
}
n = length(y)
Fcn = function(i) {
c(rep(naFill, i), y[1:(n-i)])
}
lagMat = sapply(1:lagNum, Fcn)
colNames = paste(colName, "lag", 1:lagNum, sep="")
colnames(lagMat) = colNames
#lagMat = data.frame(lagMat)
return(lagMat)
}
Example = function() {
m = 100000
k = 2000
fut=TRUE
t1 = Sys.time()
x = CreateLagMat(1:m, k, colName="zereshk", fut=fut)
t2 = Sys.time()
print(t2-t1)
t1 = Sys.time()
x = CreateLagMat2(1:m, k, colName="zereshk", fut=fut)
t2 = Sys.time()
print(t2-t1)
set.seed(1)
email_data <- data.frame(dates=1:10, email_reach=1:10)
library(data.table)
setDT(email_data)[, paste0("email_reach", 1:3) := shift(email_reach, 1:3)][]
set.seed(1)
data <- data.table(time = c(1:3, 1:4),
groups = c(rep(c("b", "a"), c(3, 4))),
value = rnorm(7))
library(dplyr)
data2 <-
data %>%
group_by(groups) %>%
mutate(lag.value=lag(value, 1:2))
}
## this function acts on a timeseries y and creates a data.frame of avg
# lags upto lag=lagNum
CreateLongtMat2 = function(y, lagNum, colName=NULL, naFill=NA,
AvgFcn=rowMeans, fut=FALSE) {
ylagMat = CreateLagMat(y, lagNum, colName, naFill=naFill, fut=fut)
n = dim(ylagMat)[1]
longtMat = matrix(NA, n, lagNum)
longtMat = data.frame(longtMat)
colNames = NULL
for (i in 1:lagNum) {
ylagMat1 = ylagMat[ , 1:i, drop=FALSE]
avg_col = AvgFcn(ylagMat1)
longtMat[ , i] = avg_col
colName2 = paste(colName, "longt", i, sep="")
colNames = c(colNames, colName2)
}
colnames(longtMat) = colNames
return(longtMat)
}
# faster
CreateLongtMat = function(
y,
lagNum,
colName=NULL,
naFill=NA,
AvgFcn=rowMeans,
fut=FALSE) {
ylagMat = CreateLagMat(y, lagNum, colName=NULL, naFill=naFill, fut=fut)
Fcn = function(i) {
AvgFcn(ylagMat[ , 1:i, drop=FALSE])
}
longtMat = sapply(1:lagNum, Fcn)
if (fut) {
longtMat = matrix(longtMat, 1, lagNum)
}
colnames(longtMat) = paste(colName, "longt", 1:lagNum, sep="")
return(longtMat)
}
Example = function() {
M = 2000
k = 200
fut = FALSE
t1 = Sys.time()
x = CreateLongtMat(1:M, k, colName="zereshk", fut=fut)
t2 = Sys.time()
print(t2-t1)
t1 = Sys.time()
x = CreateLongtMat2(1:M, k, colName="zereshk", fut=fut)
t2 = Sys.time()
print(t2-t1)
}
## this function creates a function which applies on a time series y
# and gives back a data.frame of needs lags or averaged lags
# lagIndex: the desired lags
# longtIndex: averaged lagIndex
CreateLagIndFcn = function(
lagIndex=NULL,
longtIndex=NULL,
AvgFcn=rowMeans,
LagTrans=function(x){x}) {
lagNum = 0
if (!is.null(lagIndex)) {
lagNum = max(lagIndex)
}
if (!is.null(longtIndex)) {
lagNum = max(c(lagNum, longtIndex))
}
Fcn = function(y, colName="", naFill=NA, fut=FALSE) {
explanLag = NULL
if (!is.null(lagIndex)) {
lagMat = CreateLagMat(
y=y,
lagNum=lagNum,
colName=colName,
naFill=naFill,
fut=fut)
lagMat = lagMat[ , lagIndex, drop=FALSE]
if (is.null(explanLag)) {
explanLag = lagMat
} else {
explanLag = cbind(explanLag, lagMat)
}
}
if (!is.null(longtIndex)) {
longtMat = CreateLongtMat(
y=y,
lagNum=lagNum,
colName=colName,
naFill=naFill,
AvgFcn=AvgFcn,
fut=fut)
longtMat = longtMat[ , longtIndex, drop=FALSE]
if (is.null(explanLag)) {
explanLag = longtMat
} else {
explanLag = cbind(explanLag, longtMat)
}
}
explanLag = LagTrans(explanLag)
return(explanLag)
}
return(Fcn)
}
Example = function() {
y = 1:10
ordLag = 3
lagIndex = 1:3
longtIndex = 3
Fcn = CreateLagIndFcn(
lagIndex=lagIndex,
longtIndex=longtIndex)
df1 = Fcn(y, colName="a", naFill=y[1])
Fcn = CreateLagIndFcn(
lagIndex=lagIndex,
longtIndex=longtIndex,
LagTrans=log)
df1 = Fcn(y, colName="a", naFill=y[1])
Fcn = CreateLagIndFcn(lagIndex=lagIndex, longtIndex=longtIndex)
df1 = Fcn(y, colName="a", naFill=y[1], fut=TRUE)
Fcn2 = CreateLagIndFcn(lagIndex=1:3, longtIndex=3)
df2 = Fcn2(y)
Fcn2 = CreateLagIndFcn(lagIndex=1:3, longtIndex=1)
df2 = Fcn2(y)
}
## creates a function which get time values and creates a data.frame of
# fourier series terms
# ord: is the order of the series
# omega is the fourier series Freq: omega = 2pi/T where T is the period
# the odd columns of the data frame are sin terms
# the even columns of the data frame are even terms
CreateFsFcn = function(ord, omega, colName="") {
Fcn = function(x) {
n = length(x)
#sinMat = matrix(NA,n,ord)
#cosMat = matrix(NA,n,ord)
mat = matrix(NA, n, 2*ord)
df = data.frame(mat)
for (i in 1:ord[1]) {
sinColName = paste(colName, "sin", i, sep="")
cosColName = paste(colName, "cos", i, sep="")
df[ , (2*i-1)] = sin(i*omega*x)
df[ , 2*i] = cos(i*omega*x)
names(df)[(2*i-1)] = sinColName
names(df)[2*i] = cosColName
}
return(df)
}
return(Fcn)
}
Example = function() {
ord = 3
omega = 2*pi/10
x = seq(0, 10, by=0.001)
explanFourier = CreateFsFcn(ord, omega)(x)
explanFourier[1:5, ]
plot(explanFourier[ , 1], type="l")
lines(explanFourier[ , 2], col=2)
lines(explanFourier[ , 3], col=3)
}
CreateFsIndFcn = function(sinOrd, cosOrd, omega, colName="") {
ord = length(sinOrd) + length(cosOrd)
Fcn = function(x) {
n = length(x)
mat = matrix(NA, n, ord)
df = data.frame(mat)
for (i in 1:length(sinOrd)) {
j = sinOrd[i]
sinColName = paste(colName, "sin", j, sep="")
df[ , i] = sin(j*omega*x)
names(df)[i] = sinColName
}
for (i in 1:length(cosOrd)) {
j = cosOrd[i]
cosColName = paste(colName, "cos", j, sep="")
df[ , length(sinOrd) + i] = cos(i*omega*x)
names(df)[length(sinOrd) + i] = cosColName
}
return(df)
}
return(Fcn)
}
Example = function() {
omega = 2*pi/10
x = seq(0, 10, by=0.1)
explanFourier = CreateFsIndFcn(sinOrd=c(1, 10), cosOrd=c(2, 3),
omega, colName="zereshk")(x)
explanFourier[1:5, ]
plot(explanFourier[ , 1], type="l")
lines(explanFourier[ , 2], col=2)
lines(explanFourier[ , 3], col=3)
}
CreateLagFsFcn = function(
lagIndex,
longtIndex,
sinOrd,
cosOrd,
omega,
AvgFcn=rowMeans,
fut=FALSE) {
Fcn = function(x, y) {
F1 = CreateFsIndFcn(sinOrd, cosOrd, omega)
F2 = CreateLagIndFcn(
lagIndex=lagIndex,
longtIndex=longtIndex,
AvgFcn=AvgFcn)
out = cbind(F1(x), F2(y, fut=fut))
return(out)
}
return(Fcn)
}
Example = function() {
omega = 2*pi/10
x = seq(0, 5, by=0.1)
explanFourier = CreateFsIndFcn(sinOrd=c(1, 10), cosOrd=c(2, 3), omega)(x)
y = explanFourier[ , 1] + 2*explanFourier[ , 2]
Fcn = CreateLagFsFcn(lagIndex=1:3, longtIndex=2,
sinOrd=1:2, cosOrd=c(3, 4), omega=omega, AvgFcn=rowMeans)
df = Fcn(x, y)
Fcn2 = CreateLagFsFcn(lagIndex=1:3, longtIndex=2,
sinOrd=1:2, cosOrd=c(3, 4), omega=omega, AvgFcn=rowMeans)
Fcn2(x=x[1], y=y)
}
################ gaussian time-varying volatility time series model functions
## this return Beta for linear regression when y = Beta X + error,
# when error is dependent
# and error var-cov matrix is denoted by Omega
# this function seems broken in some cases when input is omegaInvChol
# it does not return the right thing which is the beta estimate
GaussLinRegSol = function(
y, x, omegaInvCholVec=NULL,
omegaInvChol=NULL, omegaInv=NULL) {
d = dim(x)
n = d[1]
if (is.null(omegaInvCholVec) & is.null(omegaInvChol) & is.null(omegaInv)) {
xTild = x
yTild = y
}
if (!is.null(omegaInvCholVec)) {
xTild = omegaInvCholVec*x
yTild = omegaInvCholVec*y
} else
if (!is.null(omegaInvChol)) {
xTild = omegaInvChol%*%x
yTild = omegaInvChol
} else
if (!is.null(omegaInv)) {
w = chol(omegaInv)
xTild = w%*%x
yTild = w%*%y
}
#SVD = svd(xTild)
#U = SVD$u
#V = SVD$v
#D.inv = diag(SVD$d^{-1})
betaHat = solve((t(xTild) %*% xTild)) %*% t(xTild) %*% yTild
return(betaHat)
}
Example = function() {
n = 100
p_1 = 2
p_2 = 3
x1 = matrix(rnorm(n*p_1), n, p_1)
x2 = matrix(rnorm(n*p_2), n, p_2)
y1 = 2*x1[ , 1] + 3*x1[ , 2] + rnorm(n)
y2 = -3*x2[ , 1] - 4*x2[ , 2] + 5*x2[ , 3] + rnorm(n)
x = cbind(y1, y2)
xList = list(x1, x2)
xbig = XbigList(xList)
x = xbig
yCol = as.matrix((as.vector(t(y))))
y = yCol
omegaInv = NA
GaussLinRegSol(y=yCol, y=xBig, omegaInvChol=NA)
GaussLinRegSol(y=yCol, y=xBig, omegaInvChol=diag(200))
###### testing with error
p = 10
n = 1000
x = matrix(rnorm(p*n), n, p)
beta = round(10*rnorm(p))
y = rowSums(t(beta*t(x)))
sdVec = seq((1:n)/n)
error = rnorm(n, sd=sdVec)
y = y + error
y = as.matrix(y)
omegaInvCholVec = 1/sdVec
#omegaInvChol = diag(omegaInvCholVec)
beta
t1 = Sys.time()
GaussLinRegSol(y=y, x=x, omegaInvCholVec=omegaInvCholVec)
t2 = Sys.time()
t2-t1
t1 = Sys.time()
GaussLinRegSolSvd(y=y, x=x, omegaInvCholVec=omegaInvCholVec)
t2 = Sys.time()
t2-t1
GaussLinRegSol(y=y, x=x, omegaInvChol=NA)
}
### estimate Beta for dependent errors given the volatility
# parameters and structure
BetaGiven_volatParams = function(y, x, omegaPred, omegaParams) {
b = omegaParams
logOmegaVec = rowSums(t(b*t(omegaPred)))
omegaVec = exp(logOmegaVec)
omegaInvCholVec = sqrt(1/omegaVec)
out = GaussLinRegSol(y=y, x=x, omegaInvCholVec=omegaInvCholVec)
return(out)
}
############## Seems like this is where we need to start #################
LlikGaussReg = function(
y, x, beta, omegaInvCholVec=NULL, omegaInvChol=NULL) {
n = dim(x)[1]
beta = as.matrix(beta)
p = dim(beta)[1]
z = y - x%*%beta
if (is.null(omegaInvCholVec) & is.null(omegaInvChol)) {
zTild = Z
logDet = 0
} else
if (!is.null(omegaInvCholVec)) {
zTild = omegaInvCholVec * z
logDet = sum(log(omegaInvCholVec))
} else
if (!is.null(omegaInvChol)) {
zTild = omegaInvChol%*%z
logDet = log(det(omegaInvChol))
}
#logDet.Omega = -2*logDet
zz = t(zTild) %*% zTild
out1 = (-n/2)*log(2*pi) + logDet - (1/2)*zz
out2 = 2*p - 2*out1
out3 = log(n)*p - 2*out1
out = list(llik=out1, AIC=out2, BIC=out3)
return(out)
}
Example = function() {
p = 4
n = 50
x = matrix(rnorm(p*n), n, p)
beta = round(10*rnorm(p))
y = rowSums(t(beta*t(x)))
T = (1:n)/n
sint = sin(2*pi*t)
cost = cos(2*pi*t)
omegaPred = cbind(1, sint, cost)
omegaParams = c(1, 5, 10)
b = omegaParams
logomegaVec = rowSums(t(b*t(omegaPred)))
omegaVec = exp(logomegaVec)
sdVec = sqrt(omegaVec)
error = rnorm(n, sd=sdVec)
y = y + error
y = as.matrix(y)
omegaInvCholVec = 1/sdVec
omegaInvChol = diag(omegaVec)
beta1 = GaussLinRegSol(y=y, x=x, omegaInvCholVec=omegaInvCholVec)
e1 = y - x %*% beta1
e1 = as.vector(e1)
sd1 = sd(e1)
sdVec1 = rep(sd1, n)
O1 = 1/sdVec1
beta2 = GaussLinRegSol(y=y, x=x)
e2 = y - x %*% beta2
e2 = as.vector(e2)
sd2 = sd(e2)
sdVec2 = rep(sd2, n)
O2 = 1/sdVec2
LlikGaussReg(y, x, beta=beta1, omegaInvCholVec=O1)
#LlikGaussReg(Y,X,Beta1,omegaInvCholVec)
#LlikGauss(Y=Y,X=X,Beta=Beta1,omegaInvCholVec=O1)
#LlikGaussReg(Y,X,Beta1,omegaInvCholVec=omegaInvCholVec)
LlikGaussReg(y, x, beta, omegaInvCholVec=O1)
LlikGaussReg(y, x, beta, omegaInvCholVec)
LlikGaussReg(y, x, beta2, omegaInvCholVec=O2)
LlikGaussReg(y, x, beta2, omegaInvCholVec)
LlikGaussReg(y, x, beta=beta2, omegaInvCholVec=O2)
mod = glm(y ~ -1+x)
mod$aic
#### Testing manually the log likelihood function
p = 4
n = 50
X = matrix(rnorm(p*n), n, p)
beta = round(10*rnorm(p))
y = rowSums(t(beta*t(x)))
t = (1:n)/n
sint = sin(2*pi*t)
cost = cos(2*pi*t)
omegaPred = cbind(1, sint, cost)
omegaParams = c(1, 0, 0)
b = omegaParams
logomegaVec = rowSums(t(b*t(omegaPred)))
omegaVec = exp(logomegaVec)
sdVec = sqrt(omegaVec)
error = rnorm(n, sd=sdVec)
y = y + error
y = as.matrix(y)
omegaInvCholVec = 1/sdVec
omegaInvChol = diag(omegaInvCholVec)
omegaVec = sdVec^2
Omega = diag(omegaVec)
varcov = Omega
LlikGaussReg(y, x, beta, omegaInvCholVec)
dmnorm(as.vector(y), mean=as.vector(x %*% beta), varcov=varcov, log=TRUE)
}
## likelihood with varying volatility
LlikGaussRegVarVolat = function(y, x, beta, omegaPred, omegaParams) {
b = omegaParams
logomegaVec = rowSums(t(b*t(omegaPred)))
omegaVec = exp(logomegaVec)
omegaInvCholVec = sqrt(1/omegaVec)
out = LlikGaussReg(
y, x, beta, omegaInvCholVec=omegaInvCholVec, omegaInvChol=NA)
return(out)
}
Example = function() {
p = 3
n = 100
x = matrix(rnorm(p*n), n, p)
y = -7*x[ , 1] + 5*x[ , 2] + 10*x[ , 3]
t = (1:n)/n
sint = sin(2*pi*t)
cost = cos(2*pi*t)
omegaPred = cbind(1, sint, cost)
omegaParams = c(0, 5, 10)
b = omegaParams
logomegaVec = rowSums(t(b*t(omegaPred)))
omegaVec = exp(logomegaVec)
sdVec = sqrt(omegaVec)
error = rnorm(n, sd=sdVec)
y = y + error
y = as.matrix(y)
omegaInvCholVec = 1 / sdVec
beta = GaussLinRegSol(y=y, x=x, omegaInvCholVec=omegaInvCholVec)
#BetaGiven_volatParams(Y,X,omegaPred,omegaParams)
LlikGaussReg(y, x, beta, omegaInvCholVec)
LlikGaussRegVarVolat(y, x, beta, omegaPred, omegaParams)
}
##
MaxGaussLlikGivenBeta = function(y, x, beta, omegaPred, init=NULL) {
z = y - x %*% beta
F = function(b) {
logomegaVec = rowSums(t(b*t(omegaPred)))
omegaVec = exp(logomegaVec)
omegaInvCholVec = sqrt(1/omegaVec)
out = LlikGaussReg(y, x, beta, omegaInvCholVec=omegaInvCholVec,
omegaInvChol=NULL)
out1 = out$llik
return(-out1)
}
bLength = dim(omegaPred)[2]
b = init
if (is.null(b)) {
b = rep(0, bLength)
}
res = optim(par=b, F)
return(res)
}
Example = function() {
p = 3
n = 200
x = matrix(rnorm(p*n), n, p)
beta = c(-7, 5, 10)
y = rowSums(t(beta*t(x)))
t = (1:n)/n
sint = sin(2*pi*t)
cost = cos(2*pi*t)
omegaPred = cbind(1, sint, cost)
omegaParams = c(1, 5, 1)
b = omegaParams
logomegaVec = rowSums(t(b*t(omegaPred)))
omegaVec = exp(logomegaVec)
sdVec = sqrt(omegaVec)
error = rnorm(n, sd=sdVec)
y = y + error
y = as.matrix(y)
omegaInvCholVec = 1/sdVec
res1 = MaxGaussLlikGivenBeta(y, x, beta, omegaPred)
MaxGaussLlikGivenBeta(y, x, beta, omegaPred, init=c(0, 0, 0))
}
## fit with varying-volatility
FitGaussLin_varyingVol = function(x, y, omegaPred, iter=1, init=NULL) {
#print(y)
p = dim(x)[2]
p2 = dim(omegaPred)[2]
n = dim(x)[1]
beta = GaussLinRegSol(y=y, x=x)
volatSample = NULL
betaSample = NULL
for (i in 1:iter) {
res = MaxGaussLlikGivenBeta(y, x, beta, omegaPred, init=init)
omegaParams = res[["par"]]
value = res[["value"]]
ll = - value
volatSample = rbind(volatSample, omegaParams)
beta = BetaGiven_volatParams(y, x, omegaPred, omegaParams)
volatSample = rbind(volatSample, omegaParams)
betaSample = rbind(betaSample, as.vector(beta))
}
aic = 2*(p + p2) - 2*ll
bic = log(n)*(p + p2) - 2*ll
betaEst = betaSample[iter, ]
volEst = volatSample[iter, ]
fitted = x %*% betaEst
out = list(
"y"=y,
"x"=x,
"fitted.values"=fitted,
"residual"=(y-fitted),
"betaSample"=betaSample,
"volatSample"=volatSample,
"betaEst"=betaEst,
"volEst"=volEst,
"llik"=ll,
"AIC"=aic,
"BIC"=bic)
}
## fit the model via formula
FitGaussVolatMod = function(
df,
yCol,
explanCols=NULL,
omegaPredCols=NULL,
iter=10) {
if (is.null(explanCols)) {
explanCols = "1"
}
if (is.null(omegaPredCols)) {
omegaPredCols = "1"
}
## find the overall SD
bigVol = sd(df[ , yCol], na.rm=TRUE)
formulaString = paste(yCol, "~", PlusVec(explanCols))
ModDesignMat1 = DesignMatFcn(df, formulaString)
x = ModDesignMat1(df)
omegaPred = as.matrix(rep(1, dim(df)[1]))
if (!is.null(omegaPredCols)) {
formulaString = paste(yCol, "~", PlusVec(omegaPredCols))
ModDesignMat2 = DesignMatFcn(df, formulaString)
omegaPred = ModDesignMat2(df)
}
y = df[ , yCol]
res = FitGaussLin_varyingVol(x=x, y=y, omegaPred=omegaPred, iter=iter)
betaEst = res[["betaEst"]]
volEst = res[["volEst"]]
PredMeanFcn = function(newdata) {
newX = ModDesignMat1(newdata)
out = newX %*% betaEst
return(out)
}
PredVolFcn = function(newdata) {
newOmegaPred = ModDesignMat2(newdata)
out = newOmegaPred %*% volEst
out = MinComp(x=c(sqrt(exp(out))), y=bigVol)
return(out)
}
fitVol = PredVolFcn(df)
res[["PredMeanFcn"]] = PredMeanFcn
res[["PredVolFcn"]] = PredVolFcn
res[["omegaPred"]] = omegaPred
res[["fitVol"]] = fitVol
res[["ModDesignMatMean"]] = ModDesignMat1
res[["ModDesignMatVol"]] = ModDesignMat2
class(res) = "gaussVolatMod"
return(res)
}
## define the predict function for the class gaussVolatMod
predict.gaussVolatMod = function(gaussVolatMod, newdata) {
value = gaussVolatMod[["PredMeanFcn"]](newdata)
vol = gaussVolatMod[["PredVolFcn"]](newdata)
return(list("mean"=value, "vol"=vol))
}
## print the model
print.gaussVolatMod = function(gaussVolatMod) {
print(gaussVolatMod[c("betaEst", "volEst", "AIC", "BIC", "llik")])
}
Example = function() {
p = 3
n = 3000
x = 3*matrix(abs(runif(p*n)), n, p)
beta = c(0, 3, 0, 0.5)
data = data.frame(x)
yCol = "y"
explanCols = c("X1", "X2", "X3")
formulaString = paste("~", PlusVec(explanCols))
ModDesignMat1 = DesignMatFcn(data, formulaString)
xOld = x = ModDesignMat1(data)
yOld = y = rowSums(t(beta*t(x)))
data[ , yCol] = y
omegaPredCols = c("X1", "X3")
formulaString = paste(yCol, "~", PlusVec(omegaPredCols))
ModDesignMat2 = DesignMatFcn(data, formulaString)
omegaPredOld = omegaPred = ModDesignMat2(data)
omegaParams = c(0, 1, -0.5)
b = omegaParams
logOmegaVec = rowSums(t(b*t(omegaPred)))
omegaVec = exp(logOmegaVec)
sdVec = sqrt(omegaVec)
error = rnorm(n, sd=sdVec)
y = y + error
y = as.matrix(y)
omegaInvCholVec = 1/sdVec
iter = 20
data[ , yCol] = y
indTrain = 1:round(n/2)
indTest = (round(n/2)+1):n
dataTrain = data[indTrain, ]
dataTest = data[indTest, ]
xTrain = x[indTrain, , drop=FALSE]
yTrain = y[indTrain]
xTest = x[indTest, , drop=FALSE]
yTest = y[indTest]
omegaPredTrain = omegaPred[indTrain, , drop=FALSE]
omegaPredTest = omegaPred[indTest, , drop=FALSE]
### via FitGaussLin_varyingVol
res = FitGaussLin_varyingVol(x=xTrain, y=yTrain, omegaPred=omegaPredTrain,
iter=iter)
round(res[["betaEst"]], 2)
round(res[["volEst"]], 2)
fittedSig2 = exp(omegaPredTrain%*%res[["volEst"]])
fittedSd = sqrt(fittedSig2)
mod = lm(res[["y"]] ~ res[["x"]])
summary(mod)$sigma
fittedSd[1, ]
e = res[["y"]] - res[["x"]] %*% res[["betaEst"]]
sqrt(var(e))
betaCov = BetaVarCov(x=res[["x"]], betaCoef=res[["betaEst"]],
volCoef=res[["volEst"]], omegaPred=omegaPred)
sqrt(diag(betaCov))
mod = lm(y ~ -1 + x)
summary(mod)
plot(yTrain, res[["fitted.values"]])
yPred = xTest %*% res[["betaEst"]]
plot(yTest, yPred)
### Same model Via FitGaussVolatMod
res = FitGaussVolatMod(data=data[indTrain, ], yCol, explanCols,
omegaPredCols, iter=iter)
round(res[["betaEst"]], 2)
round(res[["volEst"]], 2)
fittedSig2 = exp(res[["omegaPred"]]%*%res[["volEst"]])
fittedSd = sqrt(fittedSig2)
yFit = res[["fitted.values"]]
plot(data[indTrain , yCol], yFit)
fitVol = res[["fitVol"]]
yFitUpper = yFit + 1.96*fitVol
yFitLower = yFit - 1.96*fitVol
#ord = order(yTrain)
ord = order(xTrain[ , 2])
plot(xTrain[ , 2][ord], yTrain[ord], col=1)
lines(xTrain[ , 2][ord], yFit[ord], col="blue")
lines(xTrain[ , 2][ord], yFitUpper[ord], col="red")
lines(xTrain[ , 2][ord], yFitLower[ord], col="green")
## compare true vol with fit vol
plot(sdVec[indTrain], fitVol)
pred = predict(res, newdata=data[indTrain, ])
yFit = pred[["mean"]]
vol = pred[["vol"]]
plot(yTrain, yFit)
testPred = predict(res, newdata=data[indTest, ])
yPred = testPred[["mean"]]
testVol = testPred[["vol"]]
plot(y[indTest], yPred)
testResidual = yTest - yPred
plot(abs(testResidual), testVol)
## compare true vol with test vol
plot(sdVec[indTest], testVol)
## corner cases of no variables in mean or volat explans
res = FitGaussVolatMod(data=data[indTrain, ], yCol, explanCols=NULL,
omegaPredCols, iter=iter)
plot(res[["fitted.values"]], yTrain)
res = FitGaussVolatMod(data=data[indTrain, ], yCol, explanCols=explanCols,
omegaPredCols=NULL, iter=iter)
plot(res[["fitted.values"]], yTrain)
}
BetaVarCov = function(x, betaCoef, volCoef, omegaPred) {
b = volCoef
logomegaVec = rowSums(t(b*t(omegaPred)))
omegaVec = exp(logomegaVec) #omegaVec
omegaInvVec = (1/omegaVec)
omegaInv = diag(omegaInvVec)
betaCov = solve(t(x) %*% omegaInv %*% x)
return(betaCov)
}
BetaPrec = function(x, betaCoef, volCoef, omegaPred) {
b = volCoef
logomegaVec = rowSums(t(b*t(omegaPred)))
omegaVec = exp(logomegaVec) #omegaVec
omegaInvVec = (1/omegaVec)
omegaInv = diag(omegaInvVec)
betaPrec = (t(x) %*% omegaInv %*% x)
return(betaPrec)
}
Example = function() {
p = 3
n = 100
x = matrix(rnorm(p*n), n, p)
beta = round(10*rnorm(p))
y = rowSums(t(beta*t(x)))
t = (1:n)/n
sint = sin(2*pi*t)
cost = cos(2*pi*t)
#omegaPred = cbind(1,sint,cost);omegaParams = c(1,3,4)
omegaPred = matrix(1, n, 1)
omegaParams = c(1)
b = omegaParams
logomegaVec = rowSums(t(b*t(omegaPred)))
omegaVec = exp(logomegaVec)
sdVec = sqrt(omegaVec)
error = rnorm(n, sd=sdVec)
y = y + error
y = as.matrix(y)
omegaInvCholVec = 1/sdVec
iter = 20
res = FitGaussLin_varyingVol(x, y, omegaPred, iter=iter)
betaSample = res[["betaSample"]]
volatSample = res[["volatSample"]]
res[["llik"]]
res[["AIC"]]
res[["BIC"]]
signif(betaSample[iter, ], 3)
signif(volatSample[iter, ], 3)
betaCoef = betaSample[iter, ]
volCoef = volatSample[iter, ]
betaCov = BetaVarCov(x, betaCoef, volCoef, omegaPred)
betaCov
mod = lm(y ~ -1+x)
AIC(mod)
### Example 2
p = 3
n = 40
x = matrix(rnorm(p*n), n, p)
beta = round(10*rnorm(p))
beta
Y = rowSums(t(beta*t(x)))
#U1 = rnorm(n)
#U2 = rnorm(n)
omegaPred = matrix(round(rnorm(2*n)), n, 2)
omegaParams = c(1, -1)
b = omegaParams
logomegaVec = rowSums(t(b*t(omegaPred)))
omegaVec = exp(logomegaVec)
sdVec = sqrt(omegaVec)
error = rnorm(n, sd=sdVec)
y = y + error
y = as.matrix(y)
omegaInvCholVec = 1/sdVec
iter = 20
res = FitGaussLin_varyingVol(x, y, omegaPred, iter=iter, init=NA)
betaSample = res[["betaSample"]]
volatSample = res[["volatSample"]]
beta
b
round(betaSample[iter, ])
round(volatSample[iter, ])
betaEstim = betaSample[iter, ]
volatEstim = volatSample[iter, ]
fittedSig2 = exp(omegaPred%*%volatEstim)
fittedSd = sqrt(fittedSig2)
mod = lm(y ~ x)
summary(mod)$sigma
fittedSd[1, ]
e = y - x %*% betaEstim
sqrt(var(e))
betaCoef = betaSample[iter, ]
volCoef = volatSample[iter, ]
betaCov = BetaVarCov(x, betaCoef, volCoef, omegaPred)
betaCov
sqrt(diag(betaCov))
mod = lm(Y ~ -1+x)
summary(mod)
}
## generalized model selection
ModSelSubsetGen = function(
yCol,
explanColsF=NULL,
explanCols=NULL,
FitFcn,
m=1,
crit="AIC",
family="gaussian",
varNum=NULL,
critSummaryDelta=0.1) {
d = length(explanCols)
varNum2 = d
if (!is.null(varNum)) {
if (varNum > d) {
print("Warning: varNum > data dimension so it was reset to dim[2].")
}
varNum2 = min(d, varNum)
}
varNum = varNum2
MsFcn = function(x) {
if (is.null(x)) {
explanCols1 = explanColsF
} else {
explanCols1 = explanCols[x]
if (!is.null(explanColsF)) {
explanCols1 = c(explanColsF, explanCols1)
}
}
res = FitFcn(explanCols1)
out = c(res, explanCols[x], rep(NA, (varNum - length(x))))
return(out)
}
nullModRes = MsFcn(x=NULL)
## subSets of up to order varNum
subSets = lapply(1:varNum, function(x) as.list(data.frame(combn(d, x))))
subSets = unlist(subSets, recursive=FALSE)
resList = lapply(subSets, MsFcn)
resDf = matrix(NA, length(resList), varNum+1)
resDf = rbind(resDf, nullModRes)
n = length(resList)
for (i in 1:n) {
resDf[i, ] = resList[[i]]
}
colnames(resDf) = NULL
resDf = data.frame(resDf)
resDf[ , 1] = as.numeric(as.character(resDf[ ,1]))
colnames(resDf)[1] = crit
ord = order(resDf[ , 1])
resDf = resDf[ord, ]
topModMatrix = resDf[1:m, ]
critVec = resDf[ , crit]
critVec = na.omit(critVec)
critSummary = quantile(critVec, seq(0, 1, critSummaryDelta))
outList = list(topModMatrix=topModMatrix, critSummary=critSummary)
return(outList)
}
################ END #####################
## calculation functions
VarOnMean = function(x) {
var(x)/mean(x)
}
CoefOfVar = function(x) {
sd(x)/mean(x)
}
############# time-varying distribution functions
## Create lag data
CreateLagData = function(
data,
respCols,
lagIndex,
longtIndex) {
F = CreateLagIndFcn(lagIndex=lagIndex, longtIndex=longtIndex)
lagList = list()
for (i in 1:length(respCols)) {
resp = respCols[i]
y = data[ , resp]
lagList[[resp]] = F(y, resp, naFill=NA)
}
lagDf = MergeDfList(dfList=lagList)
l = list(lagList=lagList, lagDf=lagDf)
return(l)
}
## should be deprecated since ggplot does now allow adding legends
GenBoundsDfGg = function(
df,
wrtCols,
valueCol,
splitCol=NULL,
boundQuantiles=c(0.001, 0.999),
boundQuantileDelta=0.05) {
probs = sort(c(boundQuantiles, 0.01, 0.1, 0.25, 0.5, 0.75, 0.9, 0.99))
boundsDf = SummWrt(
data=df,
valueCol=valueCol,
wrtCols=wrtCols,
probs=probs,
fcnList=list(mean, sd, min, max),
fcnNames=c('mean', 'sd', 'min', 'max'),
addValuePrefix=TRUE)
qStr1 = paste0('_', as.character(boundQuantiles[1]))
qStr2 = paste0('_', as.character(boundQuantiles[2]))
deltaStr = paste0('qr_', as.character(boundQuantileDelta*100), 'pcnt')
boundsDf[ , paste0(valueCol, '_right_qr')] = (
boundsDf[ , paste0(valueCol, '_0.99')] -
boundsDf[ , paste0(valueCol, '_0.9')])/0.09
boundsDf[ , paste0(valueCol, '_left_qr')] = (
boundsDf[ , paste0(valueCol, '_0.1')] -
boundsDf[ , paste0(valueCol, '_0.01')])/0.09
lowerCol = paste0(valueCol, qStr1, '_minus_', deltaStr)
upperCol = paste0(valueCol, qStr2, '_plus_', deltaStr)
boundsDf[ , lowerCol] = (
boundsDf[ , paste0(valueCol, '_', as.character(boundQuantiles[1]))] -
boundQuantileDelta*boundsDf[ , paste0(valueCol, '_left_qr')])
boundsDf[ , upperCol] = (
boundsDf[ , paste0(valueCol, '_', as.character(boundQuantiles[2]))] +
boundQuantileDelta*boundsDf[ , paste0(valueCol, '_right_qr')])
boundsDfSumm = boundsDf[ ,
c(
wrtCols,
lowerCol,
upperCol),
drop=FALSE]
F = function(newDf) {
outDf = merge(newDf, boundsDfSumm, on=wrtCols, sort=FALSE)
outDf[ , valueCol] = MinComp(
outDf[ , valueCol],
outDf[ , upperCol])
outDf[ , valueCol] = MaxComp(
outDf[ , valueCol],
outDf[ , lowerCol])
outDf = outDf[ , colnames(newDf), drop=FALSE]
return(outDf)
}
p = NULL
colours = c("blue", "red")
if (!is.null(splitCol)) {
p = Plt_splitDfOverlay(
df=df, splitCol=splitCol, wrtCol=wrtCols, yCol=valueCol)
cols = c(lowerCol, upperCol)
boundsDf[ , "lower"] = boundsDf[ , lowerCol]
boundsDf[ , "upper"] = boundsDf[ , upperCol]
p = p + geom_line(
data=boundsDf,
aes(x=get(wrtCol), y=lower),
size=0.75, colour=colours[1], linetype=1)
p = p + geom_line(
data=boundsDf,
aes(x=get(wrtCol), y=upper),
size=0.75, colour=colours[2], linetype=2)
p = p + theme(legend.position="top")
}
return(list(
"F"=F,
"boundsDf"=boundsDf,
"boundsDfSumm"=boundsDfSumm,
"p"=p))
}
## generates a function which calcualtes bounds
# based on df for each value on wrtCols
# such a function applies to new data
# and bounds the values of the new df
GenBoundsDf = function(
df,
wrtCols,
valueCol,
splitCol=NULL,
boundQuantiles=c(0.001, 0.999),
boundQuantileDelta=0.05,
lwd=2) {
probs = sort(c(boundQuantiles, 0.01, 0.1, 0.25, 0.5, 0.75, 0.9, 0.99))
boundsDf = SummWrt(
data=df,
valueCol=valueCol,
wrtCols=wrtCols,
probs=probs,
fcnList=list(mean, sd, min, max),
fcnNames=c('mean', 'sd', 'min', 'max'),
addValuePrefix=TRUE)
qStr1 = paste0('_', as.character(boundQuantiles[1]))
qStr2 = paste0('_', as.character(boundQuantiles[2]))
deltaStr = paste0('qr_', as.character(boundQuantileDelta*100), 'pcnt')
boundsDf[ , paste0(valueCol, '_right_qr')] = (
boundsDf[ , paste0(valueCol, '_0.99')] -
boundsDf[ , paste0(valueCol, '_0.9')])/0.09
boundsDf[ , paste0(valueCol, '_left_qr')] = (
boundsDf[ , paste0(valueCol, '_0.1')] -
boundsDf[ , paste0(valueCol, '_0.01')])/0.09
lowerCol = paste0(valueCol, qStr1, '_minus_', deltaStr)
upperCol = paste0(valueCol, qStr2, '_plus_', deltaStr)
boundsDf[ , lowerCol] = (
boundsDf[ , paste0(valueCol, '_', as.character(boundQuantiles[1]))] -
boundQuantileDelta*boundsDf[ , paste0(valueCol, '_left_qr')])
boundsDf[ , upperCol] = (
boundsDf[ , paste0(valueCol, '_', as.character(boundQuantiles[2]))] +
boundQuantileDelta*boundsDf[ , paste0(valueCol, '_right_qr')])
boundsDfSumm = boundsDf[ ,
c(
wrtCols,
lowerCol,
upperCol),
drop=FALSE]
F = function(newDf) {
outDf = merge(newDf, boundsDfSumm, on=wrtCols, sort=FALSE)
outDf[ , valueCol] = MinComp(
outDf[ , valueCol],
outDf[ , upperCol])
outDf[ , valueCol] = MaxComp(
outDf[ , valueCol],
outDf[ , lowerCol])
outDf = outDf[ , colnames(newDf), drop=FALSE]
return(outDf)
}
Plt = function() {
if (length(wrtCols) > 1) {
return(NULL)
}
colours = c("blue", "red")
if (!is.null(splitCol)) {
df0 = df[order(df[ , splitCol], df[ , wrtCols]), ]
Plt_splitDfOverlay(
df=df0, splitCol=splitCol, wrtCol=wrtCols, yCol=valueCol)
cols = c(lowerCol, upperCol)
} else {
yMax = max(df[ , valueCol], na.rm=TRUE)
yMin = min(df[ , valueCol], na.rm=TRUE)
xMax = max(df[ , wrtCols], na.rm=TRUE)
xMin = min(df[ , wrtCols], na.rm=TRUE)
yDelta = (yMax - yMin) / 4
yMax = yMax + yDelta
yMin = yMin - yDelta
plot(
df[ , wrtCols], df[ , valueCol], xlab=wrtCols, ylab=valueCol,
xlim=c(xMin, xMax), ylim=c(yMin, yMax), col=ColAlpha("black", 0.3))
}
boundsDf = boundsDf[order(boundsDf[ , wrtCols]), ]
boundsDf[ , "lower"] = boundsDf[ , lowerCol]
boundsDf[ , "upper"] = boundsDf[ , upperCol]
lines(boundsDf[ , wrtCols], boundsDf[ , lowerCol], col=colours[1],
lty=2, lwd=lwd)
lines(boundsDf[ , wrtCols], boundsDf[ , upperCol], col=colours[2],
lty=3, lwd=lwd)
legend(
x="topright", legend=c("obs", "lower", "upper"),
col=c("grey", colours), lty=c(1, 2, 3))
}
return(list(
"F"=F,
"boundsDf"=boundsDf,
"boundsDfSumm"=boundsDfSumm,
"Plt"=Plt))
}
Example = function() {
m = 500
x = rep(1:100, m)/100
y = sin(x*2*pi) + rnorm(length(x))
z = unlist(lapply(X=1:m, FUN=function(i){rep(i, 100)}))
df = data.frame(x=x, y=y, z=z)
res = GenBoundsDf(
df=df,
wrtCols="x",
valueCol="y",
splitCol="z",
boundQuantiles=c(0.01, 0.99),
boundQuantileDelta=0.1)
res[["Plt"]]()
res = GenBoundsDf(
df=df,
wrtCols="x",
valueCol="y",
splitCol=NULL,
boundQuantiles=c(0.01, 0.99),
boundQuantileDelta=0.1)
res[["Plt"]]()
}
GenBoundsFcn = function(
df,
wrtCols,
valueCol,
boundQuantiles=c(0.001, 0.999),
boundQuantileDelta=0.05) {
res = GenBoundsDf(
df=df,
wrtCols=wrtCols,
valueCol=valueCol,
boundQuantiles=boundQuantiles,
boundQuantileDelta=boundQuantileDelta)
return(res[["F"]])
}
Example = function() {
x = sample(1:10, 1000, replace=TRUE)
y = rnorm(1000)
df = data.frame(x, y)
F = GenBoundsFcn(
df,
wrtCols="x",
valueCol="y",
boundQuantiles=c(0.001, 0.999),
boundQuantileDelta=0.05)
df2 = df[1:10, ]
df2[1, ] = c(1, 100)
F(df2)
}
## make a longterm avg plot by taking the average in a window
# and moving that across the time
# that window can be a week, month, year etc
LongtAvgPlt = function(
df,
timeCol,
windowCol,
respCol,
ylab=NULL,
ylim=NULL,
pltTitle="",
color=ColAlpha("blue", 0.5),
addToPlot=FALSE,
addPoints=FALSE,
cex.lab=1.5,
cex.axis=1.5,
cex.main=1.5,
lty=1) {
# this is for choosing the major color
# after averaging
Func = function(x) {
x[1]
}
df0 = df[ , c(timeCol, windowCol, respCol)]
df0[ , "color"] = color
names(df0) = c("timeCol", "windowCol", "col", "color")
df0 = data.table(df0)
dfW = df0[ ,
.(
contiTime=mean(timeCol),
col=mean(col),
color=Func(color),
num=.N),
by=.(windowCol)]
if (is.null(ylab)) {
ylab = "avg resp across window"
}
dfW = data.frame(dfW)
Plt = plot
if (addToPlot) {
Plt = lines
}
Plt(
dfW[ , "contiTime"],
dfW[ , "col"],
ylim=ylim,
type="l",
xlab="time",
ylab=ylab,
main=pltTitle,
col=dfW[ , "color"],
lwd=3,
cex.lab=cex.lab,
cex.axis=cex.axis,
cex.main=cex.main,
lty=lty)
if (addPoints) {
points(
dfW[ , "contiTime"],
dfW[ , "col"],
col=ColAlpha("black", 0.6),
pch=20,
cex=1)
}
return(dfW)
}
Example = function() {
y1 = 5 * rnorm(100)
y2 = y1 + 1 + rnorm(100)
contiTime = 1:100
window = c(
rep(1, 20),
rep(2, 20),
rep(3, 20),
rep(4, 20),
rep(5, 20))
color = window
df = data.frame(
"contiTime"=contiTime,
"windowCol"=window,
"color"=color,
"y1"=y1,
"y2"=y2)
ltDf = LongtAvgPlt(
df=df,
timeCol="contiTime",
windowCol="windowCol",
respCol="y1",
ylab=NULL,
pltTitle="",
color=df[ , "color"])
LongtAvgPlt(
df=df,
timeCol="contiTime",
windowCol="windowCol",
respCol="y2",
ylab=NULL,
pltTitle="",
color=df[ , "color"],
addToPlot=TRUE,
lty=2)
}
######## fit Box-Cox
## pivot the data wrt appropriate variables.
# for daily weather it should be pivotPar=c("doy","year")
BuildDataPivot = function(df, resp, pivotVar=c("week", "year")) {
castText = paste(
"dfPivot = cast(df,",
pivotVar[1],
"~",
pivotVar[2],
",value=\"",
resp,
"\", fun.aggregate=mean)",
sep="")
eval(parse(text=castText))
return(dfPivot)
}
## build a function to estimate boxcox
EstimBoxCoxFcn = function(
OptCrit,
shiftIt,
powRange=seq(1/2, 2, 1/4),
shiftGridNum=10,
prefCoef=c(1, 0.1, 1)) {
function(x, ...) {
x = x[-1]
res = EstimBoxCox(
x,
OptCrit=OptCrit,
powRange=powRange,
shiftIt=shiftIt,
shiftGridNum=shiftGridNum,
prefCoef=prefCoef)
y = res[["transX"]]
c(
res[["param"]],
res[["value"]],
res[["newValue"]],
mean(x, na.rm=TRUE),
sd(x, na.rm=TRUE),
mean(y, na.rm=TRUE),
sd(y, na.rm=TRUE))
}
}
## Apply BoxCox pointwise appropriately,
# we choose BoxCox parameters with respect to
# e.g. week/day (or day of year in this case)
BoxCox1Var = function(boxCoxRes) {
Asn = as.numeric
x = boxCoxRes[[resp]]
x = Asn(x)
pow = boxCoxRes[["pow"]]
pow = Asn(pow)
shift = boxCoxRes[["shift"]]
shift = Asn(shift)
out = NA
if (!is.na(x)) {
out = BoxCox(x=x, pow=pow, shift=shift)
}
return(out)
}
## manual way to apply BoxCox pointwise appropriately
Example = function() {
dfValues = rawDf
n = dim(dfValues)[1]
dfValues[ , "tempOrder"] = 1:n
data = merge(dfValues,dfColsArgs,by="week",all.x=TRUE)
data = data[order(data[,"tempOrder"]), ]
cols = c(wrtCols, valueCols, colnames(dfColsArgs))
cols = unique(cols)
data2 = data[ , cols]
#data2 = data.table(data2)
transX1 = apply(data2, 1, fcn)
}
## smooth boxCox parameters over time (of year)
SmoothBoxCoxRes = function(boxCoxRes, method="smoothSpline", ord=10) {
pow = boxCoxRes[ , "pow"]
shift = boxCoxRes[ , "shift"]
timeLen = length(pow)
data = data.frame(t=1:timeLen, pow=pow, shift=shift)
y1 = data[ , "pow"]
y2 = data[ , "shift"]
if (method == "regression") {
fsData = Fseries(data=data, colName="t",
omega=(2*pi)/timeLen, ord=ord)[["df"]]
mod1 = GlmFitMatr(y=y1, explan=fsData, family="gaussian")
mod2 = GlmFitMatr(y=y2, explan=fsData, family="gaussian")
yPred1 = predict(mod1, explan=fsData)
yPred2 = predict(mod2, explan=fsData)
}
if (method == "smoothSpline") {
mod1 = smooth.spline(x=1:timeLen, y=y1, spar=0.7)
mod2 = smooth.spline(x=1:timeLen, y=y2, spar=0.7)
yPred1 = predict(mod1, explan=fsData)[["y"]]
yPred2 = predict(mod2, explan=fsData)[["y"]]
}
if (method == "GPfit") {
InstallLoad("GPfit")
#xMat = matrix((1:timeLen)/timeLen)
fsData = Fseries(data=data, colName="t",
omega=(2*pi)/timeLen, ord=1)[["df"]]
xMat = fsData
fsData = (fsData + 1)/2
y = y1
mod = GP_fit(xMat, y)
res = predict(mod)
yPred1 = res[["Y_hat"]]
y = y2
mod = GP_fit(xMat, y)
res = predict(mod)
yPred2 = res[["Y_hat"]]
}
if (method == "filt") {
yPred1 = MovingAvg(vec=y1, filterNum=ord, filterPpns=NULL,
missNum=100, periodic=TRUE)
yPred2 = MovingAvg(vec=y2, filterNum=ord, filterPpns=NULL,
missNum=100, periodic=TRUE)
}
par(mfrow=c(1,2))
plot(1:timeLen, y1, pch=20, col="grey")
lines(1:timeLen, yPred1, col="red")
plot(1:timeLen, y2, pch=20, col="grey")
lines(1:timeLen, yPred2, col="red")
return(list(powPred=yPred1, shiftPred=yPred2))
}
## plot boxcox trans results
PlotBoxCoxRes = function(
dfPivot,
powPred,
shiftPred,
OptCrit=FitGoodness) {
n = dim(dfPivot)[1]
m1 = m2 = rep(NA, dim(dfPivot)[1])
s1 = s2 = rep(NA, dim(dfPivot)[1])
valueVec = rep(NA, dim(dfPivot)[1])
newValueVec = rep(NA, dim(dfPivot)[1])
Func = function(i) {
x = dfPivot[i, ]
x = as.numeric(x)
x = x[-1]
y = BoxCox(x, pow=powPred[i], shift=shiftPred[i])
m1 = mean(x, na.rm=TRUE)
m2 = mean(y, na.rm=TRUE)
s1 = sd(x, na.rm=TRUE)
s2 = sd(y, na.rm=TRUE)
qsk1 = QskewAbs(na.omit(x))
qsk2 = QskewAbs(na.omit(y))
sk1 = SkewnessAbs(na.omit(x))
sk2 = SkewnessAbs(na.omit(y))
kur1 = kurtosis(x, na.rm=TRUE)
kur2 = kurtosis(y, na.rm=TRUE)
value = OptCrit(x)
newValue = OptCrit(y)
return(c(m1, m2, s1, s2, sk1, sk2, qsk1, qsk2, kur1, kur2, value,
newValue))
}
x = matrix(1:dim(dfPivot)[1])
mat = apply(X=x, MARGIN=1, FUN=Func)
mat = t(mat)
m1 = mat[ , 1]
m2 = mat[ , 2]
s1 = mat[ , 3]
s2 = mat[ , 4]
sk1 = mat[ , 5]
sk2 = mat[ , 6]
qsk1 = mat[ , 7]
qsk2 = mat[ , 8]
kur1 = mat[ , 9]
kur2 = mat[ , 10]
valueVec = mat[ , 11]
newValueVec = mat[ , 12]
Max2 = function(x) {
max(x, na.rm=TRUE)
}
Min2 = function(x) {
min(x, na.rm=TRUE)
}
YlimFcn = function(x1, x2) {
c(Min2(c(x1, x2)), Max2(c(x1, x2)))
}
par(mfrow=c(2, 7))
plot2 = function(x,...) {
plot(x, xlab="day of year", type="l", col=ColAlpha("red"), ...)
}
plot(powPred, main="BoxCox pow", type="l", lwd=2)
plot2(m1, ylim=YlimFcn(m1, m2), main="mean bef BC")
plot2(s1, ylim=YlimFcn(s1, s2), main="sd bef BC")
plot2(sk1, ylim=YlimFcn(sk1, sk2), main="skew bef BC")
plot2(qsk1, ylim=YlimFcn(qsk1, qsk2), main="qskew bef BC")
plot2(kur1, ylim=YlimFcn(kur1, kur2), main="kurt bef BC")
plot2(valueVec, ylim=YlimFcn(valueVec, newValueVec), main="GOF bef BC")
plot(shiftPred, main="BoxCox shift", type="l",lwd=2)
plot2(m2, ylim=YlimFcn(m1, m2), main="mean aft BC")
plot2(s2, ylim=YlimFcn(s1, s2), main="sd aft BC")
plot2(sk2, ylim=YlimFcn(sk1, sk2), main="skew aft BC")
plot2(qsk2, ylim=YlimFcn(qsk1, qsk2), main="qskew aft BC")
plot2(kur2, ylim=YlimFcn(kur1, kur2), main="kurt aft BC")
plot2(newValueVec, ylim=YlimFcn(valueVec, newValueVec), main="GOF aft BC")
}
################ time series weather functions
## source function depending on online or off-line
# source(paste(addr.source,"extract_rawData_frost.r",sep=""),local=TRUE)
## this is a function to build explan data for the data set
# we can then later use it for explan generation for future data as well
BuildLagFsDf = function(
df,
respCols,
LagFcn=NULL,
fsFcnList=NULL,
fut=FALSE,
contiTimeFut=NULL) {
lagMatList = list()
outDf = NULL
if (!is.null(LagFcn)) {
for (r in respCols) {
lagMatList[[r]] = LagFcn(df[ , r], colName=r, fut=fut)
}
for (r in respCols) {
if (!is.null(outDf)) {
outDf = cbind(outDf, lagMatList[[r]])
} else {
outDf = lagMatList[[r]]
}
}
}
if (!is.null(fsFcnList)) {
timeCols = names(fsFcnList)
for (i in 1:length(timeCols)) {
FsFcn = fsFcnList[[i]]
if (!fut) {
sinCosMat = FsFcn(df[ , timeCols[i]])
} else {
sinCosMat = FsFcn(contiTimeFut[i])
}
if (is.null(outDf)) {
outDf = sinCosMat
} else {
outDf = cbind(outDf, sinCosMat)
}
}
}
return(outDf)
}
Example = function() {
longtIndex = c(1, 2, 5)
LagFcn = CreateLagIndFcn(lagIndex=1:3,
longtIndex=longtIndex, AvgFcn=rowMeans)
FsFcnTod = CreateFsIndFcn(sinOrd=1:2, cosOrd=1:2,
omega=(2*pi/24), colName="tod")
FsFcnTow = CreateFsIndFcn(sinOrd=1:3, cosOrd=1:3,
omega=(2*pi/7), colName="tow")
fsFcnList = list("tod"=FsFcnTod, "tow"=FsFcnTow)
yCol = "count"
t1 = strptime("01/01/2011 11:23", "%d/%m/%Y %H:%M", tz = "Europe/London")
t1 = chron::as.chron(t1)
t2 = t1 + 365
timeDf = CreateTimeGrid(t1, t2, delta=1/(24))
df = timeDf
df[ , yCol] = rpois(dim(df)[1], lambda=10)
futTs = t2 + 1/24
futTimeInfo = BuildTimeDf(futTs)
contiTimeFut = futTimeInfo[ , names(fsFcnList)]
modDf = BuildLagFsDf(
df=df,
respCols=c("count"),
LagFcn=LagFcn,
fsFcnList=fsFcnList,
fut=FALSE,
contiTimeFut=NULL)
modDfFut = BuildLagFsDf(
df=df,
respCols=c("count"),
LagFcn=LagFcn,
fsFcnList=fsFcnList,
fut=TRUE,
contiTimeFut=contiTimeFut)
}
## from basic model builds a data with lags and fourier series terms
# this also imputes the data: the imputation will replace
# the missing value with avg wrt wrtCols
BuildTsModDf = function(
df,
respCols,
LagFcn,
fsFcnList,
wrtCols=NULL,
omitNa=FALSE) {
outDf = BuildLagFsDf(
df=df,
respCols=respCols,
LagFcn=LagFcn,
fsFcnList=fsFcnList,
fut=FALSE,
contiTimeFut=NULL)
cols = names(outDf)
augDf = cbind(df, outDf)
if (!is.null(wrtCols)) {
augData = ImputeDf(
df=augDf,
valueCols=cols,
wrtCols=wrtCols,
AggFunc=NULL,
replaceCol=TRUE)
}
if (omitNa) {
augDf = na.omit(augDf)
}
return(augDf)
}
## this is for generic seasonal time series eg traffic
BuildTsModDf_lagFourierParams = function(
df,
todK=NULL,
todK_wknd=NULL,
towK=NULL,
toyK=NULL,
lagK=NULL,
longtIndex=NULL,
lagTrans=function(x){x},
imputeWrtCols,
omitNa=FALSE
) {
LagFcn = NULL
if (!is.null(lagK) || !is.null(longtIndex)) {
LagFcn = CreateLagIndFcn(
lagIndex=1:lagK,
longtIndex=longtIndex,
AvgFcn=rowMeans,
LagTrans=LagTrans)
}
fsFcnList = list()
if (!is.null(todK)) {
FsFcnTod = CreateFsIndFcn(
sinOrd=1:todK,
cosOrd=1:todK,
omega=(2*pi/24),
colName="tod")
fsFcnList[["tod"]] = FsFcnTod
}
if (!is.null(todK_wknd)) {
FsFcnTod_wknd = CreateFsIndFcn(
sinOrd=1:todK_wknd,
cosOrd=1:todK_wknd,
omega=(2*pi/24),
colName="tod_wknd")
fsFcnList[["tod_wknd"]] = FsFcnTod_wknd
}
if (!is.null(towK)) {
FsFcnTow = CreateFsIndFcn(
sinOrd=1:towK,
cosOrd=1:towK,
omega=(2*pi/7),
colName="tow")
fsFcnList[["tow"]] = FsFcnTow
}
if (!is.null(toyK)) {
FsFcnToy = CreateFsIndFcn(
sinOrd=1:toyK,
cosOrd=1:toyK,
omega=(2*pi/366),
colName="toy")
fsFcnList[["toy"]] = FsFcnToy
}
modDf = BuildTsModDf(
df=df,
respCols=yCol,
LagFcn=LagFcn,
fsFcnList=fsFcnList,
wrtCols=imputeWrtCols,
omitNa=FALSE)
return(list(
"modDf"=modDf,
"fsFcnList"=fsFcnList,
"LagFcn"=LagFcn))
}
## adds the explan data to a data set.
# this is useful for fut explan generation
Add_LagFs_explanDf = function(
df,
respCols,
LagFcn,
fsFcnList,
fut=FALSE,
futDfRow=NULL) {
contiTimeFut = futDfRow[ , names(fsFcnList)]
contiTimeFut = as.vector(contiTimeFut)
outDf = BuildLagFsDf(
df=df,
respCols=respCols,
LagFcn=LagFcn,
fsFcnList=fsFcnList,
fut=fut,
contiTimeFut=contiTimeFut)
if (!fut) {
augDf = cbind(df, outDf)
} else {
#data[dim(data)[1] , , drop=FALSE]
augDf = cbind(futDfRow, outDf)
}
return(augDf)
}
Example = function() {
longtIndex = c(1, 2, 5)
LagFcn = CreateLagIndFcn(
lagIndex=1:3,
longtIndex=longtIndex,
AvgFcn=rowMeans)
FsFcnTod = CreateFsIndFcn(
sinOrd=1:2,
cosOrd=1:2,
omega=(2*pi/24),
colName="tod")
FsFcnTow = CreateFsIndFcn(
sinOrd=1:3,
cosOrd=1:3,
omega=(2*pi/7),
colName="tow")
fsFcnList = list("tod"=FsFcnTod, "tow"=FsFcnTow)
yCol = "count"
t1 = strptime("01/01/2011 11:00", "%d/%m/%Y %H:%M", tz="Europe/London")
t1 = chron::as.chron(t1)
t2 = t1 + 365
timeDf = CreateTimeGrid(t1, t2, delta=1/(24))
df = timeDf
df[ , yCol] = rpois(dim(df)[1], lambda=10)
futTs = t2 + 1/24
futTimeInfo = BuildTimeDf(futTs)
futDfRow = futTimeInfo
contiTimeFut = futTimeInfo[ , names(fsFcnList)]
modDf = Add_LagFs_explanDf(
data=df,
respCols=c("count"),
LagFcn=LagFcn,
fsFcnList=fsFcnList,
fut=FALSE,
futDfRow=NULL)
modDfFut = Add_LagFs_explanDf(
data=df,
respCols=c("count"),
LagFcn=LagFcn,
fsFcnList=fsFcnList,
fut=TRUE,
futDfRow=futDfRow)
}
#### Auto-correlation plot by season ######
CreateAutoCSeas = function(
data,
resp="tmin",
Fcn=pacf,
fcnName="PAC",
lagNum=5) {
G = function(seas) {
data_seas = subset(data, is.element(data[ , "month"], seas))
val = data_seas[ , resp]
acf = Fcn(na.omit(val), plot=FALSE)
return(acf[["acf"]])
#lags = acf[["lag"]]
}
valWin = G(c(12, 1, 2))
valSpr = G(4:5)
valSum = G(6:8)
valAut = G(9:11)
plot(1:lagNum, valWin[1:lagNum], type="l", xlab="lag",
ylab=fcnName, cex.lab=1.2, cex.axis=1.2, ylim=c(0, 1), col=2)
lines(1:lagNum, valSpr[1:lagNum], lwd=2, lty=2, col=3)
lines(1:lagNum, valSum[1:lagNum], lwd=2, lty=3, col=4)
lines(1:lagNum, valAut[1:lagNum], lwd=2, lty=4, col=5)
abline(0, 0, lty=2)
legend(x=lagNum-2, y=max(valWin, valSpr, valSum, valAut),
legend=c("Win", "Spr", "Sum", "Fall"), lwd=c(2, 2, 2, 2),
col=2:5, lty=1:4)
}
Example = function() {
# CreateAutoCSeas(data=data, resp="tmin", Fcn=pacf, fcnName="PAC", lagNum=5)
# CreateAutoCSeas(data=data, resp="tmax", Fcn=pacf, fcnName="PAC", lagNum=5)
fn = paste0(odir, "pac_per_season.png")
png(fn)
CreateAutoCSeas(data=data, resp="tmin", Fcn=pacf, fcnName="PAC", lagNum=5)
dev.off()
}
##### Distribution
# y
# doyVec
CheckHistMonthly = function(data, doyCol="doy", resp, Fcn=hist) {
monthVecStarts = c(1, 31, 61, 91, 121, 151, 181, 211, 241, 271,
301, 331, 361)
par(mfrow=c(3, 4))
for (i in 1:12) {
minn = monthVecStarts[i]
maxx = monthVecStarts[i+1]
ind = which(data[ , doyCol] >= minn & data[ , doyCol]<=maxx)
yres = data[ind, resp]
Fcn(yres, main=resp)
}
}
# CheckHistMonthly(data, resp)
CheckQqMonthly = function(data, doyCol="doy", resp) {
monthVecStarts = c(1, 31, 61, 91, 121, 151, 181, 211, 241, 271, 301,
331, 361)
par(mfrow=c(3, 4))
for (i in 1:12) {
minn = monthVecStarts[i]
maxx = monthVecStarts[i+1]
ind = which(data[ , doyCol] >= minn & data[ , doyCol] <= maxx)
yres = data[ind, resp]
qqnorm(yres, main=resp, col=ColAlpha("black", 0.5))
qqline(yres)
}
}
CheckHistBiMonthly = function(data, doyCol="doy", resp, Fcn=hist) {
bimonthVecStarts = c(1, 61, 121, 181, 241, 301, 361)
par(mfrow=c(3, 2))
for (i in (1:6)) {
minn = bimonthVecStarts[i]
maxx = bimonthVecStarts[i+1]
ind = which(data[ , "doy"] >= minn & data[ , "doy"] <= maxx)
z = data[ind, resp]
z = na.omit(z)
i1 = 2*i-1
i2 = 2*i
textt = paste("months ", i1, "and", i2)
Fcn(z, main=textt, probability=TRUE, xlab="temp(C)")
}
}
# CheckHistBiMonthly(data, resp)
CheckQqBiMonthly = function(data, resp) {
bimonthVecStarts = c(1, 61, 121, 181, 241, 301, 361)
par(mfrow=c(3,2))
for (i in (1:6)) {
minn = bimonthVecStarts[i]
maxx = bimonthVecStarts[i+1]
ind = which(data[ , "doy"] >= minn & data[ , "doy"] <= maxx)
z = data[ind, resp]
z = na.omit(z)
i1 = 2*i-1
i2 = 2*i
textt = paste("monthVecs ", i1, "and", i2)
qqnorm(z, main=textt, ColAlpha("black", 0.5))
qqline(z)
}
}
# CheckQqBiMonthly(data=data, resp="tmax")
TestDistShapiro = function(data) {
monthVecStarts = c(1, 31, 61, 91, 121, 151, 181, 211, 241,
271, 301, 331, 361)
for (i in (1:12)) {
minn = monthVecStarts[i]
maxx = monthVecStarts[i]+1
ind = which(data[ , "doy"] >= minn & data[ , "doy"] <= maxx)
yres = data[ind , resp]
test = shapiro.test(yres)
print(test)
}
}
# TestDistShapiro(data)
## extracting optimal columns from msRes
MsExtractOptCols = function(msRes, critNum=2) {
topMod = msRes[["topModMatrix"]]
optMod = topMod[1, ]
explanColsOpt = optMod[(critNum + 1):length(optMod)]
explanColsOpt = as.matrix(explanColsOpt)
names(explanColsOpt) = NULL
explanColsOpt = na.omit((explanColsOpt)[1, ])
#print(explanColsOpt)
crit = round(optMod[1:critNum], 3)
#print(crit)
return(explanColsOpt)
}
## plot a model given the optimal columns
FitPlotMod = function(explanColsOpt) {
formulaString = paste(resp, "~", PlusVec(explanColsOpt))
mod = glm(formula(formulaString), data=data)
AIC(mod)
BIC(mod)
yFit = predict(mod)
plot(data[ , "contiTime"], yFit, type="l")
}
## selects models and extracts columns
ModSelExtract = function(
df, yCol, explanCols, explanColsF=NULL, crit="BIC", family="gaussian",
ModelFcn=glm) {
msRes = ModSelSubsetFormula(
df=df,
yCol=yCol,
explanCols=explanCols,
explanColsF=explanColsF,
m=1,
crit=crit,
family=family,
ModelFcn=glm)
explanColsOpt = MsExtractOptCols(msRes)
return(explanColsOpt)
}
Example = function() {
l = list()
crit = "AIC"
l[["S1"]] = ModSelExtract(
data=data, yCol=resp, explanCols=groupsList[["seas"]],
explanColsF=NULL, crit=crit)
###### Extend to AR:
l[["S2"]] = ModSelExtract(
data=data, yCol=resp, explanCols=groupsList[["lag"]],
explanColsF=l[["S1"]], crit=crit)
l[["S3"]] = ModSelExtract(
data=data, yCol=resp, explanCols=groupsList[["longt"]],
explanColsF=l[["S1"]], crit=crit)
l[["S4"]] = ModSelExtract(
data=data, yCol=resp, explanCols=groupsList[["pow"]],
explanColsF=l[["S1"]], crit=crit)
l[["S5"]] = ModSelExtract(
data=data, yCol=resp, explanCols=c(l[["S2"]], l[["S3"]], l[["S4"]]),
explanColsF=l[["S1"]], crit=crit)
}
## specify model selection sequence
# in each step we may refer to a previous step or the groups
ApplySeqMs = function(
df,
yCol,
crit,
msSteps,
family="gaussian",
ModelFcn=glm) {
l = list()
n = length(msSteps)
for (i in 1:n) {
#print(paste("*** Model Selection: Step", i))
step = msSteps[[i]]
name = step[["name"]]
explanColsF = step[["explanColsF"]]
explanCols = step[["explanCols"]]
explanColsFInd = step[["explanColsFInd"]]
explanColsInd = step[["explanColsInd"]]
#if (is.null(explanColsF) & !is.null(explanColsFInd))
if (!is.null(explanColsFInd)) {
explanColsF = c(explanColsF, unlist(l[explanColsFInd]))
}
if (is.null(explanCols) & !is.null(explanColsInd)) {
explanCols = unlist(l[explanColsInd])
}
chosenCols = ModSelExtract(
df=df,
yCol=yCol,
explanCols=explanCols,
explanColsF=explanColsF,
crit=crit,
family=family,
ModelFcn=ModelFc)
#Mark(chosenCols)
#Mark(explanColsF)
l[[name]] = list("explanColsF"=explanColsF, "chosenCols"=chosenCols)
}
return(l)
}
## builds a FitFcn for generalized model selection via ModSelSubsetGen
# this is to pick best volatility
FitFcnBuild = function(df, yCol, explanCols, iter=10) {
function(omegaPredCols) {
out = FitGaussVolatMod(
df=df,
yCol=yCol,
explanCols=explanCols,
omegaPredCols=omegaPredCols,
iter=10)[["BIC"]]
}
}
## this function finds the optimal explan for autoregressive trend
# then it finds the optimal volatility structure
ApplySeqMs_tvv = function(
df,
yCol,
msSteps,
crit="BIC",
omegaPredCols,
critVolat="BIC",
varNumVolat=2,
volatDeviance=FALSE,
writePath=NULL,
fileNameExtra="_raw") {
msRes_tvv = ApplySeqMs(
df=df,
yCol=yCol,
crit=crit,
msSteps=msSteps,
family="gaussian")
#print("*** result of ms")
#print(res)
optModExplan = sort(unique(as.vector(unlist(msRes_tvv))))
seasModExplan = res[[1]]
explanMsRes = res
if (!is.null(writePath)) {
write.csv(
x=optModExplan,
file=paste0(writePath,"opt_model", fileNameExtra, ".csv"),
row.names=FALSE)
}
df2 = df
if (volatDeviance) {
formulaString = paste(yCol, "~", PlusVec(seasModExplan))
seasMod = glm(as.formula(formulaString), data=modDf)
d = df[ , yCol] - predict(seasMod)
d = d[-length(d)]
d = c(0, d)
absD = abs(d)
df2[ , "devFromUsual"] = d
df2[ , "absDevFromUsual"] = absD
omegaPredCols = c(omegaPredCols, "absDevFromUsual")
}
## fitting varying volatility models
res = ModSelSubsetGen(
yCol,
explanColsF=NULL,
explanCols=omegaPredCols,
FitFcn=FitFcnBuild(
df=df2,
yCol=yCol,
explanCols=optModExplan),
m=3,
crit=critVolat,
varNum=varNumVolat,
critSummaryDelta=0.1)
omegaPredColsOpt = MsExtractOptCols(res, critNum=1)
volatMsRes = res
if (!is.null(writePath)) {
write.csv(
x=omegaPredColsOpt,
file=paste0(writePath, "opt_omega", fileNameExtra, ".csv"),
row.names=FALSE)
}
outList = list()
outList[["optModExplan"]] = optModExplan
outList[["seasModExplan"]] = seasModExplan
outList[["explanMsRes"]] = explanMsRes
outList[["omegaPredColsOpt"]] = omegaPredColsOpt
outList[["volatMsRes"]] = volatMsRes
outList[["expandedData"]] = df2
return(outList)
}
## initial can be the series upto the date we simulate from
## it augments the data with lag info
SimulNextWithModFcn = function(mod) {
if ("gaussVolatMod" %in% class(mod)) {
Fcn = function(futData) {
pred = predict.gaussVolatMod(gaussVolatMod=mod, newdata=futData)
ySim = pred[["mean"]] + rnorm(1, mean=0, sd=pred[["vol"]])
return(ySim)
}
} else if (paste(class(mod), collapse=",") == "glm,lm") {
modFamily = family(mod)
familyStr = family(mod)["family"]
linkStr = family(mod)["link"]
disp = mod[["dispersion"]]
if (familyStr == "quasi" & linkStr == "log") {
VarFcn = family(mod)["variance"]["variance"]
Fcn = function(futData) {
mu = exp(predict(mod, newdata=futData))
theta = mod[["theta"]]
ySim = rqpois(n=1, mu=mu, theta=theta)
return(ySim)
}
}
if (familyStr == "quasipoisson" & linkStr == "log") {
rqpois = function(n, mu, theta) {
rnbinom(n=n, mu=mu, size=mu/(theta-1))
}
Fcn = function(futData) {
mu = exp(predict(mod, newdata=futData))
var = disp * mu
#ySim = rpois(1, lambda=yPred)
#ySim = rqpois(n=1, mu=pred, theta=theta)
ySim = rnorm(1, mean=mu, sd=sqrt(var))
ySim = max(0, ySim)
return(ySim)
}
}
if (familyStr == "poisson" & linkStr == "log") {
Fcn = function(futData) {
mu = exp(predict(mod, newdata=futData))
ySim = rpois(1, lambda=mu)
return(ySim)
}
}
if (familyStr == "poisson" & linkStr == "sqrt") {
Fcn = function(futData) {
mu = predict(mod, newdata=futData)^2
ySim = rpois(1, lambda=mu)
return(ySim)
}
}
if (familyStr == "Gamma" & linkStr == "log") {
alpha = 1/disp
Fcn = function(futData) {
mu = exp(predict(mod, newdata=futData))
beta = alpha/mu
ySim = rgamma(1, shape=alpha, rate=beta)
return(ySim)
}
}
if (familyStr == "Gamma" & linkStr == "identity") {
alpha = 1/disp
Fcn = function(futData) {
mu = predict(mod, newdata=futData)
beta = alpha/mu
ySim = rgamma(1, shape=alpha, rate=beta)
return(ySim)
}
}
if (familyStr == "Gamma" & linkStr == "inverse") {
alpha = 1/disp
Fcn = function(futData) {
mu = 1/predict(mod, newdata=futData)
beta = alpha/mu
ySim = rgamma(1, shape=alpha, rate=beta)
return(ySim)
}
}
} else if (paste(class(mod), collapse=",") == "negbin,glm,lm") {
modFamily = family(mod)
familyStr = family(mod)["family"]
linkStr = family(mod)["link"]
if (grepl("Negative Binomial", familyStr) & linkStr == "log") {
#print("DUH")
Fcn = function(futData) {
mu = exp(predict(mod, newdata=futData))
theta = mod[["theta"]]
theta = theta + 1
ySim = rnegbin(1, mu=mu, theta=theta)
return(ySim)
}
}
} else {
print("Warning: prediction is not implemented for your model.")
Fcn = function(x) NULL
}
return(Fcn)
}
# SimulNext(t=2007, data=basicData, respCols=respCols, mod=mod, TransFcn=function(x)x)
SimulateSeries = function(
futDf,
prevDf,
mod,
respCols,
fsFcnList,
LagFcn,
simNum=1,
limitSimValue=FALSE,
yMax=NULL,
yMin=NULL,
Transf=function(x)x,
applyBounds=FALSE,
BoundDf=NULL,
boundsWrtCols=NULL) {
SimViaModel = SimulNextWithModFcn(mod)
## a function to simulate next value
SimulNext = function(
futDfRow, df, respCols, mod,
fsFcnList, LagFcn, TransFcn=function(x)x) {
futDf2 = Add_LagFs_explanDf(
df=df,
respCols=respCols,
LagFcn=LagFcn,
fsFcnList=fsFcnList,
fut=TRUE,
futDfRow=futDfRow)
ySim = SimViaModel(futDf2)
ySim = Transf(ySim)
changedValue_global = 0
changedValue_wrt = 0
if (limitSimValue) {
ySim2 = MinComp(ySim, yMax)
ySim2 = MaxComp(ySim2, yMin)
if (ySim2 != ySim) {
ySim = ySim2
changedValue_global = 1
}
}
if (applyBounds) {
newDf = setNames(
data.frame(matrix(
ncol=length(boundsWrtCols) + 1,
nrow=0)),
c(boundsWrtCols, respCols))
newDf[1, ] = c(futDf2[1, boundsWrtCols], ySim)
boundedDf = BoundDf(newDf=newDf)
ySim2 = boundedDf[1, respCols]
if (ySim2 != ySim) {
ySim = ySim2
changedValue_wrt = 1
}
}
return(list(
"ySim"=ySim,
"changedValue_global"=changedValue_global,
"changedValue_wrt"=changedValue_wrt))
}
SimFutVec = function(x) {
futDf[ , respCols] = 0
cols = names(futDf)
df = prevDf[ , cols, drop=FALSE]
changedNum_global = 0
changedNum_wrt = 0
changedNum_both = 0
for (i in 1:dim(futDf)[1]) {
futDfRow = futDf[i, , drop=FALSE]
l = SimulNext(
futDfRow=futDfRow, df=df, respCols=respCols,
mod=mod, LagFcn=LagFcn, fsFcnList=fsFcnList,
TransFcn=TransFcn)
futDfRow[ , respCols] = l[["ySim"]]
changedNum_global = changedNum_global + l[["changedValue_global"]]
changedNum_wrt = changedNum_wrt + l[["changedValue_wrt"]]
changedNum_both = (
changedNum_both +
l[["changedValue_global"]]*l[["changedValue_wrt"]])
futDf[i, ] = futDfRow
df = rbind(df, futDfRow)
}
return(list(
"futDf"=futDf,
"changedNum_global"=changedNum_global,
"changedNum_wrt"=changedNum_wrt,
"changedNum_both"=changedNum_both))
}
resList = lapply(X=1:simNum, FUN=SimFutVec)
changedNum_global = 0
changedNum_wrt = 0
changedNum_both = 0
futDf2 = futDf
for (i in 1:simNum) {
res = resList[[i]]
futDf2[ , paste0(respCols, "_sim", i)] = res[["futDf"]][ , respCols]
changedNum_global = changedNum_global + res[["changedNum_global"]]
changedNum_wrt = changedNum_wrt + res[["changedNum_wrt"]]
changedNum_both = changedNum_both + res[["changedNum_both"]]
}
futDf2[ , respCols] = futDf2[ , paste0(respCols, "_sim", 1)]
Q1 = function(x) {quantile(x=x, p=0.25)}
Q3 = function(x) {quantile(x=x, p=0.75)}
for (respCol in respCols) {
cols = paste0(respCol, "_sim", 1:simNum)
futDf2[ , paste0(respCol, "_sim_mean")] = apply(
futDf2[ , cols, drop=FALSE], 1, mean)
futDf2[ , paste0(respCol, "_sim_sd")] = apply(
futDf2[ , cols, drop=FALSE], 1, sd)
futDf2[ , paste0(respCol, "_sim_Q1")] = apply(
futDf2[ , cols, drop=FALSE], 1, Q1)
futDf2[ , paste0(respCol, "_sim_Q3")] = apply(
futDf2[ , cols, drop=FALSE], 1, Q3)
}
n = dim(futDf)[1] * simNum
return(list(
"futDf"=futDf2,
"changedPcnt_global"=100*changedNum_global / n,
"changedPcnt_wrt"=100*changedNum_wrt / n,
"changedPcnt_both"=100*changedNum_both / n))
}
## generate timeseries
CreateYearDoyIndex = function(timeStart, timeEnd, origin_forTimeVars=2000) {
timeGrid = seq(timeStart, timeEnd, 1)
timeDf = BuildTimeDf(
timeGrid, origin_forTimeVars=origin_forTimeVars)
out = timeDf[ , c("year", "doy", "contiTime")]
return(out)
}
Example = function() {
lastDay = basicData[dim(basicData)[1], "date"]
timeStart = lastDay + 1
timeEnd = lastDay + 366
contiTime = CreateYearDoyIndex(timeStart, timeEnd)[ , "contiTime"]
}
## it tests constructed time series data
SanityCheck_tsModDf = function(obsDf, modDf, fsFcnList) {
plot(modDf[ , "doy"])
lines(plot(modDf[ , "toy"]))
#Pause()
plot(modDf[ , "contiTime"], obsDf[ , "contiTime"])
#Pause()
ind = 380:420
plot(obsDf[ind, "contiTime"], modDf[ind, "tod"], type="p")
#Pause()
ind = 1:2000
par(mfrow=c(1, length(names(fsFcnList))))
#Pause()
for (name in names(fsFcnList)) {
plot(
modDf[ind, "contiTime"],
modDf[ind, paste0(name, "sin1")], type="p")
}
name = "tow"
ind = 1:dim(modDf)[1]
plot(modDf[ind, name], modDf[ind, paste0(name, "sin2")])
#Pause()
par(mfrow=c(1, 2))
ind = 1:(24*7)
plot(modDf[ind, "contiTime"], modDf[ind, "tow"])
#Pause()
ind = 1:(24)
plot(modDf[ind, "contiTime"], modDf[ind, "tod"])
#Pause()
}
## set up commom model selection steps (works for both glm and tvv)
# it includes, tod, tow, toy
BuildMsStepsTodTowToy = function(
resp,
todK,
towK,
toyK,
todK_wknd,
lagK,
longtIndex,
todK_fixed=0,
towK_fixed=0,
toyK_fixed=0,
todK_wknd_fixed=0,
lagK_fixed=0,
contiTimeVars=c("ct1", "ct2", "ctSqrt", "ctCbrt"),
explanColsF=NULL) {
if (todK_fixed > 0) {
explanColsF = c(
explanColsF,
paste0("tod", "sin", 1:todK_fixed),
paste0("tod", "cos", 1:todK_fixed)
)
}
if (towK_fixed > 0) {
explanColsF = c(
explanColsF,
paste0("tow", "sin", 1:towK_fixed),
paste0("tow", "cos", 1:towK_fixed)
)
}
if (toyK_fixed > 0) {
explanColsF = c(
explanColsF,
paste0("toy", "sin", 1:toyK_fixed),
paste0("toy", "cos", 1:toyK_fixed)
)
}
if (todK_wknd_fixed > 0) {
explanColsF = c(
explanColsF,
paste0("tod_wknd", "sin", 1:todK_wknd_fixed),
paste0("tod_wknd", "cos", 1:todK_wknd_fixed)
)
}
if (lagK_fixed > 0) {
explanColsF = c(
explanColsF,
paste0(resp, "lag", 1:lagK_fixed))
}
groupsList = list()
groupsList[["tod"]] = c(
paste0("tod", "sin", (todK_fixed + 1):todK),
paste0("tod", "cos", (todK_fixed + 1):todK))
groupsList[["tod_wknd"]] = c(
paste0("tod_wknd", "sin", (todK_fixed + 1):todK),
paste0("tod_wknd", "cos", (todK_fixed + 1):todK))
groupsList[["tow"]] = c(
paste0("tow", "sin", (1 + towK_fixed):towK),
paste0("tow", "cos", (1 + towK_fixed):towK))
groupsList[["toy"]] = c(
paste0("toy", "sin", (1 + toyK_fixed):toyK),
paste0("toy", "cos", (1 + toyK_fixed):toyK))
groupsList[["contiTime"]] = contiTimeVars
groupsList[["lag"]] = paste0(resp, "lag", (lagK_fixed + 1):lagK)
groupsList[["longt"]] = paste0(resp, "longt", longtIndex)
l = list()
msSteps = list(
list(
name="S1",
explanColsF=explanColsF,
explanColsFInd=NULL,
explanCols=groupsList[["tod"]],
explanColsInd=NULL),
list(
name="S2",
explanColsF=explanColsF,
explanColsFInd="S1",
explanCols=groupsList[["tod_wknd"]],
explanColsInd=NULL),
list(
name="S3",
explanColsF=explanColsF,
explanColsFInd=c("S1", "S2"),
explanCols=groupsList[["tow"]],
explanColsInd=NULL),
list(name="S4",
explanColsF=explanColsF,
explanColsFInd=c("S1", "S2", "S3"),
explanCols=groupsList[["toy"]],
explanColsInd=NULL),
list(name="S5",
explanColsF=explanColsF,
explanColsFInd=c("S1", "S2", "S3", "S4"),
explanCols=groupsList[["contiTime"]],
explanColsInd=NULL),
list(
name="S6",
explanColsF=explanColsF,
explanColsFInd=c("S1", "S2", "S3", "S4", "S5"),
explanCols=groupsList[["longt"]],
explanColsInd=NULL),
list(
name="S7",
explanColsF=explanColsF,
explanColsFInd=c("S1", "S2", "S3", "S4", "S5"),
explanCols=groupsList[["lag"]],
explanColsInd=NULL) #,
#list(
# name="S7",
# explanColsF=explanColsF,
# explanColsFInd=c("S1", "S2", "S3", "S4"),
# explanCols=NULL,
# explanColsInd=c("S5", "S6")),
)
return(msSteps)
}
## fits a glm model (nb, glm, ...)
FitGlmMod = function(
familyStr,
link,
modDf,
yCol,
optModExplan_glm,
dropExtraComponents=TRUE) {
## fitting neg binomial
if (familyStr == "nb") {
model = glm.nb(
formula=formula(paste0(yCol, "~", AddSepFcn("+")(optModExplan_glm))),
data=modDf,
link=log)
} else {
model = glm(
formula(paste0(yCol, "~", AddSepFcn("+")(optModExplan_glm))),
data=modDf,
family=eval(parse(text=paste0(familyStr, "(", link,")"))))
}
aic = AIC(model)
bic = BIC(model)
dev = deviance(model)
n = dim(modDf)[1]
corFitObs = cor(modDf[ , yCol], model[["fitted.values"]])
model[["dispersion"]] = summary(model)[["dispersion"]]
if (dropExtraComponents) {
unnecess = c(
"data",
"y",
"weights",
"prior.weights",
"residuals",
#"linear.predictors",
"values",
"na.action")
model[unnecess] = NULL
model[["qr"]][["qr"]] = NULL
#objList = ls(envir=attr(model[["terms"]], ".Environment"))
#objList = setdiff(
# objList,
# c("aic", "bic", "dev", "n", "outList", "corFitObs"))
objList = "modDf"
#print(ls(envir=attr(model[["terms"]], ".Environment")))
rm(list=objList, envir=attr(model[["terms"]], ".Environment"))
}
outList = list(
"aic_stdized"=aic / n,
"bic_stdized"=bic / n,
"dev"=dev,
"corFitObs"=corFitObs)
outList[["model"]] = model
return(outList)
}
Example = function() {
n = 10^4
x1 = rnorm(n)
x2 = rnorm(n)
y = rpois(n, lambda=exp(x1 + x2))
plot(x1, y)
df = data.frame(x1=x1, x2=x2, y=y)
k = n/2
trainDf = df[1:k, ]
testDf = df[(k+1):n, ]
res = FitGlmMod(
familyStr="poisson",
link="log",
modDf=trainDf,
yCol="y",
optModExplan_glm=c("x1", "x2"))
model = res[["model"]]
ReportObjectSize(model)
ReportObjectSize_true(model)
yPred = predict(model, testDf)
plot(testDf[ , "y"], yPred)
summary(model)
}
## fitting a list of models
FitGlmModList = function(
yCol,
modDf,
familyLinkStrVec,
optModExplanList,
modList=list(),
modInfoDf = setNames(
data.frame(matrix(
ncol=9,
nrow=0)),
c(
"modStr",
"response",
"familyStr",
"link",
"parameter_complexity",
"aic_stdized",
"bic_stdized",
"dev",
"corFitObs"))) {
for (familyLinkStr in familyLinkStrVec) {
for (j in 1:length(optModExplanList)) {
optModExplan_glm = optModExplanList[[j]]
model_complexity = names(optModExplanList)[j]
familyStr = strsplit(familyLinkStr, "-")[[1]][1]
link = strsplit(familyLinkStr, "-")[[1]][2]
modStr = paste0(
yCol, "-", familyLinkStr,
"-", model_complexity)
#Mark(modStr, message="modStr")
modList[[modStr]] = FitGlmMod(
familyStr= familyStr,
link=link,
modDf=modDf,
yCol=yCol,
optModExplan_glm=optModExplan_glm)
modInfoDf[dim(modInfoDf)[1] + 1, ] = c(
modStr,
yCol,
familyStr,
link,
model_complexity,
modList[[modStr]][["aic_stdized"]],
modList[[modStr]][["bic_stdized"]],
modList[[modStr]][["dev"]],
modList[[modStr]][["corFitObs"]])
}
}
return(list("modList"=modList, "modInfoDf"=modInfoDf))
}
## compare fitted coefficients
CompareModelsCoefs = function(
modList,
modStrVec=NULL) {
if (is.null(modStrVec)) {
modStrVec = names(modList)
}
modStr = modStrVec[1]
x = 1:length(modList[[modStr]][["model"]][["coefficients"]])
plot(
x,
modList[[modStr]][["model"]][["coefficients"]],
col=ColAlpha(1, 0.4),
pch=1,
cex=1,
lwd=2)
for (i in 2:length(modStrVec)) {
modStr = modStrVec[i]
points(
x,
modList[[modStr]][["model"]][["coefficients"]],
col=ColAlpha(i, 0.4),
pch=i,
cex=1,
lwd=2)
}
legend(
"topright", modStrVec,
col=ColAlpha(1:length(modStrVec), 0.7),
pch=1:length(modStrVec), bty="n")
}
## plots a fitted model and compares it with obs data
# by pivoting Data
Plt_fittedModWrt = function(
model,
yCol,
modDf,
pivotVar=c("tow", "ywoy"),
pltTitle="") {
plot(
modDf[ , yCol],
model[["fitted.values"]],
col=ColAlpha("blue", 0.05),
pch=5)
#print(cor(modDf[ , yCol], model[["fitted.values"]]))
fitData = modDf
fitData[ , yCol] = model[["fitted.values"]]
fitPivotDf = PlotSummPivot(
df=fitData,
resp=yCol,
pivotVar=pivotVar,
pltTitle=pltTitle)
obsOverlayDf = Plt_overlayFcnList(
df=modDf,
resp=yCol,
fcnList=list(mean, sd),
fcnNames=c("mean", "sd"),
wrtCol=pivotVar[1],
pltTitle="",
colSuffix="_obs",
colPrefix="",
addToPlot=FALSE)
fitOverlayDf = Plt_overlayFcnList(
df=fitData,
resp=yCol,
fcnList=list(mean, sd),
fcnNames=c("mean", "sd"),
wrtCol=pivotVar[1],
pltTitle="",
colSuffix="_fit",
colPrefix="",
addToPlot=TRUE)
return(list(
"obsOverlayDf"=obsOverlayDf,
"fitOverlayDf"=fitOverlayDf))
}
## simulation of data using fitted models
SimGlmModList = function(
yCol,
modList,
familyLinkStrVec,
optModExplanList,
futDf,
prevDf,
fsFcnList,
LagFcn,
simNum=1,
limitSimValue=FALSE,
yMax=NULL,
yMin=NULL,
Transf=function(x)x,
applyBounds=FALSE,
BoundDf=NULL,
boundsWrtCols=NULL,
simDataList=list(),
simInfoDf = setNames(
data.frame(matrix(
ncol=4,
nrow=0)),
c(
"modStr",
"changedPcnt_global",
"changedPcnt_wrt",
"changedPcnt_both")),
parallel=FALSE,
parallel_outfile="") {
modStrVec = NULL
for (familyLinkStr in familyLinkStrVec) {
for (j in 1:length(optModExplanList)) {
model_complexity = names(optModExplanList)[j]
modStr = paste0(
yCol, "-", familyLinkStr,
"-", model_complexity)
modStrVec = c(modStrVec, modStr)
}
}
Func = function(modStr) {
Src()
print(Sys.time())
print("inside the simulation: memory usage")
print(ReportObjectSize(prevDf, "prevDf inside the sims"))
model = modList[[modStr]][["model"]]
res = SimulateSeries(
futDf=futDf,
prevDf=prevDf,
mod=model,
respCols=yCol,
fsFcnList=fsFcnList,
LagFcn=LagFcn,
Transf=Transf,
simNum=simNum,
limitSimValue=limitSimValue,
yMax=yMax,
yMin=yMin,
applyBounds=applyBounds,
BoundDf=BoundDf,
boundsWrtCols=boundsWrtCols)
#names(res) = modStr
return(res)
}
if (!parallel) {
simDataList2 = lapply(X=modStrVec, FUN=Func)
} else {
library("parallel")
closeAllConnections()
if (nrow(prevDf) > 500) {
prevDf = prevDf[(nrow(prevDf)-500):nrow(prevDf), ]
}
# Calculate the number of cores
no_cores = detectCores() - 2
no_cores = min(no_cores, length(modStrVec) + 1)
Mark(no_cores, "no_cores")
# Initiate cluster
cl = makeCluster(no_cores, outfile=parallel_outfile)
clusterExport(
cl=cl,
list(
"yCol", "modList", "familyLinkStrVec", "Transf",
"futDf", "prevDf", "fsFcnList", "LagFcn", "limitSimValue",
"yMax", "yMin", "applyBounds", "BoundDf", "boundsWrtCols",
"Src", "simNum"),
envir=environment())
simDataList2 = parLapply(cl=cl, X=modStrVec, fun=F)
stopCluster(cl)
closeAllConnections()
}
names(simDataList2) = modStrVec
simDataList = c(simDataList, simDataList2)
for (modStr in modStrVec) {
simInfoDf[dim(simInfoDf)[1] + 1, ] = c(
modStr,
simDataList[[modStr]][["changedPcnt_global"]],
simDataList[[modStr]][["changedPcnt_wrt"]],
simDataList[[modStr]][["changedPcnt_both"]])
}
return(list("simDataList"=simDataList, "simInfoDf"=simInfoDf))
}
## plots sim data and obsdata
# it compares data across wrtCols
# also it comapres longterm trends
PltTsSimData = function(
yCol,
simDf,
trainDf,
testDf,
windowCol,
overlayWrtCol,
timeThresh=-1,
pltTitleVec=c("", "", "", ""),
cex.main=NULL,
mfrow=NULL) {
trainDf[ , "color"] = ColAlpha("blue", 0.3)
testDf[ , "color"] = ColAlpha("blue", 0.3)
obsDf_all = rbind(trainDf, testDf)
#simData[ , "count"] = exp(simData[ , "logCount"])
simDf[ , "color"] = ColAlpha("red", 0.3)
cols = c("contiTime", windowCol, yCol, "color")
combinDf = rbind(trainDf[ , cols], simDf[ , cols])
ind = combinDf[ , "contiTime"] > timeThresh
if (is.null(mfrow)) {
mfrow = c(2, 2)
}
par(mfrow=mfrow)
plot(
combinDf[ind, "contiTime"], combinDf[ind, yCol],
col=ColAlpha("blue", 0.3), type="l", ylab=yCol,
xlab="time", main=pltTitleVec[1], cex.main=1.5,
cex.axis=1.5, cex.lab=1.5)
lines(
simDf[ , "contiTime"], simDf[ , yCol],
col=ColAlpha("red", 0.3), lty=2)
legend(
"topleft",
legend=c("obs", "sim"),
lty=c(1, 2),
col=ColAlpha(c("blue", "red"), 0.3))
## obs data
obsOverlayDf = Plt_overlayFcnList(
df=testDf,
resp=yCol,
fcnList=list(mean, sd),
fcnNames=c("mean obs", "sd obs"),
wrtCol=overlayWrtCol,
pltTitle=pltTitleVec[2],
colSuffix="_obs",
colPrefix="",
addToPlot=FALSE)
## sim data
simOverlayDf = Plt_overlayFcnList(
df=simDf,
resp=yCol,
fcnList=list(mean, sd),
fcnNames=c("mean sim", "sd sim"),
wrtCol=overlayWrtCol,
pltTitle="",
colSuffix="_sim",
colPrefix="",
addToPlot=TRUE)
## weekly pattern comparison plot
res = LongtAvgPlt(
df=obsDf_all,
timeCol="contiTime",
windowCol=windowCol,
respCol=yCol,
ylab="weekly avg count",
color=ColAlpha("blue", 0.4),
pltTitle=pltTitleVec[3])
res = LongtAvgPlt(
df=simDf,
timeCol="contiTime",
windowCol=windowCol,
respCol=yCol,
ylab="weekly avg count",
color=ColAlpha("red", 0.4),
addToPlot=TRUE,
lty=2)
legend(
"topleft",
legend=c("obs", "sim"),
lty=c(1, 2),
col=ColAlpha(c("blue", "red"), 0.3))
## Annual avg comparison plot
res = LongtAvgPlt(
df=obsDf_all,
timeCol="contiTime",
windowCol="year",
respCol=yCol,
addPoints=TRUE,
ylab="avg annual count",
ylim=c(0, 600),
color=ColAlpha("blue", 0.4),
pltTitle=pltTitleVec[4])
res = LongtAvgPlt(
df=simDf,
timeCol="contiTime",
windowCol="year",
respCol=yCol,
addPoints=TRUE,
addToPlot=TRUE,
ylab="avg annual count",
color=ColAlpha("red", 0.5),
lty=2)
legend(
"topleft",
legend=c("obs", "sim"),
lty=c(1, 2),
col=ColAlpha(c("blue", "red"), 0.3))
}
AssessSimErr = function(
yCol,
simDf,
testDf) {
yObs = testDf[ , yCol]
ySim = simDf[ , yCol]
rmse = sqrt(mean((yObs - ySim)^2))
mae = mean(abs(yObs - ySim))
corr = cor(yObs, ySim)
out = c(rmse, mae, corr)
names(out) = c("RMSE", "MAE", "cor(obs, sim)")
return(out)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.