#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# authors: Alireza Hosseini, Reza Hosseini
## a function which partitions the data into two parts
DivideData = function(df, m=NA) {
n = dim(df)[1]
if (is.na(m)) {
m = round(n/2)
}
ind1 = sample(1:n, m)
ind2 = setdiff(1:n, ind1)
df1 = df[ind1, ]
df2 = df[ind2, ]
outList = list(df1=df1, df2=df2)
return(outList)
}
DivideDataWrt = function(df, prop=1/2, colNames) {
l = list()
for (i in 1:length(colNames)) {
l[[i]] = df[ , colNames[i]]
}
dataList = split(df, l)
n = length(dataList)
m = max(round(n*prop), 1)
#print(n)
#print(m)
ind1 = sample(1:n, m, replace=FALSE)
ind2 = setdiff(1:n, ind1)
print(ind1)
print(ind2)
df1 = MergeDfList(dfList=dataList, ind=ind1, fcn=rbind)
df2 = MergeDfList(dfList=dataList, ind=ind2, fcn=rbind)
#print('***')
#print(dfList)
outList = list(df1=df1, df2=df2)
return(outList)
}
Example = function() {
n = 100
x = sample(c('boz', 'asb', 'sag'), n, replace=TRUE)
y = sample(c('yazd', 'rafsanjan', 'kerman'), n, replace=TRUE)
z = rnorm(n)
u = z + rnorm(n)
df = data.frame(x, y, z, u)
res = DivideDataWrt(df, colNames=c('x', 'y'))
res
}
## return info about the class of the df columns
## corrects the class from numeric and integer to factor
##if the levels are not many
CorrectClassDf = function(df, levelsCutoff=0) {
colNames = names(df)
n = length(colNames)
for (i in 1:n) {
colName = colNames[i]
c = class(df[ , colName])[1]
l = length(unique(df[ , colName]))
print(paste(colName, c, sep=': '))
print(paste('the cardinality of the levels of this variable is:', l))
if ((c=='integer' | c=='numeric') & l<levelsCutoff)
{
df[ , colName] = as.factor(df[ , colName])
}
}
return(df)
}
Example = function() {
df = trainData
df = CorrectClassDf(df=df, levelsCutoff=3)
}
## mapping some factors of x (given in 'from')
# to other factors (given in 'to')
FactorMap = function(x, from, to) {
if (length(from) != length(to)) {
print('Warning: from and to size do not match');
return(NULL)
}
x = factor(x)
levels(x) = c(levels(x), to)
k = length(from)
for (i in 1:k) {
fac = from[i]
newfac = to[i]
ind = which(x == fac)
if (length(ind) > 0) {
x[ind] = newfac
}
}
x=factor(as.character(x))
levels(x) = setdiff(levels(x), from)
return(x)
}
Example = function() {
x = sample(c('a', 'b', 'c'), 100, replace=TRUE)
x[100] = 'd'
x = factor(x)
from = c('a', 'd')
to = c('u', 'v')
y = FactorMap(x, from, to)
}
## builds a function which when
NonExistFactorMap = function(facNames, toFacName) {
Func = function(y) {
yFacNames = levels(y)
nonExistInd = which(!(yFacNames %in% facNames))
from = yFacNames[nonExistInd]
to = rep(toFacName, length(from))
FactorMap(y, from=from, to=to)
}
return(Func)
}
## deprecated
NonExistFactorMap2 = function(x, facNames, toFacName) {
xTab = table(x)
xFacNames = names(xTab)
nonExistInd = which(!(xFacNames %in% facNames))
from = xFacNames[nonExistInd]
to = rep(toFacName, length(from))
Func = function(y) {
FactorMap(y, from=from, to=to)
}
return(Func)
}
Example = function() {
facNames = c('a', 'b', 'd')
toFacName = 'zzz'
y = c('a', 'b', 'g')
y = factor(y)
Fcn = NonExistFactorMap(facNames, toFacName)
}
### find out the levels of a factor and assign
CorrectFactorFcn = function(x, cutoff=3) {
x = factor(x)
#levels(x)
#summary(x)
tab = table(x)
facNum = length(tab)
facNames = names(tab)
freqInd = which(tab == max(tab))[1]
freqFac = facNames[freqInd]
toFacName = freqFac
from = NULL
to = NULL
smallFreqInd = which(tab < cutoff)
from = facNames[smallFreqInd]
to = rep(freqFac, length(from))
SmallFreq = function(y) {
FactorMap(y, from=from, to=to)
}
x = SmallFreq(x)
NonExist = NonExistFactorMap(
facNames=facNames,
toFacName=toFacName)
Combine = function(y) {
SmallFreq(NonExist(y))
}
outList = list('x'=x, 'SmallFreq'=SmallFreq,
'NonExist'=NonExist, 'Combine'=Combine)
return(outList)
}
Example = function() {
x = sample(c('a', 'b', 'c'), 100, replace=TRUE)
x[100] = 'd'
res = CorrectFactorFcn(x)
newX= res$x
SmallFreq = res[[2]]
NonExist = res[[3]]
Combine = res[[4]]
y = c('d', 'd', 'd', 'd')
SmallFreq(y)
y = c('d', 'd', 'zereshk')
y = factor(y)
NonExist(y)
Combine(y)
}
### find out the levels of a factor and assign for dataframes
### this will assign little-appearing factors to most frequent factor
### this will also assign non-existing factors to most frequent factor
CorrectFactorDf = function(df, cutoff=3) {
colNames = names(df)
n = length(colNames)
fcnList = list()
for (i in 1:n) {
colName = colNames[i]
c = class(df[ , colName])[1]
fcnList[[i]] = (f=function(x){return(x)})
if (c == 'factor') {
out = CorrectFactorFcn(df[ , colName], cutoff=cutoff)
df[ , colName] = out[['x']]
fcnList[[i]] = out[['Combine']]
}
}
outList = list(df=df, fcnList=fcnList, Fcn=FcnListDfFcn(fcnList))
return(outList)
}
Example = function() {
x = sample(c('a', 'b', 'c'), 100, replace=TRUE)
x[100] = 'd'
y = sample(c('aa', 'bb', 'cc'), 100, replace=TRUE)
y[99]='dd'
y[100] = 'dd'
z = 1:100
df = data.frame(x=x, y=y, z=z)
res = CorrectFactorDf(df, cutoff=3)
df2 = res[['df']]
fcn = res[['Fcn']]
x = sample(c('a', 'b', 'e'), 100, replace=TRUE)
x[100] = 'd'
y = sample(c('aa', 'ee', 'cc'), 100, replace=TRUE)
y[99]='dd'
y[100] = 'dd'
z = 1:100
newDf = data.frame(x=x, y=y, z=z)
fcn(newDf)
fcnList = res[['fcnList']]
FcnListDfFcn(fcnList)(newDf)
}
### from a factor variable build numeric columns and replace in df
FactorDfNumeric = function(df) {
m = dim(df)[2]
outDf = NULL
ind = NULL
count = 0
for (i in 1:m) {
x = df[ , i]
if (class(x) == 'factor') {
name = colnames(df)[i]
colnames(df)[i] = paste(name, '_', sep='')
name = colnames(df)[i]
ind = c(ind, i)
#print(class(x))
string = paste('xTrans = model.matrix( ~ ', name, '-1, data=df)', sep='')
#print(string)
eval(parse(text=string))
xTrans = data.frame(xTrans)
xTrans2 = xTrans[ , -1, drop=FALSE]
if (count == 0) {
outDf = xTrans2
} else {
outDf = cbind(outDf, xTrans2)
}
count = count + 1
}
}
df2 = df
if (length(ind) > 0) {
df2 = df2[ , -ind, drop=FALSE]
df2 = cbind(df2, outDf)
}
return(df2)
}
Example = function() {
df = data.frame(x=1:10, y=c(rep('a', 5), rep('b', 5)),
z=c(rep('c', 5), rep('d', 5)))
FactorDfNumeric(df)
}
Example = function() {
dates = c("2011-01-01 ", "2011-01-01", "2011-01-01")
hrs = c('0', '1', '21')
df = data.frame(date=dates, hr=hrs)
BuildDatetime(df=df, dateCol='date', hrCol='hr')
}
### Fourier series
Fseries = function(data, colName, omega=2*pi, ord=1) {
asc = as.character
n = dim(data)[1]
outDf = matrix(NA, n, 2*ord)
outDf = data.frame(outDf)
x = data[ , colName]
columns = list()
ind = 0
for (k in 1:ord) {
sincoln = paste('sin', asc(k), '_', colName, sep='')
coscoln = paste('cos', asc(k), '_', colName, sep='')
columns = append(columns, sincoln)
columns = append(columns, coscoln)
u = omega*k*x
sin = sin(u)
cos = cos(u)
ind = ind + 1
outDf[ , ind] = sin
names(outDf)[ind] = sincoln
ind = ind + 1
outDf[ , ind] = cos
names(outDf)[ind] = coscoln
}
return(list('df'=outDf, 'columns'=columns))
}
Example = function() {
x = seq(0, 2, 0.01)
data = data.frame('x'=x)
df = Fseries(data, colName='x')
cols = df[['columns']]
df = df[['df']]
plot(x, df[ , 2])
}
### multiple F series
FseriesMulti = function(data, colNames, omegas=NA, ords=NA) {
k = length(colNames)
if (is.na(omegas[1])) {
omegas = rep(2*pi, length(colNames))
}
if (is.na(ords[1])) {
ords = rep(1, length(colNames))
}
outData = NULL
outColumns = list()
for (i in 1:k) {
colName = colNames[i]
omega = omegas[i]
ord = ords[i]
FS = Fseries(data=data, colName=colName, omega=omega, ord=ord)
FSdf = FS[['df']]
FScolumns = FS[['columns']]
if (i == 1) {
outData=FSdf
} else {
outData = cbind(outData, FSdf)
}
outColumns = append(outColumns, FScolumns)
}
return(list('df'=outData, 'columns'=outColumns))
}
Example = function() {
x = seq(0, 2, 0.01)
y = 2*seq(0, 2, 0.01)
z = 4*x
data = data.frame('x'=x, 'y'=y, 'z'=z)
colNames = colnames(data)
res = FseriesMulti(data, colNames=c('x', 'y', 'z'), omegas=NA, ord=NA)
df = res[['df']]
colNames = res['columns']
plot(df[ , 1] ,df[ , 2])
plot(x, df[ , 1])
}
### create an interaction matrix using two explan matrices: explan1, explan2
### the interactions needed are taken from interactList
### every tuple in interact list starts with an index in explan1
### then it follows by indices in interact2
CreateInteractFcn = function(
interactList=NULL,
interactMatrix=NULL,
sep='_') {
#print(interactList)
#print(interactMatrix)
if (is.null(interactList)) {
x = interactMatrix
interactList = split(t(x), rep(1:ncol(t(x)), each=nrow(t(x))))
}
print(interactList)
Func = function(explan1, explan2) {
k = length(interactList)
k1 = dim(explan1)[2]
k2 = dim(explan2)[2]
n1 = dim(explan1)[1]
n2 = dim(explan2)[1]
if (n1 != n2) {
print('explan1 and explan2 row nums do not match')
}
explanInteract = NULL
for (i in 1:k) {
index = interactList[[i]]
explan1_index = index[1]
explan2_index = index[-1]
for (j in explan2_index) {
explanInteract = cbind(explanInteract,
(explan1[ , explan1_index]*explan2[ , j]))
}
}
explanInteract = data.frame(explanInteract)
count = 0
for (i in 1:k) {
index = interactList[[i]]
explan1_index = index[1]
explan2_index = index[-1]
if (!is.null(explan2_index)) {
for (j in explan2_index) {
count = count + 1
name = paste('int', explan1_index, j, sep=sep)
name1 = names(explan1)[explan1_index]
name2 = names(explan2)[j]
#print(name1);print(j);print(names(explan2));print(name2)
if (!is.null(name1) & !is.null(name2)) {
name=paste(name, sep, name1, sep, name2, sep='')
}
names(explanInteract)[count] = name
}
}
}
return(explanInteract)
}
return(Func)
}
Example = function() {
explan1 = matrix(1:9, 3, 3)
explan2 = matrix(11:19, 3, 3)
interactList = list(c(1, 1, 2), c(2, 1, 2))
fcn = CreateInteractFcn(interactList)
fcn(explan1, explan2)
explan1 = matrix(1:9, 3, 3)
explan2 = matrix(11:19, 3, 3)
explan1 = data.frame(explan1)
explan2 = data.frame(explan2)
names(explan1) = c('a', 'b', 'c')
names(explan2) = c('t', 'u', 'v')
interactList = list(c(3, 1, 2, 3), c(1, 2))
CreateInteractFcn(interactList)(explan1, explan2)
interactMatrix = rbind(c(1, 2), c(2, 3), c(1, 1))
CreateInteractFcn(interactMatrix=interactMatrix)(explan1, explan2)
}
### randomForest prediction
RfFit = function(y, explan) {
InstallLoad('randomForest')
mod = randomForest(y ~ ., data=explan)
return(mod)
}
Example = function() {
n = 100
explan = matrix(rnorm(n*10), n, 10)
explan = data.frame(explan)
yObs = 5*explan[ , 1] + explan[ ,2]+rnorm(n)
mod = RfFit(yObs, explan)
}
## fitting via aggregate
AggFit = function(
y,
explan,
yName='value',
wrtCols=NULL,
AggFunc=mean,
aggrFuncName='mean') {
explan = data.frame(explan)
if (length(wrtCols) == 0) {
wrtCols=colnames(explan)
}
data = cbind(y,explan)
colnames(data)[1] = yName
aggModDf = SummWrt(
data,
valueCol=yName,
wrtCols=wrtCols,
probs=NULL,
fcnList=list(AggFunc),
fcnName='mean')
aggMod = list(df=aggModDf)
class(aggMod) = 'aggMod'
return(aggMod)
}
AggPred = function(aggMod, explan) {
colNames = colnames(aggMod$df)[-length(colnames(aggMod$df))]
yName = colnames(aggMod$df)[length(colnames(aggMod$df))]
#print(dim(explan))
explan[ , 'orderTemp'] = 1:dim(explan)[1]
out = merge(explan, aggMod[['df']], by=colNames, all.x=TRUE)
out = out[order(out[ , 'orderTemp']), ]
yPred = out[ , yName]
return(yPred)
}
predict.aggMod = function(aggMod, explan, ...) {
return(AggPred(aggMod, explan))
}
print.aggMod = function(aggMod) {
print(aggMod[['df']])
}
summary.aggMod = function(aggMod) {
return(aggMod[['df']])
}
## remove Agg Value such as mean, median
RemoveAggValue = function(
data,
colName,
wrtCols=NULL,
AggFunc=mean,
aggrFuncName='mean',
addToData=TRUE,
newColName=NULL) {
aggMod = AggFit(y=data[ , colName], explan=data, yName=colName,
wrtCols=wrtCols, AggFunc=AggFunc, aggrFuncName=aggrFuncName)
yPred = AggPred(aggMod, explan=data)
out = data[ , colName] - yPred
if (addToData) {
out = cbind(data, out)
if (is.null(newColName)) {
newColName = paste(colName, '_', aggrFuncName, '_removed', sep='')
}
colnames(out)[dim(out)[2]] = newColName
}
return(out)
}
Example = function() {
n = 50
u1 = sample(1:3, n, replace=TRUE)
x1 = factor(u1, labels = c("p1","p2","p3"))
u2 = sample(1:2, n, replace=TRUE)
x2 = factor(u2, labels = c("rich", "poor"))
y = (
0*(u1==1) + 8*(u1==2) + 10*(u1==3) + 15*(u1==4) +
20*(u1==5) + 1*(u2==1) + rnorm(n))
explan = data.frame(x1=x1, x2=x2)
data = cbind(y,explan)
RemoveAggValue(data,colName='y', wrtCols=c('x1', 'x2'), AggFunc=mean,
aggrFuncName='mean', addToData=TRUE)
}
###### imputation
### impute col
ImputeCol = function(
df,
valueCol,
wrtCols,
AggFunc=NULL,
aggrFuncName="agg") {
if (is.null(AggFunc)) {
AggFunc = function(x){mean(x, na.rm=TRUE)}
}
y = df[ , valueCol]
aggMod = AggFit(
y=y,
explan=df,
yName=valueCol,
wrtCols=wrtCols,
AggFunc=AggFunc,
aggrFuncName=aggrFuncName)
yPred = AggPred(aggMod, explan=df)
ind = which(is.na(y))
out = y
out[ind] = yPred[ind]
return(out)
}
Example = function() {
n = 50
u1 = sample(1:3, n, replace=TRUE)
x1 = factor(u1, labels = c("p1", "p2", "p3"))
u2 = sample(1:2, n, replace=TRUE)
x2 = factor(u2, labels = c("rich", "poor"))
y = 0*(u1==1) + 8*(u1==2) + 10*(u1==3) + 15*(u1==4) + 20*(u1==5) + 1*(u2==1) + rnorm(n)
y[1:5] = NA
explan = data.frame(x1=x1, x2=x2)
data = cbind(y, explan)
ImputeCol(df=df, valueCol='y', wrtCols=c('x1', 'x2'), AggFunc=NULL)
}
### impute data
ImputeDf = function(
df,
valueCols,
wrtCols,
AggFunc=NULL,
replaceCol=TRUE) {
out = NULL
for (i in 1:length(valueCols)) {
valueCol = valueCols[i]
newCol = ImputeCol(
df=df,
valueCol=valueCol,
wrtCols=wrtCols,
AggFunc=NULL)
out = cbind(out, newCol)
colnames(out)[i] = valueCol
if (replaceCol) {
df[ , valueCol] = newCol
}
}
if (replaceCol) {
out = df
}
return(out)
}
Example = function() {
n = 50
u1 = sample(1:3, n, replace=TRUE)
x1 = factor(u1, labels = c("p1","p2","p3"))
u2 = sample(1:2, n, replace=TRUE)
x2 = factor(u2, labels = c("rich", "poor"))
y1 = (
0*(u1 == 1) +
8*(u1 == 2) +
10*(u1 == 3) +
15*(u1 == 4) +
20*(u1 == 5) +
1*(u2 == 1) + rnorm(n))
y2 = (
0*(u1 == 1) +
3*(u1 == 2) +
1*(u1 == 3) +
5*(u1 == 4) +
25*(u1 == 5) +
1*(u2==1) + rnorm(n))
y1[1:5] = NA
y2[5:10] = NA
explan = data.frame(x1=x1, x2=x2)
df = cbind(y1, y2, explan)
ImputeDf(df=df, valueCols=c('y1', 'y2'), wrtCols=c('x1', 'x2'))
}
#setMethod(predict, aggMod, .aggMod)
Example = function() {
n = 500
u1 = sample(1:5, n, replace=TRUE)
x1 = factor(u1, labels = c("p1", "p2", "p3", "p4", "p5"))
u2 = sample(1:2, n, replace=TRUE)
x2 = factor(u2, labels = c("rich", "poor"))
y = (
0*(u1 == 1) +
8*(u1 == 2) +
10*(u1 == 3) +
15*(u1 == 4) +
20*(u1 == 5) +
1*(u2 == 1) + rnorm(n))
explan = data.frame(x1=x1,x2=x2)
aggMod = AggFit(y, explan)
yPred = AggPred(explan, aggMod)
plot(y, yPred)
yPred = predict(aggMod, explan)
}
Example = function() {
n = 500
u1 = sample(1:5, n, replace=TRUE)
x1 = factor(u1, labels = c("p1", "p2", "p3", "p4", "p5"))
u2 = sample(1:2, n, replace=TRUE)
x2 = factor(u2, labels = c("rich", "poor"))
y = (
0*(u1 == 1) +
8*(u1 == 2) +
10*(u1 == 3) +
15*(u1 == 4) +
20*(u1 == 5) +
1*(u2 == 1) + rnorm(n))
explan = data.frame(x1=x1, x2=x2)
aggMod = AggFit(y, explan)
yPred = predict(aggMod, explan)
plot(y, yPred)
FitAssess(yObs=y, explan=explan, model='Agg', wrtCols=NULL)
}
## fit dev models
DevModelFit = function(
y,
explanBasic,
explanDev,
basicModel,
devModel,
devFcn=NA,
devInvFcn=NA,
form='additive',
wrtCols=NULL,
family='gaussian') {
if (!is.null(form) & form == 'additive') {
devFcn = function(x, y)(x - y)
devInvFcn = function(x, y)(x + y)
}
if (!is.na(form) & form=='multi') {
devFcn = function(x, y)(x/y)
devInvFcn = function(x, y)(x*y)
}
bmod = basicModel(y, explan=explanBasic)
yFitBasic = predict(bmod, explan=explanBasic, wrtCols=wrtCols)
e = devFcn(y, yFitBasic)
dmod = devModel(e, explanDev)
eFit = predict(dmod, explanDev)
out = list(bmod=bmod, dmod=dmod, devFcn=devFcn, devInvFcn=devInvFcn)
class(out) = 'devMod'
return(out)
}
print.devMod = function(devMod) {
bmod = devMod[['bmod']]
dmod = devMod[['dmod']]
print('basic model')
print(bmod)
print('dev model')
print(dmod)
}
predict.devMod = function(devMod, explanBasic, explanDev, ...) {
bmod = devMod[['bmod']]
dmod = devMod[['dmod']]
devFcn = devMod[['devFcn']]
devInvFcn = devMod[['devInvFcn']]
yPredBasic = predict(bmod, explanBasic)
ePred = predict(dmod, explanDev)
yPred = devInvFcn(yPredBasic, ePred)
return(yPred)
}
Example = function() {
n = 500
u1 = sample(1:5, n, replace=TRUE)
x1 = factor(u1, labels = c("p1", "p2", "p3", "p4", "p5"))
u2 = sample(1:2, n, replace=TRUE)
x2 = factor(u2, labels = c("rich", "poor"))
x3 = rnorm(500)
x4 = rnorm(500)
y = 0*(u1==1) + 8*(u1==2) + 10*(u1==3) + 15*(u1==4) + 20*(u1==5) + 1*(u2==1) +
2*x3 + 3*x4 + rnorm(n)
explanBasic = data.frame(x1=x1, x2=x2)
explanDev = data.frame(x3, x4)
# glm
devMod = DevModelFit(
y, explanBasic=explanBasic, explanDev=explanDev,
basicModel=AggFit, devModel=GlmFitMatr, devFcn=NA,
devInvFcn=NA, form='additive')
# RF
devMod = DevModelFit(
y, explanBasic=explanBasic, explanDev=explanDev,
basicModel=AggFit, devModel=RfFit, devFcn=NA,
devInvFcn=NA, form='additive')
yFit = predict(
devMod, explanBasic=explanBasic, explanDev=explanDev)
plot(y, yFit)
print(devMod)
gmod = GlmFitMatr(y, explan)
yFit = predict(gmod, explan=explan)
plot(y, yFit)
}
## Fitting various algorithms in R
FitAssess = function(
yObs,
explan=NULL,
model,
indTrain=NULL,
indValid=NULL,
obsDamping=TRUE,
ymin=NULL,
ymax=NULL,
wrtCols=NULL,
explanBasic=NULL,
explanDev=NULL,
basicModel=NULL,
devModel=NULL,
devForm='additive',
family='gaussian',
larsType='lasso',
divideColNames=NULL,
divideProp=1/2,
explanDivide=NULL,
divideRangeCols=NULL,
divideRangeList=NULL) {
n = length(yObs)
divideOrder = 1:n
if (!is.null(divideColNames)) {
data = cbind(divideOrder, explanDivide)
colnames(data)[1] = 'divideOrder'
res = DivideDataWrt(df=data, prop=divideProp,
colNames=divideColNames)
dataTrain = res[[1]]
dataValid = res[[2]]
indTrain = dataTrain[ , 'divideOrder']
indValid = dataValid[ , 'divideOrder']
}
if (!is.null(divideRangeCols)) {
logicalInd = rep(1, n)
for (i in 1:length(divideRangeCols)) {
colName = divideRangeCols[i]
x = explanDivide[ , colName]
colRange = divideRangeList[[i]]
logicalInd = (x < colRange[2] & x > colRange[1])*logicalInd
}
indTrain = which(logicalInd>0)
indValid = setdiff(1:n, indTrain)
}
if (is.null(indTrain)) {
m = round(n/2)
indTrain = sample(1:n, m)
}
if (is.null(indValid)) {
indValid = setdiff(1:n, indTrain)
}
yTrain = yObs[indTrain]
yValid = yObs[indValid]
explanTrain = explan[indTrain, , drop=FALSE]
explanValid = explan[indValid, , drop=FALSE]
yminObs = min(yTrain)
ymaxObs = max(yTrain)
if (model == 'RF') {
InstallLoad('randomForest')
mod = RfFit(y=yTrain, explan=explanTrain)
yFit = predict(mod, newdata=explanTrain)
yPred = predict(mod, newdata=explanValid)
}
if (model == 'GLM') {
mod = GlmFitMatr(y=yTrain, explan=explanTrain, family=family)
yFit = predict(mod, explan=explanTrain)
yPred = predict(mod, explan=explanValid)
}
if (model == 'LARS') {
InstallIf('lars')
library('lars')
xTrain = FactorDfNumeric(explanTrain)
xTrain = as.matrix(xTrain)
p = dim(xTrain)[2]
mod = lars(x=xTrain, y=t(yTrain), type=larsType)
res = predict.lars(mod, newx=xTrain, type='fit')
fit = res[["fit"]]
yFit = fit[,(p+1)]
xValid = FactorDfNumeric(explanValid)
xValid = as.matrix(xValid)
res = predict.lars(mod, newx=xValid)
pred = res$fit
yPred = pred[ , (p+1)]
}
if (model == 'PLS') {
InstallIf('pls')
library('pls')
ncomp=10
#X1 = explan_all
xTrain = as.matrix(explan[indTrain, ])
#PLSmod = mvr(y1~X1, ncomp=6 , data = mydata)
mod = plsr(yTrain ~ xTrain, ncomp=ncomp)
fit = predict(mod, newdata=xTrain)
yFit = fit[ , , ncomp]
yFit = MaxComp(yFit, 0)
xValid = explan[indValid, ]
xValid = as.matrix(xValid)
pred = predict(mod, newdata=xValid)
yPred = pred[ , , ncomp]
}
if (model == 'Agg') {
mod = AggFit(y=yTrain, explan=explanTrain, wrtCols=wrtCols)
yFit = AggPred(mod, explan=explanTrain)
yPred = AggPred(mod, explan=explanValid)
}
if (model == 'Dev') {
if (is.null(explanBasic)) {explanBasic=explan}
if (is.null(explanDev)) {explanDev=explan}
explanBasicTrain = explanBasic[indTrain, ]
explanBasicValid = explanBasic[indValid, ]
explanDevTrain = explanDev[indTrain, ]
explanDevValid = explanDev[indValid, ]
mod = DevModelFit(
yTrain, explanBasic=explanBasicTrain, explanDev=explanDevTrain,
basicModel=basicModel, devModel=devModel, devFcn=NA, devInvFcn=NA,
form=devForm, wrtCols=wrtCols)
yFit = predict(mod, explanBasic=explanBasicTrain, explanDev=explanDevTrain)
yPred = predict(mod, explanBasic=explanBasicValid, explanDev=explanDevValid)
}
summary(mod)
if (obsDamping) {
yFit = MaxComp(yFit, yminObs)
yFit = MinComp(yFit, ymaxObs)
}
if (obsDamping) {
yPred = MaxComp(yPred, yminObs)
yPred = MinComp(yPred, ymaxObs)
}
if (!is.null(ymin)) {
yFit = MaxComp(yFit, ymin)
yPred = MaxComp(yPred, ymin)
}
if (!is.null(ymax)) {
yFit = MinComp(yFit, ymax)
yPred = MaxComp(yPred, ymin)
}
indOK = which(!is.na(yPred))
num = length(yPred) - length(indOK)
if (num > 0) {
print('*** Warning: this number of predictions are NA:')
print(num)
}
par(mfrow=c(1, 2))
plot(yTrain, yFit, col=rgb(0, 0, 0, alpha=0.1), pch=20)
print('fit cor and rmse')
print(cor(yTrain, yFit))
abline(0, 1, col='blue')
rmse = sqrt(mean((yTrain-yFit)^2))
print(rmse)
plot(yValid,yPred,col=rgb(0, 0, 0, alpha=0.1), pch=20)
abline(0, 1, col='blue')
print('test cor and rmse')
print(cor(yValid[indOK], yPred[indOK]))
rmse = sqrt(mean((yValid[indOK] - yPred[indOK])^2))
print(rmse)
return(mod)
}
Example = function() {
### TRY GLM
n = 100
explan = matrix(rnorm(n*10), n, 10)
explan = data.frame(explan)
yObs = 5*explan[ , 1] + explan[ , 2] + rnorm(n)
mod = FitAssess(yObs, explan, model='GLM')
### TRY LARS
n = 100
explan = matrix(rnorm(n*10), n, 10)
explan = data.frame(explan)
yObs = 5*explan[ , 1] + explan[ , 2] + rnorm(n)
mod = FitAssess(yObs, explan, model='LARS')
### TRY Random Forest
n = 100
explan = matrix(rnorm(n*10), n, 10)
explan = data.frame(explan)
yObs = 5*explan[ , 1] + explan[ , 2] + rnorm(n)
mod = FitAssess(yObs, explan, model='RF')
### TRY Dev models
n = 400
u1 = sample(1:3, n, replace=TRUE)
x1 = factor(u1, labels=c("p1", "p2", "p3"))
u2 = sample(1:2, n, replace=TRUE)
x2 = factor(u2, labels=c("rich", "poor"))
x3 = rnorm(n)
x4 = rnorm(n)
yObs = 0*(u1==1) + 8*(u1==2) + 10*(u1==3) + 15*(u1==4) + 20*(u1==5) + 1*(u2==1) +
2*x3 + 3*x4 + 4*x3*(x4>1) + 7*(x3>2)*x4 + 5*cos(2*pi*x3) * rnorm(n)
explanBasic = data.frame(x1=x1, x2=x2)
explanDev = data.frame(x3, x4)
explan = cbind(explanBasic, explanDev)
mod = FitAssess(yObs, explanBasic=explanBasic, explanDev=explanDev,
model='Dev', basicModel=AggFit, devModel=GlmFitMatr)
mod = FitAssess(yObs, explanBasic=explanBasic, explanDev=explanDev,
model='Dev', basicModel=AggFit, devModel=RfFit)
## divide data wrt specific columns
mod = FitAssess(yObs, explan=cbind(explanBasic, explanDev), model='GLM')
mod = FitAssess(yObs, explan=cbind(explanDev), model='GLM',
divideColNames=c('x1', 'x2'), divideProp=1/2, explanDivide=explanBasic)
mod = FitAssess(yObs, explan=cbind(explanDev), model='GLM',
explanDivide=explan[ , c('x3', 'x4')], divideRangeCols=c('x3', 'x4'),
divideRangeList=list(c(0, 1), c(0, 2)))
dig_deeper = function() {
basicModel = AggFit
devModel = GlmFitMatr
indTrain = 1:250
indValid = 250:500
explanBasicTrain = explanBasic[indTrain, ]
explanBasicValid = explanBasic[indValid, ]
explanDevTrain = explanDev[indTrain, ]
explanDevValid = explanDev[indValid, ]
mod = DevModelFit(
yTrain, explanBasic=explanBasicTrain, explanDev=explanDevTrain,
basicModel=basicModel, devModel=devModel, devFcn=NA, devInvFcn=NA,
form=devForm,
wrtCols=wrtCols)
yFit = predict(mod, explanBasic=explanBasicTrain, explanDev=explanDevTrain)
yPred = predict(mod, explanBasic=explanBasicValid, explanDev=explanDevValid)
}
}
### categorical
FitAssessCat = function(
y, explan, model, indTrain=NA, indValid=NA) {
n = dim(explan)[1]
if (is.na(sum(indTrain))) {
m=round(n/2)
}
indTrain = sample(1:n, m)
indValid = setdiff(1:n, indTrain)
yTrain = y[indTrain]
yValid = y[indValid]
explanTrain = explan[indTrain, ]
explanValid = explan[indValid, ]
if (model == 'RF') {
mod = RfFit(y=yTrain, explan=explanTrain)
summary(mod)
yFit = predict(mod)
yPred = predict(mod, newdata=explanValid)
}
if (model == 'nnet') {
library('nnet')
size = NA
# scale inputs: divide by 50 to get 0-1 range
mod = nnet(yTrain ~ ., data=X1, size=size)
# multiply 50 to restore original scale
fit = predict(mod)
L = length(yTrain)
yFit = rep(NA, L)
NAMES = levels(yTrain)
for (i in 1:L) {
ind = which(fit[i, ] == max(fit[i, ]))[1]
yFit[i] = NAMES[ind]
}
pred = predict(mod, newdata=xValid)
L = length(yValid)
yPred = rep(NA, L)
NAMES = levels(yValid)
for (i in 1:L) {
ind = which(pred[i, ]==max(pred[i, ]))[1]
yPred[i] = NAMES[ind]
}
}
if (model == 'RSNNS') {
library('RSNNS')
mod = mlp(x=xTrain, y=yTrain)
yFit = predict(mod)
yPred = predict(mod, newdata=xValid)
}
if (model == 'multiLogistic') {
mod = multinom(formula=yTrain ~ ., data=xTrain)
yFit = predict(mod)
yPred = predict(mod, data=xValid)
res = categPredErr(yTrain, yFit)
}
if (model == 'SVM') {
library('e1071')
mod = svm(y=yTrain, x=xTrain)
yFit = predict(mod)
yPred = predict(mod, newdata=xValid)
}
par(mfrow=c(1, 2))
plot(yTrain, yFit)
plot(yValid, yPred)
res = categPredErr(yTrain, yFit)
print('Fit errors')
print(res)
res = categPredErr(yValid, yPred)
print('Pred Errors')
return(mod)
}
### model matrix
## gets a list, for a list element which is a vector of variables build interactions
## using DesignMatFcn below should be better
ModelMatrixViaList = function(formulaList, data, dropIntercept=TRUE) {
colsToText = function(colNames) {
for (k in 1:length(colNames))
{
if (k == 1) {
out = colNames[1]
} else {
out = paste(out, '*', colNames[k], sep='')
}
}
return(out)
}
formulaText = '~'
n = length(formulaList)
for (i in 1:n) {
colNames = formulaList[[i]]
print(colNames)
Text = colsToText(colNames)
print(Text)
if (i == 1) {
formulaText = paste(formulaText, Text, sep='')
}
if (i > 1) {
formulaText = paste(formulaText,'+', Text, sep='')
}
}
print(formulaText)
runText = paste('outData = model.matrix(', formulaText, ', data=data)', sep='')
eval(parse(text=runText))
if (dropIntercept) {outData = outData[ , -1 , drop=FALSE]}
return(outData)
}
Example = function() {
### TRY Dev models
n = 50
u1 = sample(1:3, n, replace=TRUE)
x1 = factor(u1,labels=c("p1","p2","p3"))
u2 = sample(1:2, n, replace=TRUE)
x2 = factor(u2, labels=c("rich", "poor"))
x3 = rnorm(n)
x4 = rnorm(n)
yObs = 0*(u1==1) + 8*(u1==2) + 10*(u1==3) + 15*(u1==4) + 20*(u1==5) + 1*(u2==1) +
2*x3 + 3*x4 + 4*x3*(x4>1) + 7*(x3>2)*x4 + 5*cos(2*pi*x3) * rnorm(n)
data = data.frame(x1=x1, x2=x2, x3=x3, x4=x4)
formulaList = list('x1', c('x1', 'x2'))
formulaList = list('x1')
res = ModelMatrixViaList(formulaList, data)
formulaList = list('x1', 'x2')
ModelMatrixViaList(formulaList, data)
formulaList = list('x2')
ModelMatrixViaList(formulaList, data)
formulaList = list('x1*x2')
ModelMatrixViaList(formulaList, data)
formulaList = list('x1+x1*x2')
ModelMatrixViaList(formulaList, data)
formulaList = list('x1+x2+x1*x2')
ModelMatrixViaList(formulaList, data)
}
Example = function() {
### TRY Dev models
n = 50
x1 = rnorm(n)
x2 = rnorm(n)
x3 = rnorm(n)
x4 = rnorm(n)
data = data.frame(x1=x1, x2=x2, x3=x3, x4=x4)
formulaList = list('x1', c('x1', 'x2'))
formulaList = list('x1')
ModelMatrixViaList(formulaList, data)
formulaList = list('x1', 'x2')
ModelMatrixViaList(formulaList, data)
formulaList = list('x2')
ModelMatrixViaList(formulaList, data)
formulaList = list('x1*x2')
ModelMatrixViaList(formulaList, data)
formulaList = list('x1+x1*x2')
ModelMatrixViaList(formulaList, data)
formulaList = list('x1+x2+x1*x2')
ModelMatrixViaList(formulaList, data)
}
### build a new function which removes NA
RemoveNAfcn = function(f) {
G = function(x, ...) {
f(x, na.rm=TRUE, ...)
}
return(G)
}
Example = function() {
f = mean
x = c(1:10, NA)
f(x)
RemoveNAfcn(f)(x)
}
## written first for uber excercise
## if the model is not glm then we need to get the design matrix (numerical matrix)
DesignMatFcn = function(df, formulaString, DfTrans=function(x){x}) {
#print('DesignMatFcn: data')
formula = as.formula(formulaString)
## design matrix
#print('DesignMatFcn: data')
#print(data[1, ])
## this builds the design matrix for current data
## as well as future data
## for current data we pass nothing
Fcn = function(newdata=NULL) {
#print('DesignMatFcn: dim newdata'); print(dim(newdata))
newdata = DfTrans(newdata)
df2 = rbind(df, newdata)
# print('dim data2'); print(dim(data2))
m = model.frame(formula=formula, data=df2)
x2 = model.matrix(formula, m)
x2 = matrix(x2, dim(x2)[1], dim(x2)[2])
#print(dim(x2))
nData = dim(df)[1]
nNewdata = dim(newdata)[1]
#print('nData'); print(nData); print('nNewdata'); print(nNewdata); print('sum')
#print(nData+nNewdata); print('dim x2')
x = x2[(nData + 1):(nData + nNewdata), , drop=FALSE]
return(x)
}
return(Fcn)
}
Example = function() {
n = 100
m = 50
u1 = sample(1:3, n, replace=TRUE)
x1 = factor(u1,labels=c("p1","p2","p3"))
u2 = sample(1:2, n, replace=TRUE)
x2 = factor(u2, labels=c("rich", "poor"))
x3 = rnorm(n)
x4 = rnorm(n)
y = 0*(u1==1) + 8*(u1==2) + 10*(u1==3) + 15*(u1==4) + 20*(u1==5) + 1*(u2==1) +
2*x3 + 3*x4 + 4*x3*(x4>1) + 7*(x3>2)*x4 + 5*cos(2*pi*x3) * rnorm(n)
data = data.frame(x1=x1, x2=x2, x3=x3, x4=x4, y=y)[1:m, ]
newdata = data.frame(x1=x1, x2=x2, x3=x3, x4=x4, y=y)[(m+1):n, ]
formulaText = 'y~x1+x2'
formula = as.formula(formulaText)
m = model.frame(formula=formula, data=data)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.