#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# authors: Reza Hosseini, Alireza Hosseini
### This source file contains functions that: fit linear models;
### calculate cross validation error; perform model selection ###
## need the source to calculate correlaltion with missing data
#### correct AIC and BIC of a glm model in R
## n sample size
## p is the number of covariates
CorrectAicBic = function(mod=NULL, p=NULL, n=NULL) {
if (is.null(mod)) {
aic = Inf
bic = Inf
logLik = -Inf
}
if (!is.null(mod)) {
aic = AIC(mod)
bic = BIC(mod)
pNA = sum(is.na(coef(mod)))
pModel = length(coef(mod)) - pNA
logLik = (2*pModel - aic) / 2
if (is.null(p)) {
p = length(coef(mod))
}
if (is.null(n)) {
n = length(mod$fitted)
}
aic = -2*logLik + 2*p
bic = -2*logLik + p*(log(n))
}
return(list('AIC'=aic, 'BIC'=bic, 'logLik'=logLik))
}
## warning BIC calculation in R does not match!!
TestCorrectAicBic = function() {
n = 100
x1 = 1:n
x2 = 1:n
y = 2*x1 + rnorm(n)
mod = glm(y ~ x1)
AIC(mod)
BIC(mod)
CorrectAicBic(mod, p=2, n=n)
mod = glm(y ~ x1 + x2)
AIC(mod)
BIC(mod)
CorrectAicBic(mod, p=3, n=n)
mod = glm(y ~ x1 + x2)
AIC(mod)
BIC(mod)
CorrectAicBic(mod)
mod = glm(y ~ x1)
AIC(mod)
BIC(mod)
}
################### fitting the model with glm and in a way that is suitable for cross validation with categorical
# automatic fitting of the linear model with continuous response start
################### fitting the model with glm and in a way that is suitable for cross validation with categorical
# automatic fitting of the linear model with continuous response start
GlmFitMatr = function(y, explan=NULL, family='gaussian', ...) {
glmMod = NULL
if (is.null(explan)) {
pNum = 0
string = paste('glmMod = glm(y ~ 1, family=', family, ')', sep='')
eval(parse(text=string))
}
if (!is.null(explan)) {
n = length(y)
explan = as.data.frame(explan)
pNum = dim(explan)[2]
for (j in (1:pNum)) {
str = paste("V", j+1, " = explan[ ,", j, "]", sep="")
eval(parse(text=str))
}
string = "(y ~ V2"
if (pNum > 1) {
for (i in 2:(pNum)) {
k = i + 1
smallString = paste("+", "V", k, sep="")
string = paste(string, smallString)
}
}
glmString = paste('glm', string, ',family=', family, ')', sep='')
tryCatch(
expr = {
glmMod = eval(parse(text=glmString))},
error = function(e){},
finally = {})
}
aic_bic = CorrectAicBic(glmMod)
aic = aic_bic[['AIC']]
bic = aic_bic[['AIC']]
logLik = aic_bic[['logLik']]
coeff = NA
fitted = NA
if (!is.null(glmMod)) {
coeff = glmMod$coeff
fitted = glmMod$fitted
}
out = list(glmMod=glmMod, logLik=logLik, BIC=bic, AIC=aic, coeff=coeff)
class(out) = 'gmod'
return(out)
}
TestGlmFitMatr = function() {
N = 80
num = 9
explan = matrix(rnorm(1:(num*N)), N, num)
y = rnorm(1:N) + 5*explan[ , 1] + 5*explan[ , 2] + 5*explan[ , 3]
res = GlmFitMatr(y, explan)
n = 1000
x = rep(1, n)
y = (1:n)
x1 = rep(2, n)
explan = cbind(x, x1, x, x, x, x, x, x, x, x, x)
GlmFitMatr(y, explan, family='poisson')
x = rnorm(10)
y = (1:10)
x1 = rnorm(10)
explan = cbind(x, x1)
GlmFitMatr(y, explan, family='poisson')
}
# Example y = rnorm(100); GlmFitMatr(y)
################# Quantile regression fitting and prediction in R
LmFitQR = function(y, explan, quant) {
library('quantreg')
n = length(y)
explan = data.frame(explan)
pNum = dim(explan)[2]
for (j in (1:pNum)) {
str = paste("V", j+1, " = explan[ , ", j, "]",sep="")
eval(parse(text=str))
}
string = "(y~V2"
if (pNum > 1) {
for (i in 2:(pNum)) {
k = i+1
smallString = paste("+", "V", k, sep="")
string = paste(string, smallString)
}
}
string = paste(string, ",", as.character(quant), ")")
rqString = paste("rq", string)
gmod = eval(parse(text=rqString))
coeff = gmod$coeff
fitted = gmod$fitted
out = list(gmod=gmod, coeff=coeff)
return(out)
}
## Examples
TestLmFitQR = function() {
n = 80
num = 9
explan = matrix(rnorm(1:(num*n)), n, num)
y = rnorm(1:n) + 5*explan[ , 1] + 5*explan[ , 2] + 5*explan[ , 3]
LmFitQR(y, explan, 0.5)
}
### a function that turns an explanatory matrix to V2,V3,... and create the prediction string
## need to provide data.frame as input
predict.gmod = function(mod, explan, ...) {
glmMod = mod[['glmMod']]
response = "response"
pNum = dim(explan)[2]
predStr = "out = predict(glmMod, newdata=data.frame(V2=explan[ , 1]"
if (pNum > 1) {
for (j in (2:pNum)) {
predStr = paste(predStr, ",V", j+1, "=explan[ , ", j, "]",sep="")
}
}
predStr = paste(predStr, "), type=response)")
eval(parse(text=predStr))
return(out)
}
Testpredict.gmod = function() {
n = 30
num = 2
explan = matrix(rnorm(1:(num*n)), n, num)
y = 6*rnorm(1:n) + 3*explan[ , 1]
gmod = GlmFitMatr(y, explan)
prediction = predict.gmod(gmod, explan)
plot(explan[ , 1], prediction)
points(explan[ , 1], explan[ , 1], col=2)
y = rnorm(100)
gmod = glm(y ~ 1)
prediction = pred(explan=matrix(NA, 5, 1), gmod)
plot(explan[ , 1], prediction)
points(explan[ , 1], explan[ , 1], col=2)
}
###################### cross validation function that can also handle categorical variables#############
###out of date version. fixed the big object issue next, i.e. ommitted fitted_MAT
############## preventing memory breakdown CV #########################
## modified to handle quantile regression as well
GlmCv = function(
y,
explan=NULL,
putOutNum=1,
model='lm',
tau=0.5,
family='gaussian') {
if (is.null(explan)) {
model = 'lm'
}
n = length(y)
m = putOutNum
no_explan = FALSE
if (is.null(explan)) {
no_explan = TRUE
explan = matrix(1, length(y), 1)
}
explan = data.frame(explan)
pNum = dim(explan)[2]
subs = combn(1:n, m, fun=NULL, simplify=TRUE)
N = dim(subs)[2]
fittedMat = matrix(NA, N, m)
for(i in 1:N) {
ind = subs[ , i]
valid = y[ind]
training = y[-ind]
explan2 = explan[-ind, ]
if (no_explan) {explan2 = NULL}
if (model == "lm") {
gmod = GlmFitMatr(y=training, explan=explan2, family=family)$gmod
}
if (model == "qr") {
gmod = LmFitQR(y=training, explan=explan2, quant=tau)$gmod
}
explan3 = explan[ind, ]
explan3 = data.frame(explan3)
fitted = glmPred(explan=explan3, gmod)
for (j in 1:m) {
fittedMat[i, j] = fitted[j]
}
}
fittedVec = rep(NA, n)
for (k in 1:n) {
tempFit = NULL
for (l in 1:N) {
if (sum(subs[ , l] == k) > 0) {
k2 = which(subs[ , l] == k)
tempFit[l]=fittedMat[l, k2]}
}
fittedVec[k] = mean(tempFit, na.rm=TRUE)
}
cc = (abs(y - fittedVec))
cv1 = mean(na.omit(cc))
cc = (y - fittedVec)^2
cc = na.omit(cc)
len = length(cc)
cv = sqrt((1/len)*sum(cc))
cc = (y - fittedVec)^4
cc = na.omit(cc)
len = length(cc)
cv4 = sqrt(sqrt((1/len)*sum(cc)))
cvr = cor2(fittedVec,y)
out = list(
cve1=cv1,
cve=cv,
cve4=cv4,
cvr=cvr,
predCv=fittedVec,
size=len)
return(out)
}
### automatic model selection procedure
### model selection using AIC, BIC, cve, find the optimal model of given number of parameters
## We can easily add cve1,cve4
### picking m top models
##################### Selecting models with covariates that are fixed
##################### We can also specify the number of accepted predictors by varNum
## explanF = fixed predictors
## explan = predictor set from which we take a subset
## explanF could be NULL
ModSelSubset = function(
y=NULL,
explanF=NULL,
explan=NULL,
m=1,
crit='AIC',
family='gaussian',
varNum=NULL,
critSummaryDelta=0.1) {
d = dim(explan)[2]
varNum2 = d
if (!is.null(varNum)) {
if (varNum > d) {
print('Warning: varNum > data dimension so it was reset to dim[2].')
}
varNum2 = min(d, varNum)
}
varNum = varNum2
MsFcn = function(x) {
if (is.null(x)) {
explan1 = explanF
} else {
explan1 = explan[ , x, drop=FALSE]
if (!is.null(explanF)) {
explan1 = cbind(explanF, explan1)
}
}
res = GlmFitMatr(y=y, explan=explan1, family=family)
mod = res[['glmMod']]
aic_bic = CorrectAicBic(mod=mod)
aic = aic_bic[['AIC']]
bic = aic_bic[['BIC']]
if (crit == "AIC") {
out = c(aic, bic, x, rep(NA, (varNum - length(x))))
}
if (crit == "BIC") {
out = c(bic, aic, x, rep(NA, (varNum - length(x))))
}
if (crit == "cve") {
res = GlmCv(y, explan=explan1, putOutNum=1)
out = c(res$cve, res$cvr, x, rep(NA, (varNum - length(x))))
}
return(out)
}
nullModRes = MsFcn(x=NULL)
## subSets of up to order varNum
subSets = lapply(1:varNum, function(x) as.list(data.frame(combn(d, x))))
subSets = unlist(subSets, recursive=FALSE)
resList = lapply(subSets, MsFcn)
resDf = data.frame()
n = length(resList)
for (i in 1:n) {resDf = rbind(resDf, resList[[i]])}
colnames(resDf) = NULL
resDf = rbind(resDf, nullModRes)
if (crit == 'AIC') {
colnames(resDf)[1:2] = c('AIC', 'BIC')
}
if (crit == 'BIC') {
colnames(resDf)[1:2] = c('BIC', 'AIC')
}
if (crit == 'cve') {
colnames(resDf)[1:2] = c('cve', 'cvr')
}
ord = order(resDf[ , 1])
resDf = resDf[ord, ]
topModMatrix = resDf[1:m, ]
critVec = resDf[ , crit]
critVec = na.omit(critVec)
critSummary = quantile(critVec, seq(0, 1, critSummaryDelta))
outList = list(topModMatrix=topModMatrix, critSummary=critSummary)
return(outList)
}
TestModSelSubset = function() {
n = 100
num = 5
fixedVarNum = 2
explan = matrix(rnorm(1:(num*n)), n, num)
explan = cbind(explan,explan[ , 5])
explanF = matrix(rnorm(1:(num*n)), n, fixedVarNum);
mu = explanF[ , 1] + 5*explan[ , 1] + 5*explan[ , 4] + 5*explan[ , 5]
lambda = exp(mu)
y = rpois(n, lambda)
ModSelSubset(
y,
explanF=explanF,
explan,
m=3,
crit="AIC",
family='poisson')
ModSelSubset(
y,
explanF=NULL,
explan=explan,
m=3,
crit="AIC",
family='poisson',
varNum=2)
ModSelSubset(
y,
explanF,
explan=explan,
m=3,
crit="BIC",
family='poisson',
varNum=4)
ModSelSubset(
y,
explanF=NULL,
explan=explan,
m=3,
crit="BIC",
family='poisson',
varNum=4)
ModSelSubset(
y,
explanF=NULL,
explan=explan,
m=3,
crit="BIC",
family='poisson',
varNum=5)
ModSelSubset(
y,
explanF=NULL,
explan=explan,
m=3,
crit="BIC",
family='poisson',
varNum=7)
n = 100
num = 5
fixedVarNum = 2
explan = matrix(rnorm(1:(num*n)), n, num)
explan = cbind(explan,explan[ , 5])
explanF = matrix(rnorm(1:(num*n)), n, fixedVarNum);
mu = explanF[ , 1] + 5*explan[ , 1] + 5*explan[ , 4] + 5*explan[ , 5]
lambda = exp(mu)
y = rpois(n, lambda)
ModSelSubset(y, explanF, explan, m=3, crit="AIC", family='poisson')
N = 100
num = 5
fixedVarNum = 2
explan = matrix(rnorm(1:(num*N)), N, num)
explanF = matrix(rnorm(1:(num*N)), N, fixedVarNum)
y = rnorm(1:N) + 5*explan[ , 1] + 5*explan[ , 4] + 5*explan[ , 5]
ModSelSubset(y, explanF, explan, m=3, crit="cve")
N = 50
num = 5
fixedVarNum = 2
explan = matrix(rnorm(1:(num*N)), N, num)
explanF = NULL
y = rnorm(1:N)
ModSelSubset(y, explanF, explan, m=3, crit="BIC")
N = 100
num = 5
fixedVarNum = 2
explan = matrix(rnorm(1:(num*N)), N, num)
explanF = matrix(rnorm(1:(num*N)), N, fixedVarNum)
y = rnorm(1:N)
ModSelSubset(y, explanF, explan, m=3, crit="BIC")
}
### model selection via GLM or other implemented models
# default is the regular GLM
## explan and explanF contain column names this time, or interaction terms
ModSel_subsetFormula = function(
df,
yCol,
explanColsF=NULL,
explanCols=NULL,
m=1,
crit='AIC',
family='gaussian',
ModelFcn=glm,
varNum=NULL,
critSummaryDelta=0.1) {
d = length(explanCols)
varNum2 = d
if (!is.null(varNum)) {
if (varNum > d) {
print('Warning: varNum > data dimension so it was reset to dim[2].')
}
varNum2 = min(d, varNum)
}
varNum = varNum2
MsFcn = function(x) {
if (is.null(x)) {
explanCols1 = explanColsF
} else {
explanCols1 = explanCols[x]
if (!is.null(explanColsF)) {
explanCols1 = c(explanColsF, explanCols1)
}
}
explanFormula = PlusVec(explanCols1)
if (explanFormula == '') {
explanFormula = '1'
}
formulaString = paste(yCol, ' ~ ', explanFormula, sep='')
mod = ModelFcn(
formula=as.formula(formulaString), data=df, family=family)
aic_bic = CorrectAicBic(mod=mod)
aic = aic_bic[['AIC']]
bic = aic_bic[['BIC']]
# AIC = AIC(mod)
# BIC = BIC(mod)
if (crit == "AIC") {
out = c(aic, bic, explanCols[x], rep(NA, (varNum - length(x))))
}
if (crit == "BIC") {
out = c(bic, aic, explanCols[x], rep(NA, (varNum - length(x))))
}
if (crit == "cve") {
res = GlmCv(y, explan=explan1, putOutNum=1)
out = c(res$cve, res$cvr, explanCols, rep(NA, (varNum - length(x))))
}
return(out)
}
nullModRes = MsFcn(x=NULL)
## subSets of up to order varNum
subSets = lapply(
1:varNum,
function(x) as.list(data.frame(combn(d, x))))
subSets = unlist(subSets, recursive=FALSE)
resList = lapply(subSets, MsFcn)
resDf = matrix(NA, length(resList), varNum + 2)
resDf = rbind(resDf, nullModRes)
n = length(resList)
for (i in 1:n) {
resDf[i, ] = resList[[i]]
}
colnames(resDf) = NULL
resDf = data.frame(resDf)
resDf[ , 1] = as.numeric(as.character(resDf[ ,1]))
resDf[ , 2] = as.numeric(as.character(resDf[ ,2]))
if (crit == 'AIC') {
colnames(resDf)[1:2] = c('AIC', 'BIC')
}
if (crit == 'BIC') {
colnames(resDf)[1:2] = c('BIC', 'AIC')
}
if (crit == 'cve') {
colnames(resDf)[1:2] = c('cve', 'cvr')
}
ord = order(resDf[ , 1])
resDf = resDf[ord, ]
topModMatrix = resDf[1:m, ]
critVec = resDf[ , crit]
critVec = na.omit(critVec)
critSummary = quantile(critVec, seq(0, 1, critSummaryDelta))
outList = list(topModMatrix=topModMatrix, critSummary=critSummary)
return(outList)
}
TestModSel_subsetFormula = function() {
n = 1000
num = 5
fixedVarNum = 2
explan = matrix(rnorm(1:(num*n)), n, num)
explan = cbind(explan, explan[ , 5])
explan = data.frame(explan)
explanF = matrix(rnorm(1:(num*n)), n, fixedVarNum)
explanF = data.frame(explanF)
names(explanF) = paste('F', 1:fixedVarNum, sep='')
mu = explanF[ , 1] + 5*(explan[ , 1]*explan[ , 1]) +
5*explan[ , 4]*explan[ , 3] + 5*explan[ , 5]
lambda = (1.1)^(mu)
y = rpois(n, lambda)
explanColsF = names(explanF)
explanCols = names(explan)
explanCols = c(explanCols, 'X1*X1', 'X1*X2', 'X3*X4', 'X2*X3')
df = cbind(explanF, explan)
df = cbind(df, y)
res = ModSel_subsetFormula(
df=df,
yCol='y',
explanColsF=explanColsF,
explanCols=explanCols,
m=3,
crit="BIC",
family='poisson')
res = ModSel_subsetFormula(
df=df,
yCol='y',
explanColsF=NULL,
explanCols=explanCols,
m=3,
crit="BIC",
family='poisson')
## y is independent of x's
n = 500
num = 5
fixedVarNum = 2
explan = matrix(rnorm(1:(num*n)), n, num)
explan = cbind(explan, explan[ , 5])
explan = data.frame(explan)
explanF = matrix(rnorm(1:(num*n)), n, fixedVarNum)
explanF = data.frame(explanF)
names(explanF) = paste('F', 1:fixedVarNum, sep='')
mu = rnorm(n)
lambda = exp(mu)
y = rpois(n, lambda)
explanColsF = names(explanF)
explanCols = names(explan)
df = cbind(explanF, explan)
df = cbind(df, y)
res = ModSel_subsetFormula(
df=df,
yCol='y',
explanColsF=NULL,
explanCols=explanCols,
m=3,
crit="BIC",
family='poisson')
}
##### sequential grouped step-wise model selection with various families
ModSel_mergeGroup = function(
y,
explanF=NULL,
explanList,
groupIndex=NULL,
crit,
family='gaussian') {
l = length(explanList)
## if no group identifier is passed we build a simple one below
if (is.null(groupIndex)) {
groupIndex = 1:l
}
explanList = explanList[groupIndex]
l = length(explanList)
explan1 = NULL
## this saves the index within an explan family
elementVec = NULL
## this saves which of the groups that predictor belongs to.
groupVec = NULL
for (i in 1:l) {
explan = explanList[[i]]
if (!is.null(explan1)) {
explan1 = cbind(explan1, explan)
} else {
explan1 = explan
}
## number of parameters of the i-th explan
p = dim(explan)[2]
elementVec = c(elementVec, 1:p)
## assigning the group index to keep track of the group
j = groupIndex[i]
groupVec = c(groupVec, rep(j, p))
}
res = ModSelSubset(
y,
explanF=explanF,
explan=explan1,
m=1,
crit=crit,
family=family)
topModMatrix = res[['topModMatrix']]
critSummary = res[['critSummary']]
topMod = as.numeric(topModMatrix)
crits = topMod[1:2]
optInd = topMod[3:length(topMod)]
optInd = na.omit(optInd)
groupInd = groupVec[optInd]
elementInd = elementVec[optInd]
explanInd = data.frame(group=groupInd, element=elementInd)
outList = list(groupElemInd=explanInd, critSummary=critSummary)
return(outList)
}
TestModSel_mergeGroup = function() {
n = 1000
num1 = 4
num2 = 2
num3 = 3
explan1 = matrix(rnorm(1:(num1*n)), n, num1)
explan2 = matrix(rnorm(1:(num2*n)), n, num2)
explan3 = matrix(rnorm(1:(num3*n)), n, num3)
explanF = matrix(rnorm(1:(num3*n)), n, num3)
explanList = list(explan1, explan2, explan3)
mu = explan1[ , 1] + 5*explan3[ , 1] + 5*explan3[ , 3]
lambda = 1.2^(mu) + 0.1
lambda = MinComp(lambda, 10000)
y = rpois(n, lambda)
ModSel_mergeGroup(
y,
explanF=explanF,
explanList=explanList,
crit="BIC",
family='poisson')
#####
ModSel_mergeGroup(
y,
explanF=NULL,
explanList,
groupIndex=c(1, 3),
crit='BIC',
family='poisson')
}
ModSel_mergeGroupFormula = function(
df,
yCol,
explanColsF=NULL,
explanColsList,
groupIndex=NULL,
crit,
family='gaussian') {
l = length(explanColsList)
## if no group identifier is passed we build a simple one below
if (is.null(groupIndex)) {
groupIndex = 1:l
}
explanColsList = explanColsList[groupIndex]
l = length(explanColsList)
explanCols1 = NULL
## this saves the index within an explan family
elementVec = NULL
## this saves which of the groups that predictor belongs to.
groupVec = NULL
for (i in 1:l) {
explanCols = explanColsList[[i]]
if (!is.null(explan1)) {
explanCols1 = c(explanCols1, explanCols)
} else {
explanCols1 = explanCols
}
## number of parameters of the i-th explan
p = length(explanCols)
elementVec = c(elementVec, 1:p)
## assigning the group index to keep track of the group
j = groupIndex[i]
groupVec = c(groupVec, rep(j, p))
}
res = ModSel_subsetFormula(df=df, yCol=yCol, explanColsF=explanColsF,
explanCols=explanCols1, m=1, crit=crit, family=family)
topModMatrix = res[['topModMatrix']]
topMod = topModMatrix[1, ]
names(topMod) = NULL
critSummary = res[['critSummary']]
crits = topMod[1:2]
optInd = topMod[3:length(topMod)]
optInd = as.character(unlist(optInd))
optInd = na.omit(optInd)
optInd = which(explanCols1 %in% optInd)
groupInd = groupVec[optInd]
elementInd = elementVec[optInd]
explanInd = data.frame(group=groupInd, element=elementInd)
outList = list(groupElemInd=explanInd, critSummary=critSummary)
return(outList)
}
TestModSel_mergeGroupFormula = function() {
n = 500
num1 = 4
num2 = 2
num3 = 3
num4 = 3
explan1 = matrix(rnorm(1:(num1*n)), n, num1)
explan2 = matrix(rnorm(1:(num2*n)), n, num2)
explan3 = matrix(rnorm(1:(num3*n)), n, num3)
explanF = matrix(rnorm(1:(num3*n)), n, num4)
explan1 = data.frame(explan1)
explan2 = data.frame(explan2)
explan3 = data.frame(explan3)
explanF = data.frame(explanF)
names(explan1) = paste('A', 1:num1, sep='')
names(explan2) = paste('B', 1:num2, sep='')
names(explan3) = paste('C', 1:num3, sep='')
names(explanF) = paste('F', 1:num4, sep='')
explanColsList = list(names(explan1), names(explan2), names(explan3))
mu = explan1[ , 1] + 5*explan3[ , 1] + 5*explan3[ , 3]
lambda = 2^(mu) + 0.1
lambda = MinComp(lambda, 10000)
y = rpois(n, lambda)
df = cbind(explan1, explan2, explan3, explanF, y)
ModSel_mergeGroupFormula(
df,
yCol=yCol,
explanColsF=explanF,
explanColsList=explanColsList,
crit="BIC",
family='poisson')
ModSel_mergeGroupFormula(
df,
yCol=yCol,
explanColsF=NULL,
explanColsList=explanColsList,
crit="BIC",
family='poisson')
ModSel_mergeGroupFormula(
df,
yCol,
explanColsF=NULL,
explanColsList=explanColsList,
crit="BIC",
family='poisson')
ModSel_mergeGroupFormula(
df,
yCol,
explanColsF=NULL,
explanColsList=explanColsList,
groupIndex=c(1, 3),
crit="BIC",
family='poisson')
}
####### Broke mans prediction
CvBmp = function(y) {
y = na.omit(y)
n = length(y)
fittedVec = rep(NA, n)
for (i in 1:n) {
fittedVec[i] = mean(y[-i])
}
cc = (abs(y - fittedVec))
cv1 = mean(na.omit(cc))
cc = (y - fittedVec)^2
cc = na.omit(cc)
len = length(cc)
cv = sqrt((1/len)*sum(cc))
cc = (y - fittedVec)^4
cc = na.omit(cc)
len = length(cc)
cv4 = sqrt(sqrt((1/len)*sum(cc)))
cvr = cor2(fittedVec, y)
out = list(cve1=cv1, cve=cv, cve4=cv4, cvr=cvr, size=len)
return(out)
}
########### True prediction error
## here we provide a very large validation set that can be used to find a true prediction error for a model
GlmPredErr = function(y, explan, vars, validation, family='gaussian') {
y1 = validation[ , 1]
explan1 = validation[ , -1]
explan1 = explan1[ , vars]
expl = explan[ , vars]
gmod = GlmFitMatr(y, expl, family=family)$gmod
fitted = glmPred(explan1, gmod)
tcv1 = (1/length(y1))*(sum(abs(y1 - fitted)))
tcv = (1/length(y1))*sqrt(sum((y1 - fitted)^2))
tcv4 = (1/length(y1))*sqrt(sqrt(sum((y1 - fitted)^4)))
tcvr2 = corNA(fitted, y)
cvout = GlmCv(y, expl, 1)
out1 = c(cvout$cve1, cvout$cve, cvout$cve4, cvout$cvr2)
out2 = c(tcv1, tcv, tcv4, tcvr2)
out = list(cvErr=out1, trueErr=out2)
return(OUT)
}
TestGlmPredErr = function() {
n = 100
num = 5
explan = matrix(rnorm(1:(num*n)), n, num)
y = rnorm(1:N) + 15*explan[ , 1] + 25*explan[ , 2] + 35*explan[ , 3]
n = 200
explan = matrix(rnorm(1:(num*n)), n, num)
y = rnorm(1:n) + 15*explan[ , 1] + 25*explan[ , 2] + 35*explan[ , 3]
validation = cbind(y, explan)
vars = 1:3
GlmPredErr(y, explan, vars, validation)
vars = 4:5
GlmPredErr(y, explan, vars, validation)
vars = 1:2
GlmPredErr(y, explan, vars, validation)
putOutNum = 2
n = 100
num = 5
explan = matrix(rnorm(1:(num*n)), n, num)
y = rnorm(1:n) + 15*explan[ , 1] + 25*explan[ , 2] + 35*explan[ , 3]
n = 200
explan = matrix(rnorm(1:(num*n)), n, num)
y = rnorm(1:n) + 15*explan[ , 1] + 25*explan[ , 2] + 35*explan[ , 3]
validation = cbind(y, explan)
vars = 1:3
GlmPredErr2(y, explan, vars, validation, 2)
vars = 4:5
GlmPredErr2(y, explan, vars, validation, 2)
vars = 1:2
GlmPredErr2(y, explan, vars, validation, 2)
}
#### Model selection sequential functions
## subsetting predictor groups according to given index pairs
## group index and element index
Subset_explanList = function(explanList, groupElemInd) {
pickedGroups = unique(groupElemInd[ , 1])
l = length(pickedGroups)
outExplan = NULL
for (i in 1:l) {
gr = pickedGroups[i]
ind = which(groupElemInd[ , 1] == gr)
elements = groupElemInd[ind, 2]
explan = explanList[[gr]]
explan1 = explan[ , elements]
if (i == 1) {
outExplan = explan1
} else {
outExplan = cbind(outExplan, explan1)
}
}
return(outExplan)
}
Subset_explanListFormula = function(explanColsList, groupElemInd) {
pickedGroups = unique(groupElemInd[ , 1])
l = length(pickedGroups)
outExplan = NULL
for (i in 1:l) {
gr = pickedGroups[i]
ind = which(groupElemInd[ , 1] == gr)
elements = groupElemInd[ind, 2]
explanCols = explanColsList[[gr]]
explanCols1 = explanCols[elements]
if (i == 1) {
outExplanCols = explanCols1
} else {
outExplanCols = c(outExplanCols, explanCols1)
}
}
return(outExplanCols)
}
TestSubset_explanListFormula = function() {
names(explan1) = paste('A', 1:num1, sep='')
names(explan2) = paste('B', 1:num2, sep='')
names(explan3) = paste('C', 1:num3, sep='')
names(explanF) = paste('F', 1:num4, sep='')
explanColsList = list(names(explan1), names(explan2), names(explan3))
groupElemInd = matrix(NA, 5, 2)
groupElemInd[ , 1] = c(1, 1, 2, 2, 3)
groupElemInd[ , 2] = c(1, 3, 1, 2, 2)
Subset_explanListFormula(explanColsList, groupElemInd=groupElemInd)
}
### this function will
### merge sequentially group of groups given in groupsIndList
ModSelSeqMerging = function(
y,
explanF=NULL,
explanList=NULL,
groupsIndList=NULL,
crit='AIC',
family='gaussian') {
## empty index set
groupElemInd = NULL
rowNum = length(groupsIndList)
critSummaryList = list()
for (i in 1:rowNum) {
groupIndex = groupsIndList[[i]]
res = ModSel_mergeGroup(
y=y,
explanF=explanF,
explanList=explanList,
groupIndex=groupIndex,
crit=crit,
family=family)
groupElemInd1 = res[['groupElemInd']]
critSummaryList[[i]] = res[['critSummary']]
#print(groupElemInd1)
## reset explanF
if (dim(groupElemInd1)[1] > 0) {
explanF1 = Subset_explanList(explanList, groupElemInd1)
if (is.null(explanF)) {
explanF=explanF1
} else {
explanF = cbind(explanF, explanF1)
}
groupElemInd = rbind(groupElemInd, groupElemInd1)
}
}
outList = list(
groupElemInd=groupElemInd,
critSummaryList=critSummaryList)
return(outList)
}
TestModSelSeqMerging = function() {
n = 1000
num1 = 4
num2 = 2
num3 = 3
num4 = 2
explan1 = matrix(rnorm(1:(num1*n)), n, num1)
explan2 = matrix(rnorm(1:(num2*n)), n, num2)
explan3 = matrix(rnorm(1:(num3*n)), n, num3)
explanF = matrix(rnorm(1:(num3*n)), n, num4)
explanList = list(explan1, explan2, explan3)
mu = explan1[ , 1] + 5*explan3[ , 1] + 5*explan3[ , 3]
lambda = 1.2^(mu) + 0.1
lambda = MinComp(lambda, 10000)
y = rpois(n, lambda)
groupsIndList = list(c(1), c(2,3))
ModSelSeqMerging(
y=y,
explanF=explanF,
explanList=explanList,
groupsIndList=groupsIndList,
crit='BIC',
family='poisson')
ModSelSeqMerging(
y=y,
explanF=NULL,
explanList=explanList,
groupsIndList=groupsIndList,
crit='BIC',
family='poisson')
}
ModSel_seqMergingFormula = function(
df,
yCol,
explanColsF=NULL,
explanColsList=NULL,
groupsIndList=NULL,
crit='AIC',
family='gaussian') {
# empty index set
groupElemInd = NULL
rowNum = length(groupsIndList)
critSummaryList = list()
for (i in 1:rowNum) {
groupIndex = groupsIndList[[i]]
res = ModSel_mergeGroupFormula(
df=df,
yCol,
explanColsF=explanColsF,
explanColsList=explanColsList,
groupIndex=groupIndex,
crit=crit,
family=family)
groupElemInd1 = res[['groupElemInd']]
critSummaryList[[i]] = res[['critSummary']]
# print(groupElemInd1)
## reset explanF
if (dim(groupElemInd1)[1] > 0) {
explanColsF1 = Subset_explanListFormula(explanColsList, groupElemInd1)
if (is.null(explanColsF)) {
explanColsF=explanColsF1
} else {
explanF = c(explanColsF, explanColsF1)
}
groupElemInd = rbind(groupElemInd, groupElemInd1)
}
}
cols = Subset_explanListFormula(explanColsList, groupElemInd)
outList = list(
groupElemInd=groupElemInd,
critSummaryList=critSummaryList,
cols=cols)
return(outList)
}
TestModSel_seqMergingFormula = function() {
n = 1000
num1 = 4
num2 = 2
num3 = 3
num4 = 3
explan1 = matrix(rnorm(1:(num1*n)), n, num1)
explan2 = matrix(rnorm(1:(num2*n)), n, num2)
explan3 = matrix(rnorm(1:(num3*n)), n, num3)
explanF = matrix(rnorm(1:(num3*n)), n, num4)
explan1 = data.frame(explan1)
explan2 = data.frame(explan2)
explan3 = data.frame(explan3)
explanF = data.frame(explanF)
names(explan1) = paste('A', 1:num1, sep='')
names(explan2) = paste('B', 1:num2, sep='')
names(explan3) = paste('C', 1:num3, sep='')
names(explanF) = paste('F', 1:num4, sep='')
explanColsList = list(c(names(explan1), 'A1*A1', 'A2*A3', 'A3*A3', 'A2*A2'),
names(explan2), names(explan3))
mu = explan1[ , 1]*explan1[ , 1] + explan1[ , 2]*explan1[ , 3] +
5*explan3[ , 1] + 5*explan3[ , 3]
# lambda = 2^(mu) + 0.1
# lambda = MinComp(lambda, 10000)
# y = rpois(n, lambda)
y = mu
df = cbind(explan1, explan2, explan3, explanF, y)
family = 'gaussian'
groupsIndList = list(c(1), c(2,3))
res = ModSel_seqMergingFormula(
df=df, yCol='y', explanColsF=explanColsF,
explanColsList=explanColsList,
groupsIndList=groupsIndList, crit='BIC', family=family)
res = ModSel_seqMergingFormula(
df=df, yCol='y', explanColsF=NULL,
explanColsList=explanColsList,
groupsIndList=groupsIndList, crit='BIC', family=family)
}
## grouped model selection
## y: response
## explanList: list of data frames, each representing a group of variables
## data of two columns, one is group, one is element
## groupsIndList:
ModSelSeqMerging_fixedGroups = function(
y=NULL,
explanList=NULL,
groupElemIndFixed=NULL,
groupsIndList=NULL,
crit='AIC',
family=NULL) {
if (is.null(groupsIndList)) {
groupsIndList=as.list(1:length(explanList))
}
explanF = NULL
if (!is.null(groupElemIndFixed)) {
explanF = Subset_explanList(explanList, groupElemIndFixed)
}
groupElemInd = NULL
if (!is.null(groupsIndList)) {
res = ModSelSeqMerging(
y=y,
explanF=explanF,
explanList=explanList,
groupsIndList=groupsIndList,
crit=crit,
family=family)
groupElemInd = res[['groupElemInd']]
critSummaryList = res[['critSummaryList']]
}
if (!is.null(groupElemIndFixed)) {
groupElemInd = rbind(as.matrix(groupElemIndFixed), groupElemInd)
}
outList = list(groupElemInd=groupElemInd, critSummaryList=critSummaryList)
return(outList)
}
TestModSelSeqMerging_fixedGroups = function() {
n = 500
ord = c(5, 3, 2, 2)
group = c(rep(1, ord[1]), rep(2, ord[2]), rep(3, ord[3]), rep(4, ord[4]))
element = c(1:ord[1], 1:ord[2], 1:ord[3], 1:ord[4])
groupElem = cbind(group, element)
explan1 = matrix(rnorm(ord[1]*n), n, ord[1])
explan2 = matrix(rnorm(ord[2]*n), n, ord[2])
explan3 = matrix(rnorm(ord[3]*n), n, ord[3])
explan4 = matrix(rnorm(ord[4]*n), n, ord[4])
explanList = list(explan1, explan2, explan3, explan4)
logy = 2*explan1[ , 5] + 2*explan2[ , 2] + 1*explan3[ , 1] + rnorm(n)
y = 1.1^(logy)
groupElemIndFixed = groupElem[c(1, 2), ]
groupsIndList = list(c(1), c(2), c(3, 4))
ModSelSeqMerging_fixedGroups(
y=y,
explanList=explanList,
groupElemIndFixed=groupElemIndFixed,
groupsIndList=groupsIndList,
crit='BIC',
family='gaussian')
###
groupElemIndFixed = NULL
groupsIndList = list(c(1), c(2), c(3, 4))
ModSelSeqMerging_fixedGroups(y=y, explanList=explanList,
groupElemIndFixed=groupElemIndFixed, groupsIndList=groupsIndList,
crit='BIC', family='gaussian')
}
ModSelSeqMerging_fixedGroupsFormula = function(
df,
yCol,
explanColsList=NULL,
groupElemIndFixed=NULL,
groupsIndList=NULL,
crit='AIC',
family=NULL) {
if (is.null(groupsIndList)) {
groupsIndList=as.list(1:length(explanColsList))
}
explanF = NULL
if (!is.null(groupElemIndFixed)) {
explanColsF = Subset_explanListFormula(explanColsList, groupElemIndFixed)
}
groupElemInd = NULL
if (!is.null(groupsIndList)) {
res = ModSel_seqMergingFormula(
df=df,
yCol=yCol,
explanColsF=explanColsF,
explanColsList=explanColsList,
groupsIndList=groupsIndList,
crit=crit,
family=family)
groupElemInd = res[['groupElemInd']]
critSummaryList = res[['critSummaryList']]
}
if (!is.null(groupElemIndFixed)) {
groupElemInd = rbind(as.matrix(groupElemIndFixed), groupElemInd)
}
cols = Subset_explanListFormula(explanColsList, groupElemInd)
outList = list(
groupElemInd=groupElemInd,
critSummaryList=critSummaryList,
cols=cols)
return(outList)
}
## stepwise
StepwiseGlm = function(df, yCol, predCols, familyStr) {
model = glm(
as.formula(paste0(yCol, "~1")), data=df, family=familyStr)
## building the maximal model formula
upperModFormula = as.formula(paste0(
"~",
paste0(predCols, collapse="+")))
stepModel = stepAIC(
model,
direction="both", trace=FALSE, k=log(nrow(df)),
scope=list(
upper=upperModFormula,
lower=~1))
terms = attr(terms(stepModel), "term.labels")
anova = stepModel[["anova"]]
return(list("terms"=terms, "anova"=anova))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.