##################This function determines the regression model, according to the values of dependent and#######################
####independent variables, its labels, the typeofresult (mean is default), order and method. (tau required if quantile)#########
#####entire informatios of models are returned, like the model itself and uncertainties associated with each coefficient########
RegModel <-function(Results, allcandidates,datatuning,typeofresult,order,method,tau,weights){
y <- numeric(ncol(Results))
if (typeofresult=="mean") y <- apply(Results,FUN=mean,na.rm=TRUE,MARGIN=2)
else {
ranks <- matrix(ncol=ncol(Results),nrow=nrow(Results))
for(i in 1:nrow(Results)) ranks[i,] <- rank(Results[i,])
if (typeofresult=="sumrankings") y <- colSums(ranks)
else y <- apply(ranks,FUN=mean, na.rm=TRUE,MARGIN=2)
}
variables <- ncol(allcandidates)-1
dataformodel <- matrix(ncol=variables+2,nrow=ncol(Results))
dataformodel[,2:ncol(dataformodel)] <- allcandidates[,1:(ncol(allcandidates))]
#scaling the independent variables in a [0,1] scale
dataformodel[,2:(ncol(dataformodel)-1)] <- NormalizeData(dataformodel[,2:(ncol(dataformodel)-1)],direction="cols")
dataformodel[,1]<-y
columns <- c(c(as.character(datatuning$name)),"Id")
colnames(dataformodel)<-c("y",columns)
dataformodel <-as.data.frame(dataformodel)
if ((method!="lasso") & (method!="ridge")) {
if (method=="ols") expr <- "lm(dataformodel$y~poly("
else if (method=="irls") expr <- "rlm(dataformodel$y~poly("
else if (method=="quant") expr <- "rq(dataformodel$y~poly("
stringaux <- ""
for (i in 2:(ncol(dataformodel)-1)){
stringaux <- paste0(stringaux,"dataformodel$",collapse=NULL)
stringaux <- paste0(stringaux,as.character(colnames(dataformodel)[i]),collapse=NULL)
stringaux <- paste0(stringaux,",",collapse=NULL)
}
expr <- paste0(expr,stringaux,collapse=NULL)
expr <- paste0(expr,"degree=")
expr <- paste0(expr,as.character(order),collapse=NULL)
expr <- paste0(expr,",raw=TRUE)",collapse=NULL)
#if (method=="ols") {
# if (!is.na(weights[1])) expr <- paste0(expr,",weights=weights")
# expr <- paste0(expr,")",collapse=NULL)
#}
#else
if (method=="quant") {
expr <- paste0(expr,",tau=",collapse=NULL)
expr <- paste0(expr,tau,collapse=NULL)
}
if (!is.na(weights[1])) expr <- paste0(expr,",weights=weights")
expr <- paste0(expr,")",collapse=NULL)
# else {
# expr <- paste0(expr,",raw=TRUE),method=\"MM\",maxit=",collapse=NULL)
# maxit <- niterirls
# expr <- paste0(expr,as.character(maxit),collapse=NULL)
# expr <- paste0(expr,")",collapse=NULL)
# }
}
else {
termsreg <- Termsregression(c(as.character(datatuning$name)),order)
X<-Buildridgelasso(dataformodel,termsreg)
if (method=="lasso") alpha <- 1 else alpha <- 0
regridgelasso <- cv.hqreg(X,dataformodel$y,alpha=alpha) ###performs cross-validation for elastic-net penalized Huber loss regression
###and quantile regression (Humber loss is default) over a sequence of
###lambda values and find an optimal lambda
coefsridgelasso<-(coef(regridgelasso,regridgelasso$lambda.min))
print("Non-zero and zero coefficients: ")
print(coefsridgelasso)
if (length(which(coefsridgelasso[2:length(coefsridgelasso)]==0))==(length(coefsridgelasso)-1)) return(list(NA))
################call to the function that return the uncertainty associated to the ridge/lasso regression####################
modelridgelasso <- Buildmodelridgelasso(dataformodel,coefsridgelasso,termsreg,10)
}
if ((method!= "lasso") & (method!="ridge")) {
model1<-eval(parse(text=expr))
return(model1)
}
else return(modelridgelasso)
##an explicit second-order regression. I used this to verify if the use of "polym" function for second-order regression works, comparing their results (they are the same)
#r<- lm(as.formula("dataformodel$y ~ dataformodel$Mutation + dataformodel$Crossover + dataformodel$Selection +
# + dataformodel$Mutation*dataformodel$Crossover + dataformodel$Mutation*dataformodel$Selection + dataformodel$Crossover*dataformodel$Selection +
# I(dataformodel$Mutation^2) + I(dataformodel$Crossover^2) + I(dataformodel$Selection^2)"))
#print(summary(r))
}
###########This function is an auxiliar function called by RegModel. In the case of ridge and lasso regression, the uncertainty of ##########
###########ridge/lasso models are obtained generating a number "t" of new ridge/lasso models, each one generated by removing#################
###########one observation of the data. The uncertainties of the coefficients of original ridge/lasso model are based on the ################
############################variances of the coefficients obtained for these "t" new ridge/lasso models######################################
Buildmodelridgelasso <- function(dataformodel,coefsridgelasso,termsreg,nobsout){
if (coefsridgelasso[1]==0) {
k<- 1
intercept <- 0
}
else {
k<-2 ##if intercept==0 the first term is a regression-term, else if the intercept
intercept <- coefsridgelasso[1]
}
coefsout <- which((coefsridgelasso)==0)
if (length(coefsout)>0) coefsridgelasso <- coefsridgelasso[-coefsout]
validterms <- list()
l <- 1
for(j in 1:length(termsreg)){
for(i in 1:nrow(termsreg[[j]])) {
if (!(is.element(k,coefsout))) {
validterms[[l]] <- as.matrix(t(termsreg[[j]][i,]))
l <- l + 1
}
k <- k + 1
}
}
obsout <- sample.int(nrow(dataformodel),nobsout)
matcoefs <- matrix(ncol=length(coefsridgelasso), nrow=(nobsout))
matcoefs[1,] <- coefsridgelasso
for(i in 2:nobsout){
X1 <- Buildridgelasso(dataformodel, validterms)
X1 <- X1[-obsout[i],]
y <- dataformodel$y[-obsout[i]]
hqreg <- hqreg(as.matrix(X1),y,nlambda=2,lambda=c(0,0))
for (j in 1:nrow(hqreg$beta)) matcoefs[i,j] <- mean(hqreg$beta[j,])
}
sdcoefs <- apply(matcoefs,FUN=sd,MARGIN=2)
confintcoefs <- matrix(ncol=length(coefsridgelasso),nrow=2)
confintcoefs[1,] <- coefsridgelasso-(sdcoefs/sqrt(nobsout))
confintcoefs[2,] <- coefsridgelasso+(sdcoefs/sqrt(nobsout))
# print("Original Coefficients:")
# print(coefsridgelasso)
#
# print("Standard Errors using cross-validation:")
# print(sdcoefs)
#
# print("Confidence Interval using cross-validation:")
# print(confintcoefs)
#
#print("Non-zero coefficients:")
#print(validterms)
return(list(intercept,validterms,confintcoefs,coefsridgelasso))
}
###########This function generated perturbed models, based on the coefficient values of the original model and their uncertainties##########
GenerateModels <-function(model,k,method){
if (is.element(method,c("ols","quant","irls"))) {
models <- model
models <- replicate(n = k, expr = models, simplify = F)
if (is.matrix(models[[1]])){
for(i in 2:k){
for(j in 1:nrow(models[[i]]))
models[[i]][j,1] <-runif(1,(models[[1]][j,1]-models[[1]][j,2]),(models[[1]][j,1]+models[[1]][j,2]))
models[[i]] <- models[[i]][,1:1]
}
models[[1]] <- models[[1]][,1:1]
}
else {
for(i in 2:k){
models[[i]][1] <-runif(1,(models[[1]][1]-models[[1]][2]),(models[[1]][1]+models[[1]][2]))
models[[i]] <- models[[i]][1:1]
}
models[[1]] <- models[[1]][1:1]
}
}
else {
confint <- model[[1]]
valuescoefs <- numeric(ncol(confint))
models <- list(0)
models[[1]] <- model[[2]]
for(i in 2:k){
for(j in 1:ncol(confint)) valuescoefs[j] <- runif(1,confint[1,j],confint[2,j])
models[[i]] <- valuescoefs
}
}
##print("modelos:")
##print(models)
return(models)
}
########This function eliminates non-valid coefficients of ols, quant and IRLS models, based on informations about their significance#######
PruneModel <- function(model,method){
if (is.element(method,c("ols","quant"))){
pvalues <- model$coefficients[,"Pr(>|t|)"]
pvalues <- which(pvalues<0.05)
termsvalid <- rownames(model$coefficients)[pvalues]
validmodel <- model$coefficients[(model$coefficients[,"Pr(>|t|)"]<0.05),]
}
else {
tvalues <- model$coefficients[,"t value"]
tvalues <- which(tvalues>=2.09)
termsvalid <- rownames(model$coefficients)[tvalues]
validmodel <- model$coefficients[(model$coefficients[,"t value"]>=2.09),]
}
return(list(validmodel,termsvalid))
}
#lm
#nls
#model2<-lm(y~x+I(x^2)) --regression quadratic model with one variable
#anova(model1,model2) -- F-test of significance of difference between two models
#model1 is linear in regressors coefficients and variable x (y=Bo+B1x) and
#model2 linear in coefficients but quadratic in x (y=Bo+B1x+B2*x²)
#Example of using the gam function for multiple regression
#x1 <- runif(70,0,1)
#x2 <- runif(70,0,1)
#x3 <- runif(70,0,1)
#y <- 5 + 2*x1+3*x2+4*x3^2
#pairs(list(x1,x2,x3,y),panel=panel.smooth)
#library(mgcv)
#par(mfrow=c(2,2))
#model<-gam(y~s(x1)+s(x2)+s(x3))
#plot(model)
#par(mfrow=c(1,1)) you can see graphically the behavior of each regressor
#summary(model)
#using a tree model
#dados<-data.frame(x1,x3,x3,y)
#model<-rpart(y~.,data=dados)
#model
#or
#model<-tree(y~.,data=dados)
#plot(model)
#text(model)
#using lm for describing a model formulation
#model1<-lm(y~x1*x2*x3+I(x1^2)+I(x2^2)+I(x3^2))
#model2<-step(model1)
##Some important things to think about when doing multiple regression
#1 - the curvature of the response variable
#2 - interaction between explanatory variables
#3 - correlaction between explanatory variables
#4 - the risk of overparameterization
#5 - rule of thumb: don't try to estimate more than n/3 coefficients!!, where n is the number of
# observations (Crawley's book, page 949 - because there won't be degrees of freedom enough for explaining the model).
#In our case, initially with 3 parameters, we wish estimate 10 coefficients (linear,
# quadratic, interaction terms and intercept). Therefore we need of at least 33 observations.
#Common problems arising during multiple regression
#differences in the measurement scales of the explanatory variables, leading to large
#variation in the sums of squares and hence to an ill-conditioned matrix;
#multicollinearity, in which there is a near-linear relation between two of the explanatory
#variables, leading to unstable parameter estimates;
#rounding errors during the fitting procedure;
#non-independence of groups of measurements;
#temporal or spatial correlation amongst the explanatory variables;
#pseudoreplication.
#Another chapters I have to study later
#multiple regressions in the context of generalized linear models (Chapter 13),
#generalized additive models (Chapter 18), survival models (Chapter 25) and mixed-effects
#models (Chapter 19)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.