In Demongo2009/ZUM_Appliances: What the package does (short line)

TWORZENIE MODELU REGRESJI LINIOWEJ https://www.pluralsight.com/guides/linear-lasso-and-ridge-regression-with-r

Wczytanie danych

train<-read.csv("../data/energy_data_train.csv")
train <- subset(train, select = -c(lights) )
train_attributes <- subset(train, select = -c(logAppliances) )
test<-read.csv("../data/energy_data_test.csv")
test_attributes <- subset(test, select = -c(lights, logAppliances) )
test_y <- subset(test, select = c( logAppliances) )
logAppliances<-train$logAppliances
train <- cbind(train_attributes, logAppliances)

Regresja liniowa

model1 <- lm(logAppliances~.,train,na.action=na.exclude)
summary(model1)

predykcja

prediction<-predict(model1, newdata = test_attributes)

#residual plot
res = resid(model1)
plot(train$logAppliances, res, 
    ylab="Residuals", xlab="Appliances Energy Usage")
#Step 1 - create the evaluation metrics function


eval_metrics = function(model, y, predictions){
    resids = y - predictions
    resids2 = resids**2
    N = length(predictions)
    r2 = as.character(round(summary(model)$r.squared, 5))
    adj_r2 = as.character(round(summary(model)$adj.r.squared, 5))
    print(adj_r2) #Adjusted R-squared
    print(as.character(round(sqrt(sum(resids2)/N), 5))) #RMSE
    return(round(sqrt(sum(resids2)/N), 5)) #RMSE

}

# Step 2 - predicting and evaluating the model on train data
predictions = predict(model1, newdata = train_attributes)
eval_metrics(model1, train$logAppliances, predictions)


# Step 3 - predicting and evaluating the model on test data
predictions = predict(model1, newdata = test_attributes)
eval_metrics(model1, test_y, predictions)

train<-read.csv("../data/energy_data_train.csv")
train <- subset(train, select = -c(lights,RH_6, RH_7, RH_9,RH_out))
model2 <- lm(logAppliances~.,train,na.action=na.exclude)
summary(model2)
prediction<-predict(model2, newdata = test_attributes)
#residual plot
res = resid(model2)
plot(train$logAppliances, res, 
    ylab="Residuals", xlab="Appliances Energy Usage")
predictions = predict(model2, newdata = train_attributes)
eval_metrics(model2, train$logAppliances, predictions)
predictions = predict(model2, newdata = test_attributes)
eval_metrics(model2, test_y, predictions)

F-test https://statisticsbyjim.com/regression/interpret-f-test-overall-significance-regression/ https://bookdown.org/ndphillips/YaRrr/comparing-regression-models-with-anova.html

train<-read.csv("../data/energy_data_train.csv")
train <- subset(train, select = -c(lights))
library(corrplot)
res <- cor( train, use = "complete.obs")
res[is.na(res)] = 0
res = as.data.frame(res)
res<-abs(subset(res, select = c(logAppliances)))
res_names<-rownames(res)[order(-res)][-1]
res<-res[order(-res),][-1]
rsme=list()
f_stat=list()
p_val=list()

sub_train_attr <-as.data.frame(train)
full_model<- lm(logAppliances~.,sub_train_attr,na.action=na.exclude)

for(i in 1:length(res_names)){
    sub_train_attr <-as.data.frame(train[,(res_names)[-i]])
    sub_train_attr <- cbind(sub_train_attr, logAppliances)
    model1 <- lm(logAppliances~.,sub_train_attr,na.action=na.exclude)
    predictions = predict(model1, newdata = as.data.frame(train[,(res_names)[-i]]))
    rsme=append(rsme,eval_metrics(model1, train$logAppliances, predictions))
    f_stat<-append(f_stat,anova(model1,full_model)$F[2])
    p_val<-append(p_val,anova(model1,full_model)$Pr[2])

}
plot(seq( 1:length(res_names)),rsme, xlab="number of predictor excluded from model", pch=19)
print(f_stat)
plot(seq( 1:(length(res_names))),f_stat, xlab="number of predictor excluded from model", pch=19)
plot(seq( 1:(length(res_names))),p_val, xlab="number of predictor excluded from model", pch=19)


train<-read.csv("../data/energy_data_train.csv")
train <- subset(train, select = -c(lights))

Likelihood Ratio Test (LRT) https://finnstats.com/index.php/2021/11/24/likelihood-ratio-test-in-r/

library(lmtest)

lrt=list()
p_val=list()
for(i in 1:length(res_names)){
    sub_train_attr <-as.data.frame(train[,(res_names)[-i]])
    sub_train_attr <- cbind(sub_train_attr, logAppliances)
    model1 <- lm(logAppliances~.,sub_train_attr,na.action=na.exclude)
    print(lrtest(full_model, model1))
    p_val<-append(p_val,lrtest(full_model, model1)$Pr[2])
    lrt<-append(lrt,lrtest(full_model, model1)$Chisq[2])

}
print(f_stat)
plot(seq( 1:(length(res_names))),lrt, xlab="number of predictor excluded from model", pch=19)
plot(seq( 1:(length(res_names))),p_val, xlab="number of predictor excluded from model", pch=19)

Akaike Information Criterion (AIC) https://www.scribbr.com/statistics/akaike-information-criterion/

library(Rcpp)
library(AICcmodavg)
indexes=c(2,6,11,13)
com<-combn(indexes, 1,simplify=FALSE)
com<-append(com,combn(indexes, 2,simplify=FALSE))
com<-append(com,combn(indexes, 3,simplify=FALSE))
com<-append(com,combn(indexes, 4,simplify=FALSE))
for(i in 1:length(com)){
    sub_train_attr <-as.data.frame(train[,(res_names)[-unlist(com[i])]])
    sub_train_attr <- cbind(sub_train_attr, logAppliances)
    models[[i]] <- lm(logAppliances~.,sub_train_attr,na.action=na.exclude)
}
aictab(cand.set = models, modnames =seq( 1:length(res_names)))


train<-read.csv("../data/energy_data_train.csv")
train <- subset(train, select = -c(lights,RH_6, RH_7, RH_out))
model2 <- lm(logAppliances~.,train,na.action=na.exclude)
summary(model2)
prediction<-predict(model2, newdata = test_attributes)
#residual plot
res = resid(model2)
plot(train$logAppliances, res, 
    ylab="Residuals", xlab="Appliances Energy Usage")
predictions = predict(model2, newdata = train_attributes)
eval_metrics(model2, train$logAppliances, predictions)
predictions = predict(model2, newdata = test_attributes)
eval_metrics(model2, test_y, predictions)

Bayes Information Criterion (BIC) https://www.statology.org/bic-in-r/

train<-read.csv("../data/energy_data_train.csv")
library(flexmix)
bics=list()
for(i in 1:length(com)){
    sub_train_attr <-as.data.frame(train[,(res_names)[-unlist(com[i])]])
    sub_train_attr <- cbind(sub_train_attr, logAppliances)
    model1 <- lm(logAppliances~.,sub_train_attr,na.action=na.exclude)
    bics<-append(bics,BIC(model1))
}
plot(seq( 1:(length(com))),bics, xlab="models no", pch=19)