TYP DANYCH

data <- read.csv(file = "data/energydata_complete.csv", stringsAsFactors = FALSE)
data<-data[order(data$date),]
str(data)

OGÓLNE STATYSTYKI DANYCH

#wyliczenia dla kolumn numerycznych
row_names <- colnames(data[,2:29])
mean <- apply(data[,2:29],2, mean)
median <-apply(data[,2:29],2,median)
min <-apply(data[,2:29],2,min)
max<-apply(data[,2:29],2,max)
std<-apply(data[,2:29],2,sd)

number_data.df <- data.frame(row_names, mean, median, min, max, std)
number_data.df[,2:6]<-round(number_data.df[,2:6], 2)
number_data.df[order(number_data.df$row_names),]

# wyliczenia dla date:
mean_date <-mean.Date(as.Date(data[,1:1], format=c("%Y-%m-%d"))) 
median_date <-median(data[,1:1])
min_date<- min(as.Date(data[,1:1], format=c("%Y-%m-%d")))
max_date<- max(as.Date(data[,1:1], format=c("%Y-%m-%d")))
date_numeric <-as.numeric(as.POSIXct(data[,1:1], format=c("%Y-%m-%d")))
std_date_days<- sd(date_numeric, na.rm = TRUE)/60/60/24

date_data.df <- data.frame("date", mean_date, median_date, min_date, max_date, std_date_days)
#date_data.df[,2:6]<-round(number_data.df[,2:6], 2)
date_data.df

WARTOŚCI BRAKUJĄCE

# w ilu kolumnach brakuje danych
which(is.na(data))
# ile jest łącznie brakujących danych
sum(is.na(data))

WYKRESY ZMIENNYCH CELU W CZASIE

data1<-as.Date(data$date, format=c("%Y-%m-%d"))
plot(data1,data$Appliances,type="l", col=3, main="Power usage from Appliances in time")
plot(data1,data$lights,type="l", col=3, main="Power usage from lights in time")

Histogram zmiennych

install.packages("Hmisc")
library(Hmisc)
data<-data[,order(names(data))]
data_numeric <- subset( data, select = -date )
hist.data.frame(data_numeric[,1:14])
hist.data.frame(data_numeric[,15:28])

Quantile - Quantile plot

for (i in 1:ncol(data_numeric[,1: ncol(data_numeric) ])){  
qqnorm(data_numeric[, i], main = names(data_numeric[i]), col=3)
   qqline(data_numeric[, i])
}

Wykresy korelacji

install.packages("corrplot")
install.packages("lubridate")
library(lubridate)
library(corrplot)
hour<-hour(as.POSIXct(data[,1:1], format=c("%Y-%m-%d %H:%M:%S")))
data_numeric['hour']<-hour
#pory dnia
#22:00 - 6:00 noc 1
#6:00-10:00 rano 2
#10:00-12:00 przedpoludnie 3
#12:00 - 14:00 wczesne popoludnie 4
#14:00 -18:00 popoludnie 5
#18:00 - 22:00 -wieczor 0
time_of_day <- rep(0,length(data$date))
time_of_day[(hour>=22&hour<=24)|(hour>=0&hour<6)]=1
time_of_day[(hour>=6&hour<10)]=2
time_of_day[(hour>=10&hour<12)]=3
time_of_day[(hour>=12&hour<14)]=4
time_of_day[(hour>=14&hour<18)]=5
time_of_day[(hour>=18&hour<22)]=0
data_numeric['time_of_day']<-time_of_day
res <- cor(data_numeric, use = "complete.obs")
round(res, 2)
corrplot(res, type = "full", order = "hclust", 
         tl.col = "black")

Mozemy zauwazyc ze atrybuty Appliances, visibility, windspeed, rh_out, t_dewpoint, t9, t3, t4, t5 maja rozklady nie bedace rozkladem normalnym.

Atrybuty rv1, rv2, r6 wydaja sie byc atrybutami losowymi.

data$logAppliances = log(data$Appliances)
data$logVisibility = log(data$Visibility)
data$logWindspeed = log(data$Windspeed)
data$logRh_out=log(data$Tdewpoint)
data$logT9=log(data$T9)
data$logT3=log(data$T3)
data$logT4=log(data$T4)
data$logT5=log(data$T5)
str(data)
data_numeric <- subset( data, select = -date )
hist.data.frame(data_numeric[,1:14])
hist.data.frame(data_numeric[,15:28])
hist.data.frame(data_numeric[,29:36])
data_numeric$logAppliances = log(data$Appliances)
data_numeric$logVisibility = log(data$Visibility)
data_numeric$logWindspeed = log(data$Windspeed)
data_numeric$logRh_out=log(data$Tdewpoint)
data_numeric$logT9=log(data$T9)
data_numeric$logT3=log(data$T3)
data_numeric$logT4=log(data$T4)
data_numeric$logT5=log(data$T5)
data_numeric[is.na(data_numeric)] = 0
res <- cor(data_numeric, use = "complete.obs")
res[is.na(res)] = 0

round(res, 2)
corrplot(res, type = "full", order = "hclust",
         tl.col = "black")

Save all numeric data to the new file

write.csv(data_numeric,'../data/energy_data_after_analysis.csv')


Demongo2009/ZUM_Appliances documentation built on May 31, 2022, 8:28 a.m.