TYP DANYCH
data <- read.csv(file = "data/energydata_complete.csv", stringsAsFactors = FALSE) data<-data[order(data$date),] str(data)
OGÓLNE STATYSTYKI DANYCH
#wyliczenia dla kolumn numerycznych row_names <- colnames(data[,2:29]) mean <- apply(data[,2:29],2, mean) median <-apply(data[,2:29],2,median) min <-apply(data[,2:29],2,min) max<-apply(data[,2:29],2,max) std<-apply(data[,2:29],2,sd) number_data.df <- data.frame(row_names, mean, median, min, max, std) number_data.df[,2:6]<-round(number_data.df[,2:6], 2) number_data.df[order(number_data.df$row_names),] # wyliczenia dla date: mean_date <-mean.Date(as.Date(data[,1:1], format=c("%Y-%m-%d"))) median_date <-median(data[,1:1]) min_date<- min(as.Date(data[,1:1], format=c("%Y-%m-%d"))) max_date<- max(as.Date(data[,1:1], format=c("%Y-%m-%d"))) date_numeric <-as.numeric(as.POSIXct(data[,1:1], format=c("%Y-%m-%d"))) std_date_days<- sd(date_numeric, na.rm = TRUE)/60/60/24 date_data.df <- data.frame("date", mean_date, median_date, min_date, max_date, std_date_days) #date_data.df[,2:6]<-round(number_data.df[,2:6], 2) date_data.df
WARTOŚCI BRAKUJĄCE
# w ilu kolumnach brakuje danych which(is.na(data)) # ile jest łącznie brakujących danych sum(is.na(data))
WYKRESY ZMIENNYCH CELU W CZASIE
data1<-as.Date(data$date, format=c("%Y-%m-%d")) plot(data1,data$Appliances,type="l", col=3, main="Power usage from Appliances in time") plot(data1,data$lights,type="l", col=3, main="Power usage from lights in time")
Histogram zmiennych
install.packages("Hmisc") library(Hmisc)
data<-data[,order(names(data))] data_numeric <- subset( data, select = -date ) hist.data.frame(data_numeric[,1:14]) hist.data.frame(data_numeric[,15:28])
Quantile - Quantile plot
for (i in 1:ncol(data_numeric[,1: ncol(data_numeric) ])){ qqnorm(data_numeric[, i], main = names(data_numeric[i]), col=3) qqline(data_numeric[, i]) }
Wykresy korelacji
install.packages("corrplot") install.packages("lubridate")
library(lubridate) library(corrplot) hour<-hour(as.POSIXct(data[,1:1], format=c("%Y-%m-%d %H:%M:%S"))) data_numeric['hour']<-hour #pory dnia #22:00 - 6:00 noc 1 #6:00-10:00 rano 2 #10:00-12:00 przedpoludnie 3 #12:00 - 14:00 wczesne popoludnie 4 #14:00 -18:00 popoludnie 5 #18:00 - 22:00 -wieczor 0 time_of_day <- rep(0,length(data$date)) time_of_day[(hour>=22&hour<=24)|(hour>=0&hour<6)]=1 time_of_day[(hour>=6&hour<10)]=2 time_of_day[(hour>=10&hour<12)]=3 time_of_day[(hour>=12&hour<14)]=4 time_of_day[(hour>=14&hour<18)]=5 time_of_day[(hour>=18&hour<22)]=0 data_numeric['time_of_day']<-time_of_day res <- cor(data_numeric, use = "complete.obs") round(res, 2) corrplot(res, type = "full", order = "hclust", tl.col = "black")
Mozemy zauwazyc ze atrybuty Appliances, visibility, windspeed, rh_out, t_dewpoint, t9, t3, t4, t5 maja rozklady nie bedace rozkladem normalnym.
Atrybuty rv1, rv2, r6 wydaja sie byc atrybutami losowymi.
data$logAppliances = log(data$Appliances) data$logVisibility = log(data$Visibility) data$logWindspeed = log(data$Windspeed) data$logRh_out=log(data$Tdewpoint) data$logT9=log(data$T9) data$logT3=log(data$T3) data$logT4=log(data$T4) data$logT5=log(data$T5) str(data) data_numeric <- subset( data, select = -date ) hist.data.frame(data_numeric[,1:14]) hist.data.frame(data_numeric[,15:28]) hist.data.frame(data_numeric[,29:36])
data_numeric$logAppliances = log(data$Appliances) data_numeric$logVisibility = log(data$Visibility) data_numeric$logWindspeed = log(data$Windspeed) data_numeric$logRh_out=log(data$Tdewpoint) data_numeric$logT9=log(data$T9) data_numeric$logT3=log(data$T3) data_numeric$logT4=log(data$T4) data_numeric$logT5=log(data$T5) data_numeric[is.na(data_numeric)] = 0 res <- cor(data_numeric, use = "complete.obs") res[is.na(res)] = 0 round(res, 2) corrplot(res, type = "full", order = "hclust", tl.col = "black")
Save all numeric data to the new file
write.csv(data_numeric,'../data/energy_data_after_analysis.csv')
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.