############################################################
### Taking a look at the bikeshare data ###
############################################################
bikes_h <- read.csv("data-raw/bikesharing-raw/hour.csv")
bikes_d <- read.csv("data-raw/bikesharing-raw/day.csv")
# Let's reverse engineer how the daily version is derived from the
# hourly version
# temperature is the average temperature over the whole day
# (a really cold night would probably not influence the renting
# behaviour if the day is sunny and warm)
mean(bikes_h[1:24, "temp"])
bikes_d[1, "temp"]
# We have a categorical weather variable
# The aggregate appears to have been caluculated as a simple mean
# Clearly it's not the mode, as can be seen when looking at the first
# day
table(bikes_h[1:24, "weathersit"])
bikes_d[1, "weathersit"]
mean(bikes_h[1:24, "weathersit"])
# For days with missing hours the temp is still the mean of temp
# so it's not actually the daily average anymore
# This is machinelearning for sure...
# (Note that it cuts to six decimals for the days when there is no
# missing data as well)
mean(bikes_h[25:47, "temp"])
bikes_d[2, "temp"]
############################################################
### Variable selection ###
############################################################
head(bikes_d)
str(bikes_d)
# The correlation between temp and atemp is >.99, so one has to go
cor(bikes_d$temp, bikes_d$atemp)
# casual + registered = cnt. We are interested in modeling the total cnt
# one could argue for either one of these three, but cnt seems the least
# suspicious
# instant and dteday are also variables that we do not want to include.
# instant is just an ID variable, and there are way to few observations
# to include day-of-the-year as 364 indicators.
df <- subset(bikes_d, select = -c(instant, dteday, atemp, casual, registered))
head(df)
# Next, we take a look at the OLS estimates of regression coefficients
# All variables sig at 10 %, however, workingday is not at 5 %
olsmod1 <- lm(cnt ~ ., data = df)
summary(olsmod1)
olsmod <- lm(cnt ~ ., data = subset(df, select = -c(workingday)))
summary(olsmod)
# Season is hightly significant, and mnth only slighlty. If we exclude
# season (arguing that mnth should capture seasonality better), mnth
# significance level increases greatly.
# what if we include an interaction
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.