data-raw/bikesharing-scripts/01_bikes_explorative.R

############################################################
### Taking a look at the bikeshare data                  ###
############################################################

bikes_h <- read.csv("data-raw/bikesharing-raw/hour.csv")
bikes_d <- read.csv("data-raw/bikesharing-raw/day.csv")

# Let's reverse engineer how the daily version is derived from the
# hourly version

# temperature is the average temperature over the whole day
# (a really cold night would probably not influence the renting
# behaviour if the day is sunny and warm)
mean(bikes_h[1:24, "temp"])
bikes_d[1, "temp"]

# We have a categorical weather variable
# The aggregate appears to have been caluculated as a simple mean
# Clearly it's not the mode, as can be seen when looking at the first
# day
table(bikes_h[1:24, "weathersit"])
bikes_d[1, "weathersit"]
mean(bikes_h[1:24, "weathersit"])

# For days with missing hours the temp is still the mean of temp
# so it's not actually the daily average anymore
# This is machinelearning for sure...
# (Note that it cuts to six decimals for the days when there is no
# missing data as well)
mean(bikes_h[25:47, "temp"])
bikes_d[2, "temp"]

############################################################
### Variable selection                                   ###
############################################################

head(bikes_d)
str(bikes_d)

# The correlation between temp and atemp is >.99, so one has to go
cor(bikes_d$temp, bikes_d$atemp)

# casual + registered = cnt. We are interested in modeling the total cnt
# one could argue for either one of these three, but cnt seems the least
# suspicious
# instant and dteday are also variables that we do not want to include.
# instant is just an ID variable, and there are way to few observations
# to include day-of-the-year as 364 indicators.
df <- subset(bikes_d, select = -c(instant, dteday, atemp, casual, registered))
head(df)

# Next, we take a look at the OLS estimates of regression coefficients
# All variables sig at 10 %, however, workingday is not at 5 %
olsmod1 <- lm(cnt ~ ., data = df)
summary(olsmod1)

olsmod <- lm(cnt ~ ., data = subset(df, select = -c(workingday)))
summary(olsmod)

# Season is hightly significant, and mnth only slighlty. If we exclude
# season (arguing that mnth should capture seasonality better), mnth 
# significance level increases greatly.

# what if we include an interaction
ooelrich/oscbvar documentation built on Sept. 8, 2021, 3:31 p.m.