knitr::opts_chunk$set(echo = TRUE, message = F, warning = F, fig.width = 13, fig.height = 6, comment = "")
library(tidyverse) library(lubridate) library(e1071) library(gridExtra) library(janitor) library(data.table) library(funModeling) library(kableExtra) library(htmltools)
# Dataset of Sales df_sales_raw <- data.table::fread("/home/renato/repos/Rossmann/inst/Data/train.csv", stringsAsFactors = T, na.strings = c("","'","NA")) # Dataset of Stores df_store_raw <- data.table::fread("/home/renato/repos/Rossmann/inst/Data/store.csv", stringsAsFactors = T, na.strings=c("","'","NA")) # merge df_raw <- merge(df_sales_raw, df_store_raw, by= "Store")
#making a copy of the dataset df_raw df1 <- df_raw %>% clean_names() rmarkdown::paged_table(head(df1))
print(paste("Number of Rows: " ,nrow(df_raw))) print(paste("Number of Cols: " ,ncol(df_raw)))
# converting the "date" feature to datetime df1$date <- ymd(df1$date) glimpse(df1)
colSums(is.na(df1))
# removing missing values df1 <- df1 %>% mutate( # replace the missing values with the value of 200000 competition_distance = ifelse(is.na(competition_distance), 200000, competition_distance), # replace the missing values with the month in the date column competition_open_since_month = ifelse(is.na(competition_open_since_month), month(date), competition_open_since_month), # replace the missing values with the year in the date column competition_open_since_year = ifelse(is.na(competition_open_since_year), year(date), competition_open_since_year), # replace the missing values with the week in the date column promo2since_week = ifelse(is.na(promo2since_week ), week(date), promo2since_week ), # replace the missing values with the year in the date column promo2since_year = ifelse(is.na(promo2since_year), year(date), promo2since_year), month_map = month(date), month_map = month.abb[month_map]) # removing the blanks df1$promo_interval <- str_squish(df1$promo_interval) # replacing missing values with 0 df1$promo_interval[df1$promo_interval==""] = "0" # creating a column with the months that the promotion is active with value 1 and not active with value 0 df1 <- df1 %>% mutate(is_promo = ifelse(promo_interval == "0",0,ifelse(str_detect(promo_interval, month_map),1,0))) # viewing a random sample of the data kable(t(df1[sample(nrow(df1), 5), ])) %>% kable_styling(full_width = F, bootstrap_options = c("striped", "hover", "condesend", "responsive"),html_font = "Cambria")
df1 <- df1 %>% mutate(competition_distance = as.integer(competition_distance), month_map = as.factor(month_map))
# selecting only numeric features num_attributes <- df1 %>% keep(is.numeric) # selecting only categorical features cat_attributes <- df1 %>% keep(is.factor)
# Central Tendency - mean , median num_mean <- as.data.frame( t(lapply(num_attributes, mean))) num_median <- as.data.frame( t(lapply(num_attributes, median))) # dispersion - std, min, max, range, skew, kurtosis num_std <- as.data.frame( t(lapply(num_attributes, sd))) num_min <- as.data.frame( t(lapply(num_attributes, min))) num_max <- as.data.frame( t(lapply(num_attributes, max))) num_skew <- as.data.frame( t(lapply(num_attributes, skewness))) num_kurt <- as.data.frame( t(lapply(num_attributes, kurtosis))) table_desc <- t(bind_rows(num_min,num_max,num_mean,num_median,num_std,num_skew,num_kurt)) table_desc<- as.data.frame(table_desc) names(table_desc) <- c("min","max","mean","median","std","skew", "kurtosis") kable(table_desc, digits = 4) %>% kable_styling(full_width = F, bootstrap_options = c("striped", "hover", "condesend", "responsive"),html_font = "Cambria")
df1 %>% ggplot(aes(sales))+ geom_histogram(aes(y =..density..),col="black", fill="steelblue")+ stat_function(fun = dnorm, args = list(mean = mean(df1$sales), sd = sd(df1$sales)), col="red", lwd=1)
apply(cat_attributes, 2, function(x) length(unique(x)))
boxplot_01 <- df1 %>% filter(state_holiday != 0 & sales > 0 ) %>% ggplot(aes(x = state_holiday, y = sales, fill=state_holiday))+ scale_y_continuous(breaks = seq(0,40000,5000))+ geom_boxplot() boxplot_02 <- df1 %>% filter(state_holiday != 0 & sales > 0 ) %>% ggplot(aes(x = store_type, y = sales, fill=store_type))+ scale_y_continuous(breaks = seq(0,40000,5000))+ geom_boxplot() boxplot_03 <- df1 %>% filter(state_holiday != 0 & sales > 0 ) %>% ggplot(aes(x = assortment, y = sales, fill=assortment))+ scale_y_continuous(breaks = seq(0,40000,5000))+ geom_boxplot() grid.arrange(boxplot_01,boxplot_02,boxplot_03,nrow= 2,ncol=2)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.