knitr::opts_chunk$set(echo = TRUE, message = F, warning = F, fig.width = 13, fig.height = 6, comment = "")

Importing Needed packages

library(tidyverse)
library(lubridate)
library(e1071) 
library(gridExtra)
library(janitor)
library(data.table)
library(funModeling)
library(kableExtra)
library(htmltools)

Helper Functions

Reading the data

# Dataset of Sales
df_sales_raw <- data.table::fread("/home/renato/repos/Rossmann/inst/Data/train.csv", stringsAsFactors = T,  na.strings = c("","'","NA"))

# Dataset of Stores
df_store_raw <- data.table::fread("/home/renato/repos/Rossmann/inst/Data/store.csv", stringsAsFactors = T,  na.strings=c("","'","NA"))

# merge
df_raw <- merge(df_sales_raw, df_store_raw, by= "Store")

Descricption of Data

Rename Columns

#making a copy of the dataset df_raw
df1 <- df_raw %>% 
  clean_names()

rmarkdown::paged_table(head(df1))

Data Dimensions

print(paste("Number of Rows: " ,nrow(df_raw)))
print(paste("Number of Cols: " ,ncol(df_raw)))

Data Types

# converting the "date" feature to datetime
df1$date <- ymd(df1$date)

glimpse(df1)

Checking NA

colSums(is.na(df1))

Fillout NA

# removing missing values
df1 <- df1 %>% 
  mutate(

    # replace the missing values with the value of 200000
    competition_distance = ifelse(is.na(competition_distance), 200000, competition_distance),

    # replace the missing values with the month in the date column     
    competition_open_since_month = ifelse(is.na(competition_open_since_month), month(date), competition_open_since_month),

    # replace the missing values with the year in the date column      
    competition_open_since_year = ifelse(is.na(competition_open_since_year), year(date), competition_open_since_year),

    # replace the missing values with the week in the date column      
    promo2since_week = ifelse(is.na(promo2since_week ), week(date), promo2since_week ),

    # replace the missing values with the year in the date column      
    promo2since_year = ifelse(is.na(promo2since_year), year(date), promo2since_year),

    month_map = month(date),

    month_map = month.abb[month_map])


# removing the blanks 
df1$promo_interval <- str_squish(df1$promo_interval)

# replacing missing values with 0
df1$promo_interval[df1$promo_interval==""] = "0"

# creating a column with the months that the promotion is active with value 1 and not active with value 0
df1 <- df1 %>% 
  mutate(is_promo = ifelse(promo_interval == "0",0,ifelse(str_detect(promo_interval, month_map),1,0)))

# viewing a random sample of the data

kable(t(df1[sample(nrow(df1), 5), ])) %>% 
  kable_styling(full_width = F, bootstrap_options = c("striped", "hover", "condesend", "responsive"),html_font = "Cambria")

Change Types

df1 <- df1 %>% 
  mutate(competition_distance = as.integer(competition_distance),
         month_map = as.factor(month_map))

Descriptive Statistics

# selecting only numeric features
num_attributes <- df1 %>% 
  keep(is.numeric)

# selecting only categorical features
cat_attributes <- df1 %>% 
  keep(is.factor)

Numeric Attributes

# Central Tendency  - mean , median
num_mean <- as.data.frame( t(lapply(num_attributes, mean)))

num_median <- as.data.frame( t(lapply(num_attributes, median)))

# dispersion - std, min, max, range, skew, kurtosis
num_std <- as.data.frame( t(lapply(num_attributes, sd)))

num_min <- as.data.frame( t(lapply(num_attributes, min)))

num_max <- as.data.frame( t(lapply(num_attributes, max)))

num_skew <- as.data.frame( t(lapply(num_attributes, skewness)))

num_kurt <- as.data.frame( t(lapply(num_attributes, kurtosis)))

table_desc <- t(bind_rows(num_min,num_max,num_mean,num_median,num_std,num_skew,num_kurt))

table_desc<- as.data.frame(table_desc)


names(table_desc) <- c("min","max","mean","median","std","skew", "kurtosis")


kable(table_desc, digits = 4) %>% 
  kable_styling(full_width = F, bootstrap_options = c("striped", "hover", "condesend", "responsive"),html_font = "Cambria")
df1 %>% 
  ggplot(aes(sales))+
  geom_histogram(aes(y =..density..),col="black", fill="steelblue")+
  stat_function(fun = dnorm, args = list(mean = mean(df1$sales), sd = sd(df1$sales)), col="red", lwd=1)

Categorical Attributes

apply(cat_attributes, 2, function(x) length(unique(x)))
boxplot_01 <- df1 %>% 
  filter(state_holiday != 0 & sales > 0 ) %>% 
  ggplot(aes(x = state_holiday, y = sales, fill=state_holiday))+
  scale_y_continuous(breaks = seq(0,40000,5000))+
  geom_boxplot()



boxplot_02 <- df1 %>% 
  filter(state_holiday != 0 & sales > 0 ) %>% 
  ggplot(aes(x = store_type, y = sales, fill=store_type))+
  scale_y_continuous(breaks = seq(0,40000,5000))+
  geom_boxplot()

boxplot_03 <- df1 %>% 
  filter(state_holiday != 0 & sales > 0 ) %>% 
  ggplot(aes(x = assortment, y = sales, fill=assortment))+
  scale_y_continuous(breaks = seq(0,40000,5000))+
  geom_boxplot()

grid.arrange(boxplot_01,boxplot_02,boxplot_03,nrow= 2,ncol=2)


rsprojeto/Rossmann documentation built on Feb. 7, 2021, 5:32 a.m.