knitr::opts_chunk$set(echo = TRUE, message = F, warning = F, fig.width = 13, fig.height = 6, comment = "")
library(tidyverse) library(lubridate) library(e1071) library(gridExtra) library(janitor) library(data.table) library(funModeling) library(kableExtra) library(htmltools) library(ISOweek) library(plotly) library(vcd) library(timetk) library(ggpubr) theme_set(theme_minimal())
catcor <- function(x, type=c("cramer", "phi", "contingency")) { require(vcd) nc <- ncol(x) v <- expand.grid(1:nc, 1:nc) type <- match.arg(type) res <- matrix(mapply(function(i1, i2) assocstats(table(x[,i1], x[,i2]))[[type]], v[,1], v[,2]), nc, nc) rownames(res) <- colnames(res) <- colnames(x) res }
# Dataset of Sales df_sales_raw <- data.table::fread("/home/renato/repos/Rossmann/inst/Data/train.csv", stringsAsFactors = T, na.strings = c("","'","NA")) # Dataset of Stores df_store_raw <- data.table::fread("/home/renato/repos/Rossmann/inst/Data/store.csv", stringsAsFactors = T, na.strings=c("","'","NA")) # merge df_raw <- merge(df_sales_raw, df_store_raw, by= "Store")
#making a copy of the dataset df_raw df1 <- df_raw %>% clean_names() rmarkdown::paged_table(head(df1))
print(paste("Number of Rows: " ,nrow(df_raw))) print(paste("Number of Cols: " ,ncol(df_raw)))
# converting the "date" feature to datetime df1$date <- ymd(df1$date) glimpse(df1)
colSums(is.na(df1))
# removing missing values df1 <- df1 %>% mutate( # replace the missing values with the value of 200000 competition_distance = ifelse(is.na(competition_distance), 200000, competition_distance), # replace the missing values with the month in the date column competition_open_since_month = ifelse(is.na(competition_open_since_month), month(date), competition_open_since_month), # replace the missing values with the year in the date column competition_open_since_year = ifelse(is.na(competition_open_since_year), year(date), competition_open_since_year), # replace the missing values with the week in the date column promo2since_week = ifelse(is.na(promo2since_week ), week(date), promo2since_week ), # replace the missing values with the year in the date column promo2since_year = ifelse(is.na(promo2since_year), year(date), promo2since_year), month_map = month(date), month_map = month.abb[month_map]) # removing the blanks df1$promo_interval <- str_squish(df1$promo_interval) # replacing missing values with 0 df1$promo_interval[df1$promo_interval==""] = "0" # creating a column with the months that the promotion is active with value 1 and not active with value 0 df1 <- df1 %>% mutate(is_promo = ifelse(promo_interval == "0",0,ifelse(str_detect(promo_interval, month_map),1,0))) # viewing a random sample of the data kable(t(df1[sample(nrow(df1), 5), ])) %>% kable_styling(full_width = F, bootstrap_options = c("striped", "hover", "condesend", "responsive"),html_font = "Cambria")
df1 <- df1 %>% mutate(competition_distance = as.integer(competition_distance), month_map = as.factor(month_map))
# selecting only numeric features num_attributes <- df1 %>% keep(is.numeric) # selecting only categorical features cat_attributes <- df1 %>% keep(is.factor)
# Central Tendency - mean , median num_mean <- as.data.frame( t(lapply(num_attributes, mean))) num_median <- as.data.frame( t(lapply(num_attributes, median))) # dispersion - std, min, max, range, skew, kurtosis num_std <- as.data.frame( t(lapply(num_attributes, sd))) num_min <- as.data.frame( t(lapply(num_attributes, min))) num_max <- as.data.frame( t(lapply(num_attributes, max))) num_skew <- as.data.frame( t(lapply(num_attributes, skewness))) num_kurt <- as.data.frame( t(lapply(num_attributes, kurtosis))) table_desc <- t(bind_rows(num_min,num_max,num_mean,num_median,num_std,num_skew,num_kurt)) table_desc<- as.data.frame(table_desc) names(table_desc) <- c("min","max","mean","median","std","skew", "kurtosis") kable(table_desc, digits = 4, caption = "Numerical data description") %>% kable_styling(full_width = F, bootstrap_options = c("striped", "hover", "condesend", "responsive"),html_font = "Cambria")
df1 %>% ggplot(aes(sales))+ geom_histogram(aes(y =..density..),col="black", fill="steelblue")+ stat_function(fun = dnorm, args = list(mean = mean(df1$sales), sd = sd(df1$sales)), col="red", lwd=1)+ labs(title = "Distribution Sales")+ theme(plot.title = element_text(hjust = 0.5, size = 18))
apply(cat_attributes, 2, function(x) length(unique(x)))
boxplot_01 <- df1 %>% filter(state_holiday != 0 & sales > 0 ) %>% ggplot(aes(x = state_holiday, y = sales, fill=state_holiday))+ scale_y_continuous(breaks = seq(0,40000,5000))+ geom_boxplot()+ labs(title = "state_holiday vs sales")+ theme(plot.title = element_text(hjust = 0.5, size = 18)) boxplot_02 <- df1 %>% filter(state_holiday != 0 & sales > 0 ) %>% ggplot(aes(x = store_type, y = sales, fill=store_type))+ scale_y_continuous(breaks = seq(0,40000,5000))+ geom_boxplot()+ labs(title = "store_type vs sales")+ theme(plot.title = element_text(hjust = 0.5, size = 18)) boxplot_03 <- df1 %>% filter(state_holiday != 0 & sales > 0 ) %>% ggplot(aes(x = assortment, y = sales, fill=assortment))+ scale_y_continuous(breaks = seq(0,40000,5000))+ geom_boxplot()+ labs(title = "assortmet vs sales")+ theme(plot.title = element_text(hjust = 0.5, size = 18)) grid.arrange(boxplot_01,boxplot_02,boxplot_03,nrow= 2,ncol=2)
df2 <- df1
knitr::include_graphics("/home/renato/repos/Rossmann/inst/img/MindMapsHypothesis.png")
1. Lojas com número maior de funcionários deveriam vender mais.
1. Stores with more employees should sell more.
2. Lojas com maior capacidade de estoque deveriam vender mais.
2. Stores with greater inventory capacity should sell more.
3. Lojas com maior porte deveriam vender mais.
3. Larger stores should sell more.
4. Lojas com maior sortimentos deveriam vender mais.
4. Stores with larger assortments should sell more.
5. Lojas com competidores mais próximos deveriam vender menos.
5. Stores with closer competitors should sell less.
6. Lojas com competidores à mais tempo deveriam vendem mais.
6. Stores with longer competitors should sell more.
1. Lojas que investem mais em Marketing deveriam vender mais.
1. Stores that invest more in Marketing should sell more.
2. Lojas com maior exposição de produto deveriam vender mais.
2. Stores with greater product exposure should sell more.
3. Lojas com produtos com preço menor deveriam vender mais.
3. Stores with lower priced products should sell more.
4. Lojas com promoções mais agressivas ( descontos maiores ), deveriam vender mais.
4. Stores with more aggressive promotions (bigger discounts), should sell more.
5. Lojas com promoções ativas por mais tempo deveriam vender mais.
5. Stores with active promotions for longer should sell more.
6. Lojas com mais dias de promoção deveriam vender mais.
6. Stores with more promotion days should sell more.
7. Lojas com mais promoções consecutivas deveriam vender mais.
7. Stores with more consecutive promotions should sell more.
1. Lojas abertas durante o feriado de Natal deveriam vender mais.
1. Stores open during the Christmas holiday should sell more.
2. Lojas deveriam vender mais ao longo dos anos.
2. Stores should sell more over the years.
3. Lojas deveriam vender mais no segundo semestre do ano.
3. Stores should sell more in the second half of the year.
4. Lojas deveriam vender mais depois do dia 10 de cada mês.
4. Stores should sell more after the 10th of each month.
5. Lojas deveriam vender menos aos finais de semana.
5. Stores should sell less on weekends.
6. Lojas deveriam vender menos durante os feriados escolares.
6. Stores should sell less during school holidays.
1. Lojas com maior sortimentos deveriam vender mais.
1. Stores with larger assortments should sell more.
2. Lojas com competidores mais próximos deveriam vender menos.
2. Stores with closer competitors should sell less.
3. Lojas com competidores à mais tempo deveriam vendem mais.
3. Stores with longer competitors should sell more.
4. Lojas com promoções ativas por mais tempo deveriam vender mais.
4. Stores with active promotions for longer should sell more.
5. Lojas com mais dias de promoção deveriam vender mais.
5. Stores with more promotion days should sell more.
6. Lojas com mais promoções consecutivas deveriam vender mais.
6. Stores with more consecutive promotions should sell more.
7. Lojas abertas durante o feriado de Natal deveriam vender mais.
7. Stores open during the Christmas holiday should sell more.
8. Lojas deveriam vender mais ao longo dos anos.
8. Stores should sell more over the years.
9. Lojas deveriam vender mais no segundo semestre do ano.
9. Stores should sell more in the second half of the year.
10. Lojas deveriam vender mais depois do dia 10 de cada mês.
10. Stores should sell more after the 10th of each month.
11. Lojas deveriam vender menos aos finais de semana.
11. Stores should sell less on weekends.
12. Lojas deveriam vender menos durante os feriados escolares.
12. Stores should sell less during school holidays.
df2 <- df2 %>% mutate( # Extracting year year = as.integer(year(date)), # Extracting month month = month(date), # Extracting day day = day(date), #Extracting week of the year week_of_year = week(date), # Extracting year and week year_week = strftime(date, format = "%Y-%W"), # Extracting first day of the month first_day_of_month = "01", # Turning into Date competition_since = make_date(competition_open_since_year, competition_open_since_month,first_day_of_month), # Getting the difference in days competition_time_month = as.integer(difftime(date, competition_since, units = "days")/30) ) # this is function to convert year-week to year-month-day data_da_semana <- function(ano, semana, diadasemana){ w <- paste0(ano, "-W", sprintf("%02d", semana), "-", diadasemana) ISOweek2date(w)-1 } df2 <- df2 %>% mutate( # convert year-week to year-month-day promo_since = data_da_semana(promo2since_year, promo2since_week, 1), # Getting the difference in days promo_time_week = difftime(date, promo_since, units = "days")/7, # converting to integer promo_time_week = as.integer(promo_time_week), assortment = case_when( # changing from a to basic assortment == "a" ~ "basic", # changing from b to extra assortment == "b" ~ "extra", # everything else for extended T ~ "extended"), state_holiday = case_when( # changing from a to public_holiday state_holiday == "a" ~ "public_holiday", # changing from b to easter_holiday state_holiday == "b" ~ "easter_holiday", # changing from b to christmas state_holiday == "c" ~ "christmas", # everything else for regular_day T ~ "regular_day"))
df3 <- df2
# Removing the records of the days the stores are closed and thus obtaining only sales with values> 0 df3 <- df3 %>% filter(open != 0 & sales > 0)
# Removing features that were not available at the time of production and will not be needed. df3 <- df3 %>% select(-customers, -open, -promo_interval, -month_map)
df4 <- df3
df4 %>% ggplot(aes(sales))+ geom_histogram(col="black", fill="steelblue")+ scale_x_continuous(breaks = seq(0,40000, 2000))+ labs(title = "count vs sales")+ theme(plot.title = element_text(hjust = 0.5, size = 18))
O maior numero de vendas está a partir de 4000 a 9000 dollares.
The biggest number of sales is from 4000 to 9000 dollars.
num_attributes %>% keep(is.numeric) %>% gather() %>% ggplot(aes(value)) + facet_wrap(~ key, scales = "free") + geom_histogram(col= "black", fill="steelblue", bins = 25)+ scale_y_continuous(labels = function(x) format(x, scientific = FALSE))+ labs(title = "Distribution of numerical variables")+ theme(plot.title = element_text(hjust = 0.5, size = 18))
barplot_state_holiday <- df4 %>% filter(state_holiday != "regular_day") %>% ggplot(aes(state_holiday, fill=state_holiday))+ geom_bar(col="black")+ labs(title= "count vs state_holiday")+ theme(plot.title = element_text(hjust = 0.5, size = 18)) density_state_holiday <- df4 %>% filter(state_holiday != "regular_day") %>% ggplot(aes(sales, fill= state_holiday))+ geom_density(alpha= 0.4)+ labs(title= "Distribution state holiday")+ theme(plot.title = element_text(hjust = 0.5, size = 18)) barplot_store_type <- df4 %>% ggplot(aes(store_type, fill=store_type))+ geom_bar(col="black")+ scale_y_continuous(labels = function(x) format(x, scientific = FALSE))+ labs(title= "count vs store_type")+ theme(plot.title = element_text(hjust = 0.5, size = 18)) density_store_type <- df4 %>% ggplot(aes(sales, fill= store_type))+ geom_density(alpha= 0.4)+ labs(title= "Distribution store_type")+ theme(plot.title = element_text(hjust = 0.5, size = 18)) barplot_assortment <- df4 %>% ggplot(aes(assortment, fill=assortment))+ geom_bar(col="black")+ scale_y_continuous(labels = function(x) format(x, scientific = FALSE))+ labs(title= "count vs assortment")+ theme(plot.title = element_text(hjust = 0.5, size = 18)) density_assortment <- df4 %>% ggplot(aes(sales, fill= assortment))+ geom_density(alpha= 0.4)+ labs(title= "Distribution assortment")+ theme(plot.title = element_text(hjust = 0.5, size = 18)) grid.arrange(barplot_state_holiday, density_state_holiday, barplot_store_type, density_store_type, barplot_assortment, density_assortment, nrow= 3,ncol=2)
H1. Lojas com maior sortimentos deveriam vender mais.
H1. Stores with larger assortments should sell more.
Falsa Lojas com maior sortimento , vendem menos.
False Stores with greater assortment , sell less.
density_assortment + labs(title= "Distribution assortment")+ theme(plot.title = element_text(hjust = 0.5, size = 18))
Visualizando a variável assortment para verificar se algum momento o sortimento extra foi maior.
Viewing the assortment variable to check if the extra assortment was ever greater.
ggarrange( df4 %>% group_by(date, assortment) %>% summarise_by_time(date, .by = "weeks",sales = sum(sales)) %>% ungroup() %>% ggplot(aes(date, sales))+ geom_line(aes(col= assortment), lwd= 1)+ scale_y_continuous(labels = scales::label_number_si())+ scale_x_date(breaks = "2 month", minor_breaks = "1 week", date_labels = "%Y-%W")+ labs(title= "year/week vs sales / assortment")+ theme(plot.title = element_text(hjust = 0.5, size = 18)), df4 %>% filter(assortment != "basic" & assortment != "extended") %>% group_by(date, assortment) %>% summarise_by_time(date, .by = "weeks",sales = sum(sales)) %>% ungroup() %>% ggplot(aes(date, sales))+ geom_line(aes(col= assortment), lwd= 1)+ scale_y_continuous(labels = scales::label_number_si())+ scale_x_date(breaks = "2 month", minor_breaks = "1 week", date_labels = "%Y-%W")+ labs(title= "year/week vs sales / extra")+ theme(plot.title = element_text(hjust = 0.5, size = 18)), ncol = 1)
Há uma diferença de escala entre basic/extended para extra, por isso plotarei o sortimento extra sozinho.
There is a difference in scale between basic / extended to extra, so I will plot the extra assortment myself.
H2. Lojas com competidores mais próximos deveriam vender menos.
H2. Stores with closer competitors should sell less.
Falsa Lojas com competidores mais próximos , vendem mais.
False Stores with closest competitors , sell more.
label <- c("0-1000m", "1000-2000m","2000-3000m","3000-4000m","4000-5000m", "5000-6000m", "6000-7000m","7000-8000m","8000-9000m","9000-10000m", "10000-11000m", "11000-12000m","12000-13000m","13000-14000m","14000-15000m", "15000-16000m", "16000-17000m","17000-18000m","18000-19000m","19000-20000m") df4$competition_dstance_binned <- cut(df4$competition_distance, breaks = seq(0, 20000, 1000), labels = label) fig4 <- df4 %>% drop_na(competition_dstance_binned) %>% group_by(competition_dstance_binned) %>% summarise(sales = sum(sales), .groups = 'drop') %>% ggplot(aes(competition_dstance_binned, sales, fill= competition_dstance_binned)) + geom_bar(stat = "identity", col="black")+ theme(axis.text.x = element_text(angle = 90), legend.position = "none")+ scale_y_continuous(labels = scales::label_number_si())+ labs(title= "Distribution competition_distance vs sales")+ theme(plot.title = element_text(hjust = 0.5, size = 18)) fig5 <- df4 %>% group_by(competition_distance) %>% summarise(sales= sum(sales), .groups="drop") %>% ggplot(aes(x = competition_distance, sales))+ geom_point( shape=21, fill="steelblue", size=3, color="white")+ scale_y_continuous(labels = scales::label_number_si())+ labs(title= "competition_distance vs sales")+ theme(plot.title = element_text(hjust = 0.5, size = 18)) grid.arrange(fig4, fig5, nrow= 2,ncol=1)
df4 %>% select(competition_distance, sales) %>% cor(method = "pearson") %>% kable(caption = "Correlation Table Competition_distance vs Sales") %>% kable_styling(full_width = F, bootstrap_options = c("striped", "hover", "condesend", "responsive"),html_font = "Cambria")
H3. Lojas com competidores à mais tempo deveriam vendem mais.
H3. Stores with longer competitors should sell more.
Falsa Lojas com competidores á mais tempo , vendem menos.
False Stores with competitors for longer , sell less.
ggarrange( df4 %>% group_by(competition_time_month) %>% summarise(sales= sum(sales), .groups = "drop") %>% filter(competition_time_month < 120 & competition_time_month != 0) %>% ggplot(aes(competition_time_month, sales)) + geom_point(col="steelblue", size= 1)+ geom_smooth( formula = "y~x",method = "lm", se = FALSE, color= "red")+ scale_y_continuous(labels = scales::label_number_si())+ labs(title= "competition_time_month vs sales")+ theme(plot.title = element_text(hjust = 0.5, size = 18)), df4 %>% group_by(competition_time_month) %>% summarise(sales= sum(sales), .groups = "drop") %>% filter(competition_time_month < 120 & competition_time_month != 0) %>% plot_time_series(competition_time_month,sales, .interactive = F, .title = "Competition_time_month Vs Sales", .x_lab= "competition_time_month", .y_lab="sales", .line_color = "red", .line_size= 1)+ scale_y_continuous(labels = scales::label_number_si())+ scale_x_continuous(breaks = seq(-40,120,10))+ labs(title= "competition_time_month vs sales")+ theme(plot.title = element_text(hjust = 0.5, size = 18)) ,ncol=1 )
df4 %>% select(competition_time_month, sales) %>% cor(method = "pearson") %>% kable(caption = "Correlation Table Competition_time_month vs Sales") %>% kable_styling(full_width = F, bootstrap_options = c("striped", "hover", "condesend", "responsive"),html_font = "Cambria")
H4. Lojas com promoções ativas por mais tempo deveriam vender mais.
H4. Stores with active promotions for longer should sell more.
Falsa Lojas com promoções ativas por mais tempo , vendem menos.
False Stores with active promotions longer , sell less.
aux1 <- data.table::fread("/home/renato/repos/Rossmann/inst/Data/meigarom.csv") %>% select(promo_time_week, sales)
fig8 <- aux1 %>% group_by(promo_time_week) %>% summarise(sales = sum(sales)) %>% filter(promo_time_week < 0) %>% ggplot(aes(promo_time_week, sales))+ geom_bar(stat='identity', fill="steelblue", col="black")+ scale_y_continuous(labels = scales::label_number_si())+ labs(title= "promo_time_week < 0 vs sales")+ theme(plot.title = element_text(hjust = 0.5, size = 18)) fig9 <- aux1 %>% group_by(promo_time_week) %>% summarise(sales = sum(sales)) %>% filter(promo_time_week > 0) %>% ggplot(aes(promo_time_week, sales))+ geom_bar(stat='identity', fill="steelblue", col="black")+ scale_y_continuous(labels = scales::label_number_si())+ labs(title= "promo_time_week > 0 vs sales")+ theme(plot.title = element_text(hjust = 0.5, size = 18)) fig14 <- aux1 %>% group_by(promo_time_week) %>% summarise(sales = sum(sales)) %>% filter(promo_time_week > 0) %>% ggplot(aes(promo_time_week, sales))+ geom_point(shape=21, fill="steelblue", size=3, color="white")+ scale_y_continuous(labels = scales::label_number_si())+ geom_smooth( formula = "y~x",method = "lm", se = FALSE, color= "red")+ labs(title= "promo_time_week vs sales")+ theme(plot.title = element_text(hjust = 0.5, size = 18)) grid.arrange(fig8, fig9,fig14, nrow= 3,ncol=1)
aux1 %>% select(promo_time_week, sales) %>% cor(method = "pearson") %>% kable(caption = "Correlation Table promo_time_week vs Sales") %>% kable_styling(full_width = F, bootstrap_options = c("striped", "hover", "condesend", "responsive"),html_font = "Cambria")
H5. Lojas com mais promoções consecutivas deveriam vender mais.
H5. Stores with more consecutive promotions should sell more.
Falsa Lojas com mais promoções consecutivas , vendem menos.
False Stores with more promotions consecutive , sell less.
df4 %>% group_by(promo, promo2) %>% summarise(sales= sum(sales)) %>% kable() %>% kable_styling(full_width = F, bootstrap_options = c("striped", "hover", "condesend", "responsive"),html_font = "Cambria")
pp <- ggplot( ) + aes(x= date) + geom_line(lwd=1,data= df4 %>% filter(promo == 1 & promo2 ==1) %>% group_by(date) %>% summarise_by_time(date, .by = "weeks",sales = sum(sales)),aes(y= sales, col= "Tradicional & Extendida", group = "Tradicional & Extendida"))+ geom_line(lwd=1,data= df4 %>% filter(promo == 1 & promo2 ==0) %>% group_by(date) %>% summarise_by_time(date, .by = "weeks",sales = sum(sales)),aes(y= sales, col= "Extendida", group = "basic"))+ scale_y_continuous(labels = scales::label_number_si())+ scale_x_date(breaks = "2 month", minor_breaks = "1 week", date_labels = "%Y-%W")+ theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))+ labs(x= "year week")+ labs(title= "year/week vs sales ")+ theme(plot.title = element_text(hjust = 0.5, size = 18)) pp
H6. Lojas abertas durante o feriado de Natal deveriam vender mais.
H6. Stores open during the Christmas holiday should sell more.
Falsa Lojas abertas durante o feriado de Natal , vendem menos.
False Stores opened during the Christmas holiday , sell less.
fig10 <- df4 %>% filter(state_holiday != "regular_day") %>% ggplot(aes(year , sales,fill=state_holiday))+ geom_bar(stat='identity', position=position_dodge())+ labs(title= "count vs state_holiday")+ theme(plot.title = element_text(hjust = 0.5, size = 18), legend.position="none") grid.arrange(barplot_state_holiday, fig10, nrow= 2,ncol=1)
H7. Lojas deveriam vender mais ao longo dos anos.
H7. Stores should sell more over the years.
Falsa Lojas vendem menos , ao longo dos anos.
False Stores sell less, over the years.
df4 %>% group_by(year) %>% summarise(sales=sum(sales)) %>% ggplot(aes(year, sales))+ geom_line(col="steelblue", lwd=1)+ scale_x_continuous(breaks = c(2013,2014,2015))+ scale_y_continuous(labels = scales::label_number_si())+ labs(title= "year vs sales")+ theme(plot.title = element_text(hjust = 0.5, size = 18))
df4 %>% select(year, sales) %>% cor(method = "pearson") %>% kable(caption = "Correlation Table year vs Sales") %>% kable_styling(full_width = F, bootstrap_options = c("striped", "hover", "condesend", "responsive"),html_font = "Cambria")
H8. Lojas deveriam vender mais no segundo semestre do ano.
H8. Stores should sell more in the second half of the year.
Falsa Lojas vendem menos , no segundo semestre do ano.
False Stores sell less, in the second half of the year.
df4 %>% group_by(month) %>% summarise(sales=sum(sales)) %>% ggplot(aes(month, sales))+ geom_line(col="darkgreen", lwd=1)+ scale_x_continuous(breaks = c(1,2,3,4,5,6,7,8,9,10,11,12))+ scale_y_continuous(labels = scales::label_number_si())+ labs(title = "month vs sales")+ theme(plot.title = element_text(hjust = 0.5, size = 18))
df4 %>% select(month, sales) %>% cor(method = "pearson") %>% kable(caption = "Correlation Table month vs Sales") %>% kable_styling(full_width = F, bootstrap_options = c("striped", "hover", "condesend", "responsive"),html_font = "Cambria")
H9. Lojas deveriam vender mais depois do dia 10 de cada mês.
H9. Stores should sell more after the 10th of each month.
Verdadeira Lojas vendem mais , depois do dia 10 de cada mes.
True Stores sell more, after the 10th of each month..
fig10 <- df4 %>% group_by(day) %>% summarise(sales=sum(sales)) %>% ggplot(aes(day, sales))+ geom_line(col="darkgreen", lwd=1)+ scale_y_continuous(labels = scales::label_number_si())+ scale_x_continuous(breaks = seq(0,31, 1))+ labs(title= "day vs sales")+ theme(plot.title = element_text(hjust = 0.5, size = 18)) fig11 <- df4 %>% mutate(day = ifelse(day <= 10 , "before_10_days", "after_10_days")) %>% ggplot(aes(sales, fill=day))+ geom_density(alpha= 0.4)+ labs(title= "Distribution sales / before_10_days / after_10_days ")+ theme(plot.title = element_text(hjust = 0.5, size = 18)) grid.arrange(fig10,fig11, nrow= 2,ncol=1)
df4 %>% select(day, sales) %>% cor(method = "pearson") %>% kable(caption = "Correlation Table day vs Sales") %>% kable_styling(full_width = F, bootstrap_options = c("striped", "hover", "condesend", "responsive"),html_font = "Cambria")
H10. Lojas deveriam vender menos aos finais de semana.
H10. Stores should sell less on weekends.
Verdadeira Lojas vendem menos , nos finais de semana.
True Stores sell less, on weekends.
df4 %>% group_by(day_of_week) %>% summarise(sales=sum(sales)) %>% ggplot(aes(day_of_week, sales))+ geom_line(col="darkred", lwd=1)+ scale_y_continuous(labels = scales::label_number_si())+ scale_x_continuous(breaks = seq(1,7, 1))+ labs(title= "day_of_week vs sales ")+ theme(plot.title = element_text(hjust = 0.5, size = 18))
df4 %>% select(day_of_week, sales) %>% cor(method = "pearson") %>% kable(caption = "Correlation Table day_of_week vs Sales") %>% kable_styling(full_width = F, bootstrap_options = c("striped", "hover", "condesend", "responsive"),html_font = "Cambria")
H11. Lojas deveriam vender menos durante os feriados escolares.
H11. Stores should sell less during school holidays.
Verdadeira Lojas vendem menos , durante os feriadso escolares, except os meses de Julho e Agosto.
True Stores sell less, during school holidays, except July and August.
fig12 <- df4 %>% mutate(school_holiday = as.factor(school_holiday)) %>% ggplot(aes(sales, fill=school_holiday))+ geom_density(alpha= 0.4)+ labs(title= "Distribution sales / school_holiday ")+ theme(plot.title = element_text(hjust = 0.5, size = 18)) fig13 <- df4 %>% group_by(month, school_holiday) %>% summarise(sales= sum(sales)) %>% mutate(month = as.factor(month), school_holiday = as.factor(school_holiday)) %>% ggplot(aes(month , sales,fill=school_holiday))+ geom_bar(stat='identity', position=position_dodge(), col="black")+ scale_y_continuous(labels = scales::label_number_si())+ labs(title= "month vs sales / school_holiday ")+ theme(plot.title = element_text(hjust = 0.5, size = 18)) grid.arrange(fig12,fig13, nrow= 2,ncol=1)
kable(data.frame( Hypotheses = c("H1","H2","H3","H4","H5","H6","H7", "H8", "H9","H10","H11") , Conclusions = c("False","False","False","False", "False","False","False", "True","True", "True", "True") , Relevance = c("Medium","Low","Low","High","Medium","Low","Low", "Low","Low","Low","Low") ) , caption ="Hypothesis Summary Table" ) %>% kable_styling(full_width = F, bootstrap_options = c("striped", "hover", "condesend"),html_font = "Cambria")
df4 %>% keep(is.numeric) %>% cor() %>% ggcorrplot::ggcorrplot(hc.order = T, type = "lower", lab=T, lab_size = 3, method = "square", colors = c("chocolate1","white","darkcyan"), ggtheme = theme_minimal())
df4 %>% mutate(store_type = as.character(store_type)) %>% keep(is.character) %>% select(-first_day_of_month, -year_week) %>% as.data.frame() %>% catcor(type="cramer") %>% ggcorrplot::ggcorrplot(hc.order = T, type = "lower", lab=T, lab_size = 3, method = "square", colors = c("chocolate1","white","steelblue"), ggtheme = theme_minimal())
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.