library(rmarkdown) library(SmartEDA) library(knitr) library(ISLR) library(scales) library(gridExtra) library(ggplot2)
In this vignette we will discuss about how to customize the summary statistics using ExpCustomStat function from SmartEDA. The output of this function returns matrix object containing descriptive information on all input variables for each level/combination of levels in categorical/group variable. Also, while running the analysis we can filter row/cases of the data. We can apply the filters at individual variable level or complete data like base subsetting.
Function definition:
ExpCustomStat(data,Cvar=NULL,Nvar=NULL,stat=NULL,gpby=TRUE,filt=NULL,dcast=FALSE)
Key functionalities of ExpCustomStat are:
ExpCustomStat functionWill open the carseats data from ISLR package and drive different types of use cases using ExpCustomStat function.
In this vignette, we will be using a simulated data set containing sales of child car seats at 400 different stores.
Data Source ISLR package.
Function source SmartEDA package
Carseats data from ISLR package:
options(width = 150) CData = ISLR::Carseats head(CData,5)
Categorical summaries to describe the distribution for a qualitative variables.
The number of observations for particular category
"US", "Urban" and "ShelveLoc" variableExpCustomStat(Carseats,Cvar=c("US","Urban","ShelveLoc","Education"),gpby=FALSE)
OR we can use similar analysis using ExpCTable function from same package, this functions includes cumulative percentages and Total
ExpCTable(Carseats,Target=NULL,clim=5,nlim=15,round=2,bin=NULL,per=F)
"US", "Urban" and "ShelveLoc" variableExpCustomStat(Carseats,Cvar=c("US","Urban","ShelveLoc"),gpby=FALSE)
To produce cross tables which calculate counts and proportions for each combination of categorical variables we can use ExpCustomStat
NOTE: For crosstabulation change input gpby=TRUE
"US" Vs "Urban"ExpCustomStat(Carseats,Cvar=c("US","Urban"),gpby=TRUE,filt=NULL)
We can also produce multidimensional tables based on three or more categorical variables
"US" Vs "Urban" Vs "ShelveLoc"ExpCustomStat(Carseats,Cvar=c("US","Urban","ShelveLoc"),gpby=TRUE,filt=NULL)
If we want to understand the number of stores in US and location is Urban for Population size greater than 150
Population is > 150 "US" Vs "Urban"ExpCustomStat(Carseats,Cvar=c("US","Urban"),gpby=TRUE,filt="Population>150")
Population > 150 and Urban=="Yes""US" Vs "ShelveLoc"ExpCustomStat(Carseats,Cvar=c("US","ShelveLoc"),gpby=TRUE,filt="Urban=='Yes' & Population>150")
Numerical summaries to describe the distribution for quantitative variables.
options(width = 150) ExpCustomStat(Carseats,Nvar=c("Population","Sales","CompPrice","Income"),stat = c('Count','mean','sum','var','sd','min','max','IQR'))
ExpCustomStat(Carseats,Nvar=c("Population","Sales","CompPrice","Income"),stat = c('min','p0.25','median','p0.75','max'))
Filter rows/cases of complete dataset where conditions are true
options(width = 150) ExpCustomStat(Carseats,Nvar=c("Population","Sales","CompPrice","Income"),stat = c('Count','mean','sum','var','min','median','max'),filt="Urban=='Yes'")
options(width=150) ExpCustomStat(Carseats,Nvar=c("Population","Sales","CompPrice","Income"),stat = c('Count','mean','sum','median','IQR'),filt="Urban=='Yes' & Population>150")
This will be useful when we need to exclude redundant values like '999' or '9999' or '-9' or '-1111', or '888' etc from each selected variable.
Eg:dat = data.frame(x = c(23,24,34,999,12,12,23,999,45), y = c(1,3,4,999,0,999,0,8,999,0)
Exclude 999:
x = c(23,24,34,12,12,23,45) y = c(1,3,4,0,0,8,0)
data_sam = Carseats[,] data_sam[sample(1:400,30),"Sales"] <- 999 data_sam[sample(1:400,20),"CompPrice"] <- -9 data_sam[sample(1:400,45),"Income"] <- 999 ExpCustomStat(data_sam,Nvar=c("Population","Sales","CompPrice","Income"),stat = c('Count','mean','sum','min'),filt="All %ni% c(999,-9)")
Different filters for each numeric variable. For example, below are the conditions (logic) for each variable summary analysis.
"Population" - Consider only Good ShelveLoc (the quality of the shelving location for the car seats at each site) ShelveLoc=='Good'
"Sales" - Inculde only those store belongs to Urban location (Urban==Yes)
"CompPrice" - Exclude Price is greater than 150
"Education" - All stores
"Income" - Inculde only stores in US (US==Yes)
Table: Descriptive summary for Price, Population, Sales, CompPrice, Income based on the filters.
options(width = 150) ExpCustomStat(Carseats,Nvar=c("Population","Sales","CompPrice","Education","Income"),stat = c('Count','mean','sum','var','sd','IQR','median'),filt=c("ShelveLoc=='Good'^Urban=='Yes'^Price>=150^All^US=='Yes'"))
Descriptive summary for numerical variable by group level.
options(width = 150) ExpCustomStat(Carseats,Cvar = c("Urban","ShelveLoc"), Nvar=c("Population","Sales"), stat = c('Count','Prop','mean','min','P0.25','median','p0.75','max'),gpby=FALSE)
options(width = 150) ExpCustomStat(Carseats,Cvar = c("Urban","US","ShelveLoc"), Nvar=c("CompPrice","Income"), stat = c('Count','Prop','mean','sum','PS','min','max','IQR','sd'), gpby = TRUE)
options(width = 150) ExpCustomStat(Carseats,Cvar = c("Urban","US","ShelveLoc"), Nvar=c("CompPrice","Income"), stat = c('Count','Prop','mean','sum','PS','median','IQR'), gpby = TRUE,filt="Urban=='Yes'")
options(width = 150) data_sam = Carseats[,] data_sam[sample(1:400,30),"Sales"] <- 888 data_sam[sample(1:400,20),"CompPrice"] <- 999 data_sam[sample(1:400,45),"Income"] <- 999 ExpCustomStat(data_sam,Cvar = c("Urban","US","ShelveLoc"), Nvar=c("Sales","CompPrice","Income"), stat = c('Count','Prop','mean','sum','PS'), gpby = TRUE,filt="All %ni% c(888,999)")
Different base for each numeric variable.
"Population" - Consider only Good ShelveLoc (the quality of the shelving location for the car seats at each site) ShelveLoc=='Good'
"Sales" - Inculde only those store belongs to Urban location (Urban==Yes)
"CompPrice" - Exclude Price is greater than 150
ExpCustomStat(Carseats,Cvar = c("Urban","US"), Nvar=c("Population","Sales","CompPrice"), stat = c('Count','Prop','mean','sum','var','IQR'), filt=c("ShelveLoc=='Good'^Urban=='Yes'^Price>=150"))
Reshapes a grouped data
options(width = 150) ExpCustomStat(Carseats,Cvar = c("Urban"), Nvar=c("Population","Sales"), stat = c('Count','Prop'),gpby=TRUE,dcast=TRUE)
##Frequency table for categorical variables ExpCustomStat(Carseats,Cvar=c("US","Urban","ShelveLoc"),gpby=FALSE) ##Crosstabulation between categorical variables ExpCustomStat(Carseats,Cvar=c("US","Urban"),gpby=TRUE,filt=NULL) ExpCustomStat(Carseats,Cvar=c("US","Urban","ShelveLoc"),gpby=TRUE,filt=NULL) ##Adding filters for custom tables ExpCustomStat(Carseats,Cvar=c("US","Urban"),gpby=TRUE,filt="Population>150") ExpCustomStat(Carseats,Cvar=c("US","ShelveLoc"),gpby=TRUE,filt="Urban=='Yes' & Population>150") ## Numeric variable summary ExpCustomStat(Carseats,Nvar=c("Population","Sales","CompPrice","Income"),stat = c('Count','mean','sum','var','min','max')) ExpCustomStat(Carseats,Nvar=c("Population","Sales","CompPrice","Income"),stat = c('min','p0.25','median','p0.75','max')) ## Adding filters for complete data (like base Subset) ExpCustomStat(Carseats,Nvar=c("Population","Sales","CompPrice","Income"),stat = c('Count','mean','sum','var'),filt="Urban=='Yes'") ExpCustomStat(Carseats,Nvar=c("Population","Sales","CompPrice","Income"),stat = c('Count','mean','sum'),filt="Urban=='Yes' & Population>150") ## Filter unique value from all the numeric variables ExpCustomStat(data_sam,Nvar=c("Population","Sales","CompPrice","Income"),stat = c('Count','mean','sum','min'),filt="All %ni% c(999,-9)") ## Adding filters at variable level ExpCustomStat(Carseats,Nvar=c("Population","Sales","CompPrice","Education","Income"),stat = c('Count','mean','sum','var','sd','IQR','median'),filt=c("ShelveLoc=='Good'^Urban=='Yes'^Price>=150^ ^US=='Yes'")) ##Numerical summaries by category ##Variable summary report (One group variable) ExpCustomStat(Carseats,Cvar = c("Urban","ShelveLoc"), Nvar=c("Population","Sales"), stat = c('Count','Prop','mean','min','P0.25','median','p0.75','max'),gpby=FALSE) ##Variable summary report (More than One group variable) ExpCustomStat(Carseats,Cvar = c("Urban","US","ShelveLoc"), Nvar=c("CompPrice","Income"), stat = c('Count','Prop','mean','sum','PS','min','max','IQR','sd'), gpby = TRUE) ##Variable summary report (More than One group variable) with filter ExpCustomStat(Carseats,Cvar = c("Urban","US","ShelveLoc"), Nvar=c("CompPrice","Income"), stat = c('Count','Prop','mean','sum','PS','P0.25','median','p0.75'), gpby = TRUE,filt="Urban=='Yes'") ExpCustomStat(data_sam,Cvar = c("Urban","US","ShelveLoc"), Nvar=c("Sales","CompPrice","Income"), stat = c('Count','Prop','mean','sum','PS'), gpby = TRUE,filt="All %ni% c(888,999)") ExpCustomStat(Carseats,Cvar = c("Urban","US"), Nvar=c("Population","Sales","CompPrice"), stat = c('Count','Prop','mean','sum','var','min','max'), filt=c("ShelveLoc=='Good'^Urban=='Yes'^Price>=150"))
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.