library(rmarkdown) library(SmartEDA) library(knitr) library(ISLR) library(scales) library(gridExtra) library(ggplot2)
In this vignette we will discuss about how to customize the summary statistics using ExpCustomStat function from SmartEDA
. The output of this function returns matrix object containing descriptive information on all input variables for each level/combination of levels in categorical/group variable. Also, while running the analysis we can filter row/cases of the data. We can apply the filters at individual variable level or complete data like base subsetting.
Function definition:
ExpCustomStat(data,Cvar=NULL,Nvar=NULL,stat=NULL,gpby=TRUE,filt=NULL,dcast=FALSE)
Key functionalities of ExpCustomStat
are:
ExpCustomStat
functionWill open the carseats data from ISLR package and drive different types of use cases using ExpCustomStat
function.
In this vignette, we will be using a simulated data set containing sales of child car seats at 400 different stores.
Data Source ISLR package.
Function source SmartEDA package
Carseats data from ISLR package:
options(width = 150) CData = ISLR::Carseats head(CData,5)
Categorical summaries to describe the distribution for a qualitative variables.
The number of observations for particular category
"US", "Urban" and "ShelveLoc"
variableExpCustomStat(Carseats,Cvar=c("US","Urban","ShelveLoc","Education"),gpby=FALSE)
OR we can use similar analysis using ExpCTable
function from same package, this functions includes cumulative percentages and Total
ExpCTable(Carseats,Target=NULL,clim=5,nlim=15,round=2,bin=NULL,per=F)
"US", "Urban" and "ShelveLoc"
variableExpCustomStat(Carseats,Cvar=c("US","Urban","ShelveLoc"),gpby=FALSE)
To produce cross tables which calculate counts and proportions for each combination of categorical variables we can use ExpCustomStat
NOTE: For crosstabulation change input gpby
=TRUE
"US" Vs "Urban"
ExpCustomStat(Carseats,Cvar=c("US","Urban"),gpby=TRUE,filt=NULL)
We can also produce multidimensional tables based on three or more categorical variables
"US" Vs "Urban" Vs "ShelveLoc"
ExpCustomStat(Carseats,Cvar=c("US","Urban","ShelveLoc"),gpby=TRUE,filt=NULL)
If we want to understand the number of stores in US
and location is Urban
for Population
size greater than 150
Population
is > 150 "US" Vs "Urban"
ExpCustomStat(Carseats,Cvar=c("US","Urban"),gpby=TRUE,filt="Population>150")
Population
> 150 and Urban
=="Yes""US" Vs "ShelveLoc"
ExpCustomStat(Carseats,Cvar=c("US","ShelveLoc"),gpby=TRUE,filt="Urban=='Yes' & Population>150")
Numerical summaries to describe the distribution for quantitative variables.
options(width = 150) ExpCustomStat(Carseats,Nvar=c("Population","Sales","CompPrice","Income"),stat = c('Count','mean','sum','var','sd','min','max','IQR'))
ExpCustomStat(Carseats,Nvar=c("Population","Sales","CompPrice","Income"),stat = c('min','p0.25','median','p0.75','max'))
Filter rows/cases of complete dataset where conditions are true
options(width = 150) ExpCustomStat(Carseats,Nvar=c("Population","Sales","CompPrice","Income"),stat = c('Count','mean','sum','var','min','median','max'),filt="Urban=='Yes'")
options(width=150) ExpCustomStat(Carseats,Nvar=c("Population","Sales","CompPrice","Income"),stat = c('Count','mean','sum','median','IQR'),filt="Urban=='Yes' & Population>150")
This will be useful when we need to exclude redundant values like '999' or '9999' or '-9' or '-1111', or '888' etc from each selected variable.
Eg:dat = data.frame(x = c(23,24,34,999,12,12,23,999,45), y = c(1,3,4,999,0,999,0,8,999,0)
Exclude 999:
x = c(23,24,34,12,12,23,45) y = c(1,3,4,0,0,8,0)
data_sam = Carseats[,] data_sam[sample(1:400,30),"Sales"] <- 999 data_sam[sample(1:400,20),"CompPrice"] <- -9 data_sam[sample(1:400,45),"Income"] <- 999 ExpCustomStat(data_sam,Nvar=c("Population","Sales","CompPrice","Income"),stat = c('Count','mean','sum','min'),filt="All %ni% c(999,-9)")
Different filters for each numeric variable. For example, below are the conditions (logic) for each variable summary analysis.
"Population" - Consider only Good ShelveLoc (the quality of the shelving location for the car seats at each site) ShelveLoc=='Good'
"Sales" - Inculde only those store belongs to Urban location (Urban==Yes)
"CompPrice" - Exclude Price is greater than 150
"Education" - All stores
"Income" - Inculde only stores in US (US==Yes)
Table: Descriptive summary for Price, Population, Sales, CompPrice, Income based on the filters.
options(width = 150) ExpCustomStat(Carseats,Nvar=c("Population","Sales","CompPrice","Education","Income"),stat = c('Count','mean','sum','var','sd','IQR','median'),filt=c("ShelveLoc=='Good'^Urban=='Yes'^Price>=150^All^US=='Yes'"))
Descriptive summary for numerical variable by group level.
options(width = 150) ExpCustomStat(Carseats,Cvar = c("Urban","ShelveLoc"), Nvar=c("Population","Sales"), stat = c('Count','Prop','mean','min','P0.25','median','p0.75','max'),gpby=FALSE)
options(width = 150) ExpCustomStat(Carseats,Cvar = c("Urban","US","ShelveLoc"), Nvar=c("CompPrice","Income"), stat = c('Count','Prop','mean','sum','PS','min','max','IQR','sd'), gpby = TRUE)
options(width = 150) ExpCustomStat(Carseats,Cvar = c("Urban","US","ShelveLoc"), Nvar=c("CompPrice","Income"), stat = c('Count','Prop','mean','sum','PS','median','IQR'), gpby = TRUE,filt="Urban=='Yes'")
options(width = 150) data_sam = Carseats[,] data_sam[sample(1:400,30),"Sales"] <- 888 data_sam[sample(1:400,20),"CompPrice"] <- 999 data_sam[sample(1:400,45),"Income"] <- 999 ExpCustomStat(data_sam,Cvar = c("Urban","US","ShelveLoc"), Nvar=c("Sales","CompPrice","Income"), stat = c('Count','Prop','mean','sum','PS'), gpby = TRUE,filt="All %ni% c(888,999)")
Different base for each numeric variable.
"Population" - Consider only Good ShelveLoc (the quality of the shelving location for the car seats at each site) ShelveLoc=='Good'
"Sales" - Inculde only those store belongs to Urban location (Urban==Yes)
"CompPrice" - Exclude Price is greater than 150
ExpCustomStat(Carseats,Cvar = c("Urban","US"), Nvar=c("Population","Sales","CompPrice"), stat = c('Count','Prop','mean','sum','var','IQR'), filt=c("ShelveLoc=='Good'^Urban=='Yes'^Price>=150"))
Reshapes a grouped data
options(width = 150) ExpCustomStat(Carseats,Cvar = c("Urban"), Nvar=c("Population","Sales"), stat = c('Count','Prop'),gpby=TRUE,dcast=TRUE)
##Frequency table for categorical variables ExpCustomStat(Carseats,Cvar=c("US","Urban","ShelveLoc"),gpby=FALSE) ##Crosstabulation between categorical variables ExpCustomStat(Carseats,Cvar=c("US","Urban"),gpby=TRUE,filt=NULL) ExpCustomStat(Carseats,Cvar=c("US","Urban","ShelveLoc"),gpby=TRUE,filt=NULL) ##Adding filters for custom tables ExpCustomStat(Carseats,Cvar=c("US","Urban"),gpby=TRUE,filt="Population>150") ExpCustomStat(Carseats,Cvar=c("US","ShelveLoc"),gpby=TRUE,filt="Urban=='Yes' & Population>150") ## Numeric variable summary ExpCustomStat(Carseats,Nvar=c("Population","Sales","CompPrice","Income"),stat = c('Count','mean','sum','var','min','max')) ExpCustomStat(Carseats,Nvar=c("Population","Sales","CompPrice","Income"),stat = c('min','p0.25','median','p0.75','max')) ## Adding filters for complete data (like base Subset) ExpCustomStat(Carseats,Nvar=c("Population","Sales","CompPrice","Income"),stat = c('Count','mean','sum','var'),filt="Urban=='Yes'") ExpCustomStat(Carseats,Nvar=c("Population","Sales","CompPrice","Income"),stat = c('Count','mean','sum'),filt="Urban=='Yes' & Population>150") ## Filter unique value from all the numeric variables ExpCustomStat(data_sam,Nvar=c("Population","Sales","CompPrice","Income"),stat = c('Count','mean','sum','min'),filt="All %ni% c(999,-9)") ## Adding filters at variable level ExpCustomStat(Carseats,Nvar=c("Population","Sales","CompPrice","Education","Income"),stat = c('Count','mean','sum','var','sd','IQR','median'),filt=c("ShelveLoc=='Good'^Urban=='Yes'^Price>=150^ ^US=='Yes'")) ##Numerical summaries by category ##Variable summary report (One group variable) ExpCustomStat(Carseats,Cvar = c("Urban","ShelveLoc"), Nvar=c("Population","Sales"), stat = c('Count','Prop','mean','min','P0.25','median','p0.75','max'),gpby=FALSE) ##Variable summary report (More than One group variable) ExpCustomStat(Carseats,Cvar = c("Urban","US","ShelveLoc"), Nvar=c("CompPrice","Income"), stat = c('Count','Prop','mean','sum','PS','min','max','IQR','sd'), gpby = TRUE) ##Variable summary report (More than One group variable) with filter ExpCustomStat(Carseats,Cvar = c("Urban","US","ShelveLoc"), Nvar=c("CompPrice","Income"), stat = c('Count','Prop','mean','sum','PS','P0.25','median','p0.75'), gpby = TRUE,filt="Urban=='Yes'") ExpCustomStat(data_sam,Cvar = c("Urban","US","ShelveLoc"), Nvar=c("Sales","CompPrice","Income"), stat = c('Count','Prop','mean','sum','PS'), gpby = TRUE,filt="All %ni% c(888,999)") ExpCustomStat(Carseats,Cvar = c("Urban","US"), Nvar=c("Population","Sales","CompPrice"), stat = c('Count','Prop','mean','sum','var','min','max'), filt=c("ShelveLoc=='Good'^Urban=='Yes'^Price>=150"))
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.