Custom table summarizing outcomes"

library(rmarkdown)
library(SmartEDA)
library(knitr)
library(ISLR)
library(scales)
library(gridExtra)
library(ggplot2)

1. Exploratory analysis - Custom tables, summary statistics

In this vignette we will discuss about how to customize the summary statistics using ExpCustomStat function from SmartEDA. The output of this function returns matrix object containing descriptive information on all input variables for each level/combination of levels in categorical/group variable. Also, while running the analysis we can filter row/cases of the data. We can apply the filters at individual variable level or complete data like base subsetting.

Function definition:

ExpCustomStat(data,Cvar=NULL,Nvar=NULL,stat=NULL,gpby=TRUE,filt=NULL,dcast=FALSE)

Key functionalities of ExpCustomStat are:

  1. Categorical data descriptive statistics (Frequencies, Proportions)
  2. Numerical data descriptive statistics (Mean, Median, Sum, Variance etc..)
  3. Comparison of numerical data based on categorical data
  4. Filter rows/cases where conditions are true. Options to apply filters at variable level or complete data set like base subsetting
  5. Options to calculate basic statistics like Mean, Median, Std.Dev, Variance, Count, Proportions, Quantiles, IQR, Percentages of Shares (PS) for numerical data

1.1 Usage of ExpCustomStat function

Will open the carseats data from ISLR package and drive different types of use cases using ExpCustomStat function.

In this vignette, we will be using a simulated data set containing sales of child car seats at 400 different stores.

Data Source ISLR package.

Function source SmartEDA package

Carseats data from ISLR package:

options(width = 150)
CData = ISLR::Carseats
head(CData,5)

2. Categorical summaries

Categorical summaries to describe the distribution for a qualitative variables.

2.1. Frequency table

The number of observations for particular category

ExpCustomStat(Carseats,Cvar=c("US","Urban","ShelveLoc","Education"),gpby=FALSE)

OR we can use similar analysis using ExpCTable function from same package, this functions includes cumulative percentages and Total

ExpCTable(Carseats,Target=NULL,clim=5,nlim=15,round=2,bin=NULL,per=F)
ExpCustomStat(Carseats,Cvar=c("US","Urban","ShelveLoc"),gpby=FALSE)

2.2. Crosstabulation (more than one categorical variable)

To produce cross tables which calculate counts and proportions for each combination of categorical variables we can use ExpCustomStat

NOTE: For crosstabulation change input gpby=TRUE

ExpCustomStat(Carseats,Cvar=c("US","Urban"),gpby=TRUE,filt=NULL)

We can also produce multidimensional tables based on three or more categorical variables

ExpCustomStat(Carseats,Cvar=c("US","Urban","ShelveLoc"),gpby=TRUE,filt=NULL)

2.3. Adding filters to tables

If we want to understand the number of stores in US and location is Urban for Population size greater than 150

ExpCustomStat(Carseats,Cvar=c("US","Urban"),gpby=TRUE,filt="Population>150")
ExpCustomStat(Carseats,Cvar=c("US","ShelveLoc"),gpby=TRUE,filt="Urban=='Yes' & Population>150")

3. Numerical summaries

Numerical summaries to describe the distribution for quantitative variables.

3.1. Numerical variable summary

options(width = 150)
ExpCustomStat(Carseats,Nvar=c("Population","Sales","CompPrice","Income"),stat = c('Count','mean','sum','var','sd','min','max','IQR'))
ExpCustomStat(Carseats,Nvar=c("Population","Sales","CompPrice","Income"),stat = c('min','p0.25','median','p0.75','max'))

3.2. Adding filters to complete data (like base subset)

Filter rows/cases of complete dataset where conditions are true

options(width = 150)
ExpCustomStat(Carseats,Nvar=c("Population","Sales","CompPrice","Income"),stat = c('Count','mean','sum','var','min','median','max'),filt="Urban=='Yes'")
options(width=150)
ExpCustomStat(Carseats,Nvar=c("Population","Sales","CompPrice","Income"),stat = c('Count','mean','sum','median','IQR'),filt="Urban=='Yes' & Population>150")

3.3. Filter out unique value from all the numeric variables

This will be useful when we need to exclude redundant values like '999' or '9999' or '-9' or '-1111', or '888' etc from each selected variable.

Eg:dat = data.frame(x = c(23,24,34,999,12,12,23,999,45), y = c(1,3,4,999,0,999,0,8,999,0)

Exclude 999:

x = c(23,24,34,12,12,23,45) y = c(1,3,4,0,0,8,0)

data_sam = Carseats[,]
data_sam[sample(1:400,30),"Sales"] <- 999
data_sam[sample(1:400,20),"CompPrice"] <- -9
data_sam[sample(1:400,45),"Income"] <- 999
ExpCustomStat(data_sam,Nvar=c("Population","Sales","CompPrice","Income"),stat = c('Count','mean','sum','min'),filt="All %ni% c(999,-9)")

3.4. Adding filters at variable level

Different filters for each numeric variable. For example, below are the conditions (logic) for each variable summary analysis.

"Population" - Consider only Good ShelveLoc (the quality of the shelving location for the car seats at each site) ShelveLoc=='Good'

"Sales" - Inculde only those store belongs to Urban location (Urban==Yes)

"CompPrice" - Exclude Price is greater than 150

"Education" - All stores

"Income" - Inculde only stores in US (US==Yes)

Table: Descriptive summary for Price, Population, Sales, CompPrice, Income based on the filters.

options(width = 150)
ExpCustomStat(Carseats,Nvar=c("Population","Sales","CompPrice","Education","Income"),stat = c('Count','mean','sum','var','sd','IQR','median'),filt=c("ShelveLoc=='Good'^Urban=='Yes'^Price>=150^All^US=='Yes'"))

4. Numerical summaries by category

Descriptive summary for numerical variable by group level.

4.1. Variable summary report (One group variable)

options(width = 150)
ExpCustomStat(Carseats,Cvar = c("Urban","ShelveLoc"), Nvar=c("Population","Sales"), stat = c('Count','Prop','mean','min','P0.25','median','p0.75','max'),gpby=FALSE)

4.2. Variable summary report (More than One group variable)

options(width = 150)
ExpCustomStat(Carseats,Cvar = c("Urban","US","ShelveLoc"), Nvar=c("CompPrice","Income"), stat = c('Count','Prop','mean','sum','PS','min','max','IQR','sd'), gpby = TRUE)

4.3. Variable summary report (More than One group variable) with filter

options(width = 150)
ExpCustomStat(Carseats,Cvar = c("Urban","US","ShelveLoc"), Nvar=c("CompPrice","Income"), stat = c('Count','Prop','mean','sum','PS','median','IQR'), gpby = TRUE,filt="Urban=='Yes'")
options(width = 150)
data_sam = Carseats[,]
data_sam[sample(1:400,30),"Sales"] <- 888
data_sam[sample(1:400,20),"CompPrice"] <- 999
data_sam[sample(1:400,45),"Income"] <- 999
ExpCustomStat(data_sam,Cvar = c("Urban","US","ShelveLoc"), Nvar=c("Sales","CompPrice","Income"), stat = c('Count','Prop','mean','sum','PS'), gpby = TRUE,filt="All %ni% c(888,999)")

Different base for each numeric variable.

"Population" - Consider only Good ShelveLoc (the quality of the shelving location for the car seats at each site) ShelveLoc=='Good'

"Sales" - Inculde only those store belongs to Urban location (Urban==Yes)

"CompPrice" - Exclude Price is greater than 150

ExpCustomStat(Carseats,Cvar = c("Urban","US"), Nvar=c("Population","Sales","CompPrice"), stat = c('Count','Prop','mean','sum','var','IQR'), filt=c("ShelveLoc=='Good'^Urban=='Yes'^Price>=150"))

5. Resahpe data

Reshapes a grouped data

options(width = 150)
ExpCustomStat(Carseats,Cvar = c("Urban"), Nvar=c("Population","Sales"), stat = c('Count','Prop'),gpby=TRUE,dcast=TRUE)
Example scripts
##Frequency table for categorical variables
ExpCustomStat(Carseats,Cvar=c("US","Urban","ShelveLoc"),gpby=FALSE)

##Crosstabulation between categorical variables
ExpCustomStat(Carseats,Cvar=c("US","Urban"),gpby=TRUE,filt=NULL)
ExpCustomStat(Carseats,Cvar=c("US","Urban","ShelveLoc"),gpby=TRUE,filt=NULL)

##Adding filters for custom tables
ExpCustomStat(Carseats,Cvar=c("US","Urban"),gpby=TRUE,filt="Population>150")
ExpCustomStat(Carseats,Cvar=c("US","ShelveLoc"),gpby=TRUE,filt="Urban=='Yes' & Population>150")

## Numeric variable summary
ExpCustomStat(Carseats,Nvar=c("Population","Sales","CompPrice","Income"),stat = c('Count','mean','sum','var','min','max'))
ExpCustomStat(Carseats,Nvar=c("Population","Sales","CompPrice","Income"),stat = c('min','p0.25','median','p0.75','max'))

## Adding filters for complete data (like base Subset)
ExpCustomStat(Carseats,Nvar=c("Population","Sales","CompPrice","Income"),stat = c('Count','mean','sum','var'),filt="Urban=='Yes'")
ExpCustomStat(Carseats,Nvar=c("Population","Sales","CompPrice","Income"),stat = c('Count','mean','sum'),filt="Urban=='Yes' & Population>150")

## Filter unique value from all the numeric variables
ExpCustomStat(data_sam,Nvar=c("Population","Sales","CompPrice","Income"),stat = c('Count','mean','sum','min'),filt="All %ni% c(999,-9)")

## Adding filters at variable level
ExpCustomStat(Carseats,Nvar=c("Population","Sales","CompPrice","Education","Income"),stat = c('Count','mean','sum','var','sd','IQR','median'),filt=c("ShelveLoc=='Good'^Urban=='Yes'^Price>=150^ ^US=='Yes'"))

##Numerical summaries by category
##Variable summary report (One group variable)
ExpCustomStat(Carseats,Cvar = c("Urban","ShelveLoc"), Nvar=c("Population","Sales"), stat = c('Count','Prop','mean','min','P0.25','median','p0.75','max'),gpby=FALSE)

##Variable summary report (More than One group variable)
ExpCustomStat(Carseats,Cvar = c("Urban","US","ShelveLoc"), Nvar=c("CompPrice","Income"), stat = c('Count','Prop','mean','sum','PS','min','max','IQR','sd'), gpby = TRUE)

##Variable summary report (More than One group variable) with filter
ExpCustomStat(Carseats,Cvar = c("Urban","US","ShelveLoc"), Nvar=c("CompPrice","Income"), stat = c('Count','Prop','mean','sum','PS','P0.25','median','p0.75'), gpby = TRUE,filt="Urban=='Yes'")
ExpCustomStat(data_sam,Cvar = c("Urban","US","ShelveLoc"), Nvar=c("Sales","CompPrice","Income"), stat = c('Count','Prop','mean','sum','PS'), gpby = TRUE,filt="All %ni% c(888,999)")
ExpCustomStat(Carseats,Cvar = c("Urban","US"), Nvar=c("Population","Sales","CompPrice"), stat = c('Count','Prop','mean','sum','var','min','max'), filt=c("ShelveLoc=='Good'^Urban=='Yes'^Price>=150"))
References


Try the SmartEDA package in your browser

Any scripts or data that you put into this service are public.

SmartEDA documentation built on Dec. 4, 2022, 1:15 a.m.