Exploratory Data Analysis (EDA)"
In Statsomat: Shiny Apps for Automated Data Analysis and Automated Interpretation

```{whites, eval=FALSE, echo = eval_rows}

Drop empty rows

rowsums <- data.frame(sapply(df,is.na)) rows_drop <- (which(rowSums(rowsums) == ncol(df))) df <- df[-rows_drop, ,drop=FALSE]

```r
eval_num <- FALSE
eval_numcol <- FALSE
if (exists("df_num")){
  if (ncol(df_num)>0){
     indices <- which(colnames(df_code) %in% colnames(df_num))
     cat("\\# `Column names of selected continuous variables`")
     cat("\\newline ")
     cat("`colnames_continuous = ")
     cat(paste0("c(", paste(indices, collapse=','), ")`"))
     eval_num <- TRUE
     eval_numcol <- (ncol(df_num)>1)
  }
}

```{whites, eval=FALSE, echo = eval_num}

Data frame of the continuous variables

df_num <- df[ ,colnames_continuous, drop=FALSE]

```r
eval_cat <- FALSE
eval_catcol <- FALSE
if (exists("df_factor")){
  if (ncol(df_factor)>0){
     indices <- which(colnames(df_code) %in% colnames(df_factor))
     cat("\\# `Column names of selected categorical variables`")
     cat("\\newline ")
     cat("`colnames_categorical = ")
     cat(paste0("c(", paste(indices, collapse=','), ")`"))
     eval_cat <- TRUE
     eval_catcol <- (ncol(df_factor)>1)
  }
}

```{whites, eval=FALSE, echo = eval_cat}

Data frame of the categorical variables

df_factor <- df[ ,colnames_categorical, drop=FALSE]

```{whites, eval=FALSE, echo = eval_num}

# Continuous variables
## Descriptive statistics

### Take over summary from psych package and add new stats 
stats_new <- psych::describe(df_num)

### Drop some stats which we do not need  
stats_new <- as.data.frame(stats_new)
stats_new <- stats_new[c(-1,-6,-10,-13)]

### Add new stats
stats_new$Variable <- colnames(df_num)
stats_new$ntotal <- nrow(df_num)

### Missings
stats_new$miss <- sapply(df_num, function(col) sum(is.na(col)))

### Complete rate
stats_new$complete <- sapply(df_num, function(col) (1-(sum(is.na(col)) / nrow(df_num)))*100)

### N Unique
stats_new$N_Unique <- sapply(df_num, function(col) length(unique(na.omit(col))))

### CV
stats_new$CV <- sapply(df_num, function(col) {
  ifelse(any(col <= 0, na.rm=TRUE), "-", round((sd(col, na.rm=TRUE) / mean(col, na.rm=TRUE)),2))
  })

### Reorder columns 
stats_new <- stats_new[,c(10,11,12,1,13,14,2:9,15)]

### Column names 
colnames(stats_new) <- c("Variable", "N Obs", "N Missing", "N Valid", "% Complete", "N Unique", "Mean", 
                         "SD", "Median", "MAD", "MIN", "MAX", "Skewness", "Kurtosis", "CV")

### Order by variable name
stats_new <- stats_new[order(stats_new$Variable),]

### Output
knitr::kable(stats_new, digits=2, row.names = FALSE, format="simple")

```{whites, eval=FALSE, echo = eval_num}

Continuous variables

Descriptive graphics: Histograms One Per Page

Order by variable name

df_num_order <- df_num[,order(colnames(df_num)),drop=FALSE]

Function to plot histogram for each variable

single_hist <- function(x, main = "Histogram", ylab="Relative Frequency", xlab=NULL, freq=FALSE, bcol="#2fa42d", dcol=c("#396e9f","#396e9f"), dlty=c("dotted", "solid"), breaks=21) {

h <- hist(x, plot=FALSE, breaks=breaks) m <- mean(x, na.rm=TRUE) s <- sd(x, na.rm=TRUE) d <- density(x, na.rm=TRUE)

# Set nice x and y axis limits xlims <- pretty(c(floor(h$breaks[1]),ceiling(last(h$breaks)))) ymax <- max(h$density) dmax <- max(d$y) ymax <- max(ymax,dmax)

# Plots plot(h, freq=freq, ylim=c(0, ymax*1.2), ylab=ylab, xlab=xlab, main=main, col=bcol, xlim = c(min(xlims), max(xlims))) lines(d, lty=dlty[1], col=dcol[1]) curve(dnorm(x,m,s), add=TRUE, lty=dlty[2], col=dcol[2])

}

Loop over variables

for (i in 1:ncol(df_num)){ single_hist(df_num_order[,i], main = paste("Histogram of ", colnames(df_num_order[i]))) }

```{whites, eval=FALSE, echo = eval_numcol}
# Continuous variables
## Descriptive graphics: Histograms Summary
k <- ceiling(ncol(df_num)/20)-1
for (i in 0:k){
  m <- 20*i+1
  n <- min(20*(i+1),ncol(df_num))
  multi.hist(df_num_order[,m:n], dcol=c("#396e9f","#396e9f"), 
             bcol= "#2fa42d", 
             dlty=c("dotted", "solid"), 
             main = colnames(df_num_order[,m:n])) 
}

```{whites, eval=FALSE, echo = eval_num}

Continuous variables

Descriptive graphics: Box-Plot One Per Page

Loop over variables

for (i in 1:ncol(df_num)){ boxplot(df_num_order[,c(i)], col = "#2fa42d", main = paste("Boxplot of",colnames(df_num_order[i])), xlab=paste(colnames(df_num_order[i])), horizontal = TRUE) }

```{whites, eval=FALSE, echo = eval_numcol}
# Continuous variables
## Descriptive graphics: Box-Plots Summary

### Set graphical parameters 
par(mfrow=c(ceiling(sqrt(length(df_num_order))), ceiling(sqrt(length(df_num_order)))), 
    mar=c(1.5,1,2,1), oma=c(1,1,1,1))

### Loop over variables   
for(i in 1:ncol(df_num)){
  boxplot(df_num_order[,c(i)], col = "#2fa42d", main = colnames(df_num_order[i]), 
          xlab=paste(colnames(df_num_order[i])), xaxt="n", horizontal = TRUE)
} 

### Restore original graphical settings 
par(opar)

```{whites, eval=FALSE, echo = eval_num}

Continuous variables

Descriptive graphics: ECDF Plots One Per Page

Loop over variables

for (i in 1:ncol(df_num)){

data <- as.data.frame(df_num_order[,c(i)]) colnames(data) <- "variable"

# Plot ECDF step_function <- ecdf(data$variable) plot(step_function, main=paste("ECDF Plot of", colnames(df_num_order[i])), xlab=colnames(df_num_order[i]), ylab="ECDF", cex=0.7, col="#2fa42d", do.points=TRUE)

# Plot CDF of normal distribution data_mean<- mean(data$variable, na.rm=TRUE) data_sd<- sd(data$variable, na.rm=TRUE) curve(pnorm(x, data_mean,data_sd), from=qnorm(0.0001, mean=data_mean, sd=data_sd), to=qnorm(0.9999, mean=data_mean, sd=data_sd), add=TRUE, col="#396e9f", lwd=2) }

```{whites, eval=FALSE, echo = eval_numcol}
# Continuous variables
## Graphics: ECDF Plots Summary

### ECDF function 
ecdf_plot <- function(i){

  data <- as.data.frame(df_num_order[,c(i)])
  colnames(data)<-"variable"

  # Plot ECDF
  step_function <- ecdf(data$variable)
  ecdf_plot <- plot(step_function,
                  main = colnames(df_num_order[i]),
                  xlab = colnames(df_num_order[i]), ylab = "ECDF",
                  cex = 0.7, col="#2fa42d", do.points = FALSE)

  # Plot CDF of normal distribution
  data_mean <- mean(data$variable, na.rm=TRUE)
  data_sd <- sd(data$variable, na.rm=TRUE)
  curve(pnorm(x, data_mean,data_sd),
        from = qnorm(0.0001, mean = data_mean, sd = data_sd), 
        to = qnorm(0.9999, mean = data_mean, sd = data_sd), 
        add = TRUE, col="#396e9f", lwd=0.5,pch=1)
}


### Set graphical parameters 
par(mfrow=c(ceiling(sqrt(length(df_num_order))), ceiling(sqrt(length(df_num_order)))), 
    mar=c(1.5,1,2,1), oma=c(1,1,1,1))

### Loop over variables  
for(i in 1:ncol(df_num)) ecdf_plot(i)

### Restore original graphical settings 
par(opar)

```{whites, eval=FALSE, echo = eval_num}

Continuous variables

Graphics: QQ Plots One Per Page

Define function for the QQ-Plot

qq_plot <- function(i, main, xlab, ylab){ var <- df_num_order[,i] qqplot(x = qnorm(ppoints(var), mean = mean(var, na.rm = TRUE), sd = sd(var, na.rm = TRUE)), y = var, xlim = c(min(var, na.rm = TRUE), max(var, na.rm = TRUE)), ylim = c(min(var, na.rm = TRUE), max(var, na.rm = TRUE)), main = main, xlab = xlab, ylab = ylab, col = "#2fa42d", cex=0.7, pch=19 ) abline(a = 0, b = 1, col = "#396e9f", lwd = 2) grid() }

Loop over variables

for (i in 1:ncol(df_num)){ qq_plot(i, main = paste("QQ-Plot of", colnames(df_num_order[i])), xlab = "Theoretical Quantiles, Normal Distribution", ylab = paste("Sample Quantiles for ", colnames(df_num_order[i])) ) }

```{whites, eval=FALSE, echo = eval_numcol}
# Continuous variables
## Graphics: QQ Plots Summary
### Set graphical parameters 
par(mfrow=c(ceiling(sqrt(length(df_num_order))), 
            ceiling(sqrt(length(df_num_order)))), 
    mar=c(1.5,1,2,1), oma=c(1,1,1,1))

### Loop over variables   
for(i in 1:ncol(df_num)){
  qq_plot(i, colnames(df_num_order[i]), "", "")
}

### Restore original graphical settings 
par(opar)

```{whites, eval=FALSE, echo = eval_cat}

Categorical variables

Descriptive statistics: Totals

Totals statistics

miss <- sapply(df_factor, function(col) sum(is.na(col))) complete <- sapply(df_factor, function(col) (1-(sum(is.na(col)) / nrow(df_factor)))*100) complete <- round(complete,3) totals <- data.frame(miss, complete) totals$Variable <- rownames(totals) totals$ntotal <- nrow(df_factor) totals$valid <- totals$ntotal - totals$miss totals$N_Unique <- sapply(df_factor, function(col) length(unique(col))) totals <- totals[,c(3,4,1,5,2,6)] totals <- totals[order(totals$Variable),] colnames(totals) <- c("Variable", "N Obs", "N Missing", "N Valid", "% Complete","N Unique")

Output

kable(totals, digits=2, row.names = FALSE, format="simple")

```{whites, eval=FALSE, echo = eval_cat}
# Categorical variables 
## Descriptive statistics: Frequencies

### Function stats per variable
discrete <- function(i){

  # Calculate individual statistics
  count <- table(df_factor[,i], useNA="always")
  perc <- as.data.frame(prop.table(count))
  perc$Percent <- perc$Freq*100
  perc$Freq <- NULL

  # Merge to one dataframe
  freq <- merge(count, perc, by="Var1")
  freq$Variable <- rep(colnames(df_factor)[i],nrow(freq))
  freq <- freq[,c(4,1,2,3)]
  colnames(freq) <- c("Variable", "Category","Frequency", "Percent")

  # Rename missing category
  if(length(is.na(freq$Category))>0){
    levels(freq$Category) <- c(levels(freq$Category),"Missing")
    freq$Category[is.na(freq$Category)] <- "Missing"
  }

  # Sort 
  freq_order <- freq[order(-freq[,4],freq[,2]),]

  # Add category "All other values" in case of more than 20 categories
  min <- min(20, length(unique(df_factor[,i])))
  if(min==20){
    freq_order$Category <- as.character(freq_order$Category)
    freq_order <- rbind(freq_order[1:20,], 
                        c(colnames(df_factor)[i], as.character("****All Other Values****"),
                          sum(freq_order$Frequency[-c(1:20)]), sum(freq_order$Percent[-c(1:20)])))
  } else {
    freq_order <- freq_order[1:min,]
  }
  return(freq_order)
}

### Loop over variables
cat_table <- discrete(1)
for (i in 1:ncol(df_factor)){
  if (i>1){
    cat_i <- discrete(i)
    cat_table <- rbind(cat_table, cat_i)
  }
}

### Sort by variable name 
cat_table <- cat_table[order(cat_table$Variable),]
cat_table$Percent <- round(as.numeric(cat_table$Percent),2)

### Output  
kable(cat_table, digits=2, row.names = FALSE, format="simple")

```{whites, eval=FALSE, echo = eval_cat}

Categorical variables

Descriptive graphics: Bar-Plots One Per Page

Data frame sorted by column name

df_factor_order <- df_factor[,order(colnames(df_factor)), drop=FALSE]

Loop over variables

for (i in 1:ncol(df_factor)){ counts <- table(df_factor_order[i], useNA = "ifany") names(counts)[is.na(names(counts))] <- "Missing" counts <- counts[order(counts)]

# Plot by case (e.g. category names length) if (any(nchar(names(counts), type = "chars") >= 11) || length(counts) > 12){

 if(length(counts) > 40){
    # Bar-Plot with suppressed category names 
    par(mar = c(6,6,4.1, 2.1), mgp = c(3, 1, 0))
    barplot(counts, col = "#2fa42d", main = paste("Barplot of ", colnames(df_factor_order[i])), xaxt="n",
            ylab = "Frequency", cex.names = 0.6, las = 2, xlab = colnames(df_factor_order[i]),
            ylim = range(pretty(c(0,counts))))


  } else {
 # Bar-Plot with shortened category names          
   par(mar = c(8, 8, 4.1, 2.1), mgp = c(6, 1, 0))
   names(counts) <- substr(names(counts), 1, 15)
   barplot(counts, col = "#2fa42d", main = paste("Barplot of ", colnames(df_factor_order[i])), ylab = "Frequency", 
           cex.names = 0.65, xlab= colnames(df_factor_order[i]), las=2, ylim=range(pretty(c(0,counts))))

  }

} else { # Bar-Plot with full-length names par(mar = c(6,6, 4.1, 2.1), mgp = c(5, 1, 0)) barplot(counts, col = "#2fa42d", main = paste("Barplot of ", colnames(df_factor_order[i])), ylab = "Frequency", cex.names = 0.7, ylim=range(pretty(c(0,counts))), xlab= colnames(df_factor_order[i])) } }

```{whites, eval=FALSE, echo = eval_catcol}
# Categorical variables 
## Descriptive graphics: Bar-Plots Summary

### Function for Bar-Plot per variable 
plot_bar <- function(i){
  counts <- table(df_factor_order[i], useNA = "ifany")
  names(counts)[is.na(names(counts))] <- "Missing"
  names(counts)[names(counts)=="NA"] <- "Missing"
  counts <- counts[order(counts)]
  barplot(counts, col = "#2fa42d", main = colnames(df_factor_order[i]),
          ylab = "Frequency",  xaxt="n", ylim=range(pretty(c(0,counts))))
}

### Set graphical parameters 
par(mfrow=c(ceiling(sqrt(length(df_factor))), ceiling(sqrt(length(df_factor)))), 
    mar=c(1.5,1,2,1), oma= c(1,1,1,1))

### Loop over variables 
for(i in 1:ncol(df_factor)) plot_bar(i)

### Restore original graphical settings 
par(opar)

```{whites, eval=FALSE, echo = eval_cat}

Categorical variables

Descriptive graphics: Pie-Plots One Per Page

Function to create frequency table for each variable

freqtable <- function(col){

# Replace NA with "Missing" col[is.na(col)] <- "Missing"

# Create table with frequencies pie_table_unsorted <- as.data.frame(table(col)) pie_table_sorted <- pie_table_unsorted[order(pie_table_unsorted$Freq, decreasing=TRUE),] colnames(pie_table_sorted) <- c("Category", "Frequency")

# If more than 20 categories: summarize the smallest categories to one category if (nrow(pie_table_sorted)>20){ pie_table_sorted$Category <- as.character(pie_table_sorted$Category) pie_table_summarized <- rbind(pie_table_sorted[c(1:20),], c(as.character("All Other Values"), sum(pie_table_sorted$Frequency[-c(1:20)]))) pie_table_sorted <- pie_table_summarized } pie_table_sorted$RelFreq <- as.numeric(pie_table_sorted$Frequency) / length(col) return(pie_table_sorted) }

Plot function

plot_pie <- function(table, title, title_size, legend_pos){ # Direction of the legend if (max(nchar(as.character(table[,1])))>15){ legend = "vertical" } else { legend = "horizontal" } plot <- ggplot(table, aes(x = "", y = RelFreq, fill = reorder(Category, RelFreq))) + guides(fill = guide_legend(title="", reverse = FALSE, direction = legend)) + ggtitle(title) + geom_col() + geom_text(aes(label = scales::percent(RelFreq,accuracy = 0.01)),
position = position_stack(vjust = 0.5), size = 2) + coord_polar("y", start = 0) + theme(axis.title.x = element_blank(),axis.title.y = element_blank(), axis.ticks = element_blank(), panel.grid = element_blank(), axis.text = element_blank(), legend.position = legend_pos, panel.background = element_blank(), plot.title = title_size) return(plot) }

Loop over variables

for(i in 1:ncol(df_factor)){ table <- freqtable(df_factor_order[,i]) title <- paste("Pie Chart of ", colnames(df_factor_order[i])) title_size <- element_text(hjust = 0.5, face = "bold") legend_pos <- "right" print(plot_pie(table=table, title=title, title_size=title_size, legend_pos=legend_pos)) }

```{whites, eval=FALSE, echo = eval_catcol}
# Categorical variables 
## Descriptive graphics: Pie-Plots Summary

### Save variable plots in a list
plots <- list()
for (i in 1:ncol(df_factor)){
  title <- substr(colnames(df_factor_order[i]), 1, 19)
  title_size <- element_text(size = 30 / min(25, ceiling(sqrt(ncol(df_factor)))),
                             face = "bold", hjust = 0.5)
  legend_pos <- "none"
  table <- freqtable(df_factor_order[,i])
  plots[[i]] <- plot_pie(table=table, title=title, title_size=title_size, legend_pos=legend_pos)
}


### Summary Plot
grid.arrange(grobs = plots, ncol = ceiling(sqrt(ncol(df_factor_order))))

Any scripts or data that you put into this service are public.

Statsomat documentation built on Nov. 17, 2021, 5:17 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

Statsomat
Shiny Apps for Automated Data Analysis and Automated Interpretation

Exploratory Data Analysis (EDA)"
In Statsomat: Shiny Apps for Automated Data Analysis and Automated Interpretation

Drop empty rows

Data frame of the continuous variables

Data frame of the categorical variables

Continuous variables

Descriptive graphics: Histograms One Per Page

Order by variable name

Function to plot histogram for each variable

Loop over variables

Continuous variables

Descriptive graphics: Box-Plot One Per Page

Loop over variables

Continuous variables

Descriptive graphics: ECDF Plots One Per Page

Loop over variables

Continuous variables

Graphics: QQ Plots One Per Page

Define function for the QQ-Plot

Loop over variables

Categorical variables

Descriptive statistics: Totals

Totals statistics

Output

Categorical variables

Descriptive graphics: Bar-Plots One Per Page

Data frame sorted by column name

Loop over variables

Categorical variables

Descriptive graphics: Pie-Plots One Per Page

Function to create frequency table for each variable

Plot function

Loop over variables

Try the Statsomat package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

Statsomat Shiny Apps for Automated Data Analysis and Automated Interpretation

Exploratory Data Analysis (EDA)" In Statsomat: Shiny Apps for Automated Data Analysis and Automated Interpretation

Drop empty rows

Data frame of the continuous variables

Data frame of the categorical variables

Continuous variables

Descriptive graphics: Histograms One Per Page

Order by variable name

Function to plot histogram for each variable

Loop over variables

Continuous variables

Descriptive graphics: Box-Plot One Per Page

Loop over variables

Continuous variables

Descriptive graphics: ECDF Plots One Per Page

Loop over variables

Continuous variables

Graphics: QQ Plots One Per Page

Define function for the QQ-Plot

Loop over variables

Categorical variables

Descriptive statistics: Totals

Totals statistics

Output

Categorical variables

Descriptive graphics: Bar-Plots One Per Page

Data frame sorted by column name

Loop over variables

Categorical variables

Descriptive graphics: Pie-Plots One Per Page

Function to create frequency table for each variable

Plot function

Loop over variables

Try the Statsomat package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

Statsomat
Shiny Apps for Automated Data Analysis and Automated Interpretation

Exploratory Data Analysis (EDA)"
In Statsomat: Shiny Apps for Automated Data Analysis and Automated Interpretation